Index: projects/nfsv42/sys/amd64/amd64/pmap.c
===================================================================
--- projects/nfsv42/sys/amd64/amd64/pmap.c	(revision 350367)
+++ projects/nfsv42/sys/amd64/amd64/pmap.c	(revision 350368)
@@ -1,9949 +1,9939 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * Copyright (c) 2014-2019 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Portions of this software were developed by
  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #define	AMD64_NPT_AWARE
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_ddb.h"
 #include "opt_pmap.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/bitstring.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rangeset.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/turnstile.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #ifdef DDB
 #include <sys/kdb.h>
 #include <ddb/ddb.h>
 #endif
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #include <x86/ifunc.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/sysarch.h>
 #include <machine/tss.h>
 
 static __inline boolean_t
 pmap_type_guest(pmap_t pmap)
 {
 
 	return ((pmap->pm_type == PT_EPT) || (pmap->pm_type == PT_RVI));
 }
 
 static __inline boolean_t
 pmap_emulate_ad_bits(pmap_t pmap)
 {
 
 	return ((pmap->pm_flags & PMAP_EMULATE_AD_BITS) != 0);
 }
 
 static __inline pt_entry_t
 pmap_valid_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_V;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_EMUL_V;
 		else
 			mask = EPT_PG_READ;
 		break;
 	default:
 		panic("pmap_valid_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_rw_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_RW;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_EMUL_RW;
 		else
 			mask = EPT_PG_WRITE;
 		break;
 	default:
 		panic("pmap_rw_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static pt_entry_t pg_g;
 
 static __inline pt_entry_t
 pmap_global_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 		mask = pg_g;
 		break;
 	case PT_RVI:
 	case PT_EPT:
 		mask = 0;
 		break;
 	default:
 		panic("pmap_global_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_accessed_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_A;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_READ;
 		else
 			mask = EPT_PG_A;
 		break;
 	default:
 		panic("pmap_accessed_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_modified_bit(pmap_t pmap)
 {
 	pt_entry_t mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = X86_PG_M;
 		break;
 	case PT_EPT:
 		if (pmap_emulate_ad_bits(pmap))
 			mask = EPT_PG_WRITE;
 		else
 			mask = EPT_PG_M;
 		break;
 	default:
 		panic("pmap_modified_bit: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 static __inline pt_entry_t
 pmap_pku_mask_bit(pmap_t pmap)
 {
 
 	return (pmap->pm_type == PT_X86 ? X86_PG_PKU_MASK : 0);
 }
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
 #else
 #define PMAP_INLINE	extern inline
 #endif
 #else
 #define PMAP_INLINE
 #endif
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #define	pa_index(pa)	((pa) >> PDRSHIFT)
 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
 
 #define	NPV_LIST_LOCKS	MAXCPU
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
 	struct rwlock *_new_lock;			\
 							\
 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
 	if (_new_lock != *_lockp) {			\
 		if (*_lockp != NULL)			\
 			rw_wunlock(*_lockp);		\
 		*_lockp = _new_lock;			\
 		rw_wlock(*_lockp);			\
 	}						\
 } while (0)
 
 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
 
 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
 	struct rwlock **_lockp = (lockp);		\
 							\
 	if (*_lockp != NULL) {				\
 		rw_wunlock(*_lockp);			\
 		*_lockp = NULL;				\
 	}						\
 } while (0)
 
 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
 
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 
 int nkpt;
 SYSCTL_INT(_machdep, OID_AUTO, nkpt, CTLFLAG_RD, &nkpt, 0,
     "Number of kernel page table pages allocated on bootup");
 
 static int ndmpdp;
 vm_paddr_t dmaplimit;
 vm_offset_t kernel_vm_end = VM_MIN_KERNEL_ADDRESS;
 pt_entry_t pg_nx;
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 
 static int pg_ps_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pg_ps_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pg_ps_enabled, 0, "Are large page mappings enabled?");
 
 #define	PAT_INDEX_SIZE	8
 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
 
 static u_int64_t	KPTphys;	/* phys addr of kernel level 1 */
 static u_int64_t	KPDphys;	/* phys addr of kernel level 2 */
 u_int64_t		KPDPphys;	/* phys addr of kernel level 3 */
 u_int64_t		KPML4phys;	/* phys addr of kernel level 4 */
 
 static u_int64_t	DMPDphys;	/* phys addr of direct mapped level 2 */
 static u_int64_t	DMPDPphys;	/* phys addr of direct mapped level 3 */
 static int		ndmpdpphys;	/* number of DMPDPphys pages */
 
 static vm_paddr_t	KERNend;	/* phys addr of end of bootstrap data */
 
 /*
  * pmap_mapdev support pre initialization (i.e. console)
  */
 #define	PMAP_PREINIT_MAPPING_COUNT	8
 static struct pmap_preinit_mapping {
 	vm_paddr_t	pa;
 	vm_offset_t	va;
 	vm_size_t	sz;
 	int		mode;
 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
 static int pmap_initialized;
 
 /*
  * Data for the pv entry allocation mechanism.
  * Updates to pv_invl_gen are protected by the pv_list_locks[]
  * elements, but reads are not.
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx __exclusive_cache_line pv_chunks_mutex;
 static struct rwlock __exclusive_cache_line pv_list_locks[NPV_LIST_LOCKS];
 static u_long pv_invl_gen[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
 static struct md_page pv_dummy;
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 pt_entry_t *CMAP1 = NULL;
 caddr_t CADDR1 = 0;
 static vm_offset_t qframe = 0;
 static struct mtx qframe_mtx;
 
 static int pmap_flags = PMAP_PDE_SUPERPAGE;	/* flags for x86 pmaps */
 
 static vmem_t *large_vmem;
 static u_int lm_ents;
 #define	PMAP_LARGEMAP_MAX_ADDRESS()			\
     (LARGEMAP_MIN_ADDRESS + NBPML4 * (u_long)lm_ents)
 
 int pmap_pcid_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pcid_enabled, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pmap_pcid_enabled, 0, "Is TLB Context ID enabled ?");
 int invpcid_works = 0;
 SYSCTL_INT(_vm_pmap, OID_AUTO, invpcid_works, CTLFLAG_RD, &invpcid_works, 0,
     "Is the invpcid instruction available ?");
 
 int __read_frequently pti = 0;
 SYSCTL_INT(_vm_pmap, OID_AUTO, pti, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
     &pti, 0,
     "Page Table Isolation enabled");
 static vm_object_t pti_obj;
 static pml4_entry_t *pti_pml4;
 static vm_pindex_t pti_pg_idx;
 static bool pti_finalized;
 
 struct pmap_pkru_range {
 	struct rs_el	pkru_rs_el;
 	u_int		pkru_keyidx;
 	int		pkru_flags;
 };
 
 static uma_zone_t pmap_pkru_ranges_zone;
 static bool pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
 static pt_entry_t pmap_pkru_get(pmap_t pmap, vm_offset_t va);
 static void pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
 static void *pkru_dup_range(void *ctx, void *data);
 static void pkru_free_range(void *ctx, void *node);
 static int pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap);
 static int pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
 static void pmap_pkru_deassign_all(pmap_t pmap);
 
 static int
 pmap_pcid_save_cnt_proc(SYSCTL_HANDLER_ARGS)
 {
 	int i;
 	uint64_t res;
 
 	res = 0;
 	CPU_FOREACH(i) {
 		res += cpuid_to_pcpu[i]->pc_pm_save_cnt;
 	}
 	return (sysctl_handle_64(oidp, &res, 0, req));
 }
 SYSCTL_PROC(_vm_pmap, OID_AUTO, pcid_save_cnt, CTLTYPE_U64 | CTLFLAG_RD |
     CTLFLAG_MPSAFE, NULL, 0, pmap_pcid_save_cnt_proc, "QU",
     "Count of saved TLB context on switch");
 
 static LIST_HEAD(, pmap_invl_gen) pmap_invl_gen_tracker =
     LIST_HEAD_INITIALIZER(&pmap_invl_gen_tracker);
 static struct mtx invl_gen_mtx;
 /* Fake lock object to satisfy turnstiles interface. */
 static struct lock_object invl_gen_ts = {
 	.lo_name = "invlts",
 };
 static struct pmap_invl_gen pmap_invl_gen_head = {
 	.gen = 1,
 	.next = NULL,
 };
 static u_long pmap_invl_gen = 1;
 static int pmap_invl_waiters;
 static struct callout pmap_invl_callout;
 static bool pmap_invl_callout_inited;
 
 #define	PMAP_ASSERT_NOT_IN_DI() \
     KASSERT(pmap_not_in_di(), ("DI already started"))
 
 static bool
 pmap_di_locked(void)
 {
 	int tun;
 
 	if ((cpu_feature2 & CPUID2_CX16) == 0)
 		return (true);
 	tun = 0;
 	TUNABLE_INT_FETCH("vm.pmap.di_locked", &tun);
 	return (tun != 0);
 }
 
 static int
 sysctl_pmap_di_locked(SYSCTL_HANDLER_ARGS)
 {
 	int locked;
 
 	locked = pmap_di_locked();
 	return (sysctl_handle_int(oidp, &locked, 0, req));
 }
 SYSCTL_PROC(_vm_pmap, OID_AUTO, di_locked, CTLTYPE_INT | CTLFLAG_RDTUN |
     CTLFLAG_MPSAFE, 0, 0, sysctl_pmap_di_locked, "",
     "Locked delayed invalidation");
 
 static bool pmap_not_in_di_l(void);
 static bool pmap_not_in_di_u(void);
 DEFINE_IFUNC(, bool, pmap_not_in_di, (void))
 {
 
 	return (pmap_di_locked() ? pmap_not_in_di_l : pmap_not_in_di_u);
 }
 
 static bool
 pmap_not_in_di_l(void)
 {
 	struct pmap_invl_gen *invl_gen;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	return (invl_gen->gen == 0);
 }
 
 static void
 pmap_thread_init_invl_gen_l(struct thread *td)
 {
 	struct pmap_invl_gen *invl_gen;
 
 	invl_gen = &td->td_md.md_invl_gen;
 	invl_gen->gen = 0;
 }
 
 static void
 pmap_delayed_invl_wait_block(u_long *m_gen, u_long *invl_gen)
 {
 	struct turnstile *ts;
 
 	ts = turnstile_trywait(&invl_gen_ts);
 	if (*m_gen > atomic_load_long(invl_gen))
 		turnstile_wait(ts, NULL, TS_SHARED_QUEUE);
 	else
 		turnstile_cancel(ts);
 }
 
 static void
 pmap_delayed_invl_finish_unblock(u_long new_gen)
 {
 	struct turnstile *ts;
 
 	turnstile_chain_lock(&invl_gen_ts);
 	ts = turnstile_lookup(&invl_gen_ts);
 	if (new_gen != 0)
 		pmap_invl_gen = new_gen;
 	if (ts != NULL) {
 		turnstile_broadcast(ts, TS_SHARED_QUEUE);
 		turnstile_unpend(ts);
 	}
 	turnstile_chain_unlock(&invl_gen_ts);
 }
 
 /*
  * Start a new Delayed Invalidation (DI) block of code, executed by
  * the current thread.  Within a DI block, the current thread may
  * destroy both the page table and PV list entries for a mapping and
  * then release the corresponding PV list lock before ensuring that
  * the mapping is flushed from the TLBs of any processors with the
  * pmap active.
  */
 static void
 pmap_delayed_invl_start_l(void)
 {
 	struct pmap_invl_gen *invl_gen;
 	u_long currgen;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	PMAP_ASSERT_NOT_IN_DI();
 	mtx_lock(&invl_gen_mtx);
 	if (LIST_EMPTY(&pmap_invl_gen_tracker))
 		currgen = pmap_invl_gen;
 	else
 		currgen = LIST_FIRST(&pmap_invl_gen_tracker)->gen;
 	invl_gen->gen = currgen + 1;
 	LIST_INSERT_HEAD(&pmap_invl_gen_tracker, invl_gen, link);
 	mtx_unlock(&invl_gen_mtx);
 }
 
 /*
  * Finish the DI block, previously started by the current thread.  All
  * required TLB flushes for the pages marked by
  * pmap_delayed_invl_page() must be finished before this function is
  * called.
  *
  * This function works by bumping the global DI generation number to
  * the generation number of the current thread's DI, unless there is a
  * pending DI that started earlier.  In the latter case, bumping the
  * global DI generation number would incorrectly signal that the
  * earlier DI had finished.  Instead, this function bumps the earlier
  * DI's generation number to match the generation number of the
  * current thread's DI.
  */
 static void
 pmap_delayed_invl_finish_l(void)
 {
 	struct pmap_invl_gen *invl_gen, *next;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	KASSERT(invl_gen->gen != 0, ("missed invl_start"));
 	mtx_lock(&invl_gen_mtx);
 	next = LIST_NEXT(invl_gen, link);
 	if (next == NULL)
 		pmap_delayed_invl_finish_unblock(invl_gen->gen);
 	else
 		next->gen = invl_gen->gen;
 	LIST_REMOVE(invl_gen, link);
 	mtx_unlock(&invl_gen_mtx);
 	invl_gen->gen = 0;
 }
 
 static bool
 pmap_not_in_di_u(void)
 {
 	struct pmap_invl_gen *invl_gen;
 
 	invl_gen = &curthread->td_md.md_invl_gen;
 	return (((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) != 0);
 }
 
 static void
 pmap_thread_init_invl_gen_u(struct thread *td)
 {
 	struct pmap_invl_gen *invl_gen;
 
 	invl_gen = &td->td_md.md_invl_gen;
 	invl_gen->gen = 0;
 	invl_gen->next = (void *)PMAP_INVL_GEN_NEXT_INVALID;
 }
 
 static bool
 pmap_di_load_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *out)
 {
 	uint64_t new_high, new_low, old_high, old_low;
 	char res;
 
 	old_low = new_low = 0;
 	old_high = new_high = (uintptr_t)0;
 
 	__asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
 	    : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
 	    : "b"(new_low), "c" (new_high)
 	    : "memory", "cc");
 	if (res == 0) {
 		if ((old_high & PMAP_INVL_GEN_NEXT_INVALID) != 0)
 			return (false);
 		out->gen = old_low;
 		out->next = (void *)old_high;
 	} else {
 		out->gen = new_low;
 		out->next = (void *)new_high;
 	}
 	return (true);
 }
 
 static bool
 pmap_di_store_invl(struct pmap_invl_gen *ptr, struct pmap_invl_gen *old_val,
     struct pmap_invl_gen *new_val)
 {
 	uint64_t new_high, new_low, old_high, old_low;
 	char res;
 
 	new_low = new_val->gen;
 	new_high = (uintptr_t)new_val->next;
 	old_low = old_val->gen;
 	old_high = (uintptr_t)old_val->next;
 
 	__asm volatile("lock;cmpxchg16b\t%1;sete\t%0"
 	    : "=r" (res), "+m" (*ptr), "+a" (old_low), "+d" (old_high)
 	    : "b"(new_low), "c" (new_high)
 	    : "memory", "cc");
 	return (res);
 }
 
 #ifdef PV_STATS
 static long invl_start_restart;
 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_start_restart, CTLFLAG_RD,
     &invl_start_restart, 0,
     "");
 static long invl_finish_restart;
 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_finish_restart, CTLFLAG_RD,
     &invl_finish_restart, 0,
     "");
 static int invl_max_qlen;
 SYSCTL_INT(_vm_pmap, OID_AUTO, invl_max_qlen, CTLFLAG_RD,
     &invl_max_qlen, 0,
     "");
 #endif
 
 static struct lock_delay_config __read_frequently di_delay;
 LOCK_DELAY_SYSINIT_DEFAULT(di_delay);
 
 static void
 pmap_delayed_invl_start_u(void)
 {
 	struct pmap_invl_gen *invl_gen, *p, prev, new_prev;
 	struct thread *td;
 	struct lock_delay_arg lda;
 	uintptr_t prevl;
 	u_char pri;
 #ifdef PV_STATS
 	int i, ii;
 #endif
 
 	td = curthread;
 	invl_gen = &td->td_md.md_invl_gen;
 	PMAP_ASSERT_NOT_IN_DI();
 	lock_delay_arg_init(&lda, &di_delay);
 	invl_gen->saved_pri = 0;
 	pri = td->td_base_pri;
 	if (pri > PVM) {
 		thread_lock(td);
 		pri = td->td_base_pri;
 		if (pri > PVM) {
 			invl_gen->saved_pri = pri;
 			sched_prio(td, PVM);
 		}
 		thread_unlock(td);
 	}
 again:
 	PV_STAT(i = 0);
 	for (p = &pmap_invl_gen_head;; p = prev.next) {
 		PV_STAT(i++);
 		prevl = atomic_load_ptr(&p->next);
 		if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
 			PV_STAT(atomic_add_long(&invl_start_restart, 1));
 			lock_delay(&lda);
 			goto again;
 		}
 		if (prevl == 0)
 			break;
 		prev.next = (void *)prevl;
 	}
 #ifdef PV_STATS
 	if ((ii = invl_max_qlen) < i)
 		atomic_cmpset_int(&invl_max_qlen, ii, i);
 #endif
 
 	if (!pmap_di_load_invl(p, &prev) || prev.next != NULL) {
 		PV_STAT(atomic_add_long(&invl_start_restart, 1));
 		lock_delay(&lda);
 		goto again;
 	}
 
 	new_prev.gen = prev.gen;
 	new_prev.next = invl_gen;
 	invl_gen->gen = prev.gen + 1;
 
 	/* Formal fence between store to invl->gen and updating *p. */
 	atomic_thread_fence_rel();
 
 	/*
 	 * After inserting an invl_gen element with invalid bit set,
 	 * this thread blocks any other thread trying to enter the
 	 * delayed invalidation block.  Do not allow to remove us from
 	 * the CPU, because it causes starvation for other threads.
 	 */
 	critical_enter();
 
 	/*
 	 * ABA for *p is not possible there, since p->gen can only
 	 * increase.  So if the *p thread finished its di, then
 	 * started a new one and got inserted into the list at the
 	 * same place, its gen will appear greater than the previously
 	 * read gen.
 	 */
 	if (!pmap_di_store_invl(p, &prev, &new_prev)) {
 		critical_exit();
 		PV_STAT(atomic_add_long(&invl_start_restart, 1));
 		lock_delay(&lda);
 		goto again;
 	}
 
 	/*
 	 * There we clear PMAP_INVL_GEN_NEXT_INVALID in
 	 * invl_gen->next, allowing other threads to iterate past us.
 	 * pmap_di_store_invl() provides fence between the generation
 	 * write and the update of next.
 	 */
 	invl_gen->next = NULL;
 	critical_exit();
 }
 
 static bool
 pmap_delayed_invl_finish_u_crit(struct pmap_invl_gen *invl_gen,
     struct pmap_invl_gen *p)
 {
 	struct pmap_invl_gen prev, new_prev;
 	u_long mygen;
 
 	/*
 	 * Load invl_gen->gen after setting invl_gen->next
 	 * PMAP_INVL_GEN_NEXT_INVALID.  This prevents larger
 	 * generations to propagate to our invl_gen->gen.  Lock prefix
 	 * in atomic_set_ptr() worked as seq_cst fence.
 	 */
 	mygen = atomic_load_long(&invl_gen->gen);
 
 	if (!pmap_di_load_invl(p, &prev) || prev.next != invl_gen)
 		return (false);
 
 	KASSERT(prev.gen < mygen,
 	    ("invalid di gen sequence %lu %lu", prev.gen, mygen));
 	new_prev.gen = mygen;
 	new_prev.next = (void *)((uintptr_t)invl_gen->next &
 	    ~PMAP_INVL_GEN_NEXT_INVALID);
 
 	/* Formal fence between load of prev and storing update to it. */
 	atomic_thread_fence_rel();
 
 	return (pmap_di_store_invl(p, &prev, &new_prev));
 }
 
 static void
 pmap_delayed_invl_finish_u(void)
 {
 	struct pmap_invl_gen *invl_gen, *p;
 	struct thread *td;
 	struct lock_delay_arg lda;
 	uintptr_t prevl;
 
 	td = curthread;
 	invl_gen = &td->td_md.md_invl_gen;
 	KASSERT(invl_gen->gen != 0, ("missed invl_start: gen 0"));
 	KASSERT(((uintptr_t)invl_gen->next & PMAP_INVL_GEN_NEXT_INVALID) == 0,
 	    ("missed invl_start: INVALID"));
 	lock_delay_arg_init(&lda, &di_delay);
 
 again:
 	for (p = &pmap_invl_gen_head; p != NULL; p = (void *)prevl) {
 		prevl = atomic_load_ptr(&p->next);
 		if ((prevl & PMAP_INVL_GEN_NEXT_INVALID) != 0) {
 			PV_STAT(atomic_add_long(&invl_finish_restart, 1));
 			lock_delay(&lda);
 			goto again;
 		}
 		if ((void *)prevl == invl_gen)
 			break;
 	}
 
 	/*
 	 * It is legitimate to not find ourself on the list if a
 	 * thread before us finished its DI and started it again.
 	 */
 	if (__predict_false(p == NULL)) {
 		PV_STAT(atomic_add_long(&invl_finish_restart, 1));
 		lock_delay(&lda);
 		goto again;
 	}
 
 	critical_enter();
 	atomic_set_ptr((uintptr_t *)&invl_gen->next,
 	    PMAP_INVL_GEN_NEXT_INVALID);
 	if (!pmap_delayed_invl_finish_u_crit(invl_gen, p)) {
 		atomic_clear_ptr((uintptr_t *)&invl_gen->next,
 		    PMAP_INVL_GEN_NEXT_INVALID);
 		critical_exit();
 		PV_STAT(atomic_add_long(&invl_finish_restart, 1));
 		lock_delay(&lda);
 		goto again;
 	}
 	critical_exit();
 	if (atomic_load_int(&pmap_invl_waiters) > 0)
 		pmap_delayed_invl_finish_unblock(0);
 	if (invl_gen->saved_pri != 0) {
 		thread_lock(td);
 		sched_prio(td, invl_gen->saved_pri);
 		thread_unlock(td);
 	}
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(di_queue, pmap_di_queue)
 {
 	struct pmap_invl_gen *p, *pn;
 	struct thread *td;
 	uintptr_t nextl;
 	bool first;
 
 	for (p = &pmap_invl_gen_head, first = true; p != NULL; p = pn,
 	    first = false) {
 		nextl = atomic_load_ptr(&p->next);
 		pn = (void *)(nextl & ~PMAP_INVL_GEN_NEXT_INVALID);
 		td = first ? NULL : __containerof(p, struct thread,
 		    td_md.md_invl_gen);
 		db_printf("gen %lu inv %d td %p tid %d\n", p->gen,
 		    (nextl & PMAP_INVL_GEN_NEXT_INVALID) != 0, td,
 		    td != NULL ? td->td_tid : -1);
 	}
 }
 #endif
 
 #ifdef PV_STATS
 static long invl_wait;
 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait, CTLFLAG_RD, &invl_wait, 0,
     "Number of times DI invalidation blocked pmap_remove_all/write");
 static long invl_wait_slow;
 SYSCTL_LONG(_vm_pmap, OID_AUTO, invl_wait_slow, CTLFLAG_RD, &invl_wait_slow, 0,
     "Number of slow invalidation waits for lockless DI");
 #endif
 
 static u_long *
 pmap_delayed_invl_genp(vm_page_t m)
 {
 
 	return (&pv_invl_gen[pa_index(VM_PAGE_TO_PHYS(m)) % NPV_LIST_LOCKS]);
 }
 
 static void
 pmap_delayed_invl_callout_func(void *arg __unused)
 {
 
 	if (atomic_load_int(&pmap_invl_waiters) == 0)
 		return;
 	pmap_delayed_invl_finish_unblock(0);
 }
 
 static void
 pmap_delayed_invl_callout_init(void *arg __unused)
 {
 
 	if (pmap_di_locked())
 		return;
 	callout_init(&pmap_invl_callout, 1);
 	pmap_invl_callout_inited = true;
 }
 SYSINIT(pmap_di_callout, SI_SUB_CPU + 1, SI_ORDER_ANY,
     pmap_delayed_invl_callout_init, NULL);
 
 /*
  * Ensure that all currently executing DI blocks, that need to flush
  * TLB for the given page m, actually flushed the TLB at the time the
  * function returned.  If the page m has an empty PV list and we call
  * pmap_delayed_invl_wait(), upon its return we know that no CPU has a
  * valid mapping for the page m in either its page table or TLB.
  *
  * This function works by blocking until the global DI generation
  * number catches up with the generation number associated with the
  * given page m and its PV list.  Since this function's callers
  * typically own an object lock and sometimes own a page lock, it
  * cannot sleep.  Instead, it blocks on a turnstile to relinquish the
  * processor.
  */
 static void
 pmap_delayed_invl_wait_l(vm_page_t m)
 {
 	u_long *m_gen;
 #ifdef PV_STATS
 	bool accounted = false;
 #endif
 
 	m_gen = pmap_delayed_invl_genp(m);
 	while (*m_gen > pmap_invl_gen) {
 #ifdef PV_STATS
 		if (!accounted) {
 			atomic_add_long(&invl_wait, 1);
 			accounted = true;
 		}
 #endif
 		pmap_delayed_invl_wait_block(m_gen, &pmap_invl_gen);
 	}
 }
 
 static void
 pmap_delayed_invl_wait_u(vm_page_t m)
 {
 	u_long *m_gen;
 	struct lock_delay_arg lda;
 	bool fast;
 
 	fast = true;
 	m_gen = pmap_delayed_invl_genp(m);
 	lock_delay_arg_init(&lda, &di_delay);
 	while (*m_gen > atomic_load_long(&pmap_invl_gen_head.gen)) {
 		if (fast || !pmap_invl_callout_inited) {
 			PV_STAT(atomic_add_long(&invl_wait, 1));
 			lock_delay(&lda);
 			fast = false;
 		} else {
 			/*
 			 * The page's invalidation generation number
 			 * is still below the current thread's number.
 			 * Prepare to block so that we do not waste
 			 * CPU cycles or worse, suffer livelock.
 			 *
 			 * Since it is impossible to block without
 			 * racing with pmap_delayed_invl_finish_u(),
 			 * prepare for the race by incrementing
 			 * pmap_invl_waiters and arming a 1-tick
 			 * callout which will unblock us if we lose
 			 * the race.
 			 */
 			atomic_add_int(&pmap_invl_waiters, 1);
 
 			/*
 			 * Re-check the current thread's invalidation
 			 * generation after incrementing
 			 * pmap_invl_waiters, so that there is no race
 			 * with pmap_delayed_invl_finish_u() setting
 			 * the page generation and checking
 			 * pmap_invl_waiters.  The only race allowed
 			 * is for a missed unblock, which is handled
 			 * by the callout.
 			 */
 			if (*m_gen >
 			    atomic_load_long(&pmap_invl_gen_head.gen)) {
 				callout_reset(&pmap_invl_callout, 1,
 				    pmap_delayed_invl_callout_func, NULL);
 				PV_STAT(atomic_add_long(&invl_wait_slow, 1));
 				pmap_delayed_invl_wait_block(m_gen,
 				    &pmap_invl_gen_head.gen);
 			}
 			atomic_add_int(&pmap_invl_waiters, -1);
 		}
 	}
 }
 
 DEFINE_IFUNC(, void, pmap_thread_init_invl_gen, (struct thread *))
 {
 
 	return (pmap_di_locked() ? pmap_thread_init_invl_gen_l :
 	    pmap_thread_init_invl_gen_u);
 }
 
 DEFINE_IFUNC(static, void, pmap_delayed_invl_start, (void))
 {
 
 	return (pmap_di_locked() ? pmap_delayed_invl_start_l :
 	    pmap_delayed_invl_start_u);
 }
 
 DEFINE_IFUNC(static, void, pmap_delayed_invl_finish, (void))
 {
 
 	return (pmap_di_locked() ? pmap_delayed_invl_finish_l :
 	    pmap_delayed_invl_finish_u);
 }
 
 DEFINE_IFUNC(static, void, pmap_delayed_invl_wait, (vm_page_t))
 {
 
 	return (pmap_di_locked() ? pmap_delayed_invl_wait_l :
 	    pmap_delayed_invl_wait_u);
 }
 
 /*
  * Mark the page m's PV list as participating in the current thread's
  * DI block.  Any threads concurrently using m's PV list to remove or
  * restrict all mappings to m will wait for the current thread's DI
  * block to complete before proceeding.
  *
  * The function works by setting the DI generation number for m's PV
  * list to at least the DI generation number of the current thread.
  * This forces a caller of pmap_delayed_invl_wait() to block until
  * current thread calls pmap_delayed_invl_finish().
  */
 static void
 pmap_delayed_invl_page(vm_page_t m)
 {
 	u_long gen, *m_gen;
 
 	rw_assert(VM_PAGE_TO_PV_LIST_LOCK(m), RA_WLOCKED);
 	gen = curthread->td_md.md_invl_gen.gen;
 	if (gen == 0)
 		return;
 	m_gen = pmap_delayed_invl_genp(m);
 	if (*m_gen < gen)
 		*m_gen = gen;
 }
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 /*
  * Internal flags for pmap_enter()'s helper functions.
  */
 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
 
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
 static int	popcnt_pc_map_pq(uint64_t *map);
 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 static void	reserve_pv_entries(pmap_t pmap, int needed,
 		    struct rwlock **lockp);
 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 		    struct rwlock **lockp);
 static bool	pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
 		    u_int flags, struct rwlock **lockp);
 #if VM_NRESERVLEVEL > 0
 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
 		    struct rwlock **lockp);
 #endif
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
 
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode,
     bool noflush);
 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 static boolean_t pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde,
     vm_offset_t va, struct rwlock **lockp);
 static boolean_t pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe,
     vm_offset_t va);
 static bool	pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
 		    vm_prot_t prot, struct rwlock **lockp);
 static int	pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
 		    u_int flags, vm_page_t m, struct rwlock **lockp);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted);
 static void pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva,
     vm_offset_t eva);
 static void pmap_invalidate_cache_range_all(vm_offset_t sva,
     vm_offset_t eva);
 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
 		    pd_entry_t pde);
 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 static vm_page_t pmap_large_map_getptp_unlocked(void);
 static vm_paddr_t pmap_large_map_kextract(vm_offset_t va);
 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask);
 #if VM_NRESERVLEVEL > 0
 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp);
 #endif
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask);
 static void pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva,
     bool exec);
 static pdp_entry_t *pmap_pti_pdpe(vm_offset_t va);
 static pd_entry_t *pmap_pti_pde(vm_offset_t va);
 static void pmap_pti_wire_pte(void *pte);
 static int pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 static void pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     struct spglist *free);
 static bool	pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 		    pd_entry_t *pde, struct spglist *free,
 		    struct rwlock **lockp);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m, struct rwlock **lockp);
 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t newpde);
 static void pmap_update_pde_invalidate(pmap_t, vm_offset_t va, pd_entry_t pde);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp);
 static vm_page_t pmap_allocpde(pmap_t pmap, vm_offset_t va,
 		struct rwlock **lockp);
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va,
 		struct rwlock **lockp);
 
 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 
 /********************/
 /* Inline functions */
 /********************/
 
 /* Return a non-clipped PD index for a given VA */
 static __inline vm_pindex_t
 pmap_pde_pindex(vm_offset_t va)
 {
 	return (va >> PDRSHIFT);
 }
 
 
 /* Return a pointer to the PML4 slot that corresponds to a VA */
 static __inline pml4_entry_t *
 pmap_pml4e(pmap_t pmap, vm_offset_t va)
 {
 
 	return (&pmap->pm_pml4[pmap_pml4e_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pml4e_to_pdpe(pml4_entry_t *pml4e, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 
 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(*pml4e & PG_FRAME);
 	return (&pdpe[pmap_pdpe_index(va)]);
 }
 
 /* Return a pointer to the PDP slot that corresponds to a VA */
 static __inline pdp_entry_t *
 pmap_pdpe(pmap_t pmap, vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
 	pt_entry_t PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	pml4e = pmap_pml4e(pmap, va);
 	if ((*pml4e & PG_V) == 0)
 		return (NULL);
 	return (pmap_pml4e_to_pdpe(pml4e, va));
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pdpe_to_pde(pdp_entry_t *pdpe, vm_offset_t va)
 {
 	pd_entry_t *pde;
 
 	pde = (pd_entry_t *)PHYS_TO_DMAP(*pdpe & PG_FRAME);
 	return (&pde[pmap_pde_index(va)]);
 }
 
 /* Return a pointer to the PD slot that corresponds to a VA */
 static __inline pd_entry_t *
 pmap_pde(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pt_entry_t PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe == NULL || (*pdpe & PG_V) == 0)
 		return (NULL);
 	return (pmap_pdpe_to_pde(pdpe, va));
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pde_to_pte(pd_entry_t *pde, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 	return (&pte[pmap_pte_index(va)]);
 }
 
 /* Return a pointer to the PT slot that corresponds to a VA */
 static __inline pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *pde;
 	pt_entry_t PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		return (NULL);
 	if ((*pde & PG_PS) != 0)	/* compat with i386 pmap_pte() */
 		return ((pt_entry_t *)pde);
 	return (pmap_pde_to_pte(pde, va));
 }
 
 static __inline void
 pmap_resident_count_inc(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pmap->pm_stats.resident_count += count;
 }
 
 static __inline void
 pmap_resident_count_dec(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(pmap->pm_stats.resident_count >= count,
 	    ("pmap %p resident count underflow %ld %d", pmap,
 	    pmap->pm_stats.resident_count, count));
 	pmap->pm_stats.resident_count -= count;
 }
 
 PMAP_INLINE pt_entry_t *
 vtopte(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPTEPGSHIFT + NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopte on a uva/gpa 0x%0lx", va));
 
 	return (PTmap + ((va >> PAGE_SHIFT) & mask));
 }
 
 static __inline pd_entry_t *
 vtopde(vm_offset_t va)
 {
 	u_int64_t mask = ((1ul << (NPDEPGSHIFT + NPDPEPGSHIFT + NPML4EPGSHIFT)) - 1);
 
 	KASSERT(va >= VM_MAXUSER_ADDRESS, ("vtopde on a uva/gpa 0x%0lx", va));
 
 	return (PDmap + ((va >> PDRSHIFT) & mask));
 }
 
 static u_int64_t
 allocpages(vm_paddr_t *firstaddr, int n)
 {
 	u_int64_t ret;
 
 	ret = *firstaddr;
 	bzero((void *)ret, n * PAGE_SIZE);
 	*firstaddr += n * PAGE_SIZE;
 	return (ret);
 }
 
 CTASSERT(powerof2(NDMPML4E));
 
 /* number of kernel PDP slots */
 #define	NKPDPE(ptpgs)		howmany(ptpgs, NPDEPG)
 
 static void
 nkpt_init(vm_paddr_t addr)
 {
 	int pt_pages;
 	
 #ifdef NKPT
 	pt_pages = NKPT;
 #else
 	pt_pages = howmany(addr, 1 << PDRSHIFT);
 	pt_pages += NKPDPE(pt_pages);
 
 	/*
 	 * Add some slop beyond the bare minimum required for bootstrapping
 	 * the kernel.
 	 *
 	 * This is quite important when allocating KVA for kernel modules.
 	 * The modules are required to be linked in the negative 2GB of
 	 * the address space.  If we run out of KVA in this region then
 	 * pmap_growkernel() will need to allocate page table pages to map
 	 * the entire 512GB of KVA space which is an unnecessary tax on
 	 * physical memory.
 	 *
 	 * Secondly, device memory mapped as part of setting up the low-
 	 * level console(s) is taken from KVA, starting at virtual_avail.
 	 * This is because cninit() is called after pmap_bootstrap() but
 	 * before vm_init() and pmap_init(). 20MB for a frame buffer is
 	 * not uncommon.
 	 */
 	pt_pages += 32;		/* 64MB additional slop. */
 #endif
 	nkpt = pt_pages;
 }
 
 /*
  * Returns the proper write/execute permission for a physical page that is
  * part of the initial boot allocations.
  *
  * If the page has kernel text, it is marked as read-only. If the page has
  * kernel read-only data, it is marked as read-only/not-executable. If the
  * page has only read-write data, it is marked as read-write/not-executable.
  * If the page is below/above the kernel range, it is marked as read-write.
  *
  * This function operates on 2M pages, since we map the kernel space that
  * way.
  *
  * Note that this doesn't currently provide any protection for modules.
  */
 static inline pt_entry_t
 bootaddr_rwx(vm_paddr_t pa)
 {
 
 	/*
 	 * Everything in the same 2M page as the start of the kernel
 	 * should be static. On the other hand, things in the same 2M
 	 * page as the end of the kernel could be read-write/executable,
 	 * as the kernel image is not guaranteed to end on a 2M boundary.
 	 */
 	if (pa < trunc_2mpage(btext - KERNBASE) ||
 	   pa >= trunc_2mpage(_end - KERNBASE))
 		return (X86_PG_RW);
 	/*
 	 * The linker should ensure that the read-only and read-write
 	 * portions don't share the same 2M page, so this shouldn't
 	 * impact read-only data. However, in any case, any page with
 	 * read-write data needs to be read-write.
 	 */
 	if (pa >= trunc_2mpage(brwsection - KERNBASE))
 		return (X86_PG_RW | pg_nx);
 	/*
 	 * Mark any 2M page containing kernel text as read-only. Mark
 	 * other pages with read-only data as read-only and not executable.
 	 * (It is likely a small portion of the read-only data section will
 	 * be marked as read-only, but executable. This should be acceptable
 	 * since the read-only protection will keep the data from changing.)
 	 * Note that fixups to the .text section will still work until we
 	 * set CR0.WP.
 	 */
 	if (pa < round_2mpage(etext - KERNBASE))
 		return (0);
 	return (pg_nx);
 }
 
 static void
 create_pagetables(vm_paddr_t *firstaddr)
 {
 	int i, j, ndm1g, nkpdpe, nkdmpde;
 	pd_entry_t *pd_p;
 	pdp_entry_t *pdp_p;
 	pml4_entry_t *p4_p;
 	uint64_t DMPDkernphys;
 
 	/* Allocate page table pages for the direct map */
 	ndmpdp = howmany(ptoa(Maxmem), NBPDP);
 	if (ndmpdp < 4)		/* Minimum 4GB of dirmap */
 		ndmpdp = 4;
 	ndmpdpphys = howmany(ndmpdp, NPDPEPG);
 	if (ndmpdpphys > NDMPML4E) {
 		/*
 		 * Each NDMPML4E allows 512 GB, so limit to that,
 		 * and then readjust ndmpdp and ndmpdpphys.
 		 */
 		printf("NDMPML4E limits system to %d GB\n", NDMPML4E * 512);
 		Maxmem = atop(NDMPML4E * NBPML4);
 		ndmpdpphys = NDMPML4E;
 		ndmpdp = NDMPML4E * NPDEPG;
 	}
 	DMPDPphys = allocpages(firstaddr, ndmpdpphys);
 	ndm1g = 0;
 	if ((amd_feature & AMDID_PAGE1GB) != 0) {
 		/*
 		 * Calculate the number of 1G pages that will fully fit in
 		 * Maxmem.
 		 */
 		ndm1g = ptoa(Maxmem) >> PDPSHIFT;
 
 		/*
 		 * Allocate 2M pages for the kernel. These will be used in
 		 * place of the first one or more 1G pages from ndm1g.
 		 */
 		nkdmpde = howmany((vm_offset_t)(brwsection - KERNBASE), NBPDP);
 		DMPDkernphys = allocpages(firstaddr, nkdmpde);
 	}
 	if (ndm1g < ndmpdp)
 		DMPDphys = allocpages(firstaddr, ndmpdp - ndm1g);
 	dmaplimit = (vm_paddr_t)ndmpdp << PDPSHIFT;
 
 	/* Allocate pages */
 	KPML4phys = allocpages(firstaddr, 1);
 	KPDPphys = allocpages(firstaddr, NKPML4E);
 
 	/*
 	 * Allocate the initial number of kernel page table pages required to
 	 * bootstrap.  We defer this until after all memory-size dependent
 	 * allocations are done (e.g. direct map), so that we don't have to
 	 * build in too much slop in our estimate.
 	 *
 	 * Note that when NKPML4E > 1, we have an empty page underneath
 	 * all but the KPML4I'th one, so we need NKPML4E-1 extra (zeroed)
 	 * pages.  (pmap_enter requires a PD page to exist for each KPML4E.)
 	 */
 	nkpt_init(*firstaddr);
 	nkpdpe = NKPDPE(nkpt);
 
 	KPTphys = allocpages(firstaddr, nkpt);
 	KPDphys = allocpages(firstaddr, nkpdpe);
 
 	/*
 	 * Connect the zero-filled PT pages to their PD entries.  This
 	 * implicitly maps the PT pages at their correct locations within
 	 * the PTmap.
 	 */
 	pd_p = (pd_entry_t *)KPDphys;
 	for (i = 0; i < nkpt; i++)
 		pd_p[i] = (KPTphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 
 	/*
 	 * Map from physical address zero to the end of loader preallocated
 	 * memory using 2MB pages.  This replaces some of the PD entries
 	 * created above.
 	 */
 	for (i = 0; (i << PDRSHIFT) < KERNend; i++)
 		/* Preset PG_M and PG_A because demotion expects it. */
 		pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
 		    X86_PG_M | X86_PG_A | bootaddr_rwx(i << PDRSHIFT);
 
 	/*
 	 * Because we map the physical blocks in 2M pages, adjust firstaddr
 	 * to record the physical blocks we've actually mapped into kernel
 	 * virtual address space.
 	 */
 	if (*firstaddr < round_2mpage(KERNend))
 		*firstaddr = round_2mpage(KERNend);
 
 	/* And connect up the PD to the PDP (leaving room for L4 pages) */
 	pdp_p = (pdp_entry_t *)(KPDPphys + ptoa(KPML4I - KPML4BASE));
 	for (i = 0; i < nkpdpe; i++)
 		pdp_p[i + KPDPI] = (KPDphys + ptoa(i)) | X86_PG_RW | X86_PG_V;
 
 	/*
 	 * Now, set up the direct map region using 2MB and/or 1GB pages.  If
 	 * the end of physical memory is not aligned to a 1GB page boundary,
 	 * then the residual physical memory is mapped with 2MB pages.  Later,
 	 * if pmap_mapdev{_attr}() uses the direct map for non-write-back
 	 * memory, pmap_change_attr() will demote any 2MB or 1GB page mappings
 	 * that are partially used. 
 	 */
 	pd_p = (pd_entry_t *)DMPDphys;
 	for (i = NPDEPG * ndm1g, j = 0; i < NPDEPG * ndmpdp; i++, j++) {
 		pd_p[j] = (vm_paddr_t)i << PDRSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
 		pd_p[j] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
 		    X86_PG_M | X86_PG_A | pg_nx;
 	}
 	pdp_p = (pdp_entry_t *)DMPDPphys;
 	for (i = 0; i < ndm1g; i++) {
 		pdp_p[i] = (vm_paddr_t)i << PDPSHIFT;
 		/* Preset PG_M and PG_A because demotion expects it. */
 		pdp_p[i] |= X86_PG_RW | X86_PG_V | PG_PS | pg_g |
 		    X86_PG_M | X86_PG_A | pg_nx;
 	}
 	for (j = 0; i < ndmpdp; i++, j++) {
 		pdp_p[i] = DMPDphys + ptoa(j);
 		pdp_p[i] |= X86_PG_RW | X86_PG_V;
 	}
 
 	/*
 	 * Instead of using a 1G page for the memory containing the kernel,
 	 * use 2M pages with appropriate permissions. (If using 1G pages,
 	 * this will partially overwrite the PDPEs above.)
 	 */
 	if (ndm1g) {
 		pd_p = (pd_entry_t *)DMPDkernphys;
 		for (i = 0; i < (NPDEPG * nkdmpde); i++)
 			pd_p[i] = (i << PDRSHIFT) | X86_PG_V | PG_PS | pg_g |
 			    X86_PG_M | X86_PG_A | pg_nx |
 			    bootaddr_rwx(i << PDRSHIFT);
 		for (i = 0; i < nkdmpde; i++)
 			pdp_p[i] = (DMPDkernphys + ptoa(i)) | X86_PG_RW |
 			    X86_PG_V;
 	}
 
 	/* And recursively map PML4 to itself in order to get PTmap */
 	p4_p = (pml4_entry_t *)KPML4phys;
 	p4_p[PML4PML4I] = KPML4phys;
 	p4_p[PML4PML4I] |= X86_PG_RW | X86_PG_V | pg_nx;
 
 	/* Connect the Direct Map slot(s) up to the PML4. */
 	for (i = 0; i < ndmpdpphys; i++) {
 		p4_p[DMPML4I + i] = DMPDPphys + ptoa(i);
 		p4_p[DMPML4I + i] |= X86_PG_RW | X86_PG_V;
 	}
 
 	/* Connect the KVA slots up to the PML4 */
 	for (i = 0; i < NKPML4E; i++) {
 		p4_p[KPML4BASE + i] = KPDPphys + ptoa(i);
 		p4_p[KPML4BASE + i] |= X86_PG_RW | X86_PG_V;
 	}
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On amd64 this is called after mapping has already been enabled
  *	and just syncs the pmap module with what has already been done.
  *	[We can't call it easily with mapping off since the kernel is not
  *	mapped with PA == VA, hence we would have to relocate every address
  *	from the linked base (virtual) address "KERNBASE" to the actual
  *	(physical) address starting relative to 0]
  */
 void
 pmap_bootstrap(vm_paddr_t *firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte;
 	uint64_t cr4;
 	u_long res;
 	int i;
 
 	KERNend = *firstaddr;
 	res = atop(KERNend - (vm_paddr_t)kernphys);
 
 	if (!pti)
 		pg_g = X86_PG_G;
 
 	/*
 	 * Create an initial set of page tables to run the kernel in.
 	 */
 	create_pagetables(firstaddr);
 
 	/*
 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
 	 * preallocated kernel page table pages so that vm_page structures
 	 * representing these pages will be created.  The vm_page structures
 	 * are required for promotion of the corresponding kernel virtual
 	 * addresses to superpage mappings.
 	 */
 	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
 
 	/*
 	 * Account for the virtual addresses mapped by create_pagetables().
 	 */
 	virtual_avail = (vm_offset_t)KERNBASE + round_2mpage(KERNend);
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Enable PG_G global pages, then switch to the kernel page
 	 * table from the bootstrap page table.  After the switch, it
 	 * is possible to enable SMEP and SMAP since PG_U bits are
 	 * correct now.
 	 */
 	cr4 = rcr4();
 	cr4 |= CR4_PGE;
 	load_cr4(cr4);
 	load_cr3(KPML4phys);
 	if (cpu_stdext_feature & CPUID_STDEXT_SMEP)
 		cr4 |= CR4_SMEP;
 	if (cpu_stdext_feature & CPUID_STDEXT_SMAP)
 		cr4 |= CR4_SMAP;
 	load_cr4(cr4);
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 * Count bootstrap data as being resident in case any of this data is
 	 * later unmapped (using pmap_remove()) and freed.
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pml4 = (pdp_entry_t *)PHYS_TO_DMAP(KPML4phys);
 	kernel_pmap->pm_cr3 = KPML4phys;
 	kernel_pmap->pm_ucr3 = PMAP_NO_CR3;
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 	kernel_pmap->pm_stats.resident_count = res;
 	kernel_pmap->pm_flags = pmap_flags;
 
  	/*
 	 * Initialize the TLB invalidations generation number lock.
 	 */
 	mtx_init(&invl_gen_mtx, "invlgn", NULL, MTX_DEF);
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 	/*
 	 * Crashdump maps.  The first page is reused as CMAP1 for the
 	 * memory test.
 	 */
 	SYSMAP(caddr_t, CMAP1, crashdumpmap, MAXDUMPPGS)
 	CADDR1 = crashdumpmap;
 
 	virtual_avail = va;
 
 	/*
 	 * Initialize the PAT MSR.
 	 * pmap_init_pat() clears and sets CR4_PGE, which, as a
 	 * side-effect, invalidates stale PG_G TLB entries that might
 	 * have been created in our pre-boot environment.
 	 */
 	pmap_init_pat();
 
 	/* Initialize TLB Context Id. */
 	if (pmap_pcid_enabled) {
 		for (i = 0; i < MAXCPU; i++) {
 			kernel_pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN;
 			kernel_pmap->pm_pcids[i].pm_gen = 1;
 		}
 
 		/*
 		 * PMAP_PCID_KERN + 1 is used for initialization of
 		 * proc0 pmap.  The pmap' pcid state might be used by
 		 * EFIRT entry before first context switch, so it
 		 * needs to be valid.
 		 */
 		PCPU_SET(pcid_next, PMAP_PCID_KERN + 2);
 		PCPU_SET(pcid_gen, 1);
 
 		/*
 		 * pcpu area for APs is zeroed during AP startup.
 		 * pc_pcid_next and pc_pcid_gen are initialized by AP
 		 * during pcpu setup.
 		 */
 		load_cr4(rcr4() | CR4_PCIDE);
 	}
 }
 
 /*
  * Setup the PAT MSR.
  */
 void
 pmap_init_pat(void)
 {
 	uint64_t pat_msr;
 	u_long cr0, cr4;
 	int i;
 
 	/* Bail if this CPU doesn't implement PAT. */
 	if ((cpu_feature & CPUID_PAT) == 0)
 		panic("no PAT??");
 
 	/* Set default PAT index table. */
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		pat_index[i] = -1;
 	pat_index[PAT_WRITE_BACK] = 0;
 	pat_index[PAT_WRITE_THROUGH] = 1;
 	pat_index[PAT_UNCACHEABLE] = 3;
 	pat_index[PAT_WRITE_COMBINING] = 6;
 	pat_index[PAT_WRITE_PROTECTED] = 5;
 	pat_index[PAT_UNCACHED] = 2;
 
 	/*
 	 * Initialize default PAT entries.
 	 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
 	 * Program 5 and 6 as WP and WC.
 	 *
 	 * Leave 4 and 7 as WB and UC.  Note that a recursive page table
 	 * mapping for a 2M page uses a PAT value with the bit 3 set due
 	 * to its overload with PG_PS.
 	 */
 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
 	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
 	    PAT_VALUE(2, PAT_UNCACHED) |
 	    PAT_VALUE(3, PAT_UNCACHEABLE) |
 	    PAT_VALUE(4, PAT_WRITE_BACK) |
 	    PAT_VALUE(5, PAT_WRITE_PROTECTED) |
 	    PAT_VALUE(6, PAT_WRITE_COMBINING) |
 	    PAT_VALUE(7, PAT_UNCACHEABLE);
 
 	/* Disable PGE. */
 	cr4 = rcr4();
 	load_cr4(cr4 & ~CR4_PGE);
 
 	/* Disable caches (CD = 1, NW = 0). */
 	cr0 = rcr0();
 	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
 
 	/* Flushes caches and TLBs. */
 	wbinvd();
 	invltlb();
 
 	/* Update PAT and index table. */
 	wrmsr(MSR_PAT, pat_msr);
 
 	/* Flush caches and TLBs again. */
 	wbinvd();
 	invltlb();
 
 	/* Restore caches and PGE. */
 	load_cr0(cr0);
 	load_cr4(cr4);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pat_mode = PAT_WRITE_BACK;
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_page_t m, mpte;
 	vm_size_t s;
 	int error, i, pv_npg, ret, skz63;
 
 	/* L1TF, reserve page @0 unconditionally */
 	vm_page_blacklist_add(0, bootverbose);
 
 	/* Detect bare-metal Skylake Server and Skylake-X. */
 	if (vm_guest == VM_GUEST_NO && cpu_vendor_id == CPU_VENDOR_INTEL &&
 	    CPUID_TO_FAMILY(cpu_id) == 0x6 && CPUID_TO_MODEL(cpu_id) == 0x55) {
 		/*
 		 * Skylake-X errata SKZ63. Processor May Hang When
 		 * Executing Code In an HLE Transaction Region between
 		 * 40000000H and 403FFFFFH.
 		 *
 		 * Mark the pages in the range as preallocated.  It
 		 * seems to be impossible to distinguish between
 		 * Skylake Server and Skylake X.
 		 */
 		skz63 = 1;
 		TUNABLE_INT_FETCH("hw.skz63_enable", &skz63);
 		if (skz63 != 0) {
 			if (bootverbose)
 				printf("SKZ63: skipping 4M RAM starting "
 				    "at physical 1G\n");
 			for (i = 0; i < atop(0x400000); i++) {
 				ret = vm_page_blacklist_add(0x40000000 +
 				    ptoa(i), FALSE);
 				if (!ret && bootverbose)
 					printf("page at %#lx already used\n",
 					    0x40000000 + ptoa(i));
 			}
 		}
 	}
 
 	/*
 	 * Initialize the vm page array entries for the kernel pmap's
 	 * page table pages.
 	 */ 
 	PMAP_LOCK(kernel_pmap);
 	for (i = 0; i < nkpt; i++) {
 		mpte = PHYS_TO_VM_PAGE(KPTphys + (i << PAGE_SHIFT));
 		KASSERT(mpte >= vm_page_array &&
 		    mpte < &vm_page_array[vm_page_array_size],
 		    ("pmap_init: page table page is out of range"));
 		mpte->pindex = pmap_pde_pindex(KERNBASE) + i;
 		mpte->phys_addr = KPTphys + (i << PAGE_SHIFT);
 		mpte->wire_count = 1;
 
 		/*
 		 * Collect the page table pages that were replaced by a 2MB
 		 * page in create_pagetables().  They are zero filled.
 		 */
 		if (i << PDRSHIFT < KERNend &&
 		    pmap_insert_pt_page(kernel_pmap, mpte, false))
 			panic("pmap_init: pmap_insert_pt_page failed");
 	}
 	PMAP_UNLOCK(kernel_pmap);
 	vm_wire_add(nkpt);
 
 	/*
 	 * If the kernel is running on a virtual machine, then it must assume
 	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
 	 * be prepared for the hypervisor changing the vendor and family that
 	 * are reported by CPUID.  Consequently, the workaround for AMD Family
 	 * 10h Erratum 383 is enabled if the processor's feature set does not
 	 * include at least one feature that is only supported by older Intel
 	 * or newer AMD processors.
 	 */
 	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
 	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
 	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
 	    AMDID2_FMA4)) == 0)
 		workaround_erratum383 = 1;
 
 	/*
 	 * Are large page mappings enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
 	if (pg_ps_enabled) {
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("pmap_init: can't assign to pagesizes[1]"));
 		pagesizes[1] = NBPDR;
 	}
 
 	/*
 	 * Initialize the pv chunk list mutex.
 	 */
 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 
 	/*
 	 * Initialize the pool of pv list locks.
 	 */
 	for (i = 0; i < NPV_LIST_LOCKS; i++)
 		rw_init(&pv_list_locks[i], "pmap pv list");
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, NBPDR);
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 	TAILQ_INIT(&pv_dummy.pv_list);
 
 	pmap_initialized = 1;
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == 0)
 			continue;
 		/* Make the direct map consistent */
 		if (ppim->pa < dmaplimit && ppim->pa + ppim->sz <= dmaplimit) {
 			(void)pmap_change_attr(PHYS_TO_DMAP(ppim->pa),
 			    ppim->sz, ppim->mode);
 		}
 		if (!bootverbose)
 			continue;
 		printf("PPIM %u: PA=%#lx, VA=%#lx, size=%#lx, mode=%#x\n", i,
 		    ppim->pa, ppim->va, ppim->sz, ppim->mode);
 	}
 
 	mtx_init(&qframe_mtx, "qfrmlk", NULL, MTX_SPIN);
 	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
 	    (vmem_addr_t *)&qframe);
 	if (error != 0)
 		panic("qframe allocation failed");
 
 	lm_ents = 8;
 	TUNABLE_INT_FETCH("vm.pmap.large_map_pml4_entries", &lm_ents);
 	if (lm_ents > LMEPML4I - LMSPML4I + 1)
 		lm_ents = LMEPML4I - LMSPML4I + 1;
 	if (bootverbose)
 		printf("pmap: large map %u PML4 slots (%lu Gb)\n",
 		    lm_ents, (u_long)lm_ents * (NBPML4 / 1024 / 1024 / 1024));
 	if (lm_ents != 0) {
 		large_vmem = vmem_create("large", LARGEMAP_MIN_ADDRESS,
 		    (vmem_size_t)lm_ents * NBPML4, PAGE_SIZE, 0, M_WAITOK);
 		if (large_vmem == NULL) {
 			printf("pmap: cannot create large map\n");
 			lm_ents = 0;
 		}
 		for (i = 0; i < lm_ents; i++) {
 			m = pmap_large_map_getptp_unlocked();
 			kernel_pmap->pm_pml4[LMSPML4I + i] = X86_PG_V |
 			    X86_PG_RW | X86_PG_A | X86_PG_M | pg_nx |
 			    VM_PAGE_TO_PHYS(m);
 		}
 	}
 }
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pde, CTLFLAG_RD, 0,
     "2MB page mapping counters");
 
 static u_long pmap_pde_demotions;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_pde_demotions, 0, "2MB page demotions");
 
 static u_long pmap_pde_mappings;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_pde_mappings, 0, "2MB page mappings");
 
 static u_long pmap_pde_p_failures;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_pde_p_failures, 0, "2MB page promotion failures");
 
 static u_long pmap_pde_promotions;
 SYSCTL_ULONG(_vm_pmap_pde, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_pde_promotions, 0, "2MB page promotions");
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, pdpe, CTLFLAG_RD, 0,
     "1GB page mapping counters");
 
 static u_long pmap_pdpe_demotions;
 SYSCTL_ULONG(_vm_pmap_pdpe, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_pdpe_demotions, 0, "1GB page demotions");
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 static pt_entry_t
 pmap_swap_pat(pmap_t pmap, pt_entry_t entry)
 {
 	int x86_pat_bits = X86_PG_PTE_PAT | X86_PG_PDE_PAT;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		/* Verify that both PAT bits are not set at the same time */
 		KASSERT((entry & x86_pat_bits) != x86_pat_bits,
 		    ("Invalid PAT bits in entry %#lx", entry));
 
 		/* Swap the PAT bits if one of them is set */
 		if ((entry & x86_pat_bits) != 0)
 			entry ^= x86_pat_bits;
 		break;
 	case PT_EPT:
 		/*
 		 * Nothing to do - the memory attributes are represented
 		 * the same way for regular pages and superpages.
 		 */
 		break;
 	default:
 		panic("pmap_switch_pat_bits: bad pm_type %d", pmap->pm_type);
 	}
 
 	return (entry);
 }
 
 boolean_t
 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
 {
 
 	return (mode >= 0 && mode < PAT_INDEX_SIZE &&
 	    pat_index[(int)mode] >= 0);
 }
 
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 int
 pmap_cache_bits(pmap_t pmap, int mode, boolean_t is_pde)
 {
 	int cache_bits, pat_flag, pat_idx;
 
 	if (!pmap_is_valid_memattr(pmap, mode))
 		panic("Unknown caching mode %d\n", mode);
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		/* The PAT bit is different for PTE's and PDE's. */
 		pat_flag = is_pde ? X86_PG_PDE_PAT : X86_PG_PTE_PAT;
 
 		/* Map the caching mode to a PAT index. */
 		pat_idx = pat_index[mode];
 
 		/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 		cache_bits = 0;
 		if (pat_idx & 0x4)
 			cache_bits |= pat_flag;
 		if (pat_idx & 0x2)
 			cache_bits |= PG_NC_PCD;
 		if (pat_idx & 0x1)
 			cache_bits |= PG_NC_PWT;
 		break;
 
 	case PT_EPT:
 		cache_bits = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(mode);
 		break;
 
 	default:
 		panic("unsupported pmap type %d", pmap->pm_type);
 	}
 
 	return (cache_bits);
 }
 
 static int
 pmap_cache_mask(pmap_t pmap, boolean_t is_pde)
 {
 	int mask;
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 	case PT_RVI:
 		mask = is_pde ? X86_PG_PDE_CACHE : X86_PG_PTE_CACHE;
 		break;
 	case PT_EPT:
 		mask = EPT_PG_IGNORE_PAT | EPT_PG_MEMORY_TYPE(0x7);
 		break;
 	default:
 		panic("pmap_cache_mask: invalid pm_type %d", pmap->pm_type);
 	}
 
 	return (mask);
 }
 
 bool
 pmap_ps_enabled(pmap_t pmap)
 {
 
 	return (pg_ps_enabled && (pmap->pm_flags & PMAP_PDE_SUPERPAGE) != 0);
 }
 
 static void
 pmap_update_pde_store(pmap_t pmap, pd_entry_t *pde, pd_entry_t newpde)
 {
 
 	switch (pmap->pm_type) {
 	case PT_X86:
 		break;
 	case PT_RVI:
 	case PT_EPT:
 		/*
 		 * XXX
 		 * This is a little bogus since the generation number is
 		 * supposed to be bumped up when a region of the address
 		 * space is invalidated in the page tables.
 		 *
 		 * In this case the old PDE entry is valid but yet we want
 		 * to make sure that any mappings using the old entry are
 		 * invalidated in the TLB.
 		 *
 		 * The reason this works as expected is because we rendezvous
 		 * "all" host cpus and force any vcpu context to exit as a
 		 * side-effect.
 		 */
 		atomic_add_acq_long(&pmap->pm_eptgen, 1);
 		break;
 	default:
 		panic("pmap_update_pde_store: bad pm_type %d", pmap->pm_type);
 	}
 	pde_store(pde, newpde);
 }
 
 /*
  * After changing the page size for the specified virtual address in the page
  * table, flush the corresponding entries from the processor's TLB.  Only the
  * calling processor's TLB is affected.
  *
  * The calling thread must be pinned to a processor.
  */
 static void
 pmap_update_pde_invalidate(pmap_t pmap, vm_offset_t va, pd_entry_t newpde)
 {
 	pt_entry_t PG_G;
 
 	if (pmap_type_guest(pmap))
 		return;
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_update_pde_invalidate: invalid type %d", pmap->pm_type));
 
 	PG_G = pmap_global_bit(pmap);
 
 	if ((newpde & PG_PS) == 0)
 		/* Demotion: flush a specific 2MB page mapping. */
 		invlpg(va);
 	else if ((newpde & PG_G) == 0)
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB
 		 * because there are too many to flush individually.
 		 */
 		invltlb();
 	else {
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB,
 		 * including any global (PG_G) mappings.
 		 */
 		invltlb_glob();
 	}
 }
 #ifdef SMP
 
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  *
  * N.B.: Before calling any of the following TLB invalidation functions,
  * the calling processor must ensure that all stores updating a non-
  * kernel page table are globally performed.  Otherwise, another
  * processor could cache an old, pre-update entry without being
  * invalidated.  This can happen one of two ways: (1) The pmap becomes
  * active on another processor after its pm_active field is checked by
  * one of the following functions but before a store updating the page
  * table is globally performed. (2) The pmap becomes active on another
  * processor before its pm_active field is checked but due to
  * speculative loads one of the following functions stills reads the
  * pmap as inactive on the other processor.
  * 
  * The kernel page table is exempt because its pm_active field is
  * immutable.  The kernel page table is always active on every
  * processor.
  */
 
 /*
  * Interrupt the cpus that are executing in the guest context.
  * This will force the vcpu to exit and the cached EPT mappings
  * will be invalidated by the host before the next vmresume.
  */
 static __inline void
 pmap_invalidate_ept(pmap_t pmap)
 {
 	int ipinum;
 
 	sched_pin();
 	KASSERT(!CPU_ISSET(curcpu, &pmap->pm_active),
 	    ("pmap_invalidate_ept: absurd pm_active"));
 
 	/*
 	 * The TLB mappings associated with a vcpu context are not
 	 * flushed each time a different vcpu is chosen to execute.
 	 *
 	 * This is in contrast with a process's vtop mappings that
 	 * are flushed from the TLB on each context switch.
 	 *
 	 * Therefore we need to do more than just a TLB shootdown on
 	 * the active cpus in 'pmap->pm_active'. To do this we keep
 	 * track of the number of invalidations performed on this pmap.
 	 *
 	 * Each vcpu keeps a cache of this counter and compares it
 	 * just before a vmresume. If the counter is out-of-date an
 	 * invept will be done to flush stale mappings from the TLB.
 	 */
 	atomic_add_acq_long(&pmap->pm_eptgen, 1);
 
 	/*
 	 * Force the vcpu to exit and trap back into the hypervisor.
 	 */
 	ipinum = pmap->pm_flags & PMAP_NESTED_IPIMASK;
 	ipi_selected(pmap->pm_active, ipinum);
 	sched_unpin();
 }
 
 static cpuset_t
 pmap_invalidate_cpu_mask(pmap_t pmap)
 {
 
 	return (pmap == kernel_pmap ? all_cpus : pmap->pm_active);
 }
 
 static inline void
 pmap_invalidate_page_pcid(pmap_t pmap, vm_offset_t va,
     const bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 	u_int cpuid, i;
 
 	cpuid = PCPU_GET(cpuid);
 	if (pmap == PCPU_GET(curpmap)) {
 		if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 			/*
 			 * Because pm_pcid is recalculated on a
 			 * context switch, we must disable switching.
 			 * Otherwise, we might use a stale value
 			 * below.
 			 */
 			critical_enter();
 			pcid = pmap->pm_pcids[cpuid].pm_pcid;
 			if (invpcid_works1) {
 				d.pcid = pcid | PMAP_PCID_USER_PT;
 				d.pad = 0;
 				d.addr = va;
 				invpcid(&d, INVPCID_ADDR);
 			} else {
 				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 				ucr3 = pmap->pm_ucr3 | pcid |
 				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 				pmap_pti_pcid_invlpg(ucr3, kcr3, va);
 			}
 			critical_exit();
 		}
 	} else
 		pmap->pm_pcids[cpuid].pm_gen = 0;
 
 	CPU_FOREACH(i) {
 		if (cpuid != i)
 			pmap->pm_pcids[i].pm_gen = 0;
 	}
 
 	/*
 	 * The fence is between stores to pm_gen and the read of the
 	 * pm_active mask.  We need to ensure that it is impossible
 	 * for us to miss the bit update in pm_active and
 	 * simultaneously observe a non-zero pm_gen in
 	 * pmap_activate_sw(), otherwise TLB update is missed.
 	 * Without the fence, IA32 allows such an outcome.  Note that
 	 * pm_active is updated by a locked operation, which provides
 	 * the reciprocal fence.
 	 */
 	atomic_thread_fence_seq_cst();
 }
 
 static void
 pmap_invalidate_page_pcid_invpcid(pmap_t pmap, vm_offset_t va)
 {
 
 	pmap_invalidate_page_pcid(pmap, va, true);
 }
 
 static void
 pmap_invalidate_page_pcid_noinvpcid(pmap_t pmap, vm_offset_t va)
 {
 
 	pmap_invalidate_page_pcid(pmap, va, false);
 }
 
 static void
 pmap_invalidate_page_nopcid(pmap_t pmap, vm_offset_t va)
 {
 }
 
 DEFINE_IFUNC(static, void, pmap_invalidate_page_mode, (pmap_t, vm_offset_t))
 {
 
 	if (pmap_pcid_enabled)
 		return (invpcid_works ? pmap_invalidate_page_pcid_invpcid :
 		    pmap_invalidate_page_pcid_noinvpcid);
 	return (pmap_invalidate_page_nopcid);
 }
 
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
 	}
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_page: invalid type %d", pmap->pm_type));
 
 	sched_pin();
 	if (pmap == kernel_pmap) {
 		invlpg(va);
 	} else {
 		if (pmap == PCPU_GET(curpmap))
 			invlpg(va);
 		pmap_invalidate_page_mode(pmap, va);
 	}
 	smp_masked_invlpg(pmap_invalidate_cpu_mask(pmap), va, pmap);
 	sched_unpin();
 }
 
 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
 #define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
 
 static void
 pmap_invalidate_range_pcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     const bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 	u_int cpuid, i;
 
 	cpuid = PCPU_GET(cpuid);
 	if (pmap == PCPU_GET(curpmap)) {
 		if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 			critical_enter();
 			pcid = pmap->pm_pcids[cpuid].pm_pcid;
 			if (invpcid_works1) {
 				d.pcid = pcid | PMAP_PCID_USER_PT;
 				d.pad = 0;
 				d.addr = sva;
 				for (; d.addr < eva; d.addr += PAGE_SIZE)
 					invpcid(&d, INVPCID_ADDR);
 			} else {
 				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 				ucr3 = pmap->pm_ucr3 | pcid |
 				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 				pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
 			}
 			critical_exit();
 		}
 	} else
 		pmap->pm_pcids[cpuid].pm_gen = 0;
 
 	CPU_FOREACH(i) {
 		if (cpuid != i)
 			pmap->pm_pcids[i].pm_gen = 0;
 	}
 	/* See the comment in pmap_invalidate_page_pcid(). */
 	atomic_thread_fence_seq_cst();
 }
 
 static void
 pmap_invalidate_range_pcid_invpcid(pmap_t pmap, vm_offset_t sva,
     vm_offset_t eva)
 {
 
 	pmap_invalidate_range_pcid(pmap, sva, eva, true);
 }
 
 static void
 pmap_invalidate_range_pcid_noinvpcid(pmap_t pmap, vm_offset_t sva,
     vm_offset_t eva)
 {
 
 	pmap_invalidate_range_pcid(pmap, sva, eva, false);
 }
 
 static void
 pmap_invalidate_range_nopcid(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 }
 
 DEFINE_IFUNC(static, void, pmap_invalidate_range_mode, (pmap_t, vm_offset_t,
     vm_offset_t))
 {
 
 	if (pmap_pcid_enabled)
 		return (invpcid_works ? pmap_invalidate_range_pcid_invpcid :
 		    pmap_invalidate_range_pcid_noinvpcid);
 	return (pmap_invalidate_range_nopcid);
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
 		pmap_invalidate_all(pmap);
 		return;
 	}
 
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
 	}
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: invalid type %d", pmap->pm_type));
 
 	sched_pin();
 	if (pmap == kernel_pmap) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 	} else {
 		if (pmap == PCPU_GET(curpmap)) {
 			for (addr = sva; addr < eva; addr += PAGE_SIZE)
 				invlpg(addr);
 		}
 		pmap_invalidate_range_mode(pmap, sva, eva);
 	}
 	smp_masked_invlpg_range(pmap_invalidate_cpu_mask(pmap), sva, eva, pmap);
 	sched_unpin();
 }
 
 static inline void
 pmap_invalidate_all_pcid(pmap_t pmap, bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 	u_int cpuid, i;
 
 	if (pmap == kernel_pmap) {
 		if (invpcid_works1) {
 			bzero(&d, sizeof(d));
 			invpcid(&d, INVPCID_CTXGLOB);
 		} else {
 			invltlb_glob();
 		}
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		if (pmap == PCPU_GET(curpmap)) {
 			critical_enter();
 			pcid = pmap->pm_pcids[cpuid].pm_pcid;
 			if (invpcid_works1) {
 				d.pcid = pcid;
 				d.pad = 0;
 				d.addr = 0;
 				invpcid(&d, INVPCID_CTX);
 				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 					d.pcid |= PMAP_PCID_USER_PT;
 					invpcid(&d, INVPCID_CTX);
 				}
 			} else {
 				kcr3 = pmap->pm_cr3 | pcid;
 				ucr3 = pmap->pm_ucr3;
 				if (ucr3 != PMAP_NO_CR3) {
 					ucr3 |= pcid | PMAP_PCID_USER_PT;
 					pmap_pti_pcid_invalidate(ucr3, kcr3);
 				} else {
 					load_cr3(kcr3);
 				}
 			}
 			critical_exit();
 		} else
 			pmap->pm_pcids[cpuid].pm_gen = 0;
 		CPU_FOREACH(i) {
 			if (cpuid != i)
 				pmap->pm_pcids[i].pm_gen = 0;
 		}
 	}
 	/* See the comment in pmap_invalidate_page_pcid(). */
 	atomic_thread_fence_seq_cst();
 }
 
 static void
 pmap_invalidate_all_pcid_invpcid(pmap_t pmap)
 {
 
 	pmap_invalidate_all_pcid(pmap, true);
 }
 
 static void
 pmap_invalidate_all_pcid_noinvpcid(pmap_t pmap)
 {
 
 	pmap_invalidate_all_pcid(pmap, false);
 }
 
 static void
 pmap_invalidate_all_nopcid(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap)
 		invltlb_glob();
 	else if (pmap == PCPU_GET(curpmap))
 		invltlb();
 }
 
 DEFINE_IFUNC(static, void, pmap_invalidate_all_mode, (pmap_t))
 {
 
 	if (pmap_pcid_enabled)
 		return (invpcid_works ? pmap_invalidate_all_pcid_invpcid :
 		    pmap_invalidate_all_pcid_noinvpcid);
 	return (pmap_invalidate_all_nopcid);
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	if (pmap_type_guest(pmap)) {
 		pmap_invalidate_ept(pmap);
 		return;
 	}
 
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_all: invalid type %d", pmap->pm_type));
 
 	sched_pin();
 	pmap_invalidate_all_mode(pmap);
 	smp_masked_invltlb(pmap_invalidate_cpu_mask(pmap), pmap);
 	sched_unpin();
 }
 
 void
 pmap_invalidate_cache(void)
 {
 
 	sched_pin();
 	wbinvd();
 	smp_cache_flush();
 	sched_unpin();
 }
 
 struct pde_action {
 	cpuset_t invalidate;	/* processors that invalidate their TLB */
 	pmap_t pmap;
 	vm_offset_t va;
 	pd_entry_t *pde;
 	pd_entry_t newpde;
 	u_int store;		/* processor that updates the PDE */
 };
 
 static void
 pmap_update_pde_action(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (act->store == PCPU_GET(cpuid))
 		pmap_update_pde_store(act->pmap, act->pde, act->newpde);
 }
 
 static void
 pmap_update_pde_teardown(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
 		pmap_update_pde_invalidate(act->pmap, act->va, act->newpde);
 }
 
 /*
  * Change the page size for the specified virtual address in a way that
  * prevents any possibility of the TLB ever having two entries that map the
  * same virtual address using different page sizes.  This is the recommended
  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
  * machine check exception for a TLB state that is improperly diagnosed as a
  * hardware error.
  */
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 	struct pde_action act;
 	cpuset_t active, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	other_cpus = all_cpus;
 	CPU_CLR(cpuid, &other_cpus);
 	if (pmap == kernel_pmap || pmap_type_guest(pmap)) 
 		active = all_cpus;
 	else {
 		active = pmap->pm_active;
 	}
 	if (CPU_OVERLAP(&active, &other_cpus)) { 
 		act.store = cpuid;
 		act.invalidate = active;
 		act.va = va;
 		act.pmap = pmap;
 		act.pde = pde;
 		act.newpde = newpde;
 		CPU_SET(cpuid, &active);
 		smp_rendezvous_cpus(active,
 		    smp_no_rendezvous_barrier, pmap_update_pde_action,
 		    pmap_update_pde_teardown, &act);
 	} else {
 		pmap_update_pde_store(pmap, pde, newpde);
 		if (CPU_ISSET(cpuid, &active))
 			pmap_update_pde_invalidate(pmap, va, newpde);
 	}
 	sched_unpin();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, invalidation functions.
  */
 void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 	uint32_t pcid;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
 		return;
 	}
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 		invlpg(va);
 		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
 		    pmap->pm_ucr3 != PMAP_NO_CR3) {
 			critical_enter();
 			pcid = pmap->pm_pcids[0].pm_pcid;
 			if (invpcid_works) {
 				d.pcid = pcid | PMAP_PCID_USER_PT;
 				d.pad = 0;
 				d.addr = va;
 				invpcid(&d, INVPCID_ADDR);
 			} else {
 				kcr3 = pmap->pm_cr3 | pcid | CR3_PCID_SAVE;
 				ucr3 = pmap->pm_ucr3 | pcid |
 				    PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 				pmap_pti_pcid_invlpg(ucr3, kcr3, va);
 			}
 			critical_exit();
 		}
 	} else if (pmap_pcid_enabled)
 		pmap->pm_pcids[0].pm_gen = 0;
 }
 
 void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct invpcid_descr d;
 	vm_offset_t addr;
 	uint64_t kcr3, ucr3;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
 		return;
 	}
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_range: unknown type %d", pmap->pm_type));
 
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap)) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		if (pmap == PCPU_GET(curpmap) && pmap_pcid_enabled &&
 		    pmap->pm_ucr3 != PMAP_NO_CR3) {
 			critical_enter();
 			if (invpcid_works) {
 				d.pcid = pmap->pm_pcids[0].pm_pcid |
 				    PMAP_PCID_USER_PT;
 				d.pad = 0;
 				d.addr = sva;
 				for (; d.addr < eva; d.addr += PAGE_SIZE)
 					invpcid(&d, INVPCID_ADDR);
 			} else {
 				kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].
 				    pm_pcid | CR3_PCID_SAVE;
 				ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[0].
 				    pm_pcid | PMAP_PCID_USER_PT | CR3_PCID_SAVE;
 				pmap_pti_pcid_invlrng(ucr3, kcr3, sva, eva);
 			}
 			critical_exit();
 		}
 	} else if (pmap_pcid_enabled) {
 		pmap->pm_pcids[0].pm_gen = 0;
 	}
 }
 
 void
 pmap_invalidate_all(pmap_t pmap)
 {
 	struct invpcid_descr d;
 	uint64_t kcr3, ucr3;
 
 	if (pmap->pm_type == PT_RVI || pmap->pm_type == PT_EPT) {
 		pmap->pm_eptgen++;
 		return;
 	}
 	KASSERT(pmap->pm_type == PT_X86,
 	    ("pmap_invalidate_all: unknown type %d", pmap->pm_type));
 
 	if (pmap == kernel_pmap) {
 		if (pmap_pcid_enabled && invpcid_works) {
 			bzero(&d, sizeof(d));
 			invpcid(&d, INVPCID_CTXGLOB);
 		} else {
 			invltlb_glob();
 		}
 	} else if (pmap == PCPU_GET(curpmap)) {
 		if (pmap_pcid_enabled) {
 			critical_enter();
 			if (invpcid_works) {
 				d.pcid = pmap->pm_pcids[0].pm_pcid;
 				d.pad = 0;
 				d.addr = 0;
 				invpcid(&d, INVPCID_CTX);
 				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 					d.pcid |= PMAP_PCID_USER_PT;
 					invpcid(&d, INVPCID_CTX);
 				}
 			} else {
 				kcr3 = pmap->pm_cr3 | pmap->pm_pcids[0].pm_pcid;
 				if (pmap->pm_ucr3 != PMAP_NO_CR3) {
 					ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[
 					    0].pm_pcid | PMAP_PCID_USER_PT;
 					pmap_pti_pcid_invalidate(ucr3, kcr3);
 				} else
 					load_cr3(kcr3);
 			}
 			critical_exit();
 		} else {
 			invltlb();
 		}
 	} else if (pmap_pcid_enabled) {
 		pmap->pm_pcids[0].pm_gen = 0;
 	}
 }
 
 PMAP_INLINE void
 pmap_invalidate_cache(void)
 {
 
 	wbinvd();
 }
 
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 
 	pmap_update_pde_store(pmap, pde, newpde);
 	if (pmap == kernel_pmap || pmap == PCPU_GET(curpmap))
 		pmap_update_pde_invalidate(pmap, va, newpde);
 	else
 		pmap->pm_pcids[0].pm_gen = 0;
 }
 #endif /* !SMP */
 
 static void
 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
 {
 
 	/*
 	 * When the PDE has PG_PROMOTED set, the 2MB page mapping was created
 	 * by a promotion that did not invalidate the 512 4KB page mappings
 	 * that might exist in the TLB.  Consequently, at this point, the TLB
 	 * may hold both 4KB and 2MB page mappings for the address range [va,
 	 * va + NBPDR).  Therefore, the entire range must be invalidated here.
 	 * In contrast, when PG_PROMOTED is clear, the TLB will not hold any
 	 * 4KB page mappings for the address range [va, va + NBPDR), and so a
 	 * single INVLPG suffices to invalidate the 2MB page mapping from the
 	 * TLB.
 	 */
 	if ((pde & PG_PROMOTED) != 0)
 		pmap_invalidate_range(pmap, va, va + NBPDR - 1);
 	else
 		pmap_invalidate_page(pmap, va);
 }
 
 DEFINE_IFUNC(, void, pmap_invalidate_cache_range,
     (vm_offset_t sva, vm_offset_t eva))
 {
 
 	if ((cpu_feature & CPUID_SS) != 0)
 		return (pmap_invalidate_cache_range_selfsnoop);
 	if ((cpu_feature & CPUID_CLFSH) != 0)
 		return (pmap_force_invalidate_cache_range);
 	return (pmap_invalidate_cache_range_all);
 }
 
 #define PMAP_CLFLUSH_THRESHOLD   (2 * 1024 * 1024)
 
 static void
 pmap_invalidate_cache_range_check_align(vm_offset_t sva, vm_offset_t eva)
 {
 
 	KASSERT((sva & PAGE_MASK) == 0,
 	    ("pmap_invalidate_cache_range: sva not page-aligned"));
 	KASSERT((eva & PAGE_MASK) == 0,
 	    ("pmap_invalidate_cache_range: eva not page-aligned"));
 }
 
 static void
 pmap_invalidate_cache_range_selfsnoop(vm_offset_t sva, vm_offset_t eva)
 {
 
 	pmap_invalidate_cache_range_check_align(sva, eva);
 }
 
 void
 pmap_force_invalidate_cache_range(vm_offset_t sva, vm_offset_t eva)
 {
 
 	sva &= ~(vm_offset_t)(cpu_clflush_line_size - 1);
 
 	/*
 	 * XXX: Some CPUs fault, hang, or trash the local APIC
 	 * registers if we use CLFLUSH on the local APIC range.  The
 	 * local APIC is always uncached, so we don't need to flush
 	 * for that range anyway.
 	 */
 	if (pmap_kextract(sva) == lapic_paddr)
 		return;
 
 	if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0) {
 		/*
 		 * Do per-cache line flush.  Use the sfence
 		 * instruction to insure that previous stores are
 		 * included in the write-back.  The processor
 		 * propagates flush to other processors in the cache
 		 * coherence domain.
 		 */
 		sfence();
 		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflushopt(sva);
 		sfence();
 	} else {
 		/*
 		 * Writes are ordered by CLFLUSH on Intel CPUs.
 		 */
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (; sva < eva; sva += cpu_clflush_line_size)
 			clflush(sva);
 		if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 	}
 }
 
 static void
 pmap_invalidate_cache_range_all(vm_offset_t sva, vm_offset_t eva)
 {
 
 	pmap_invalidate_cache_range_check_align(sva, eva);
 	pmap_invalidate_cache();
 }
 
 /*
  * Remove the specified set of pages from the data and instruction caches.
  *
  * In contrast to pmap_invalidate_cache_range(), this function does not
  * rely on the CPU's self-snoop feature, because it is intended for use
  * when moving pages into a different cache domain.
  */
 void
 pmap_invalidate_cache_pages(vm_page_t *pages, int count)
 {
 	vm_offset_t daddr, eva;
 	int i;
 	bool useclflushopt;
 
 	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
 	if (count >= PMAP_CLFLUSH_THRESHOLD / PAGE_SIZE ||
 	    ((cpu_feature & CPUID_CLFSH) == 0 && !useclflushopt))
 		pmap_invalidate_cache();
 	else {
 		if (useclflushopt)
 			sfence();
 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (i = 0; i < count; i++) {
 			daddr = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pages[i]));
 			eva = daddr + PAGE_SIZE;
 			for (; daddr < eva; daddr += cpu_clflush_line_size) {
 				if (useclflushopt)
 					clflushopt(daddr);
 				else
 					clflush(daddr);
 			}
 		}
 		if (useclflushopt)
 			sfence();
 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 	}
 }
 
 void
 pmap_flush_cache_range(vm_offset_t sva, vm_offset_t eva)
 {
 
 	pmap_invalidate_cache_range_check_align(sva, eva);
 
 	if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) == 0) {
 		pmap_force_invalidate_cache_range(sva, eva);
 		return;
 	}
 
 	/* See comment in pmap_force_invalidate_cache_range(). */
 	if (pmap_kextract(sva) == lapic_paddr)
 		return;
 
 	sfence();
 	for (; sva < eva; sva += cpu_clflush_line_size)
 		clwb(sva);
 	sfence();
 }
 
 void
 pmap_flush_cache_phys_range(vm_paddr_t spa, vm_paddr_t epa, vm_memattr_t mattr)
 {
 	pt_entry_t *pte;
 	vm_offset_t vaddr;
 	int error, pte_bits;
 
 	KASSERT((spa & PAGE_MASK) == 0,
 	    ("pmap_flush_cache_phys_range: spa not page-aligned"));
 	KASSERT((epa & PAGE_MASK) == 0,
 	    ("pmap_flush_cache_phys_range: epa not page-aligned"));
 
 	if (spa < dmaplimit) {
 		pmap_flush_cache_range(PHYS_TO_DMAP(spa), PHYS_TO_DMAP(MIN(
 		    dmaplimit, epa)));
 		if (dmaplimit >= epa)
 			return;
 		spa = dmaplimit;
 	}
 
 	pte_bits = pmap_cache_bits(kernel_pmap, mattr, 0) | X86_PG_RW |
 	    X86_PG_V;
 	error = vmem_alloc(kernel_arena, PAGE_SIZE, M_BESTFIT | M_WAITOK,
 	    &vaddr);
 	KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 	pte = vtopte(vaddr);
 	for (; spa < epa; spa += PAGE_SIZE) {
 		sched_pin();
 		pte_store(pte, spa | pte_bits);
 		invlpg(vaddr);
 		/* XXXKIB sfences inside flush_cache_range are excessive */
 		pmap_flush_cache_range(vaddr, vaddr + PAGE_SIZE);
 		sched_unpin();
 	}
 	vmem_free(kernel_arena, vaddr, PAGE_SIZE);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	vm_paddr_t pa;
 
 	pa = 0;
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 		if ((*pdpe & PG_PS) != 0)
 			pa = (*pdpe & PG_PS_FRAME) | (va & PDPMASK);
 		else {
 			pde = pmap_pdpe_to_pde(pdpe, va);
 			if ((*pde & PG_V) != 0) {
 				if ((*pde & PG_PS) != 0) {
 					pa = (*pde & PG_PS_FRAME) |
 					    (va & PDRMASK);
 				} else {
 					pte = pmap_pde_to_pte(pde, va);
 					pa = (*pte & PG_FRAME) |
 					    (va & PAGE_MASK);
 				}
 			}
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde, *pdep;
 	pt_entry_t pte, PG_RW, PG_V;
 	vm_paddr_t pa;
 	vm_page_t m;
 
 	pa = 0;
 	m = NULL;
 	PG_RW = pmap_rw_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 retry:
 	pdep = pmap_pde(pmap, va);
 	if (pdep != NULL && (pde = *pdep)) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 				if (vm_page_pa_tryrelock(pmap, (pde &
 				    PG_PS_FRAME) | (va & PDRMASK), &pa))
 					goto retry;
 				m = PHYS_TO_VM_PAGE(pa);
 			}
 		} else {
 			pte = *pmap_pde_to_pte(pdep, va);
 			if ((pte & PG_V) &&
 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
 				    &pa))
 					goto retry;
 				m = PHYS_TO_VM_PAGE(pa);
 			}
 		}
 		if (m != NULL)
 			vm_page_wire(m);
 	}
 	PA_UNLOCK_COND(pa);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pd_entry_t pde;
 	vm_paddr_t pa;
 
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		pa = DMAP_TO_PHYS(va);
 	} else if (LARGEMAP_MIN_ADDRESS <= va &&
 	    va < PMAP_LARGEMAP_MAX_ADDRESS()) {
 		pa = pmap_large_map_kextract(va);
 	} else {
 		pde = *vtopde(va);
 		if (pde & PG_PS) {
 			pa = (pde & PG_PS_FRAME) | (va & PDRMASK);
 		} else {
 			/*
 			 * Beware of a concurrent promotion that changes the
 			 * PDE at this point!  For example, vtopte() must not
 			 * be used to access the PTE because it would use the
 			 * new PDE.  It is, however, safe to use the old PDE
 			 * because the page table page is preserved by the
 			 * promotion.
 			 */
 			pa = *pmap_pde_to_pte(&pde, va);
 			pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 		}
 	}
 	return (pa);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void 
 pmap_kenter(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g);
 }
 
 static __inline void
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
 	int cache_bits;
 
 	pte = vtopte(va);
 	cache_bits = pmap_cache_bits(kernel_pmap, mode, 0);
 	pte_store(pte, pa | X86_PG_RW | X86_PG_V | pg_g | cache_bits);
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	return PHYS_TO_DMAP(start);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *endpte, oldpte, pa, *pte;
 	vm_page_t m;
 	int cache_bits;
 
 	oldpte = 0;
 	pte = vtopte(sva);
 	endpte = pte + count;
 	while (pte < endpte) {
 		m = *ma++;
 		cache_bits = pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
 		pa = VM_PAGE_TO_PHYS(m) | cache_bits;
 		if ((*pte & (PG_FRAME | X86_PG_PTE_CACHE)) != pa) {
 			oldpte |= *pte;
 			pte_store(pte, pa | pg_g | pg_nx | X86_PG_RW | X86_PG_V);
 		}
 		pte++;
 	}
 	if (__predict_false((oldpte & X86_PG_V) != 0))
 		pmap_invalidate_range(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		KASSERT(va >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", va));
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 	
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  *
  * If "promoted" is false, then the page table page "mpte" must be zero filled.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
 	return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_root, pmap_pde_pindex(va)));
 }
 
 /*
  * Decrements a page table page's wire count, which is used to record the
  * number of valid page table entries within the page.  If the wire count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0) {
 		_pmap_unwire_ptp(pmap, va, m, free);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 static void
 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/*
 	 * unmap the page table page
 	 */
 	if (m->pindex >= (NUPDE + NUPDPE)) {
 		/* PDP page */
 		pml4_entry_t *pml4;
 		pml4 = pmap_pml4e(pmap, va);
 		*pml4 = 0;
 		if (pmap->pm_pml4u != NULL && va <= VM_MAXUSER_ADDRESS) {
 			pml4 = &pmap->pm_pml4u[pmap_pml4e_index(va)];
 			*pml4 = 0;
 		}
 	} else if (m->pindex >= NUPDE) {
 		/* PD page */
 		pdp_entry_t *pdp;
 		pdp = pmap_pdpe(pmap, va);
 		*pdp = 0;
 	} else {
 		/* PTE page */
 		pd_entry_t *pd;
 		pd = pmap_pde(pmap, va);
 		*pd = 0;
 	}
 	pmap_resident_count_dec(pmap, 1);
 	if (m->pindex < NUPDE) {
 		/* We just released a PT, unhold the matching PD */
 		vm_page_t pdpg;
 
 		pdpg = PHYS_TO_VM_PAGE(*pmap_pdpe(pmap, va) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pdpg, free);
 	}
 	if (m->pindex >= NUPDE && m->pindex < (NUPDE + NUPDPE)) {
 		/* We just released a PD, unhold the matching PDP */
 		vm_page_t pdppg;
 
 		pdppg = PHYS_TO_VM_PAGE(*pmap_pml4e(pmap, va) & PG_FRAME);
 		pmap_unwire_ptp(pmap, va, pdppg, free);
 	}
 
 	/* 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
     struct spglist *free)
 {
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return (pmap_unwire_ptp(pmap, va, mpte, free));
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 	struct proc *p;
 	struct thread *td;
 	int i;
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(KPML4phys);
 	pmap->pm_pml4u = NULL;
 	pmap->pm_cr3 = KPML4phys;
 	/* hack to keep pmap_pti_pcid_invalidate() alive */
 	pmap->pm_ucr3 = PMAP_NO_CR3;
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_flags = pmap_flags;
 	CPU_FOREACH(i) {
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_KERN + 1;
 		pmap->pm_pcids[i].pm_gen = 1;
 	}
 	pmap_activate_boot(pmap);
 	td = curthread;
 	if (pti) {
 		p = td->td_proc;
 		PROC_LOCK(p);
 		p->p_md.md_flags |= P_MD_KPTI;
 		PROC_UNLOCK(p);
 	}
 	pmap_thread_init_invl_gen(td);
 
 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 		pmap_pkru_ranges_zone = uma_zcreate("pkru ranges",
 		    sizeof(struct pmap_pkru_range), NULL, NULL, NULL, NULL,
 		    UMA_ALIGN_PTR, 0);
 	}
 }
 
 void
 pmap_pinit_pml4(vm_page_t pml4pg)
 {
 	pml4_entry_t *pm_pml4;
 	int i;
 
 	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 
 	/* Wire in kernel global address entries. */
 	for (i = 0; i < NKPML4E; i++) {
 		pm_pml4[KPML4BASE + i] = (KPDPphys + ptoa(i)) | X86_PG_RW |
 		    X86_PG_V;
 	}
 	for (i = 0; i < ndmpdpphys; i++) {
 		pm_pml4[DMPML4I + i] = (DMPDPphys + ptoa(i)) | X86_PG_RW |
 		    X86_PG_V;
 	}
 
 	/* install self-referential address mapping entry(s) */
 	pm_pml4[PML4PML4I] = VM_PAGE_TO_PHYS(pml4pg) | X86_PG_V | X86_PG_RW |
 	    X86_PG_A | X86_PG_M;
 
 	/* install large map entries if configured */
 	for (i = 0; i < lm_ents; i++)
 		pm_pml4[LMSPML4I + i] = kernel_pmap->pm_pml4[LMSPML4I + i];
 }
 
 static void
 pmap_pinit_pml4_pti(vm_page_t pml4pg)
 {
 	pml4_entry_t *pm_pml4;
 	int i;
 
 	pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4pg));
 	for (i = 0; i < NPML4EPG; i++)
 		pm_pml4[i] = pti_pml4[i];
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 int
 pmap_pinit_type(pmap_t pmap, enum pmap_type pm_type, int flags)
 {
 	vm_page_t pml4pg, pml4pgu;
 	vm_paddr_t pml4phys;
 	int i;
 
 	/*
 	 * allocate the page directory page
 	 */
 	pml4pg = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
 
 	pml4phys = VM_PAGE_TO_PHYS(pml4pg);
 	pmap->pm_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(pml4phys);
 	CPU_FOREACH(i) {
 		pmap->pm_pcids[i].pm_pcid = PMAP_PCID_NONE;
 		pmap->pm_pcids[i].pm_gen = 0;
 	}
 	pmap->pm_cr3 = PMAP_NO_CR3;	/* initialize to an invalid value */
 	pmap->pm_ucr3 = PMAP_NO_CR3;
 	pmap->pm_pml4u = NULL;
 
 	pmap->pm_type = pm_type;
 	if ((pml4pg->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_pml4);
 
 	/*
 	 * Do not install the host kernel mappings in the nested page
 	 * tables. These mappings are meaningless in the guest physical
 	 * address space.
 	 * Install minimal kernel mappings in PTI case.
 	 */
 	if (pm_type == PT_X86) {
 		pmap->pm_cr3 = pml4phys;
 		pmap_pinit_pml4(pml4pg);
 		if ((curproc->p_md.md_flags & P_MD_KPTI) != 0) {
 			pml4pgu = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 			    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
 			pmap->pm_pml4u = (pml4_entry_t *)PHYS_TO_DMAP(
 			    VM_PAGE_TO_PHYS(pml4pgu));
 			pmap_pinit_pml4_pti(pml4pgu);
 			pmap->pm_ucr3 = VM_PAGE_TO_PHYS(pml4pgu);
 		}
 		if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 			rangeset_init(&pmap->pm_pkru, pkru_dup_range,
 			    pkru_free_range, pmap, M_NOWAIT);
 		}
 	}
 
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap->pm_flags = flags;
 	pmap->pm_eptgen = 0;
 
 	return (1);
 }
 
 int
 pmap_pinit(pmap_t pmap)
 {
 
 	return (pmap_pinit_type(pmap, PT_X86, pmap_flags));
 }
 
 /*
  * This routine is called if the desired page table page does not exist.
  *
  * If page table page allocation fails, this routine may sleep before
  * returning NULL.  It sleeps only if a lock pointer was given.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
  * afterwards.  This conservative approach is easily argued to avoid
  * race conditions.
  */
 static vm_page_t
 _pmap_allocpte(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, pdppg, pdpg;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (lockp != NULL) {
 			RELEASE_PV_LIST_LOCK(lockp);
 			PMAP_UNLOCK(pmap);
 			PMAP_ASSERT_NOT_IN_DI();
 			vm_wait(NULL);
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	if (ptepindex >= (NUPDE + NUPDPE)) {
 		pml4_entry_t *pml4, *pml4u;
 		vm_pindex_t pml4index;
 
 		/* Wire up a new PDPE page */
 		pml4index = ptepindex - (NUPDE + NUPDPE);
 		pml4 = &pmap->pm_pml4[pml4index];
 		*pml4 = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 		if (pmap->pm_pml4u != NULL && pml4index < NUPML4E) {
 			/*
 			 * PTI: Make all user-space mappings in the
 			 * kernel-mode page table no-execute so that
 			 * we detect any programming errors that leave
 			 * the kernel-mode page table active on return
 			 * to user space.
 			 */
 			if (pmap->pm_ucr3 != PMAP_NO_CR3)
 				*pml4 |= pg_nx;
 
 			pml4u = &pmap->pm_pml4u[pml4index];
 			*pml4u = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V |
 			    PG_A | PG_M;
 		}
 
 	} else if (ptepindex >= NUPDE) {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
 		pml4_entry_t *pml4;
 		pdp_entry_t *pdp;
 
 		/* Wire up a new PDE page */
 		pdpindex = ptepindex - NUPDE;
 		pml4index = pdpindex >> NPML4EPGSHIFT;
 
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pdp, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + NUPDPE + pml4index,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 		} else {
 			/* Add reference to pdp page */
 			pdppg = PHYS_TO_VM_PAGE(*pml4 & PG_FRAME);
 			pdppg->wire_count++;
 		}
 		pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 
 		/* Now find the pdp page */
 		pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 		*pdp = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 
 	} else {
 		vm_pindex_t pml4index;
 		vm_pindex_t pdpindex;
 		pml4_entry_t *pml4;
 		pdp_entry_t *pdp;
 		pd_entry_t *pd;
 
 		/* Wire up a new PTE page */
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
 		pml4index = pdpindex >> NPML4EPGSHIFT;
 
 		/* First, find the pdp and check that its valid. */
 		pml4 = &pmap->pm_pml4[pml4index];
 		if ((*pml4 & PG_V) == 0) {
 			/* Have to allocate a new pd, recurse */
 			if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 		} else {
 			pdp = (pdp_entry_t *)PHYS_TO_DMAP(*pml4 & PG_FRAME);
 			pdp = &pdp[pdpindex & ((1ul << NPDPEPGSHIFT) - 1)];
 			if ((*pdp & PG_V) == 0) {
 				/* Have to allocate a new pd, recurse */
 				if (_pmap_allocpte(pmap, NUPDE + pdpindex,
 				    lockp) == NULL) {
 					vm_page_unwire_noq(m);
 					vm_page_free_zero(m);
 					return (NULL);
 				}
 			} else {
 				/* Add reference to the pd page */
 				pdpg = PHYS_TO_VM_PAGE(*pdp & PG_FRAME);
 				pdpg->wire_count++;
 			}
 		}
 		pd = (pd_entry_t *)PHYS_TO_DMAP(*pdp & PG_FRAME);
 
 		/* Now we know where the page directory page is */
 		pd = &pd[ptepindex & ((1ul << NPDEPGSHIFT) - 1)];
 		*pd = VM_PAGE_TO_PHYS(m) | PG_U | PG_RW | PG_V | PG_A | PG_M;
 	}
 
 	pmap_resident_count_inc(pmap, 1);
 
 	return (m);
 }
 
 static vm_page_t
 pmap_allocpde(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t pdpindex, ptepindex;
 	pdp_entry_t *pdpe, PG_V;
 	vm_page_t pdpg;
 
 	PG_V = pmap_valid_bit(pmap);
 
 retry:
 	pdpe = pmap_pdpe(pmap, va);
 	if (pdpe != NULL && (*pdpe & PG_V) != 0) {
 		/* Add a reference to the pd page. */
 		pdpg = PHYS_TO_VM_PAGE(*pdpe & PG_FRAME);
 		pdpg->wire_count++;
 	} else {
 		/* Allocate a pd page. */
 		ptepindex = pmap_pde_pindex(va);
 		pdpindex = ptepindex >> NPDPEPGSHIFT;
 		pdpg = _pmap_allocpte(pmap, NUPDE + pdpindex, lockp);
 		if (pdpg == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (pdpg);
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pd, PG_V;
 	vm_page_t m;
 
 	PG_V = pmap_valid_bit(pmap);
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_pde_pindex(va);
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	pd = pmap_pde(pmap, va);
 
 	/*
 	 * This supports switching from a 2MB page to a
 	 * normal 4K page.
 	 */
 	if (pd != NULL && (*pd & (PG_PS | PG_V)) == (PG_PS | PG_V)) {
 		if (!pmap_demote_pde_locked(pmap, pd, va, lockp)) {
 			/*
 			 * Invalidation of the 2MB page mapping may have caused
 			 * the deallocation of the underlying PD page.
 			 */
 			pd = NULL;
 		}
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (pd != NULL && (*pd & PG_V) != 0) {
 		m = PHYS_TO_VM_PAGE(*pd & PG_FRAME);
 		m->wire_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		m = _pmap_allocpte(pmap, ptepindex, lockp);
 		if (m == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (m);
 }
 
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m;
 	int i;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 	KASSERT(CPU_EMPTY(&pmap->pm_active),
 	    ("releasing active pmap %p", pmap));
 
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4));
 
 	for (i = 0; i < NKPML4E; i++)	/* KVA */
 		pmap->pm_pml4[KPML4BASE + i] = 0;
 	for (i = 0; i < ndmpdpphys; i++)/* Direct Map */
 		pmap->pm_pml4[DMPML4I + i] = 0;
 	pmap->pm_pml4[PML4PML4I] = 0;	/* Recursive Mapping */
 	for (i = 0; i < lm_ents; i++)	/* Large Map */
 		pmap->pm_pml4[LMSPML4I + i] = 0;
 
 	vm_page_unwire_noq(m);
 	vm_page_free_zero(m);
 
 	if (pmap->pm_pml4u != NULL) {
 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_pml4u));
 		vm_page_unwire_noq(m);
 		vm_page_free(m);
 	}
 	if (pmap->pm_type == PT_X86 &&
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
 		rangeset_fini(&pmap->pm_pkru);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "LU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "LU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t paddr;
 	vm_page_t nkpg;
 	pd_entry_t *pde, newpdir;
 	pdp_entry_t *pdpe;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 
 	/*
 	 * Return if "addr" is within the range of kernel page table pages
 	 * that were preallocated during pmap bootstrap.  Moreover, leave
 	 * "kernel_vm_end" and the kernel page table as they were.
 	 *
 	 * The correctness of this action is based on the following
 	 * argument: vm_map_insert() allocates contiguous ranges of the
 	 * kernel virtual address space.  It calls this function if a range
 	 * ends after "kernel_vm_end".  If the kernel is mapped between
 	 * "kernel_vm_end" and "addr", then the range cannot begin at
 	 * "kernel_vm_end".  In fact, its beginning address cannot be less
 	 * than the kernel.  Thus, there is no immediate need to allocate
 	 * any new kernel page table pages between "kernel_vm_end" and
 	 * "KERNBASE".
 	 */
 	if (KERNBASE < addr && addr <= KERNBASE + nkpt * NBPDR)
 		return;
 
 	addr = roundup2(addr, NBPDR);
 	if (addr - 1 >= vm_map_max(kernel_map))
 		addr = vm_map_max(kernel_map);
 	while (kernel_vm_end < addr) {
 		pdpe = pmap_pdpe(kernel_pmap, kernel_vm_end);
 		if ((*pdpe & X86_PG_V) == 0) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDPSHIFT,
 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 			if (nkpg == NULL)
 				panic("pmap_growkernel: no memory to grow kernel");
 			if ((nkpg->flags & PG_ZERO) == 0)
 				pmap_zero_page(nkpg);
 			paddr = VM_PAGE_TO_PHYS(nkpg);
 			*pdpe = (pdp_entry_t)(paddr | X86_PG_V | X86_PG_RW |
 			    X86_PG_A | X86_PG_M);
 			continue; /* try again */
 		}
 		pde = pmap_pdpe_to_pde(pdpe, kernel_vm_end);
 		if ((*pde & X86_PG_V) != 0) {
 			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 				kernel_vm_end = vm_map_max(kernel_map);
 				break;                       
 			}
 			continue;
 		}
 
 		nkpg = vm_page_alloc(NULL, pmap_pde_pindex(kernel_vm_end),
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 		if ((nkpg->flags & PG_ZERO) == 0)
 			pmap_zero_page(nkpg);
 		paddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = paddr | X86_PG_V | X86_PG_RW | X86_PG_A | X86_PG_M;
 		pde_store(pde, newpdir);
 
 		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 			kernel_vm_end = vm_map_max(kernel_map);
 			break;                       
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 3);
 CTASSERT(_NPCPV == 168);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0	0xfffffffffffffffful
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 #endif
 
 static void
 reclaim_pv_chunk_leave_pmap(pmap_t pmap, pmap_t locked_pmap, bool start_di)
 {
 
 	if (pmap == NULL)
 		return;
 	pmap_invalidate_all(pmap);
 	if (pmap != locked_pmap)
 		PMAP_UNLOCK(pmap);
 	if (start_di)
 		pmap_delayed_invl_finish();
 }
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  *
  * Returns NULL if PV entries were reclaimed from the specified pmap.
  *
  * We do not, however, unmap 2mpages because subsequent accesses will
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
 static vm_page_t
 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 {
 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
 	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t next_pmap, pmap;
 	pt_entry_t *pte, tpte;
 	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 	pv_entry_t pv;
 	vm_offset_t va;
 	vm_page_t m, m_pc;
 	struct spglist free;
 	uint64_t inuse;
 	int bit, field, freed;
 	bool start_di;
 	static int active_reclaims = 0;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 	pmap = NULL;
 	m_pc = NULL;
 	PG_G = PG_A = PG_M = PG_RW = 0;
 	SLIST_INIT(&free);
 	bzero(&pc_marker_b, sizeof(pc_marker_b));
 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
 	pc_marker = (struct pv_chunk *)&pc_marker_b;
 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
 
 	/*
 	 * A delayed invalidation block should already be active if
 	 * pmap_advise() or pmap_remove() called this function by way
 	 * of pmap_demote_pde_locked().
 	 */
 	start_di = pmap_not_in_di();
 
 	mtx_lock(&pv_chunks_mutex);
 	active_reclaims++;
 	TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
 	    SLIST_EMPTY(&free)) {
 		next_pmap = pc->pc_pmap;
 		if (next_pmap == NULL) {
 			/*
 			 * The next chunk is a marker.  However, it is
 			 * not our marker, so active_reclaims must be
 			 * > 1.  Consequently, the next_chunk code
 			 * will not rotate the pv_chunks list.
 			 */
 			goto next_chunk;
 		}
 		mtx_unlock(&pv_chunks_mutex);
 
 		/*
 		 * A pv_chunk can only be removed from the pc_lru list
 		 * when both pc_chunks_mutex is owned and the
 		 * corresponding pmap is locked.
 		 */
 		if (pmap != next_pmap) {
 			reclaim_pv_chunk_leave_pmap(pmap, locked_pmap,
 			    start_di);
 			pmap = next_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap) {
 				RELEASE_PV_LIST_LOCK(lockp);
 				PMAP_LOCK(pmap);
 				if (start_di)
 					pmap_delayed_invl_start();
 				mtx_lock(&pv_chunks_mutex);
 				continue;
 			} else if (pmap != locked_pmap) {
 				if (PMAP_TRYLOCK(pmap)) {
 					if (start_di)
 						pmap_delayed_invl_start();
 					mtx_lock(&pv_chunks_mutex);
 					continue;
 				} else {
 					pmap = NULL; /* pmap is not locked */
 					mtx_lock(&pv_chunks_mutex);
 					pc = TAILQ_NEXT(pc_marker, pc_lru);
 					if (pc == NULL ||
 					    pc->pc_pmap != next_pmap)
 						continue;
 					goto next_chunk;
 				}
 			} else if (start_di)
 				pmap_delayed_invl_start();
 			PG_G = pmap_global_bit(pmap);
 			PG_A = pmap_accessed_bit(pmap);
 			PG_M = pmap_modified_bit(pmap);
 			PG_RW = pmap_rw_bit(pmap);
 		}
 
 		/*
 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
 		 */
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 			    inuse != 0; inuse &= ~(1UL << bit)) {
 				bit = bsfq(inuse);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va = pv->pv_va;
 				pde = pmap_pde(pmap, va);
 				if ((*pde & PG_PS) != 0)
 					continue;
 				pte = pmap_pde_to_pte(pde, va);
 				if ((*pte & PG_W) != 0)
 					continue;
 				tpte = pte_load_clear(pte);
 				if ((tpte & PG_G) != 0)
 					pmap_invalidate_page(pmap, va);
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 					vm_page_dirty(m);
 				if ((tpte & PG_A) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						vm_page_aflag_clear(m,
 						    PGA_WRITEABLE);
 					}
 				}
 				pmap_delayed_invl_page(m);
 				pc->pc_map[field] |= 1UL << bit;
 				pmap_unuse_pt(pmap, va, *pde, &free);
 				freed++;
 			}
 		}
 		if (freed == 0) {
 			mtx_lock(&pv_chunks_mutex);
 			goto next_chunk;
 		}
 		/* Every freed mapping is for a 4 KB page. */
 		pmap_resident_count_dec(pmap, freed);
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
 		    pc->pc_map[2] == PC_FREE2) {
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 			/* Entire chunk is free; return it. */
 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 			dump_drop_page(m_pc->phys_addr);
 			mtx_lock(&pv_chunks_mutex);
 			TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 			break;
 		}
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		mtx_lock(&pv_chunks_mutex);
 		/* One freed pv entry in locked_pmap is sufficient. */
 		if (pmap == locked_pmap)
 			break;
 next_chunk:
 		TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 		TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
 		if (active_reclaims == 1 && pmap != NULL) {
 			/*
 			 * Rotate the pv chunks list so that we do not
 			 * scan the same pv chunks that could not be
 			 * freed (because they contained a wired
 			 * and/or superpage mapping) on every
 			 * invocation of reclaim_pv_chunk().
 			 */
 			while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
 				MPASS(pc->pc_pmap != NULL);
 				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 			}
 		}
 	}
 	TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 	TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
 	active_reclaims--;
 	mtx_unlock(&pv_chunks_mutex);
 	reclaim_pv_chunk_leave_pmap(pmap, locked_pmap, start_di);
 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 		m_pc = SLIST_FIRST(&free);
 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 	}
 	vm_page_free_pages_toq(&free, true);
 	return (m_pc);
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 	    pc->pc_map[2] != PC_FREE2) {
 		/* 98% of the time, pc is already at the head of the list. */
 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		}
 		return;
 	}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
 	mtx_lock(&pv_chunks_mutex);
  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 }
 
 /*
  * Returns a new PV entry, allocating a new PV chunk from the system when
  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
  * returned.
  *
  * The given PV list lock may be released.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 {
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = bsfq(pc->pc_map[field]);
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 64 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 			    pc->pc_map[2] == 0) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED);
 	if (m == NULL) {
 		if (lockp == NULL) {
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = reclaim_pv_chunk(pmap, lockp);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
 	mtx_lock(&pv_chunks_mutex);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 	return (pv);
 }
 
 /*
  * Returns the number of one bits within the given PV chunk map.
  *
  * The erratas for Intel processors state that "POPCNT Instruction May
  * Take Longer to Execute Than Expected".  It is believed that the
  * issue is the spurious dependency on the destination register.
  * Provide a hint to the register rename logic that the destination
  * value is overwritten, by clearing it, as suggested in the
  * optimization manual.  It should be cheap for unaffected processors
  * as well.
  *
  * Reference numbers for erratas are
  * 4th Gen Core: HSD146
  * 5th Gen Core: BDM85
  * 6th Gen Core: SKL029
  */
 static int
 popcnt_pc_map_pq(uint64_t *map)
 {
 	u_long result, tmp;
 
 	__asm __volatile("xorl %k0,%k0;popcntq %2,%0;"
 	    "xorl %k1,%k1;popcntq %3,%1;addl %k1,%k0;"
 	    "xorl %k1,%k1;popcntq %4,%1;addl %k1,%k0"
 	    : "=&r" (result), "=&r" (tmp)
 	    : "m" (map[0]), "m" (map[1]), "m" (map[2]));
 	return (result);
 }
 
 /*
  * Ensure that the number of spare PV entries in the specified pmap meets or
  * exceeds the given count, "needed".
  *
  * The given PV list lock may be released.
  */
 static void
 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 {
 	struct pch new_tail;
 	struct pv_chunk *pc;
 	vm_page_t m;
 	int avail, free;
 	bool reclaimed;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 
 	/*
 	 * Newly allocated PV chunks must be stored in a private list until
 	 * the required number of PV chunks have been allocated.  Otherwise,
 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
 	 * contrast, these chunks must be added to the pmap upon allocation.
 	 */
 	TAILQ_INIT(&new_tail);
 retry:
 	avail = 0;
 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 #ifndef __POPCNT__
 		if ((cpu_feature2 & CPUID2_POPCNT) == 0)
 			bit_count((bitstr_t *)pc->pc_map, 0,
 			    sizeof(pc->pc_map) * NBBY, &free);
 		else
 #endif
 		free = popcnt_pc_map_pq(pc->pc_map);
 		if (free == 0)
 			break;
 		avail += free;
 		if (avail >= needed)
 			break;
 	}
 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED);
 		if (m == NULL) {
 			m = reclaim_pv_chunk(pmap, lockp);
 			if (m == NULL)
 				goto retry;
 			reclaimed = true;
 		}
 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 		dump_add_page(m->phys_addr);
 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 		pc->pc_pmap = pmap;
 		pc->pc_map[0] = PC_FREE0;
 		pc->pc_map[1] = PC_FREE1;
 		pc->pc_map[2] = PC_FREE2;
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
 
 		/*
 		 * The reclaim might have freed a chunk from the current pmap.
 		 * If that chunk contained available entries, we need to
 		 * re-count the number of available entries.
 		 */
 		if (reclaimed)
 			goto retry;
 	}
 	if (!TAILQ_EMPTY(&new_tail)) {
 		mtx_lock(&pv_chunks_mutex);
 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 		mtx_unlock(&pv_chunks_mutex);
 	}
 }
 
 /*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
  * 2MB page mappings.
  */
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 			break;
 		}
 	}
 	return (pv);
 }
 
 /*
  * After demotion from a 2MB page mapping to 512 4KB page mappings,
  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
  * entries for each of the 4KB page mappings.
  */
 static void
 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct pv_chunk *pc;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 	int bit, field;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the 2mpage's pv entry for this mapping to the first
 	 * page's pv list.  Once this transfer begins, the pv list lock
 	 * must not be released until the last pv entry is reinstantiated.
 	 */
 	pvh = pa_to_pvh(pa);
 	va = trunc_2mpage(va);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	m->md.pv_gen++;
 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
 	PV_STAT(atomic_add_long(&pv_entry_allocs, NPTEPG - 1));
 	va_last = va + NBPDR - PAGE_SIZE;
 	for (;;) {
 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 		    pc->pc_map[2] != 0, ("pmap_pv_demote_pde: missing spare"));
 		for (field = 0; field < _NPCM; field++) {
 			while (pc->pc_map[field]) {
 				bit = bsfq(pc->pc_map[field]);
 				pc->pc_map[field] &= ~(1ul << bit);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va += PAGE_SIZE;
 				pv->pv_va = va;
 				m++;
 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("pmap_pv_demote_pde: page %p is not managed", m));
 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (va == va_last)
 					goto out;
 			}
 		}
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 out:
 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 	PV_STAT(atomic_add_long(&pv_entry_count, NPTEPG - 1));
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, NPTEPG - 1));
 }
 
 #if VM_NRESERVLEVEL > 0
 /*
  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
  * replace the many pv entries for the 4KB page mappings by a single pv entry
  * for the 2MB page mapping.
  */
 static void
 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 	 * a transfer avoids the possibility that get_pv_entry() calls
 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 	 * mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = trunc_2mpage(va);
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	/* Free the remaining NPTEPG - 1 pv entries. */
 	va_last = va + NBPDR - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  * First find and then destroy the pv entry for the specified pmap and virtual
  * address.  This operation can be performed on pv lists for either 4KB or 2MB
  * page mappings.
  */
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Conditionally create the PV entry for a 4KB page mapping if the required
  * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct rwlock **lockp)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
  * false if the PV entry cannot be allocated without resorting to reclamation.
  */
 static bool
 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_paddr_t pa;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
 	    NULL : lockp)) == NULL)
 		return (false);
 	pv->pv_va = va;
 	pa = pde & PG_PS_FRAME;
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	return (true);
 }
 
 /*
  * Fills a page table page with mappings to consecutive physical pages.
  */
 static void
 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 {
 	pt_entry_t *pte;
 
 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 		*pte = newpte;
 		newpte += PAGE_SIZE;
 	}
 }
 
 /*
  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
  * mapping is invalidated.
  */
 static boolean_t
 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	struct rwlock *lock;
 	boolean_t rv;
 
 	lock = NULL;
 	rv = pmap_demote_pde_locked(pmap, pde, va, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	return (rv);
 }
 
 static void
 pmap_demote_pde_check(pt_entry_t *firstpte __unused, pt_entry_t newpte __unused)
 {
 #ifdef INVARIANTS
 #ifdef DIAGNOSTIC
 	pt_entry_t *xpte, *ypte;
 
 	for (xpte = firstpte; xpte < firstpte + NPTEPG;
 	    xpte++, newpte += PAGE_SIZE) {
 		if ((*xpte & PG_FRAME) != (newpte & PG_FRAME)) {
 			printf("pmap_demote_pde: xpte %zd and newpte map "
 			    "different pages: found %#lx, expected %#lx\n",
 			    xpte - firstpte, *xpte, newpte);
 			printf("page table dump\n");
 			for (ypte = firstpte; ypte < firstpte + NPTEPG; ypte++)
 				printf("%zd %#lx\n", ypte - firstpte, *ypte);
 			panic("firstpte");
 		}
 	}
 #else
 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 	    ("pmap_demote_pde: firstpte and newpte map different physical"
 	    " addresses"));
 #endif
 #endif
 }
 
 static void
 pmap_demote_pde_abort(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t oldpde, struct rwlock **lockp)
 {
 	struct spglist free;
 	vm_offset_t sva;
 
 	SLIST_INIT(&free);
 	sva = trunc_2mpage(va);
 	pmap_remove_pde(pmap, pde, sva, &free, lockp);
 	if ((oldpde & pmap_global_bit(pmap)) == 0)
 		pmap_invalidate_pde_page(pmap, sva, oldpde);
 	vm_page_free_pages_toq(&free, true);
 	CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#lx in pmap %p",
 	    va, pmap);
 }
 
 static boolean_t
 pmap_demote_pde_locked(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pd_entry_t newpde, oldpde;
 	pt_entry_t *firstpte, newpte;
 	pt_entry_t PG_A, PG_G, PG_M, PG_PKU_MASK, PG_RW, PG_V;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 	int PG_PTE_CACHE;
 	bool in_kernel;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 	PG_PKU_MASK = pmap_pku_mask_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	in_kernel = va >= VM_MAXUSER_ADDRESS;
 	oldpde = *pde;
 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 
 	/*
 	 * Invalidate the 2MB page mapping and return "failure" if the
 	 * mapping was never accessed.
 	 */
 	if ((oldpde & PG_A) == 0) {
 		KASSERT((oldpde & PG_W) == 0,
 		    ("pmap_demote_pde: a wired mapping is missing PG_A"));
 		pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
 		return (FALSE);
 	}
 
 	mpte = pmap_remove_pt_page(pmap, va);
 	if (mpte == NULL) {
 		KASSERT((oldpde & PG_W) == 0,
 		    ("pmap_demote_pde: page table page for a wired mapping"
 		    " is missing"));
 
 		/*
 		 * If the page table page is missing and the mapping
 		 * is for a kernel address, the mapping must belong to
 		 * the direct map.  Page table pages are preallocated
 		 * for every other part of the kernel address space,
 		 * so the direct map region is the only part of the
 		 * kernel address space that must be handled here.
 		 */
 		KASSERT(!in_kernel || (va >= DMAP_MIN_ADDRESS &&
 		    va < DMAP_MAX_ADDRESS),
 		    ("pmap_demote_pde: No saved mpte for va %#lx", va));
 
 		/*
 		 * If the 2MB page mapping belongs to the direct map
 		 * region of the kernel's address space, then the page
 		 * allocation request specifies the highest possible
 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
 		 * priority is normal.
 		 */
 		mpte = vm_page_alloc(NULL, pmap_pde_pindex(va),
 		    (in_kernel ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 
 		/*
 		 * If the allocation of the new page table page fails,
 		 * invalidate the 2MB page mapping and return "failure".
 		 */
 		if (mpte == NULL) {
 			pmap_demote_pde_abort(pmap, va, pde, oldpde, lockp);
 			return (FALSE);
 		}
 
 		if (!in_kernel) {
 			mpte->wire_count = NPTEPG;
 			pmap_resident_count_inc(pmap, 1);
 		}
 	}
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pde: oldpde is missing PG_M"));
 	newpte = oldpde & ~PG_PS;
 	newpte = pmap_swap_pat(pmap, newpte);
 
 	/*
 	 * If the page table page is not leftover from an earlier promotion,
 	 * initialize it.
 	 */
 	if (mpte->valid == 0)
 		pmap_fill_ptp(firstpte, newpte);
 
 	pmap_demote_pde_check(firstpte, newpte);
 
 	/*
 	 * If the mapping has changed attributes, update the page table
 	 * entries.
 	 */
 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 		pmap_fill_ptp(firstpte, newpte);
 
 	/*
 	 * The spare PV entries must be reserved prior to demoting the
 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
 	 * of the PDE and the PV lists will be inconsistent, which can result
 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
 	 * wrong PV list and pmap_pv_demote_pde() failing to find the expected
 	 * PV entry for the 2MB page mapping that is being demoted.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		reserve_pv_entries(pmap, NPTEPG - 1, lockp);
 
 	/*
 	 * Demote the mapping.  This pmap is locked.  The old PDE has
 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 	 * set.  Thus, there is no danger of a race with another
 	 * processor changing the setting of PG_A and/or PG_M between
 	 * the read above and the store below. 
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else
 		pde_store(pde, newpde);
 
 	/*
 	 * Invalidate a stale recursive mapping of the page table page.
 	 */
 	if (in_kernel)
 		pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 
 	/*
 	 * Demote the PV entry.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME, lockp);
 
 	atomic_add_long(&pmap_pde_demotions, 1);
 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#lx in pmap %p",
 	    va, pmap);
 	return (TRUE);
 }
 
 /*
  * pmap_remove_kernel_pde: Remove a kernel superpage mapping.
  */
 static void
 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 
 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte = pmap_remove_pt_page(pmap, va);
 	if (mpte == NULL)
 		panic("pmap_remove_kernel_pde: Missing pt page.");
 
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	newpde = mptepa | X86_PG_M | X86_PG_A | X86_PG_RW | X86_PG_V;
 
 	/*
 	 * If this page table page was unmapped by a promotion, then it
 	 * contains valid mappings.  Zero it to invalidate those mappings.
 	 */
 	if (mpte->valid != 0)
 		pagezero((void *)PHYS_TO_DMAP(mptepa));
 
 	/*
 	 * Demote the mapping.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else
 		pde_store(pde, newpde);
 
 	/*
 	 * Invalidate a stale recursive mapping of the page table page.
 	 */
 	pmap_invalidate_page(pmap, (vm_offset_t)vtopte(va));
 }
 
 /*
  * pmap_remove_pde: do the things to unmap a superpage in a process
  */
 static int
 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pd_entry_t oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m, mpte;
 	pt_entry_t PG_G, PG_A, PG_M, PG_RW;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_remove_pde: sva is not 2mpage aligned"));
 	oldpde = pte_load_clear(pdq);
 	if (oldpde & PG_W)
 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 	if ((oldpde & PG_G) != 0)
 		pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 	pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 	if (oldpde & PG_MANAGED) {
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, oldpde & PG_PS_FRAME);
 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++) {
 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(m);
 			if (oldpde & PG_A)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 			pmap_delayed_invl_page(m);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_pde(pmap, pdq, sva);
 	} else {
 		mpte = pmap_remove_pt_page(pmap, sva);
 		if (mpte != NULL) {
 			KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_remove_pde: pte page not promoted"));
 			pmap_resident_count_dec(pmap, 1);
 			KASSERT(mpte->wire_count == NPTEPG,
 			    ("pmap_remove_pde: pte page wire count error"));
 			mpte->wire_count = 0;
 			pmap_add_delayed_free_list(mpte, free, FALSE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, sva, *pmap_pdpe(pmap, sva), free));
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va, 
     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t oldpte, PG_A, PG_M, PG_RW;
 	vm_page_t m;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	pmap_resident_count_dec(pmap, 1);
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if (oldpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		pmap_pvh_free(&m->md, pmap, va);
 		if (TAILQ_EMPTY(&m->md.pv_list) &&
 		    (m->flags & PG_FICTITIOUS) == 0) {
 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 			if (TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 		pmap_delayed_invl_page(m);
 	}
 	return (pmap_unuse_pt(pmap, va, ptepde, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     struct spglist *free)
 {
 	struct rwlock *lock;
 	pt_entry_t *pte, PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((*pde & PG_V) == 0)
 		return;
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		return;
 	lock = NULL;
 	pmap_remove_pte(pmap, pte, va, *pde, free, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	pmap_invalidate_page(pmap, va);
 }
 
 /*
  * Removes the specified range of addresses from the page table page.
  */
 static bool
 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     pd_entry_t *pde, struct spglist *free, struct rwlock **lockp)
 {
 	pt_entry_t PG_G, *pte;
 	vm_offset_t va;
 	bool anyvalid;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PG_G = pmap_global_bit(pmap);
 	anyvalid = false;
 	va = eva;
 	for (pte = pmap_pde_to_pte(pde, sva); sva != eva; pte++,
 	    sva += PAGE_SIZE) {
 		if (*pte == 0) {
 			if (va != eva) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = eva;
 			}
 			continue;
 		}
 		if ((*pte & PG_G) == 0)
 			anyvalid = true;
 		else if (va == eva)
 			va = sva;
 		if (pmap_remove_pte(pmap, pte, sva, *pde, free, lockp)) {
 			sva += PAGE_SIZE;
 			break;
 		}
 	}
 	if (va != eva)
 		pmap_invalidate_range(pmap, va, sva);
 	return (anyvalid);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct rwlock *lock;
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t PG_G, PG_V;
 	struct spglist free;
 	int anyvalid;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = 0;
 	SLIST_INIT(&free);
 
 	pmap_delayed_invl_start();
 	PMAP_LOCK(pmap);
 	pmap_pkru_on_remove(pmap, sva, eva);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if (sva + PAGE_SIZE == eva) {
 		pde = pmap_pde(pmap, sva);
 		if (pde && (*pde & PG_PS) == 0) {
 			pmap_remove_page(pmap, sva, pde, &free);
 			goto out;
 		}
 	}
 
 	lock = NULL;
 	for (; sva < eva; sva = va_next) {
 
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we removing the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == va_next && eva >= va_next) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_remove_pde().
 				 */
 				if ((ptpaddr & PG_G) == 0)
 					anyvalid = 1;
 				pmap_remove_pde(pmap, pde, sva, &free, &lock);
 				continue;
 			} else if (!pmap_demote_pde_locked(pmap, pde, sva,
 			    &lock)) {
 				/* The large page mapping was destroyed. */
 				continue;
 			} else
 				ptpaddr = *pde;
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (va_next > eva)
 			va_next = eva;
 
 		if (pmap_remove_ptes(pmap, sva, va_next, pde, &free, &lock))
 			anyvalid = 1;
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 out:
 	if (anyvalid)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 	pmap_delayed_invl_finish();
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pt_entry_t *pte, tpte, PG_A, PG_M, PG_RW;
 	pd_entry_t *pde;
 	vm_offset_t va;
 	struct spglist free;
 	int pvh_gen, md_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry:
 	rw_wlock(lock);
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 		PMAP_UNLOCK(pmap);
 	}
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pmap_resident_count_dec(pmap, 1);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 		    " a 2mpage in page %p's pv list", m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		if (tpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, *pde, &free);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(lock);
 	pmap_delayed_invl_wait(m);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * pmap_protect_pde: do the things to protect a 2mpage in a process
  */
 static boolean_t
 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 {
 	pd_entry_t newpde, oldpde;
 	vm_page_t m, mt;
 	boolean_t anychanged;
 	pt_entry_t PG_G, PG_M, PG_RW;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_protect_pde: sva is not 2mpage aligned"));
 	anychanged = FALSE;
 retry:
 	oldpde = newpde = *pde;
 	if ((prot & VM_PROT_WRITE) == 0) {
 		if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
 		    (PG_MANAGED | PG_M | PG_RW)) {
 			m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 			for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 				vm_page_dirty(mt);
 		}
 		newpde &= ~(PG_RW | PG_M);
 	}
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 	if (newpde != oldpde) {
 		/*
 		 * As an optimization to future operations on this PDE, clear
 		 * PG_PROMOTED.  The impending invalidation will remove any
 		 * lingering 4KB page mappings from the TLB.
 		 */
 		if (!atomic_cmpset_long(pde, oldpde, newpde & ~PG_PROMOTED))
 			goto retry;
 		if ((oldpde & PG_G) != 0)
 			pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 		else
 			anychanged = TRUE;
 	}
 	return (anychanged);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t ptpaddr, *pde;
 	pt_entry_t *pte, PG_G, PG_M, PG_RW, PG_V;
 	boolean_t anychanged;
 
 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 	if (prot == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if ((prot & (VM_PROT_WRITE|VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE|VM_PROT_EXECUTE))
 		return;
 
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	anychanged = FALSE;
 
 	/*
 	 * Although this function delays and batches the invalidation
 	 * of stale TLB entries, it does not need to call
 	 * pmap_delayed_invl_start() and
 	 * pmap_delayed_invl_finish(), because it does not
 	 * ordinarily destroy mappings.  Stale TLB entries from
 	 * protection-only changes need only be invalidated before the
 	 * pmap lock is released, because protection-only changes do
 	 * not destroy PV entries.  Even operations that iterate over
 	 * a physical page's PV list of mappings, like
 	 * pmap_remove_write(), acquire the pmap lock for each
 	 * mapping.  Consequently, for protection-only changes, the
 	 * pmap lock suffices to synchronize both page table and TLB
 	 * updates.
 	 *
 	 * This function only destroys a mapping if pmap_demote_pde()
 	 * fails.  In that case, stale TLB entries are immediately
 	 * invalidated.
 	 */
 	
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		ptpaddr = *pde;
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we protecting the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == va_next && eva >= va_next) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_protect_pde().
 				 */
 				if (pmap_protect_pde(pmap, pde, sva, prot))
 					anychanged = TRUE;
 				continue;
 			} else if (!pmap_demote_pde(pmap, pde, sva)) {
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			pt_entry_t obits, pbits;
 			vm_page_t m;
 
 retry:
 			obits = pbits = *pte;
 			if ((pbits & PG_V) == 0)
 				continue;
 
 			if ((prot & VM_PROT_WRITE) == 0) {
 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 				    (PG_MANAGED | PG_M | PG_RW)) {
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				pbits &= ~(PG_RW | PG_M);
 			}
 			if ((prot & VM_PROT_EXECUTE) == 0)
 				pbits |= pg_nx;
 
 			if (pbits != obits) {
 				if (!atomic_cmpset_long(pte, obits, pbits))
 					goto retry;
 				if (obits & PG_G)
 					pmap_invalidate_page(pmap, sva);
 				else
 					anychanged = TRUE;
 			}
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 }
 
 #if VM_NRESERVLEVEL > 0
 /*
  * Tries to promote the 512, contiguous 4KB page mappings that are within a
  * single page table page (PTP) to a single 2MB page mapping.  For promotion
  * to occur, two conditions must be met: (1) the 4KB page mappings must map
  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
  * identical characteristics. 
  */
 static void
 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pd_entry_t newpde;
 	pt_entry_t *firstpte, oldpte, pa, *pte;
 	pt_entry_t PG_G, PG_A, PG_M, PG_RW, PG_V, PG_PKU_MASK;
 	vm_page_t mpte;
 	int PG_PTE_CACHE;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	PG_PKU_MASK = pmap_pku_mask_bit(pmap);
 	PG_PTE_CACHE = pmap_cache_mask(pmap, 0);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
 	 * either invalid, unused, or does not map the first 4KB physical page
 	 * within a 2MB page. 
 	 */
 	firstpte = (pt_entry_t *)PHYS_TO_DMAP(*pde & PG_FRAME);
 setpde:
 	newpde = *firstpte;
 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 		atomic_add_long(&pmap_pde_p_failures, 1);
 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 		/*
 		 * When PG_M is already clear, PG_RW can be cleared without
 		 * a TLB invalidation.
 		 */
 		if (!atomic_cmpset_long(firstpte, newpde, newpde & ~PG_RW))
 			goto setpde;
 		newpde &= ~PG_RW;
 	}
 
 	/*
 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
 	 * PTE maps an unexpected 4KB physical page or does not have identical
 	 * characteristics to the first PTE.
 	 */
 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 setpte:
 		oldpte = *pte;
 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 			atomic_add_long(&pmap_pde_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 			/*
 			 * When PG_M is already clear, PG_RW can be cleared
 			 * without a TLB invalidation.
 			 */
 			if (!atomic_cmpset_long(pte, oldpte, oldpte & ~PG_RW))
 				goto setpte;
 			oldpte &= ~PG_RW;
 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#lx"
 			    " in pmap %p", (oldpte & PG_FRAME & PDRMASK) |
 			    (va & ~PDRMASK), pmap);
 		}
 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 			atomic_add_long(&pmap_pde_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		pa -= PAGE_SIZE;
 	}
 
 	/*
 	 * Save the page table page in its current state until the PDE
 	 * mapping the superpage is demoted by pmap_demote_pde() or
 	 * destroyed by pmap_remove_pde(). 
 	 */
 	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 	KASSERT(mpte >= vm_page_array &&
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_pde: page table page is out of range"));
 	KASSERT(mpte->pindex == pmap_pde_pindex(va),
 	    ("pmap_promote_pde: page table page's pindex is wrong"));
 	if (pmap_insert_pt_page(pmap, mpte, true)) {
 		atomic_add_long(&pmap_pde_p_failures, 1);
 		CTR2(KTR_PMAP,
 		    "pmap_promote_pde: failure for va %#lx in pmap %p", va,
 		    pmap);
 		return;
 	}
 
 	/*
 	 * Promote the pv entries.
 	 */
 	if ((newpde & PG_MANAGED) != 0)
 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME, lockp);
 
 	/*
 	 * Propagate the PAT index to its proper position.
 	 */
 	newpde = pmap_swap_pat(pmap, newpde);
 
 	/*
 	 * Map the superpage.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 	else
 		pde_store(pde, PG_PROMOTED | PG_PS | newpde);
 
 	atomic_add_long(&pmap_pde_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  *
  *	When destroying both a page table and PV entry, this function
  *	performs the TLB invalidation before releasing the PV list
  *	lock, so we do not need pmap_delayed_invl_page() calls here.
  */
 int
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind)
 {
 	struct rwlock *lock;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_G, PG_A, PG_M, PG_RW, PG_V;
 	pt_entry_t newpte, origpte;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte, om;
 	int rv;
 	boolean_t nosleep;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	va = trunc_page(va);
 	KASSERT(va <= VM_MAX_KERNEL_ADDRESS, ("pmap_enter: toobig"));
 	KASSERT(va < UPT_MIN_ADDRESS || va >= UPT_MAX_ADDRESS,
 	    ("pmap_enter: invalid to pmap_enter page table pages (va: 0x%lx)",
 	    va));
 	KASSERT((m->oflags & VPO_UNMANAGED) != 0 || va < kmi.clean_sva ||
 	    va >= kmi.clean_eva,
 	    ("pmap_enter: managed mapping within the clean submap"));
 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 	KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
 	    ("pmap_enter: flags %u has reserved bits set", flags));
 	pa = VM_PAGE_TO_PHYS(m);
 	newpte = (pt_entry_t)(pa | PG_A | PG_V);
 	if ((flags & VM_PROT_WRITE) != 0)
 		newpte |= PG_M;
 	if ((prot & VM_PROT_WRITE) != 0)
 		newpte |= PG_RW;
 	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
 	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 	if ((flags & PMAP_ENTER_WIRED) != 0)
 		newpte |= PG_W;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U;
 	if (pmap == kernel_pmap)
 		newpte |= PG_G;
 	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
 
 	/*
 	 * Set modified bit gratuitously for writeable mappings if
 	 * the page is unmanaged. We do not want to take a fault
 	 * to do the dirty bit accounting for these mappings.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) != 0) {
 		if ((newpte & PG_RW) != 0)
 			newpte |= PG_M;
 	} else
 		newpte |= PG_MANAGED;
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	if (psind == 1) {
 		/* Assert the required virtual and physical alignment. */ 
 		KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 		rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m, &lock);
 		goto out;
 	}
 	mpte = NULL;
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 retry:
 	pde = pmap_pde(pmap, va);
 	if (pde != NULL && (*pde & PG_V) != 0 && ((*pde & PG_PS) == 0 ||
 	    pmap_demote_pde_locked(pmap, pde, va, &lock))) {
 		pte = pmap_pde_to_pte(pde, va);
 		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 			mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 			mpte->wire_count++;
 		}
 	} else if (va < VM_MAXUSER_ADDRESS) {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 		mpte = _pmap_allocpte(pmap, pmap_pde_pindex(va),
 		    nosleep ? NULL : &lock);
 		if (mpte == NULL && nosleep) {
 			rv = KERN_RESOURCE_SHORTAGE;
 			goto out;
 		}
 		goto retry;
 	} else
 		panic("pmap_enter: invalid page directory va=%#lx", va);
 
 	origpte = *pte;
 	pv = NULL;
 	if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86)
 		newpte |= pmap_pkru_get(pmap, va);
 
 	/*
 	 * Is the specified virtual address already mapped?
 	 */
 	if ((origpte & PG_V) != 0) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
 			pmap->pm_stats.wired_count++;
 		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove the extra PT page reference.
 		 */
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%lx", va));
 		}
 
 		/*
 		 * Has the physical page changed?
 		 */
 		opa = origpte & PG_FRAME;
 		if (opa == pa) {
 			/*
 			 * No, might be a protection or wiring change.
 			 */
 			if ((origpte & PG_MANAGED) != 0 &&
 			    (newpte & PG_RW) != 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
 				goto unchanged;
 			goto validate;
 		}
 
 		/*
 		 * The physical page has changed.  Temporarily invalidate
 		 * the mapping.  This ensures that all threads sharing the
 		 * pmap keep a consistent view of the mapping, which is
 		 * necessary for the correct handling of COW faults.  It
 		 * also permits reuse of the old mapping's PV entry,
 		 * avoiding an allocation.
 		 *
 		 * For consistency, handle unmanaged mappings the same way.
 		 */
 		origpte = pte_load_clear(pte);
 		KASSERT((origpte & PG_FRAME) == opa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((origpte & PG_MANAGED) != 0) {
 			om = PHYS_TO_VM_PAGE(opa);
 
 			/*
 			 * The pmap lock is sufficient to synchronize with
 			 * concurrent calls to pmap_page_test_mappings() and
 			 * pmap_ts_referenced().
 			 */
 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(om);
 			if ((origpte & PG_A) != 0)
 				vm_page_aflag_set(om, PGA_REFERENCED);
 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 			KASSERT(pv != NULL,
 			    ("pmap_enter: no PV entry for %#lx", va));
 			if ((newpte & PG_MANAGED) == 0)
 				free_pv_entry(pmap, pv);
 			if ((om->aflags & PGA_WRITEABLE) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 		}
 		if ((origpte & PG_A) != 0)
 			pmap_invalidate_page(pmap, va);
 		origpte = 0;
 	} else {
 		/*
 		 * Increment the counters.
 		 */
 		if ((newpte & PG_W) != 0)
 			pmap->pm_stats.wired_count++;
 		pmap_resident_count_inc(pmap, 1);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((newpte & PG_MANAGED) != 0) {
 		if (pv == NULL) {
 			pv = get_pv_entry(pmap, &lock);
 			pv->pv_va = va;
 		}
 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		if ((newpte & PG_RW) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
 	/*
 	 * Update the PTE.
 	 */
 	if ((origpte & PG_V) != 0) {
 validate:
 		origpte = pte_load_store(pte, newpte);
 		KASSERT((origpte & PG_FRAME) == pa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
 		    (PG_M | PG_RW)) {
 			if ((origpte & PG_MANAGED) != 0)
 				vm_page_dirty(m);
 
 			/*
 			 * Although the PTE may still have PG_RW set, TLB
 			 * invalidation may nonetheless be required because
 			 * the PTE no longer has PG_M set.
 			 */
 		} else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
 			/*
 			 * This PTE change does not require TLB invalidation.
 			 */
 			goto unchanged;
 		}
 		if ((origpte & PG_A) != 0)
 			pmap_invalidate_page(pmap, va);
 	} else
 		pte_store(pte, newpte);
 
 unchanged:
 
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * If both the page table page and the reservation are fully
 	 * populated, then attempt promotion.
 	 */
 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
 		pmap_promote_pde(pmap, pde, va, &lock);
 #endif
 
 	rv = KERN_SUCCESS;
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
  * if successful.  Returns false if (1) a page table page cannot be allocated
  * without sleeping, (2) a mapping already exists at the specified virtual
  * address, or (3) a PV entry cannot be allocated without reclaiming another
  * PV entry.
  */
 static bool
 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     struct rwlock **lockp)
 {
 	pd_entry_t newpde;
 	pt_entry_t PG_V;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PG_V = pmap_valid_bit(pmap);
 	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
 	    PG_PS | PG_V;
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		newpde |= PG_MANAGED;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpde |= pg_nx;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpde |= PG_U;
 	return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
 	    KERN_SUCCESS);
 }
 
 /*
  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
  * a mapping already exists at the specified virtual address.  Returns
  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
  *
  * The parameter "m" is only used when creating a managed, writeable mapping.
  */
 static int
 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
     vm_page_t m, struct rwlock **lockp)
 {
 	struct spglist free;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t PG_G, PG_RW, PG_V;
 	vm_page_t mt, pdpg;
 
 	KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0,
 	    ("pmap_enter_pde: cannot create wired user mapping"));
 	PG_G = pmap_global_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	KASSERT((newpde & (pmap_modified_bit(pmap) | PG_RW)) != PG_RW,
 	    ("pmap_enter_pde: newpde is missing PG_M"));
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if ((pdpg = pmap_allocpde(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
 	    NULL : lockp)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (KERN_RESOURCE_SHORTAGE);
 	}
 
 	/*
 	 * If pkru is not same for the whole pde range, return failure
 	 * and let vm_fault() cope.  Check after pde allocation, since
 	 * it could sleep.
 	 */
 	if (!pmap_pkru_same(pmap, va, va + NBPDR)) {
 		SLIST_INIT(&free);
 		if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
 			pmap_invalidate_page(pmap, va);
 			vm_page_free_pages_toq(&free, true);
 		}
 		return (KERN_FAILURE);
 	}
 	if (va < VM_MAXUSER_ADDRESS && pmap->pm_type == PT_X86) {
 		newpde &= ~X86_PG_PKU_MASK;
 		newpde |= pmap_pkru_get(pmap, va);
 	}
 
 	pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 	pde = &pde[pmap_pde_index(va)];
 	oldpde = *pde;
 	if ((oldpde & PG_V) != 0) {
 		KASSERT(pdpg->wire_count > 1,
 		    ("pmap_enter_pde: pdpg's wire count is too low"));
 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
 			pdpg->wire_count--;
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (KERN_FAILURE);
 		}
 		/* Break the existing mapping(s). */
 		SLIST_INIT(&free);
 		if ((oldpde & PG_PS) != 0) {
 			/*
 			 * The reference to the PD page that was acquired by
 			 * pmap_allocpde() ensures that it won't be freed.
 			 * However, if the PDE resulted from a promotion, then
 			 * a reserved PT page could be freed.
 			 */
 			(void)pmap_remove_pde(pmap, pde, va, &free, lockp);
 			if ((oldpde & PG_G) == 0)
 				pmap_invalidate_pde_page(pmap, va, oldpde);
 		} else {
 			pmap_delayed_invl_start();
 			if (pmap_remove_ptes(pmap, va, va + NBPDR, pde, &free,
 			    lockp))
 		               pmap_invalidate_all(pmap);
 			pmap_delayed_invl_finish();
 		}
 		vm_page_free_pages_toq(&free, true);
 		if (va >= VM_MAXUSER_ADDRESS) {
 			/*
 			 * Both pmap_remove_pde() and pmap_remove_ptes() will
 			 * leave the kernel page table page zero filled.
 			 */
 			mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 			if (pmap_insert_pt_page(pmap, mt, false))
 				panic("pmap_enter_pde: trie insert failed");
 		} else
 			KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
 			    pde));
 	}
 	if ((newpde & PG_MANAGED) != 0) {
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_pde(pmap, va, newpde, flags, lockp)) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, pdpg, &free)) {
 				/*
 				 * Although "va" is not mapped, paging-
 				 * structure caches could nonetheless have
 				 * entries that refer to the freed page table
 				 * pages.  Invalidate those entries.
 				 */
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		if ((newpde & PG_RW) != 0) {
 			for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 				vm_page_aflag_set(mt, PGA_WRITEABLE);
 		}
 	}
 
 	/*
 	 * Increment counters.
 	 */
 	if ((newpde & PG_W) != 0)
 		pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
 	pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 
 	/*
 	 * Map the superpage.  (This is not a promoted mapping; there will not
 	 * be any lingering 4KB page mappings in the TLB.)
 	 */
 	pde_store(pde, newpde);
 
 	atomic_add_long(&pmap_pde_mappings, 1);
 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (KERN_SUCCESS);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	struct rwlock *lock;
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
 			m = &m[NBPDR / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 			    mpte, &lock);
 		m = TAILQ_NEXT(m, listq);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	struct rwlock *lock;
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 {
 	struct spglist free;
 	pt_entry_t newpte, *pte, PG_V;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t ptepindex;
 		pd_entry_t *ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = pmap_pde_pindex(va);
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap_pde(pmap, va);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.  Otherwise, we
 			 * attempt to allocate a page table page.  If this
 			 * attempt fails, we don't retry.  Instead, we give up.
 			 */
 			if (ptepa && (*ptepa & PG_V) != 0) {
 				if (*ptepa & PG_PS)
 					return (NULL);
 				mpte = PHYS_TO_VM_PAGE(*ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
 				/*
 				 * Pass NULL instead of the PV list lock
 				 * pointer, because we don't intend to sleep.
 				 */
 				mpte = _pmap_allocpte(pmap, ptepindex, NULL);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 		pte = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 		pte = &pte[pmap_pte_index(va)];
 	} else {
 		mpte = NULL;
 		pte = vtopte(va);
 	}
 	if (*pte) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 				/*
 				 * Although "va" is not mapped, paging-
 				 * structure caches could nonetheless have
 				 * entries that refer to the freed page table
 				 * pages.  Invalidate those entries.
 				 */
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap_resident_count_inc(pmap, 1);
 
 	newpte = VM_PAGE_TO_PHYS(m) | PG_V |
 	    pmap_cache_bits(pmap, m->md.pat_mode, 0);
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		newpte |= PG_MANAGED;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		newpte |= pg_nx;
 	if (va < VM_MAXUSER_ADDRESS)
 		newpte |= PG_U | pmap_pkru_get(pmap, va);
 	pte_store(pte, newpte);
 	return (mpte);
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 void *
 pmap_kenter_temporary(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 	pd_entry_t *pde;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pa, ptepa;
 	vm_page_t p, pdpg;
 	int pat_mode;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 	if ((addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 		if (!pmap_ps_enabled(pmap))
 			return;
 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
 			return;
 		p = vm_page_lookup(object, pindex);
 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
 		    ("pmap_object_init_pt: invalid page %p", p));
 		pat_mode = p->md.pat_mode;
 
 		/*
 		 * Abort the mapping if the first page is not physically
 		 * aligned to a 2MB page boundary.
 		 */
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;
 
 		/*
 		 * Skip the first page.  Abort the mapping if the rest of
 		 * the pages are not physically contiguous or have differing
 		 * memory attributes.
 		 */
 		p = TAILQ_NEXT(p, listq);
 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 		    pa += PAGE_SIZE) {
 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_object_init_pt: invalid page %p", p));
 			if (pa != VM_PAGE_TO_PHYS(p) ||
 			    pat_mode != p->md.pat_mode)
 				return;
 			p = TAILQ_NEXT(p, listq);
 		}
 
 		/*
 		 * Map using 2MB pages.  Since "ptepa" is 2M aligned and
 		 * "size" is a multiple of 2M, adding the PAT setting to "pa"
 		 * will not affect the termination of this loop.
 		 */ 
 		PMAP_LOCK(pmap);
 		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
 		    pa < ptepa + size; pa += NBPDR) {
 			pdpg = pmap_allocpde(pmap, addr, NULL);
 			if (pdpg == NULL) {
 				/*
 				 * The creation of mappings below is only an
 				 * optimization.  If a page directory page
 				 * cannot be allocated without blocking,
 				 * continue on to the next mapping rather than
 				 * blocking.
 				 */
 				addr += NBPDR;
 				continue;
 			}
 			pde = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pdpg));
 			pde = &pde[pmap_pde_index(addr)];
 			if ((*pde & PG_V) == 0) {
 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
 				    PG_U | PG_RW | PG_V);
 				pmap_resident_count_inc(pmap, NBPDR / PAGE_SIZE);
 				atomic_add_long(&pmap_pde_mappings, 1);
 			} else {
 				/* Continue on if the PDE is already valid. */
 				pdpg->wire_count--;
 				KASSERT(pdpg->wire_count > 0,
 				    ("pmap_object_init_pt: missing reference "
 				    "to page directory page, va: 0x%lx", addr));
 			}
 			addr += NBPDR;
 		}
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware
  *	feature, so there is no need to invalidate any TLB entries.
  *	Since pmap_demote_pde() for the wired entry must never fail,
  *	pmap_delayed_invl_start()/finish() calls around the
  *	function are not needed.
  */
 void
 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va_next;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		if ((*pde & PG_V) == 0)
 			continue;
 		if ((*pde & PG_PS) != 0) {
 			if ((*pde & PG_W) == 0)
 				panic("pmap_unwire: pde %#jx is missing PG_W",
 				    (uintmax_t)*pde);
 
 			/*
 			 * Are we unwiring the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == va_next && eva >= va_next) {
 				atomic_clear_long(pde, PG_W);
 				pmap->pm_stats.wired_count -= NBPDR /
 				    PAGE_SIZE;
 				continue;
 			} else if (!pmap_demote_pde(pmap, pde, sva))
 				panic("pmap_unwire: demotion failed");
 		}
 		if (va_next > eva)
 			va_next = eva;
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & PG_V) == 0)
 				continue;
 			if ((*pte & PG_W) == 0)
 				panic("pmap_unwire: pte %#jx is missing PG_W",
 				    (uintmax_t)*pte);
 
 			/*
 			 * PG_W must be cleared atomically.  Although the pmap
 			 * lock synchronizes access to PG_W, another processor
 			 * could be setting PG_M and/or PG_A concurrently.
 			 */
 			atomic_clear_long(pte, PG_W);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
 	struct rwlock *lock;
 	struct spglist free;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde, srcptepaddr;
 	pt_entry_t *dst_pte, PG_A, PG_M, PG_V, ptetemp, *src_pte;
 	vm_offset_t addr, end_addr, va_next;
 	vm_page_t dst_pdpg, dstmpte, srcmpte;
 
 	if (dst_addr != src_addr)
 		return;
 
 	if (dst_pmap->pm_type != src_pmap->pm_type)
 		return;
 
 	/*
 	 * EPT page table entries that require emulation of A/D bits are
 	 * sensitive to clearing the PG_A bit (aka EPT_PG_READ). Although
 	 * we clear PG_M (aka EPT_PG_WRITE) concomitantly, the PG_U bit
 	 * (aka EPT_PG_EXECUTE) could still be set. Since some EPT
 	 * implementations flag an EPT misconfiguration for exec-only
 	 * mappings we skip this function entirely for emulated pmaps.
 	 */
 	if (pmap_emulate_ad_bits(dst_pmap))
 		return;
 
 	end_addr = src_addr + len;
 	lock = NULL;
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 
 	PG_A = pmap_accessed_bit(dst_pmap);
 	PG_M = pmap_modified_bit(dst_pmap);
 	PG_V = pmap_valid_bit(dst_pmap);
 
 	for (addr = src_addr; addr < end_addr; addr = va_next) {
 		KASSERT(addr < UPT_MIN_ADDRESS,
 		    ("pmap_copy: invalid to pmap_copy page tables"));
 
 		pml4e = pmap_pml4e(src_pmap, addr);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (addr + NBPML4) & ~PML4MASK;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, addr);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (addr + NBPDP) & ~PDPMASK;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 
 		va_next = (addr + NBPDR) & ~PDRMASK;
 		if (va_next < addr)
 			va_next = end_addr;
 
 		pde = pmap_pdpe_to_pde(pdpe, addr);
 		srcptepaddr = *pde;
 		if (srcptepaddr == 0)
 			continue;
 			
 		if (srcptepaddr & PG_PS) {
 			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
 				continue;
 			dst_pdpg = pmap_allocpde(dst_pmap, addr, NULL);
 			if (dst_pdpg == NULL)
 				break;
 			pde = (pd_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_pdpg));
 			pde = &pde[pmap_pde_index(addr)];
 			if (*pde == 0 && ((srcptepaddr & PG_MANAGED) == 0 ||
 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
 			    PMAP_ENTER_NORECLAIM, &lock))) {
 				*pde = srcptepaddr & ~PG_W;
 				pmap_resident_count_inc(dst_pmap, NBPDR /
 				    PAGE_SIZE);
 				atomic_add_long(&pmap_pde_mappings, 1);
 			} else
 				dst_pdpg->wire_count--;
 			continue;
 		}
 
 		srcptepaddr &= PG_FRAME;
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 		KASSERT(srcmpte->wire_count > 0,
 		    ("pmap_copy: source page table page is unused"));
 
 		if (va_next > end_addr)
 			va_next = end_addr;
 
 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 		src_pte = &src_pte[pmap_pte_index(addr)];
 		dstmpte = NULL;
 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
 			ptetemp = *src_pte;
 
 			/*
 			 * We only virtual copy managed pages.
 			 */
 			if ((ptetemp & PG_MANAGED) == 0)
 				continue;
 
 			if (dstmpte != NULL) {
 				KASSERT(dstmpte->pindex ==
 				    pmap_pde_pindex(addr),
 				    ("dstmpte pindex/addr mismatch"));
 				dstmpte->wire_count++;
 			} else if ((dstmpte = pmap_allocpte(dst_pmap, addr,
 			    NULL)) == NULL)
 				goto out;
 			dst_pte = (pt_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 			dst_pte = &dst_pte[pmap_pte_index(addr)];
 			if (*dst_pte == 0 &&
 			    pmap_try_insert_pv_entry(dst_pmap, addr,
 			    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME), &lock)) {
 				/*
 				 * Clear the wired, modified, and accessed
 				 * (referenced) bits during the copy.
 				 */
 				*dst_pte = ptetemp & ~(PG_W | PG_M | PG_A);
 				pmap_resident_count_inc(dst_pmap, 1);
 			} else {
 				SLIST_INIT(&free);
 				if (pmap_unwire_ptp(dst_pmap, addr, dstmpte,
 				    &free)) {
 					/*
 					 * Although "addr" is not mapped,
 					 * paging-structure caches could
 					 * nonetheless have entries that refer
 					 * to the freed page table pages.
 					 * Invalidate those entries.
 					 */
 					pmap_invalidate_page(dst_pmap, addr);
 					vm_page_free_pages_toq(&free, true);
 				}
 				goto out;
 			}
 			/* Have we copied all of the valid mappings? */ 
 			if (dstmpte->wire_count >= srcmpte->wire_count)
 				break;
 		}
 	}
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 int
 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
 {
 	int error;
 
 	if (dst_pmap->pm_type != src_pmap->pm_type ||
 	    dst_pmap->pm_type != PT_X86 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
 		return (0);
 	for (;;) {
 		if (dst_pmap < src_pmap) {
 			PMAP_LOCK(dst_pmap);
 			PMAP_LOCK(src_pmap);
 		} else {
 			PMAP_LOCK(src_pmap);
 			PMAP_LOCK(dst_pmap);
 		}
 		error = pmap_pkru_copy(dst_pmap, src_pmap);
 		/* Clean up partial copy on failure due to no memory. */
 		if (error == ENOMEM)
 			pmap_pkru_deassign_all(dst_pmap);
 		PMAP_UNLOCK(src_pmap);
 		PMAP_UNLOCK(dst_pmap);
 		if (error != ENOMEM)
 			break;
 		vm_wait(NULL);
 	}
 	return (error);
 }
 
 /*
  * Zero the specified hardware page.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  * Zero an an area within a single hardware page.  off and size must not
  * cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero((void *)va);
 	else
 		bzero((char *)va + off, size);
 }
 
 /*
  * Copy 1 specified hardware page to another.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	pagecopy((void *)src, (void *)dst);
 }
 
 int unmapped_buf_allowed = 1;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	void *a_cp, *b_cp;
 	vm_page_t pages[2];
 	vm_offset_t vaddr[2], a_pg_offset, b_pg_offset;
 	int cnt;
 	boolean_t mapped;
 
 	while (xfersize > 0) {
 		a_pg_offset = a_offset & PAGE_MASK;
 		pages[0] = ma[a_offset >> PAGE_SHIFT];
 		b_pg_offset = b_offset & PAGE_MASK;
 		pages[1] = mb[b_offset >> PAGE_SHIFT];
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		mapped = pmap_map_io_transient(pages, vaddr, 2, FALSE);
 		a_cp = (char *)vaddr[0] + a_pg_offset;
 		b_cp = (char *)vaddr[1] + b_pg_offset;
 		bcopy(a_cp, b_cp, cnt);
 		if (__predict_false(mapped))
 			pmap_unmap_io_transient(pages, vaddr, 2, FALSE);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	struct rwlock *lock;
 	struct md_page *pvh;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	pv_entry_t pv;
 	int count, md_gen, pvh_gen;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (0);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	count = 0;
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va);
 		if ((*pte & PG_W) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pde(pmap, pv->pv_va);
 			if ((*pte & PG_W) != 0)
 				count++;
 			PMAP_UNLOCK(pmap);
 		}
 	}
 	rw_runlock(lock);
 	return (count);
 }
 
 /*
  * Returns TRUE if the given page is mapped individually or as part of
  * a 2mpage.  Otherwise, returns FALSE.
  */
 boolean_t
 pmap_page_is_mapped(vm_page_t m)
 {
 	struct rwlock *lock;
 	boolean_t rv;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  * Destroy all managed, non-wired mappings in the given user-space
  * pmap.  This pmap cannot be active on any processor besides the
  * caller.
  *
  * This function cannot be applied to the kernel pmap.  Moreover, it
  * is not intended for general use.  It is only to be used during
  * process termination.  Consequently, it can be implemented in ways
  * that make it faster than pmap_remove().  First, it can more quickly
  * destroy mappings by iterating over the pmap's collection of PV
  * entries, rather than searching the page table.  Second, it doesn't
  * have to test and clear the page table entries atomically, because
  * no processor is currently accessing the user address space.  In
  * particular, a page table entry's dirty bit won't change state once
  * this function starts.
  *
  * Although this function destroys all of the pmap's managed,
  * non-wired mappings, it can delay and batch the invalidation of TLB
  * entries without calling pmap_delayed_invl_start() and
  * pmap_delayed_invl_finish().  Because the pmap is not active on
  * any other processor, none of these TLB entries will ever be used
  * before their eventual invalidation.  Consequently, there is no need
  * for either pmap_remove_all() or pmap_remove_write() to wait for
  * that eventual TLB invalidation.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pd_entry_t ptepde;
 	pt_entry_t *pte, tpte;
 	pt_entry_t PG_M, PG_RW, PG_V;
 	struct spglist free;
 	vm_page_t m, mpte, mt;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
 	struct rwlock *lock;
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree, field, freed, idx;
 	boolean_t superpage;
 	vm_paddr_t pa;
 
 	/*
 	 * Assert that the given pmap is only active on the current
 	 * CPU.  Unfortunately, we cannot block another CPU from
 	 * activating the pmap while this function is executing.
 	 */
 	KASSERT(pmap == PCPU_GET(curpmap), ("non-current pmap %p", pmap));
 #ifdef INVARIANTS
 	{
 		cpuset_t other_cpus;
 
 		other_cpus = all_cpus;
 		critical_enter();
 		CPU_CLR(PCPU_GET(cpuid), &other_cpus);
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		critical_exit();
 		KASSERT(CPU_EMPTY(&other_cpus), ("pmap active %p", pmap));
 	}
 #endif
 
 	lock = NULL;
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	SLIST_INIT(&free);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfq(inuse);
 				bitmask = 1UL << bit;
 				idx = field * 64 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = pmap_pdpe(pmap, pv->pv_va);
 				ptepde = *pte;
 				pte = pmap_pdpe_to_pde(pte, pv->pv_va);
 				tpte = *pte;
 				if ((tpte & (PG_PS | PG_V)) == PG_V) {
 					superpage = FALSE;
 					ptepde = tpte;
 					pte = (pt_entry_t *)PHYS_TO_DMAP(tpte &
 					    PG_FRAME);
 					pte = &pte[pmap_pte_index(pv->pv_va)];
 					tpte = *pte;
 				} else {
 					/*
 					 * Keep track whether 'tpte' is a
 					 * superpage explicitly instead of
 					 * relying on PG_PS being set.
 					 *
 					 * This is because PG_PS is numerically
 					 * identical to PG_PTE_PAT and thus a
 					 * regular page could be mistaken for
 					 * a superpage.
 					 */
 					superpage = TRUE;
 				}
 
 				if ((tpte & PG_V) == 0) {
 					panic("bad pte va %lx pte %lx",
 					    pv->pv_va, tpte);
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & PG_W) {
 					allfree = 0;
 					continue;
 				}
 
 				if (superpage)
 					pa = tpte & PG_PS_FRAME;
 				else
 					pa = tpte & PG_FRAME;
 
 				m = PHYS_TO_VM_PAGE(pa);
 				KASSERT(m->phys_addr == pa,
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad tpte %#jx",
 				    (uintmax_t)tpte));
 
 				pte_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 					if (superpage) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
 					} else
 						vm_page_dirty(m);
 				}
 
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 
 				/* Mark free */
 				pc->pc_map[field] |= bitmask;
 				if (superpage) {
 					pmap_resident_count_dec(pmap, NBPDR / PAGE_SIZE);
 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 					pvh->pv_gen++;
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
 							    TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 					if (mpte != NULL) {
 						KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
 						    ("pmap_remove_pages: pte page not promoted"));
 						pmap_resident_count_dec(pmap, 1);
 						KASSERT(mpte->wire_count == NPTEPG,
 						    ("pmap_remove_pages: pte page wire count error"));
 						mpte->wire_count = 0;
 						pmap_add_delayed_free_list(mpte, &free, FALSE);
 					}
 				} else {
 					pmap_resident_count_dec(pmap, 1);
 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 					m->md.pv_gen++;
 					if ((m->aflags & PGA_WRITEABLE) != 0 &&
 					    TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
 							vm_page_aflag_clear(m, PGA_WRITEABLE);
 					}
 				}
 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 				freed++;
 			}
 		}
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	pmap_invalidate_all(pmap);
 	pmap_pkru_deassign_all(pmap);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 static boolean_t
 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 {
 	struct rwlock *lock;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	pt_entry_t *pte, mask;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	pmap_t pmap;
 	int md_gen, pvh_gen;
 	boolean_t rv;
 
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va);
 		mask = 0;
 		if (modified) {
 			PG_M = pmap_modified_bit(pmap);
 			PG_RW = pmap_rw_bit(pmap);
 			mask |= PG_RW | PG_M;
 		}
 		if (accessed) {
 			PG_A = pmap_accessed_bit(pmap);
 			PG_V = pmap_valid_bit(pmap);
 			mask |= PG_V | PG_A;
 		}
 		rv = (*pte & mask) == mask;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			goto out;
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pde(pmap, pv->pv_va);
 			mask = 0;
 			if (modified) {
 				PG_M = pmap_modified_bit(pmap);
 				PG_RW = pmap_rw_bit(pmap);
 				mask |= PG_RW | PG_M;
 			}
 			if (accessed) {
 				PG_A = pmap_accessed_bit(pmap);
 				PG_V = pmap_valid_bit(pmap);
 				mask |= PG_V | PG_A;
 			}
 			rv = (*pte & mask) == mask;
 			PMAP_UNLOCK(pmap);
 			if (rv)
 				goto out;
 		}
 	}
 out:
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTEs can have PG_M set.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
 	return (pmap_page_test_mappings(m, FALSE, TRUE));
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is eligible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	boolean_t rv;
 
 	PG_V = pmap_valid_bit(pmap);
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pde = pmap_pde(pmap, addr);
 	if (pde != NULL && (*pde & (PG_PS | PG_V)) == PG_V) {
 		pte = pmap_pde_to_pte(pde, addr);
 		rv = (*pte & PG_V) == 0;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	return (pmap_page_test_mappings(m, TRUE, FALSE));
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pv_entry_t next_pv, pv;
 	pd_entry_t *pde;
 	pt_entry_t oldpte, *pte, PG_M, PG_RW;
 	vm_offset_t va;
 	int pvh_gen, md_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * set by another thread while the object is locked.  Thus,
 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry_pv_loop:
 	rw_wlock(lock);
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		if ((*pde & PG_RW) != 0)
 			(void)pmap_demote_pde_locked(pmap, pde, va, &lock);
 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 		    ("inconsistent pv lock %p %p for page %p",
 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen ||
 			    md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_remove_write: found a 2mpage in page %p's pv list",
 		    m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 retry:
 		oldpte = *pte;
 		if (oldpte & PG_RW) {
 			if (!atomic_cmpset_long(pte, oldpte, oldpte &
 			    ~(PG_RW | PG_M)))
 				goto retry;
 			if ((oldpte & PG_M) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	pmap_delayed_invl_wait(m);
 }
 
 static __inline boolean_t
 safe_to_clear_referenced(pmap_t pmap, pt_entry_t pte)
 {
 
 	if (!pmap_emulate_ad_bits(pmap))
 		return (TRUE);
 
 	KASSERT(pmap->pm_type == PT_EPT, ("invalid pm_type %d", pmap->pm_type));
 
 	/*
 	 * XWR = 010 or 110 will cause an unconditional EPT misconfiguration
 	 * so we don't let the referenced (aka EPT_PG_READ) bit to be cleared
 	 * if the EPT_PG_WRITE bit is set.
 	 */
 	if ((pte & EPT_PG_WRITE) != 0)
 		return (FALSE);
 
 	/*
 	 * XWR = 100 is allowed only if the PMAP_SUPPORTS_EXEC_ONLY is set.
 	 */
 	if ((pte & EPT_PG_EXECUTE) == 0 ||
 	    ((pmap->pm_flags & PMAP_SUPPORTS_EXEC_ONLY) != 0))
 		return (TRUE);
 	else
 		return (FALSE);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  *
  *	A DI block is not needed within this function, because
  *	invalidations are performed before the PV list lock is
  *	released.
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte, PG_A, PG_M, PG_RW;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	int cleared, md_gen, not_cleared, pvh_gen;
 	struct spglist free;
 	boolean_t demoted;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	SLIST_INIT(&free);
 	cleared = 0;
 	pa = VM_PAGE_TO_PHYS(m);
 	lock = PHYS_TO_PV_LIST_LOCK(pa);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 	rw_wlock(lock);
 retry:
 	not_cleared = 0;
 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, pv->pv_va);
 		oldpde = *pde;
 		if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			/*
 			 * Although "oldpde" is mapping a 2MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 		if ((oldpde & PG_A) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB
 			 * pages, it should not be cleared every time it is
 			 * tested.  Apply a simple "hash" function on the
 			 * physical page number, the virtual superpage number,
 			 * and the pmap address to select one 4KB page out of
 			 * the 512 on which testing the reference bit will
 			 * result in clearing that reference bit.  This
 			 * function is designed to avoid the selection of the
 			 * same 4KB page for every 2MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 			    (oldpde & PG_W) == 0) {
 				if (safe_to_clear_referenced(pmap, oldpde)) {
 					atomic_clear_long(pde, PG_A);
 					pmap_invalidate_page(pmap, pv->pv_va);
 					demoted = FALSE;
 				} else if (pmap_demote_pde_locked(pmap, pde,
 				    pv->pv_va, &lock)) {
 					/*
 					 * Remove the mapping to a single page
 					 * so that a subsequent access may
 					 * repromote.  Since the underlying
 					 * page table page is fully populated,
 					 * this removal never frees a page
 					 * table page.
 					 */
 					demoted = TRUE;
 					va += VM_PAGE_TO_PHYS(m) - (oldpde &
 					    PG_PS_FRAME);
 					pte = pmap_pde_to_pte(pde, va);
 					pmap_remove_pte(pmap, pte, va, *pde,
 					    NULL, &lock);
 					pmap_invalidate_page(pmap, va);
 				} else
 					demoted = TRUE;
 
 				if (demoted) {
 					/*
 					 * The superpage mapping was removed
 					 * entirely and therefore 'pv' is no
 					 * longer valid.
 					 */
 					if (pvf == pv)
 						pvf = NULL;
 					pv = NULL;
 				}
 				cleared++;
 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 				    ("inconsistent pv lock %p %p for page %p",
 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 		}
 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		PG_A = pmap_accessed_bit(pmap);
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_ts_referenced: found a 2mpage in page %p's pv list",
 		    m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if ((*pte & PG_A) != 0) {
 			if (safe_to_clear_referenced(pmap, *pte)) {
 				atomic_clear_long(pte, PG_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 			} else if ((*pte & PG_W) == 0) {
 				/*
 				 * Wired pages cannot be paged out so
 				 * doing accessed bit emulation for
 				 * them is wasted effort. We do the
 				 * hard work for unwired pages only.
 				 */
 				pmap_remove_pte(pmap, pte, pv->pv_va,
 				    *pde, &free, &lock);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 				if (pvf == pv)
 					pvf = NULL;
 				pv = NULL;
 				KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 				    ("inconsistent pv lock %p %p for page %p",
 				    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 			m->md.pv_gen++;
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 	    not_cleared < PMAP_TS_REFERENCED_MAX);
 out:
 	rw_wunlock(lock);
 	vm_page_free_pages_toq(&free, true);
 	return (cleared + not_cleared);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
 	struct rwlock *lock;
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte, PG_A, PG_G, PG_M, PG_RW, PG_V;
 	vm_offset_t va, va_next;
 	vm_page_t m;
 	boolean_t anychanged;
 
 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
 		return;
 
 	/*
 	 * A/D bit emulation requires an alternate code path when clearing
 	 * the modified and accessed bits below. Since this function is
 	 * advisory in nature we skip it entirely for pmaps that require
 	 * A/D bit emulation.
 	 */
 	if (pmap_emulate_ad_bits(pmap))
 		return;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_G = pmap_global_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 	anychanged = FALSE;
 	pmap_delayed_invl_start();
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		pml4e = pmap_pml4e(pmap, sva);
 		if ((*pml4e & PG_V) == 0) {
 			va_next = (sva + NBPML4) & ~PML4MASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		pdpe = pmap_pml4e_to_pdpe(pml4e, sva);
 		if ((*pdpe & PG_V) == 0) {
 			va_next = (sva + NBPDP) & ~PDPMASK;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 		va_next = (sva + NBPDR) & ~PDRMASK;
 		if (va_next < sva)
 			va_next = eva;
 		pde = pmap_pdpe_to_pde(pdpe, sva);
 		oldpde = *pde;
 		if ((oldpde & PG_V) == 0)
 			continue;
 		else if ((oldpde & PG_PS) != 0) {
 			if ((oldpde & PG_MANAGED) == 0)
 				continue;
 			lock = NULL;
 			if (!pmap_demote_pde_locked(pmap, pde, sva, &lock)) {
 				if (lock != NULL)
 					rw_wunlock(lock);
 
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 
 			/*
 			 * Unless the page mappings are wired, remove the
 			 * mapping to a single page so that a subsequent
 			 * access may repromote.  Since the underlying page
 			 * table page is fully populated, this removal never
 			 * frees a page table page.
 			 */
 			if ((oldpde & PG_W) == 0) {
 				pte = pmap_pde_to_pte(pde, sva);
 				KASSERT((*pte & PG_V) != 0,
 				    ("pmap_advise: invalid PTE"));
 				pmap_remove_pte(pmap, pte, sva, *pde, NULL,
 				    &lock);
 				anychanged = TRUE;
 			}
 			if (lock != NULL)
 				rw_wunlock(lock);
 		}
 		if (va_next > eva)
 			va_next = eva;
 		va = va_next;
 		for (pte = pmap_pde_to_pte(pde, sva); sva != va_next; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
 				goto maybe_invlrng;
 			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 				if (advice == MADV_DONTNEED) {
 					/*
 					 * Future calls to pmap_is_modified()
 					 * can be avoided by making the page
 					 * dirty now.
 					 */
 					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				atomic_clear_long(pte, PG_M | PG_A);
 			} else if ((*pte & PG_A) != 0)
 				atomic_clear_long(pte, PG_A);
 			else
 				goto maybe_invlrng;
 
 			if ((*pte & PG_G) != 0) {
 				if (va == va_next)
 					va = sva;
 			} else
 				anychanged = TRUE;
 			continue;
 maybe_invlrng:
 			if (va != va_next) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = va_next;
 			}
 		}
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	PMAP_UNLOCK(pmap);
 	pmap_delayed_invl_finish();
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	pv_entry_t next_pv, pv;
 	pd_entry_t oldpde, *pde;
-	pt_entry_t oldpte, *pte, PG_M, PG_RW, PG_V;
+	pt_entry_t *pte, PG_M, PG_RW;
 	struct rwlock *lock;
 	vm_offset_t va;
 	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT(!vm_page_xbusied(m),
 	    ("pmap_clear_modify: page %p is exclusive busied", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 	 * If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_wlock(lock);
 restart:
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		PG_M = pmap_modified_bit(pmap);
-		PG_V = pmap_valid_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, va);
 		oldpde = *pde;
-		if ((oldpde & PG_RW) != 0) {
-			if (pmap_demote_pde_locked(pmap, pde, va, &lock)) {
-				if ((oldpde & PG_W) == 0) {
-					/*
-					 * Write protect the mapping to a
-					 * single page so that a subsequent
-					 * write access may repromote.
-					 */
-					va += VM_PAGE_TO_PHYS(m) - (oldpde &
-					    PG_PS_FRAME);
-					pte = pmap_pde_to_pte(pde, va);
-					oldpte = *pte;
-					if ((oldpte & PG_V) != 0) {
-						while (!atomic_cmpset_long(pte,
-						    oldpte,
-						    oldpte & ~(PG_M | PG_RW)))
-							oldpte = *pte;
-						vm_page_dirty(m);
-						pmap_invalidate_page(pmap, va);
-					}
-				}
-			}
+		/* If oldpde has PG_RW set, then it also has PG_M set. */
+		if ((oldpde & PG_RW) != 0 &&
+		    pmap_demote_pde_locked(pmap, pde, va, &lock) &&
+		    (oldpde & PG_W) == 0) {
+			/*
+			 * Write protect the mapping to a single page so that
+			 * a subsequent write access may repromote.
+			 */
+			va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME);
+			pte = pmap_pde_to_pte(pde, va);
+			atomic_clear_long(pte, PG_M | PG_RW);
+			vm_page_dirty(m);
+			pmap_invalidate_page(pmap, va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		PG_M = pmap_modified_bit(pmap);
 		PG_RW = pmap_rw_bit(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 		    " a 2mpage in page %p's pv list", m));
 		pte = pmap_pde_to_pte(pde, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			atomic_clear_long(pte, PG_M);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 static __inline void
 pmap_pte_attr(pt_entry_t *pte, int cache_bits, int mask)
 {
 	u_int opte, npte;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PTE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opte = *(u_int *)pte;
 		npte = opte & ~mask;
 		npte |= cache_bits;
 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 }
 
 /* Adjust the cache mode for a 2MB page mapped via a PDE. */
 static __inline void
 pmap_pde_attr(pd_entry_t *pde, int cache_bits, int mask)
 {
 	u_int opde, npde;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PDE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opde = *(u_int *)pde;
 		npde = opde & ~mask;
 		npde |= cache_bits;
 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 static void *
 pmap_mapdev_internal(vm_paddr_t pa, vm_size_t size, int mode, bool noflush)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t va, offset;
 	vm_size_t tmpsize;
 	int i;
 
 	offset = pa & PAGE_MASK;
 	size = round_page(offset + size);
 	pa = trunc_page(pa);
 
 	if (!pmap_initialized) {
 		va = 0;
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->va == 0) {
 				ppim->pa = pa;
 				ppim->sz = size;
 				ppim->mode = mode;
 				ppim->va = virtual_avail;
 				virtual_avail += size;
 				va = ppim->va;
 				break;
 			}
 		}
 		if (va == 0)
 			panic("%s: too many preinit mappings", __func__);
 	} else {
 		/*
 		 * If we have a preinit mapping, re-use it.
 		 */
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->pa == pa && ppim->sz == size &&
 			    ppim->mode == mode)
 				return ((void *)(ppim->va + offset));
 		}
 		/*
 		 * If the specified range of physical addresses fits within
 		 * the direct map window, use the direct map.
 		 */
 		if (pa < dmaplimit && pa + size <= dmaplimit) {
 			va = PHYS_TO_DMAP(pa);
 			PMAP_LOCK(kernel_pmap);
 			i = pmap_change_attr_locked(va, size, mode, noflush);
 			PMAP_UNLOCK(kernel_pmap);
 			if (!i)
 				return ((void *)(va + offset));
 		}
 		va = kva_alloc(size);
 		if (va == 0)
 			panic("%s: Couldn't allocate KVA", __func__);
 	}
 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 	pmap_invalidate_range(kernel_pmap, va, va + tmpsize);
 	if (!noflush)
 		pmap_invalidate_cache_range(va, va + tmpsize);
 	return ((void *)(va + offset));
 }
 
 void *
 pmap_mapdev_attr(vm_paddr_t pa, vm_size_t size, int mode)
 {
 
 	return (pmap_mapdev_internal(pa, size, mode, false));
 }
 
 void *
 pmap_mapdev(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, false));
 }
 
 void *
 pmap_mapdev_pciecfg(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_internal(pa, size, PAT_UNCACHEABLE, true));
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 
 	return (pmap_mapdev_internal(pa, size, PAT_WRITE_BACK, false));
 }
 
 void
 pmap_unmapdev(vm_offset_t va, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t offset;
 	int i;
 
 	/* If we gave a direct map region in pmap_mapdev, do nothing */
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
 		return;
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 	va = trunc_page(va);
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == va && ppim->sz == size) {
 			if (pmap_initialized)
 				return;
 			ppim->pa = 0;
 			ppim->va = 0;
 			ppim->sz = 0;
 			ppim->mode = 0;
 			if (va + size == virtual_avail)
 				virtual_avail = va;
 			return;
 		}
 	}
 	if (pmap_initialized)
 		kva_free(va, size);
 }
 
 /*
  * Tries to demote a 1GB page mapping.
  */
 static boolean_t
 pmap_demote_pdpe(pmap_t pmap, pdp_entry_t *pdpe, vm_offset_t va)
 {
 	pdp_entry_t newpdpe, oldpdpe;
 	pd_entry_t *firstpde, newpde, *pde;
 	pt_entry_t PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pdpgpa;
 	vm_page_t pdpg;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpdpe = *pdpe;
 	KASSERT((oldpdpe & (PG_PS | PG_V)) == (PG_PS | PG_V),
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_PS and/or PG_V"));
 	if ((pdpg = vm_page_alloc(NULL, va >> PDPSHIFT, VM_ALLOC_INTERRUPT |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_demote_pdpe: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (FALSE);
 	}
 	pdpgpa = VM_PAGE_TO_PHYS(pdpg);
 	firstpde = (pd_entry_t *)PHYS_TO_DMAP(pdpgpa);
 	newpdpe = pdpgpa | PG_M | PG_A | (oldpdpe & PG_U) | PG_RW | PG_V;
 	KASSERT((oldpdpe & PG_A) != 0,
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_A"));
 	KASSERT((oldpdpe & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pdpe: oldpdpe is missing PG_M"));
 	newpde = oldpdpe;
 
 	/*
 	 * Initialize the page directory page.
 	 */
 	for (pde = firstpde; pde < firstpde + NPDEPG; pde++) {
 		*pde = newpde;
 		newpde += NBPDR;
 	}
 
 	/*
 	 * Demote the mapping.
 	 */
 	*pdpe = newpdpe;
 
 	/*
 	 * Invalidate a stale recursive mapping of the page directory page.
 	 */
 	pmap_invalidate_page(pmap, (vm_offset_t)vtopde(va));
 
 	pmap_pdpe_demotions++;
 	CTR2(KTR_PMAP, "pmap_demote_pdpe: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 /*
  * Sets the memory attribute for the specified page.
  */
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 
 	m->md.pat_mode = ma;
 
 	/*
 	 * If "m" is a normal page, update its direct mapping.  This update
 	 * can be relied upon to perform any cache operations that are
 	 * required for data coherence.
 	 */
 	if ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
 	    m->md.pat_mode))
 		panic("memory attribute change on the direct map failed");
 }
 
 /*
  * Changes the specified virtual address range's memory type to that given by
  * the parameter "mode".  The specified virtual address range must be
  * completely contained within either the direct map or the kernel map.  If
  * the virtual address range is contained within the kernel map, then the
  * memory type for each of the corresponding ranges of the direct map is also
  * changed.  (The corresponding ranges of the direct map are those ranges that
  * map the same physical pages as the specified virtual address range.)  These
  * changes to the direct map are necessary because Intel describes the
  * behavior of their processors as "undefined" if two or more mappings to the
  * same physical page have different memory types.
  *
  * Returns zero if the change completed successfully, and either EINVAL or
  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
  * of the virtual address range was not mapped, and ENOMEM is returned if
  * there was insufficient memory available to complete the change.  In the
  * latter case, the memory type may have been changed on some part of the
  * virtual address range or the direct map.
  */
 int
 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 {
 	int error;
 
 	PMAP_LOCK(kernel_pmap);
 	error = pmap_change_attr_locked(va, size, mode, false);
 	PMAP_UNLOCK(kernel_pmap);
 	return (error);
 }
 
 static int
 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode, bool noflush)
 {
 	vm_offset_t base, offset, tmpva;
 	vm_paddr_t pa_start, pa_end, pa_end1;
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	int cache_bits_pte, cache_bits_pde, error;
 	boolean_t changed;
 
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 
 	/*
 	 * Only supported on kernel virtual addresses, including the direct
 	 * map but excluding the recursive map.
 	 */
 	if (base < DMAP_MIN_ADDRESS)
 		return (EINVAL);
 
 	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
 	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
 	changed = FALSE;
 
 	/*
 	 * Pages that aren't mapped aren't supported.  Also break down 2MB pages
 	 * into 4KB pages if required.
 	 */
 	for (tmpva = base; tmpva < base + size; ) {
 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
 		if (pdpe == NULL || *pdpe == 0)
 			return (EINVAL);
 		if (*pdpe & PG_PS) {
 			/*
 			 * If the current 1GB page already has the required
 			 * memory type, then we need not demote this page. Just
 			 * increment tmpva to the next 1GB page frame.
 			 */
 			if ((*pdpe & X86_PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_1gpage(tmpva) + NBPDP;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 1GB page frame
 			 * and there is at least 1GB left within the range, then
 			 * we need not break down this page into 2MB pages.
 			 */
 			if ((tmpva & PDPMASK) == 0 &&
 			    tmpva + PDPMASK < base + size) {
 				tmpva += NBPDP;
 				continue;
 			}
 			if (!pmap_demote_pdpe(kernel_pmap, pdpe, tmpva))
 				return (ENOMEM);
 		}
 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
 		if (*pde == 0)
 			return (EINVAL);
 		if (*pde & PG_PS) {
 			/*
 			 * If the current 2MB page already has the required
 			 * memory type, then we need not demote this page. Just
 			 * increment tmpva to the next 2MB page frame.
 			 */
 			if ((*pde & X86_PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_2mpage(tmpva) + NBPDR;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 2MB page frame
 			 * and there is at least 2MB left within the range, then
 			 * we need not break down this page into 4KB pages.
 			 */
 			if ((tmpva & PDRMASK) == 0 &&
 			    tmpva + PDRMASK < base + size) {
 				tmpva += NBPDR;
 				continue;
 			}
 			if (!pmap_demote_pde(kernel_pmap, pde, tmpva))
 				return (ENOMEM);
 		}
 		pte = pmap_pde_to_pte(pde, tmpva);
 		if (*pte == 0)
 			return (EINVAL);
 		tmpva += PAGE_SIZE;
 	}
 	error = 0;
 
 	/*
 	 * Ok, all the pages exist, so run through them updating their
 	 * cache mode if required.
 	 */
 	pa_start = pa_end = 0;
 	for (tmpva = base; tmpva < base + size; ) {
 		pdpe = pmap_pdpe(kernel_pmap, tmpva);
 		if (*pdpe & PG_PS) {
 			if ((*pdpe & X86_PG_PDE_CACHE) != cache_bits_pde) {
 				pmap_pde_attr(pdpe, cache_bits_pde,
 				    X86_PG_PDE_CACHE);
 				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*pdpe & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pdpe & PG_PS_FRAME;
 					pa_end = pa_start + NBPDP;
 				} else if (pa_end == (*pdpe & PG_PS_FRAME))
 					pa_end += NBPDP;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_attr_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, mode, noflush);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = *pdpe & PG_PS_FRAME;
 					pa_end = pa_start + NBPDP;
 				}
 			}
 			tmpva = trunc_1gpage(tmpva) + NBPDP;
 			continue;
 		}
 		pde = pmap_pdpe_to_pde(pdpe, tmpva);
 		if (*pde & PG_PS) {
 			if ((*pde & X86_PG_PDE_CACHE) != cache_bits_pde) {
 				pmap_pde_attr(pde, cache_bits_pde,
 				    X86_PG_PDE_CACHE);
 				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*pde & PG_PS_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pde & PG_PS_FRAME;
 					pa_end = pa_start + NBPDR;
 				} else if (pa_end == (*pde & PG_PS_FRAME))
 					pa_end += NBPDR;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_attr_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, mode, noflush);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = *pde & PG_PS_FRAME;
 					pa_end = pa_start + NBPDR;
 				}
 			}
 			tmpva = trunc_2mpage(tmpva) + NBPDR;
 		} else {
 			pte = pmap_pde_to_pte(pde, tmpva);
 			if ((*pte & X86_PG_PTE_CACHE) != cache_bits_pte) {
 				pmap_pte_attr(pte, cache_bits_pte,
 				    X86_PG_PTE_CACHE);
 				changed = TRUE;
 			}
 			if (tmpva >= VM_MIN_KERNEL_ADDRESS &&
 			    (*pte & PG_FRAME) < dmaplimit) {
 				if (pa_start == pa_end) {
 					/* Start physical address run. */
 					pa_start = *pte & PG_FRAME;
 					pa_end = pa_start + PAGE_SIZE;
 				} else if (pa_end == (*pte & PG_FRAME))
 					pa_end += PAGE_SIZE;
 				else {
 					/* Run ended, update direct map. */
 					error = pmap_change_attr_locked(
 					    PHYS_TO_DMAP(pa_start),
 					    pa_end - pa_start, mode, noflush);
 					if (error != 0)
 						break;
 					/* Start physical address run. */
 					pa_start = *pte & PG_FRAME;
 					pa_end = pa_start + PAGE_SIZE;
 				}
 			}
 			tmpva += PAGE_SIZE;
 		}
 	}
 	if (error == 0 && pa_start != pa_end && pa_start < dmaplimit) {
 		pa_end1 = MIN(pa_end, dmaplimit);
 		if (pa_start != pa_end1)
 			error = pmap_change_attr_locked(PHYS_TO_DMAP(pa_start),
 			    pa_end1 - pa_start, mode, noflush);
 	}
 
 	/*
 	 * Flush CPU caches if required to make sure any data isn't cached that
 	 * shouldn't be, etc.
 	 */
 	if (changed) {
 		pmap_invalidate_range(kernel_pmap, base, tmpva);
 		if (!noflush)
 			pmap_invalidate_cache_range(base, tmpva);
 	}
 	return (error);
 }
 
 /*
  * Demotes any mapping within the direct map region that covers more than the
  * specified range of physical addresses.  This range's size must be a power
  * of two and its starting address must be a multiple of its size.  Since the
  * demotion does not change any attributes of the mapping, a TLB invalidation
  * is not mandatory.  The caller may, however, request a TLB invalidation.
  */
 void
 pmap_demote_DMAP(vm_paddr_t base, vm_size_t len, boolean_t invalidate)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	vm_offset_t va;
 	boolean_t changed;
 
 	if (len == 0)
 		return;
 	KASSERT(powerof2(len), ("pmap_demote_DMAP: len is not a power of 2"));
 	KASSERT((base & (len - 1)) == 0,
 	    ("pmap_demote_DMAP: base is not a multiple of len"));
 	if (len < NBPDP && base < dmaplimit) {
 		va = PHYS_TO_DMAP(base);
 		changed = FALSE;
 		PMAP_LOCK(kernel_pmap);
 		pdpe = pmap_pdpe(kernel_pmap, va);
 		if ((*pdpe & X86_PG_V) == 0)
 			panic("pmap_demote_DMAP: invalid PDPE");
 		if ((*pdpe & PG_PS) != 0) {
 			if (!pmap_demote_pdpe(kernel_pmap, pdpe, va))
 				panic("pmap_demote_DMAP: PDPE failed");
 			changed = TRUE;
 		}
 		if (len < NBPDR) {
 			pde = pmap_pdpe_to_pde(pdpe, va);
 			if ((*pde & X86_PG_V) == 0)
 				panic("pmap_demote_DMAP: invalid PDE");
 			if ((*pde & PG_PS) != 0) {
 				if (!pmap_demote_pde(kernel_pmap, pde, va))
 					panic("pmap_demote_DMAP: PDE failed");
 				changed = TRUE;
 			}
 		}
 		if (changed && invalidate)
 			pmap_invalidate_page(kernel_pmap, va);
 		PMAP_UNLOCK(kernel_pmap);
 	}
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pd_entry_t *pdep;
 	pt_entry_t pte, PG_A, PG_M, PG_RW, PG_V;
 	vm_paddr_t pa;
 	int val;
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	PMAP_LOCK(pmap);
 retry:
 	pdep = pmap_pde(pmap, addr);
 	if (pdep != NULL && (*pdep & PG_V)) {
 		if (*pdep & PG_PS) {
 			pte = *pdep;
 			/* Compute the physical address of the 4KB page. */
 			pa = ((*pdep & PG_PS_FRAME) | (addr & PDRMASK)) &
 			    PG_FRAME;
 			val = MINCORE_SUPER;
 		} else {
 			pte = *pmap_pde_to_pte(pdep, addr);
 			pa = pte & PG_FRAME;
 			val = 0;
 		}
 	} else {
 		pte = 0;
 		pa = 0;
 		val = 0;
 	}
 	if ((pte & PG_V) != 0) {
 		val |= MINCORE_INCORE;
 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((pte & PG_A) != 0)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 	}
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 static uint64_t
 pmap_pcid_alloc(pmap_t pmap, u_int cpuid)
 {
 	uint32_t gen, new_gen, pcid_next;
 
 	CRITICAL_ASSERT(curthread);
 	gen = PCPU_GET(pcid_gen);
 	if (pmap->pm_pcids[cpuid].pm_pcid == PMAP_PCID_KERN)
 		return (pti ? 0 : CR3_PCID_SAVE);
 	if (pmap->pm_pcids[cpuid].pm_gen == gen)
 		return (CR3_PCID_SAVE);
 	pcid_next = PCPU_GET(pcid_next);
 	KASSERT((!pti && pcid_next <= PMAP_PCID_OVERMAX) ||
 	    (pti && pcid_next <= PMAP_PCID_OVERMAX_KERN),
 	    ("cpu %d pcid_next %#x", cpuid, pcid_next));
 	if ((!pti && pcid_next == PMAP_PCID_OVERMAX) ||
 	    (pti && pcid_next == PMAP_PCID_OVERMAX_KERN)) {
 		new_gen = gen + 1;
 		if (new_gen == 0)
 			new_gen = 1;
 		PCPU_SET(pcid_gen, new_gen);
 		pcid_next = PMAP_PCID_KERN + 1;
 	} else {
 		new_gen = gen;
 	}
 	pmap->pm_pcids[cpuid].pm_pcid = pcid_next;
 	pmap->pm_pcids[cpuid].pm_gen = new_gen;
 	PCPU_SET(pcid_next, pcid_next + 1);
 	return (0);
 }
 
 static uint64_t
 pmap_pcid_alloc_checked(pmap_t pmap, u_int cpuid)
 {
 	uint64_t cached;
 
 	cached = pmap_pcid_alloc(pmap, cpuid);
 	KASSERT(pmap->pm_pcids[cpuid].pm_pcid < PMAP_PCID_OVERMAX,
 	    ("pmap %p cpu %d pcid %#x", pmap, cpuid,
 	    pmap->pm_pcids[cpuid].pm_pcid));
 	KASSERT(pmap->pm_pcids[cpuid].pm_pcid != PMAP_PCID_KERN ||
 	    pmap == kernel_pmap,
 	    ("non-kernel pmap pmap %p cpu %d pcid %#x",
 	    pmap, cpuid, pmap->pm_pcids[cpuid].pm_pcid));
 	return (cached);
 }
 
 static void
 pmap_activate_sw_pti_post(struct thread *td, pmap_t pmap)
 {
 
 	PCPU_GET(tssp)->tss_rsp0 = pmap->pm_ucr3 != PMAP_NO_CR3 ?
 	    PCPU_GET(pti_rsp0) : (uintptr_t)td->td_pcb;
 }
 
 static void inline
 pmap_activate_sw_pcid_pti(pmap_t pmap, u_int cpuid, const bool invpcid_works1)
 {
 	struct invpcid_descr d;
 	uint64_t cached, cr3, kcr3, ucr3;
 
 	cached = pmap_pcid_alloc_checked(pmap, cpuid);
 	cr3 = rcr3();
 	if ((cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
 		load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid);
 	PCPU_SET(curpmap, pmap);
 	kcr3 = pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid;
 	ucr3 = pmap->pm_ucr3 | pmap->pm_pcids[cpuid].pm_pcid |
 	    PMAP_PCID_USER_PT;
 
 	if (!cached && pmap->pm_ucr3 != PMAP_NO_CR3) {
 		/*
 		 * Explicitly invalidate translations cached from the
 		 * user page table.  They are not automatically
 		 * flushed by reload of cr3 with the kernel page table
 		 * pointer above.
 		 *
 		 * Note that the if() condition is resolved statically
 		 * by using the function argument instead of
 		 * runtime-evaluated invpcid_works value.
 		 */
 		if (invpcid_works1) {
 			d.pcid = PMAP_PCID_USER_PT |
 			    pmap->pm_pcids[cpuid].pm_pcid;
 			d.pad = 0;
 			d.addr = 0;
 			invpcid(&d, INVPCID_CTX);
 		} else {
 			pmap_pti_pcid_invalidate(ucr3, kcr3);
 		}
 	}
 
 	PCPU_SET(kcr3, kcr3 | CR3_PCID_SAVE);
 	PCPU_SET(ucr3, ucr3 | CR3_PCID_SAVE);
 	if (cached)
 		PCPU_INC(pm_save_cnt);
 }
 
 static void
 pmap_activate_sw_pcid_invpcid_pti(struct thread *td, pmap_t pmap, u_int cpuid)
 {
 
 	pmap_activate_sw_pcid_pti(pmap, cpuid, true);
 	pmap_activate_sw_pti_post(td, pmap);
 }
 
 static void
 pmap_activate_sw_pcid_noinvpcid_pti(struct thread *td, pmap_t pmap,
     u_int cpuid)
 {
 	register_t rflags;
 
 	/*
 	 * If the INVPCID instruction is not available,
 	 * invltlb_pcid_handler() is used to handle an invalidate_all
 	 * IPI, which checks for curpmap == smp_tlb_pmap.  The below
 	 * sequence of operations has a window where %CR3 is loaded
 	 * with the new pmap's PML4 address, but the curpmap value has
 	 * not yet been updated.  This causes the invltlb IPI handler,
 	 * which is called between the updates, to execute as a NOP,
 	 * which leaves stale TLB entries.
 	 *
 	 * Note that the most typical use of pmap_activate_sw(), from
 	 * the context switch, is immune to this race, because
 	 * interrupts are disabled (while the thread lock is owned),
 	 * and the IPI happens after curpmap is updated.  Protect
 	 * other callers in a similar way, by disabling interrupts
 	 * around the %cr3 register reload and curpmap assignment.
 	 */
 	rflags = intr_disable();
 	pmap_activate_sw_pcid_pti(pmap, cpuid, false);
 	intr_restore(rflags);
 	pmap_activate_sw_pti_post(td, pmap);
 }
 
 static void
 pmap_activate_sw_pcid_nopti(struct thread *td __unused, pmap_t pmap,
     u_int cpuid)
 {
 	uint64_t cached, cr3;
 
 	cached = pmap_pcid_alloc_checked(pmap, cpuid);
 	cr3 = rcr3();
 	if (!cached || (cr3 & ~CR3_PCID_MASK) != pmap->pm_cr3)
 		load_cr3(pmap->pm_cr3 | pmap->pm_pcids[cpuid].pm_pcid |
 		    cached);
 	PCPU_SET(curpmap, pmap);
 	if (cached)
 		PCPU_INC(pm_save_cnt);
 }
 
 static void
 pmap_activate_sw_pcid_noinvpcid_nopti(struct thread *td __unused, pmap_t pmap,
     u_int cpuid)
 {
 	register_t rflags;
 
 	rflags = intr_disable();
 	pmap_activate_sw_pcid_nopti(td, pmap, cpuid);
 	intr_restore(rflags);
 }
 
 static void
 pmap_activate_sw_nopcid_nopti(struct thread *td __unused, pmap_t pmap,
     u_int cpuid __unused)
 {
 
 	load_cr3(pmap->pm_cr3);
 	PCPU_SET(curpmap, pmap);
 }
 
 static void
 pmap_activate_sw_nopcid_pti(struct thread *td, pmap_t pmap,
     u_int cpuid __unused)
 {
 
 	pmap_activate_sw_nopcid_nopti(td, pmap, cpuid);
 	PCPU_SET(kcr3, pmap->pm_cr3);
 	PCPU_SET(ucr3, pmap->pm_ucr3);
 	pmap_activate_sw_pti_post(td, pmap);
 }
 
 DEFINE_IFUNC(static, void, pmap_activate_sw_mode, (struct thread *, pmap_t,
     u_int))
 {
 
 	if (pmap_pcid_enabled && pti && invpcid_works)
 		return (pmap_activate_sw_pcid_invpcid_pti);
 	else if (pmap_pcid_enabled && pti && !invpcid_works)
 		return (pmap_activate_sw_pcid_noinvpcid_pti);
 	else if (pmap_pcid_enabled && !pti && invpcid_works)
 		return (pmap_activate_sw_pcid_nopti);
 	else if (pmap_pcid_enabled && !pti && !invpcid_works)
 		return (pmap_activate_sw_pcid_noinvpcid_nopti);
 	else if (!pmap_pcid_enabled && pti)
 		return (pmap_activate_sw_nopcid_pti);
 	else /* if (!pmap_pcid_enabled && !pti) */
 		return (pmap_activate_sw_nopcid_nopti);
 }
 
 void
 pmap_activate_sw(struct thread *td)
 {
 	pmap_t oldpmap, pmap;
 	u_int cpuid;
 
 	oldpmap = PCPU_GET(curpmap);
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	if (oldpmap == pmap)
 		return;
 	cpuid = PCPU_GET(cpuid);
 #ifdef SMP
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 #else
 	CPU_SET(cpuid, &pmap->pm_active);
 #endif
 	pmap_activate_sw_mode(td, pmap, cpuid);
 #ifdef SMP
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 #else
 	CPU_CLR(cpuid, &oldpmap->pm_active);
 #endif
 }
 
 void
 pmap_activate(struct thread *td)
 {
 
 	critical_enter();
 	pmap_activate_sw(td);
 	critical_exit();
 }
 
 void
 pmap_activate_boot(pmap_t pmap)
 {
 	uint64_t kcr3;
 	u_int cpuid;
 
 	/*
 	 * kernel_pmap must be never deactivated, and we ensure that
 	 * by never activating it at all.
 	 */
 	MPASS(pmap != kernel_pmap);
 
 	cpuid = PCPU_GET(cpuid);
 #ifdef SMP
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 #else
 	CPU_SET(cpuid, &pmap->pm_active);
 #endif
 	PCPU_SET(curpmap, pmap);
 	if (pti) {
 		kcr3 = pmap->pm_cr3;
 		if (pmap_pcid_enabled)
 			kcr3 |= pmap->pm_pcids[cpuid].pm_pcid | CR3_PCID_SAVE;
 	} else {
 		kcr3 = PMAP_NO_CR3;
 	}
 	PCPU_SET(kcr3, kcr3);
 	PCPU_SET(ucr3, PMAP_NO_CR3);
 }
 
 void
 pmap_sync_icache(pmap_t pm, vm_offset_t va, vm_size_t sz)
 {
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t superpage_offset;
 
 	if (size < NBPDR)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & PDRMASK;
 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
 	    (*addr & PDRMASK) == superpage_offset)
 		return;
 	if ((*addr & PDRMASK) < superpage_offset)
 		*addr = (*addr & ~PDRMASK) + superpage_offset;
 	else
 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 }
 
 #ifdef INVARIANTS
 static unsigned long num_dirty_emulations;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_dirty_emulations, CTLFLAG_RW,
 	     &num_dirty_emulations, 0, NULL);
 
 static unsigned long num_accessed_emulations;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_accessed_emulations, CTLFLAG_RW,
 	     &num_accessed_emulations, 0, NULL);
 
 static unsigned long num_superpage_accessed_emulations;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, num_superpage_accessed_emulations, CTLFLAG_RW,
 	     &num_superpage_accessed_emulations, 0, NULL);
 
 static unsigned long ad_emulation_superpage_promotions;
 SYSCTL_ULONG(_vm_pmap, OID_AUTO, ad_emulation_superpage_promotions, CTLFLAG_RW,
 	     &ad_emulation_superpage_promotions, 0, NULL);
 #endif	/* INVARIANTS */
 
 int
 pmap_emulate_accessed_dirty(pmap_t pmap, vm_offset_t va, int ftype)
 {
 	int rv;
 	struct rwlock *lock;
 #if VM_NRESERVLEVEL > 0
 	vm_page_t m, mpte;
 #endif
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_A, PG_M, PG_RW, PG_V;
 
 	KASSERT(ftype == VM_PROT_READ || ftype == VM_PROT_WRITE,
 	    ("pmap_emulate_accessed_dirty: invalid fault type %d", ftype));
 
 	if (!pmap_emulate_ad_bits(pmap))
 		return (-1);
 
 	PG_A = pmap_accessed_bit(pmap);
 	PG_M = pmap_modified_bit(pmap);
 	PG_V = pmap_valid_bit(pmap);
 	PG_RW = pmap_rw_bit(pmap);
 
 	rv = -1;
 	lock = NULL;
 	PMAP_LOCK(pmap);
 
 	pde = pmap_pde(pmap, va);
 	if (pde == NULL || (*pde & PG_V) == 0)
 		goto done;
 
 	if ((*pde & PG_PS) != 0) {
 		if (ftype == VM_PROT_READ) {
 #ifdef INVARIANTS
 			atomic_add_long(&num_superpage_accessed_emulations, 1);
 #endif
 			*pde |= PG_A;
 			rv = 0;
 		}
 		goto done;
 	}
 
 	pte = pmap_pde_to_pte(pde, va);
 	if ((*pte & PG_V) == 0)
 		goto done;
 
 	if (ftype == VM_PROT_WRITE) {
 		if ((*pte & PG_RW) == 0)
 			goto done;
 		/*
 		 * Set the modified and accessed bits simultaneously.
 		 *
 		 * Intel EPT PTEs that do software emulation of A/D bits map
 		 * PG_A and PG_M to EPT_PG_READ and EPT_PG_WRITE respectively.
 		 * An EPT misconfiguration is triggered if the PTE is writable
 		 * but not readable (WR=10). This is avoided by setting PG_A
 		 * and PG_M simultaneously.
 		 */
 		*pte |= PG_M | PG_A;
 	} else {
 		*pte |= PG_A;
 	}
 
 #if VM_NRESERVLEVEL > 0
 	/* try to promote the mapping */
 	if (va < VM_MAXUSER_ADDRESS)
 		mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 	else
 		mpte = NULL;
 
 	m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 
 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0) {
 		pmap_promote_pde(pmap, pde, va, &lock);
 #ifdef INVARIANTS
 		atomic_add_long(&ad_emulation_superpage_promotions, 1);
 #endif
 	}
 #endif
 
 #ifdef INVARIANTS
 	if (ftype == VM_PROT_WRITE)
 		atomic_add_long(&num_dirty_emulations, 1);
 	else
 		atomic_add_long(&num_accessed_emulations, 1);
 #endif
 	rv = 0;		/* success */
 done:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 void
 pmap_get_mapping(pmap_t pmap, vm_offset_t va, uint64_t *ptr, int *num)
 {
 	pml4_entry_t *pml4;
 	pdp_entry_t *pdp;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	int idx;
 
 	idx = 0;
 	PG_V = pmap_valid_bit(pmap);
 	PMAP_LOCK(pmap);
 
 	pml4 = pmap_pml4e(pmap, va);
 	ptr[idx++] = *pml4;
 	if ((*pml4 & PG_V) == 0)
 		goto done;
 
 	pdp = pmap_pml4e_to_pdpe(pml4, va);
 	ptr[idx++] = *pdp;
 	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0)
 		goto done;
 
 	pde = pmap_pdpe_to_pde(pdp, va);
 	ptr[idx++] = *pde;
 	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0)
 		goto done;
 
 	pte = pmap_pde_to_pte(pde, va);
 	ptr[idx++] = *pte;
 
 done:
 	PMAP_UNLOCK(pmap);
 	*num = idx;
 }
 
 /**
  * Get the kernel virtual address of a set of physical pages. If there are
  * physical addresses not covered by the DMAP perform a transient mapping
  * that will be removed when calling pmap_unmap_io_transient.
  *
  * \param page        The pages the caller wishes to obtain the virtual
  *                    address on the kernel memory map.
  * \param vaddr       On return contains the kernel virtual memory address
  *                    of the pages passed in the page parameter.
  * \param count       Number of pages passed in.
  * \param can_fault   TRUE if the thread using the mapped pages can take
  *                    page faults, FALSE otherwise.
  *
  * \returns TRUE if the caller must call pmap_unmap_io_transient when
  *          finished or FALSE otherwise.
  *
  */
 boolean_t
 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	boolean_t needs_mapping;
 	pt_entry_t *pte;
 	int cache_bits, error __unused, i;
 
 	/*
 	 * Allocate any KVA space that we need, this is done in a separate
 	 * loop to prevent calling vmem_alloc while pinned.
 	 */
 	needs_mapping = FALSE;
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (__predict_false(paddr >= dmaplimit)) {
 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 			needs_mapping = TRUE;
 		} else {
 			vaddr[i] = PHYS_TO_DMAP(paddr);
 		}
 	}
 
 	/* Exit early if everything is covered by the DMAP */
 	if (!needs_mapping)
 		return (FALSE);
 
 	/*
 	 * NB:  The sequence of updating a page table followed by accesses
 	 * to the corresponding pages used in the !DMAP case is subject to
 	 * the situation described in the "AMD64 Architecture Programmer's
 	 * Manual Volume 2: System Programming" rev. 3.23, "7.3.1 Special
 	 * Coherency Considerations".  Therefore, issuing the INVLPG right
 	 * after modifying the PTE bits is crucial.
 	 */
 	if (!can_fault)
 		sched_pin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (paddr >= dmaplimit) {
 			if (can_fault) {
 				/*
 				 * Slow path, since we can get page faults
 				 * while mappings are active don't pin the
 				 * thread to the CPU and instead add a global
 				 * mapping visible to all CPUs.
 				 */
 				pmap_qenter(vaddr[i], &page[i], 1);
 			} else {
 				pte = vtopte(vaddr[i]);
 				cache_bits = pmap_cache_bits(kernel_pmap,
 				    page[i]->md.pat_mode, 0);
 				pte_store(pte, paddr | X86_PG_RW | X86_PG_V |
 				    cache_bits);
 				invlpg(vaddr[i]);
 			}
 		}
 	}
 
 	return (needs_mapping);
 }
 
 void
 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	int i;
 
 	if (!can_fault)
 		sched_unpin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (paddr >= dmaplimit) {
 			if (can_fault)
 				pmap_qremove(vaddr[i], 1);
 			vmem_free(kernel_arena, vaddr[i], PAGE_SIZE);
 		}
 	}
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 	vm_paddr_t paddr;
 
 	paddr = VM_PAGE_TO_PHYS(m);
 	if (paddr < dmaplimit)
 		return (PHYS_TO_DMAP(paddr));
 	mtx_lock_spin(&qframe_mtx);
 	KASSERT(*vtopte(qframe) == 0, ("qframe busy"));
 	pte_store(vtopte(qframe), paddr | X86_PG_RW | X86_PG_V | X86_PG_A |
 	    X86_PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0));
 	return (qframe);
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 
 	if (addr != qframe)
 		return;
 	pte_store(vtopte(qframe), 0);
 	invlpg(qframe);
 	mtx_unlock_spin(&qframe_mtx);
 }
 
 /*
  * Pdp pages from the large map are managed differently from either
  * kernel or user page table pages.  They are permanently allocated at
  * initialization time, and their wire count is permanently set to
  * zero.  The pml4 entries pointing to those pages are copied into
  * each allocated pmap.
  *
  * In contrast, pd and pt pages are managed like user page table
  * pages.  They are dynamically allocated, and their wire count
  * represents the number of valid entries within the page.
  */
 static vm_page_t
 pmap_large_map_getptp_unlocked(void)
 {
 	vm_page_t m;
 
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_ZERO);
 	if (m != NULL && (m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 	return (m);
 }
 
 static vm_page_t
 pmap_large_map_getptp(void)
 {
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	m = pmap_large_map_getptp_unlocked();
 	if (m == NULL) {
 		PMAP_UNLOCK(kernel_pmap);
 		vm_wait(NULL);
 		PMAP_LOCK(kernel_pmap);
 		/* Callers retry. */
 	}
 	return (m);
 }
 
 static pdp_entry_t *
 pmap_large_map_pdpe(vm_offset_t va)
 {
 	vm_pindex_t pml4_idx;
 	vm_paddr_t mphys;
 
 	pml4_idx = pmap_pml4e_index(va);
 	KASSERT(LMSPML4I <= pml4_idx && pml4_idx < LMSPML4I + lm_ents,
 	    ("pmap_large_map_pdpe: va %#jx out of range idx %#jx LMSPML4I "
 	    "%#jx lm_ents %d",
 	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
 	KASSERT((kernel_pmap->pm_pml4[pml4_idx] & X86_PG_V) != 0,
 	    ("pmap_large_map_pdpe: invalid pml4 for va %#jx idx %#jx "
 	    "LMSPML4I %#jx lm_ents %d",
 	    (uintmax_t)va, (uintmax_t)pml4_idx, LMSPML4I, lm_ents));
 	mphys = kernel_pmap->pm_pml4[pml4_idx] & PG_FRAME;
 	return ((pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va));
 }
 
 static pd_entry_t *
 pmap_large_map_pde(vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	vm_page_t m;
 	vm_paddr_t mphys;
 
 retry:
 	pdpe = pmap_large_map_pdpe(va);
 	if (*pdpe == 0) {
 		m = pmap_large_map_getptp();
 		if (m == NULL)
 			goto retry;
 		mphys = VM_PAGE_TO_PHYS(m);
 		*pdpe = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
 	} else {
 		MPASS((*pdpe & X86_PG_PS) == 0);
 		mphys = *pdpe & PG_FRAME;
 	}
 	return ((pd_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pde_index(va));
 }
 
 static pt_entry_t *
 pmap_large_map_pte(vm_offset_t va)
 {
 	pd_entry_t *pde;
 	vm_page_t m;
 	vm_paddr_t mphys;
 
 retry:
 	pde = pmap_large_map_pde(va);
 	if (*pde == 0) {
 		m = pmap_large_map_getptp();
 		if (m == NULL)
 			goto retry;
 		mphys = VM_PAGE_TO_PHYS(m);
 		*pde = mphys | X86_PG_A | X86_PG_RW | X86_PG_V | pg_nx;
 		PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->wire_count++;
 	} else {
 		MPASS((*pde & X86_PG_PS) == 0);
 		mphys = *pde & PG_FRAME;
 	}
 	return ((pt_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pte_index(va));
 }
 
 static vm_paddr_t
 pmap_large_map_kextract(vm_offset_t va)
 {
 	pdp_entry_t *pdpe, pdp;
 	pd_entry_t *pde, pd;
 	pt_entry_t *pte, pt;
 
 	KASSERT(LARGEMAP_MIN_ADDRESS <= va && va < PMAP_LARGEMAP_MAX_ADDRESS(),
 	    ("not largemap range %#lx", (u_long)va));
 	pdpe = pmap_large_map_pdpe(va);
 	pdp = *pdpe;
 	KASSERT((pdp & X86_PG_V) != 0,
 	    ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
 	    (u_long)pdpe, pdp));
 	if ((pdp & X86_PG_PS) != 0) {
 		KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
 		    ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
 		    (u_long)pdpe, pdp));
 		return ((pdp & PG_PS_PDP_FRAME) | (va & PDPMASK));
 	}
 	pde = pmap_pdpe_to_pde(pdpe, va);
 	pd = *pde;
 	KASSERT((pd & X86_PG_V) != 0,
 	    ("invalid pd va %#lx pde %#lx pd %#lx", va, (u_long)pde, pd));
 	if ((pd & X86_PG_PS) != 0)
 		return ((pd & PG_PS_FRAME) | (va & PDRMASK));
 	pte = pmap_pde_to_pte(pde, va);
 	pt = *pte;
 	KASSERT((pt & X86_PG_V) != 0,
 	    ("invalid pte va %#lx pte %#lx pt %#lx", va, (u_long)pte, pt));
 	return ((pt & PG_FRAME) | (va & PAGE_MASK));
 }
 
 static int
 pmap_large_map_getva(vm_size_t len, vm_offset_t align, vm_offset_t phase,
     vmem_addr_t *vmem_res)
 {
 
 	/*
 	 * Large mappings are all but static.  Consequently, there
 	 * is no point in waiting for an earlier allocation to be
 	 * freed.
 	 */
 	return (vmem_xalloc(large_vmem, len, align, phase, 0, VMEM_ADDR_MIN,
 	    VMEM_ADDR_MAX, M_NOWAIT | M_BESTFIT, vmem_res));
 }
 
 int
 pmap_large_map(vm_paddr_t spa, vm_size_t len, void **addr,
     vm_memattr_t mattr)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_offset_t va, inc;
 	vmem_addr_t vmem_res;
 	vm_paddr_t pa;
 	int error;
 
 	if (len == 0 || spa + len < spa)
 		return (EINVAL);
 
 	/* See if DMAP can serve. */
 	if (spa + len <= dmaplimit) {
 		va = PHYS_TO_DMAP(spa);
 		*addr = (void *)va;
 		return (pmap_change_attr(va, len, mattr));
 	}
 
 	/*
 	 * No, allocate KVA.  Fit the address with best possible
 	 * alignment for superpages.  Fall back to worse align if
 	 * failed.
 	 */
 	error = ENOMEM;
 	if ((amd_feature & AMDID_PAGE1GB) != 0 && rounddown2(spa + len,
 	    NBPDP) >= roundup2(spa, NBPDP) + NBPDP)
 		error = pmap_large_map_getva(len, NBPDP, spa & PDPMASK,
 		    &vmem_res);
 	if (error != 0 && rounddown2(spa + len, NBPDR) >= roundup2(spa,
 	    NBPDR) + NBPDR)
 		error = pmap_large_map_getva(len, NBPDR, spa & PDRMASK,
 		    &vmem_res);
 	if (error != 0)
 		error = pmap_large_map_getva(len, PAGE_SIZE, 0, &vmem_res);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Fill pagetable.  PG_M is not pre-set, we scan modified bits
 	 * in the pagetable to minimize flushing.  No need to
 	 * invalidate TLB, since we only update invalid entries.
 	 */
 	PMAP_LOCK(kernel_pmap);
 	for (pa = spa, va = vmem_res; len > 0; pa += inc, va += inc,
 	    len -= inc) {
 		if ((amd_feature & AMDID_PAGE1GB) != 0 && len >= NBPDP &&
 		    (pa & PDPMASK) == 0 && (va & PDPMASK) == 0) {
 			pdpe = pmap_large_map_pdpe(va);
 			MPASS(*pdpe == 0);
 			*pdpe = pa | pg_g | X86_PG_PS | X86_PG_RW |
 			    X86_PG_V | X86_PG_A | pg_nx |
 			    pmap_cache_bits(kernel_pmap, mattr, TRUE);
 			inc = NBPDP;
 		} else if (len >= NBPDR && (pa & PDRMASK) == 0 &&
 		    (va & PDRMASK) == 0) {
 			pde = pmap_large_map_pde(va);
 			MPASS(*pde == 0);
 			*pde = pa | pg_g | X86_PG_PS | X86_PG_RW |
 			    X86_PG_V | X86_PG_A | pg_nx |
 			    pmap_cache_bits(kernel_pmap, mattr, TRUE);
 			PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde))->
 			    wire_count++;
 			inc = NBPDR;
 		} else {
 			pte = pmap_large_map_pte(va);
 			MPASS(*pte == 0);
 			*pte = pa | pg_g | X86_PG_RW | X86_PG_V |
 			    X86_PG_A | pg_nx | pmap_cache_bits(kernel_pmap,
 			    mattr, FALSE);
 			PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte))->
 			    wire_count++;
 			inc = PAGE_SIZE;
 		}
 	}
 	PMAP_UNLOCK(kernel_pmap);
 	MPASS(len == 0);
 
 	*addr = (void *)vmem_res;
 	return (0);
 }
 
 void
 pmap_large_unmap(void *svaa, vm_size_t len)
 {
 	vm_offset_t sva, va;
 	vm_size_t inc;
 	pdp_entry_t *pdpe, pdp;
 	pd_entry_t *pde, pd;
 	pt_entry_t *pte;
 	vm_page_t m;
 	struct spglist spgf;
 
 	sva = (vm_offset_t)svaa;
 	if (len == 0 || sva + len < sva || (sva >= DMAP_MIN_ADDRESS &&
 	    sva + len <= DMAP_MIN_ADDRESS + dmaplimit))
 		return;
 
 	SLIST_INIT(&spgf);
 	KASSERT(LARGEMAP_MIN_ADDRESS <= sva &&
 	    sva + len <= PMAP_LARGEMAP_MAX_ADDRESS(),
 	    ("not largemap range %#lx %#lx", (u_long)svaa, (u_long)svaa + len));
 	PMAP_LOCK(kernel_pmap);
 	for (va = sva; va < sva + len; va += inc) {
 		pdpe = pmap_large_map_pdpe(va);
 		pdp = *pdpe;
 		KASSERT((pdp & X86_PG_V) != 0,
 		    ("invalid pdp va %#lx pdpe %#lx pdp %#lx", va,
 		    (u_long)pdpe, pdp));
 		if ((pdp & X86_PG_PS) != 0) {
 			KASSERT((amd_feature & AMDID_PAGE1GB) != 0,
 			    ("no 1G pages, va %#lx pdpe %#lx pdp %#lx", va,
 			    (u_long)pdpe, pdp));
 			KASSERT((va & PDPMASK) == 0,
 			    ("PDPMASK bit set, va %#lx pdpe %#lx pdp %#lx", va,
 			    (u_long)pdpe, pdp));
 			KASSERT(va + NBPDP <= sva + len,
 			    ("unmap covers partial 1GB page, sva %#lx va %#lx "
 			    "pdpe %#lx pdp %#lx len %#lx", sva, va,
 			    (u_long)pdpe, pdp, len));
 			*pdpe = 0;
 			inc = NBPDP;
 			continue;
 		}
 		pde = pmap_pdpe_to_pde(pdpe, va);
 		pd = *pde;
 		KASSERT((pd & X86_PG_V) != 0,
 		    ("invalid pd va %#lx pde %#lx pd %#lx", va,
 		    (u_long)pde, pd));
 		if ((pd & X86_PG_PS) != 0) {
 			KASSERT((va & PDRMASK) == 0,
 			    ("PDRMASK bit set, va %#lx pde %#lx pd %#lx", va,
 			    (u_long)pde, pd));
 			KASSERT(va + NBPDR <= sva + len,
 			    ("unmap covers partial 2MB page, sva %#lx va %#lx "
 			    "pde %#lx pd %#lx len %#lx", sva, va, (u_long)pde,
 			    pd, len));
 			pde_store(pde, 0);
 			inc = NBPDR;
 			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
 			m->wire_count--;
 			if (m->wire_count == 0) {
 				*pdpe = 0;
 				SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
 			}
 			continue;
 		}
 		pte = pmap_pde_to_pte(pde, va);
 		KASSERT((*pte & X86_PG_V) != 0,
 		    ("invalid pte va %#lx pte %#lx pt %#lx", va,
 		    (u_long)pte, *pte));
 		pte_clear(pte);
 		inc = PAGE_SIZE;
 		m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pte));
 		m->wire_count--;
 		if (m->wire_count == 0) {
 			*pde = 0;
 			SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
 			m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pde));
 			m->wire_count--;
 			if (m->wire_count == 0) {
 				*pdpe = 0;
 				SLIST_INSERT_HEAD(&spgf, m, plinks.s.ss);
 			}
 		}
 	}
 	pmap_invalidate_range(kernel_pmap, sva, sva + len);
 	PMAP_UNLOCK(kernel_pmap);
 	vm_page_free_pages_toq(&spgf, false);
 	vmem_free(large_vmem, sva, len);
 }
 
 static void
 pmap_large_map_wb_fence_mfence(void)
 {
 
 	mfence();
 }
 
 static void
 pmap_large_map_wb_fence_sfence(void)
 {
 
 	sfence();
 }
 
 static void
 pmap_large_map_wb_fence_nop(void)
 {
 }
 
 DEFINE_IFUNC(static, void, pmap_large_map_wb_fence, (void))
 {
 
 	if (cpu_vendor_id != CPU_VENDOR_INTEL)
 		return (pmap_large_map_wb_fence_mfence);
 	else if ((cpu_stdext_feature & (CPUID_STDEXT_CLWB |
 	    CPUID_STDEXT_CLFLUSHOPT)) == 0)
 		return (pmap_large_map_wb_fence_sfence);
 	else
 		/* clflush is strongly enough ordered */
 		return (pmap_large_map_wb_fence_nop);
 }
 
 static void
 pmap_large_map_flush_range_clwb(vm_offset_t va, vm_size_t len)
 {
 
 	for (; len > 0; len -= cpu_clflush_line_size,
 	    va += cpu_clflush_line_size)
 		clwb(va);
 }
 
 static void
 pmap_large_map_flush_range_clflushopt(vm_offset_t va, vm_size_t len)
 {
 
 	for (; len > 0; len -= cpu_clflush_line_size,
 	    va += cpu_clflush_line_size)
 		clflushopt(va);
 }
 
 static void
 pmap_large_map_flush_range_clflush(vm_offset_t va, vm_size_t len)
 {
 
 	for (; len > 0; len -= cpu_clflush_line_size,
 	    va += cpu_clflush_line_size)
 		clflush(va);
 }
 
 static void
 pmap_large_map_flush_range_nop(vm_offset_t sva __unused, vm_size_t len __unused)
 {
 }
 
 DEFINE_IFUNC(static, void, pmap_large_map_flush_range, (vm_offset_t, vm_size_t))
 {
 
 	if ((cpu_stdext_feature & CPUID_STDEXT_CLWB) != 0)
 		return (pmap_large_map_flush_range_clwb);
 	else if ((cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0)
 		return (pmap_large_map_flush_range_clflushopt);
 	else if ((cpu_feature & CPUID_CLFSH) != 0)
 		return (pmap_large_map_flush_range_clflush);
 	else
 		return (pmap_large_map_flush_range_nop);
 }
 
 static void
 pmap_large_map_wb_large(vm_offset_t sva, vm_offset_t eva)
 {
 	volatile u_long *pe;
 	u_long p;
 	vm_offset_t va;
 	vm_size_t inc;
 	bool seen_other;
 
 	for (va = sva; va < eva; va += inc) {
 		inc = 0;
 		if ((amd_feature & AMDID_PAGE1GB) != 0) {
 			pe = (volatile u_long *)pmap_large_map_pdpe(va);
 			p = *pe;
 			if ((p & X86_PG_PS) != 0)
 				inc = NBPDP;
 		}
 		if (inc == 0) {
 			pe = (volatile u_long *)pmap_large_map_pde(va);
 			p = *pe;
 			if ((p & X86_PG_PS) != 0)
 				inc = NBPDR;
 		}
 		if (inc == 0) {
 			pe = (volatile u_long *)pmap_large_map_pte(va);
 			p = *pe;
 			inc = PAGE_SIZE;
 		}
 		seen_other = false;
 		for (;;) {
 			if ((p & X86_PG_AVAIL1) != 0) {
 				/*
 				 * Spin-wait for the end of a parallel
 				 * write-back.
 				 */
 				cpu_spinwait();
 				p = *pe;
 
 				/*
 				 * If we saw other write-back
 				 * occuring, we cannot rely on PG_M to
 				 * indicate state of the cache.  The
 				 * PG_M bit is cleared before the
 				 * flush to avoid ignoring new writes,
 				 * and writes which are relevant for
 				 * us might happen after.
 				 */
 				seen_other = true;
 				continue;
 			}
 
 			if ((p & X86_PG_M) != 0 || seen_other) {
 				if (!atomic_fcmpset_long(pe, &p,
 				    (p & ~X86_PG_M) | X86_PG_AVAIL1))
 					/*
 					 * If we saw PG_M without
 					 * PG_AVAIL1, and then on the
 					 * next attempt we do not
 					 * observe either PG_M or
 					 * PG_AVAIL1, the other
 					 * write-back started after us
 					 * and finished before us.  We
 					 * can rely on it doing our
 					 * work.
 					 */
 					continue;
 				pmap_large_map_flush_range(va, inc);
 				atomic_clear_long(pe, X86_PG_AVAIL1);
 			}
 			break;
 		}
 		maybe_yield();
 	}
 }
 
 /*
  * Write-back cache lines for the given address range.
  *
  * Must be called only on the range or sub-range returned from
  * pmap_large_map().  Must not be called on the coalesced ranges.
  *
  * Does nothing on CPUs without CLWB, CLFLUSHOPT, or CLFLUSH
  * instructions support.
  */
 void
 pmap_large_map_wb(void *svap, vm_size_t len)
 {
 	vm_offset_t eva, sva;
 
 	sva = (vm_offset_t)svap;
 	eva = sva + len;
 	pmap_large_map_wb_fence();
 	if (sva >= DMAP_MIN_ADDRESS && eva <= DMAP_MIN_ADDRESS + dmaplimit) {
 		pmap_large_map_flush_range(sva, len);
 	} else {
 		KASSERT(sva >= LARGEMAP_MIN_ADDRESS &&
 		    eva <= LARGEMAP_MIN_ADDRESS + lm_ents * NBPML4,
 		    ("pmap_large_map_wb: not largemap %#lx %#lx", sva, len));
 		pmap_large_map_wb_large(sva, eva);
 	}
 	pmap_large_map_wb_fence();
 }
 
 static vm_page_t
 pmap_pti_alloc_page(void)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 	m = vm_page_grab(pti_obj, pti_pg_idx++, VM_ALLOC_NOBUSY |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 	return (m);
 }
 
 static bool
 pmap_pti_free_page(vm_page_t m)
 {
 
 	KASSERT(m->wire_count > 0, ("page %p not wired", m));
 	if (!vm_page_unwire_noq(m))
 		return (false);
 	vm_page_free_zero(m);
 	return (true);
 }
 
 static void
 pmap_pti_init(void)
 {
 	vm_page_t pml4_pg;
 	pdp_entry_t *pdpe;
 	vm_offset_t va;
 	int i;
 
 	if (!pti)
 		return;
 	pti_obj = vm_pager_allocate(OBJT_PHYS, NULL, 0, VM_PROT_ALL, 0, NULL);
 	VM_OBJECT_WLOCK(pti_obj);
 	pml4_pg = pmap_pti_alloc_page();
 	pti_pml4 = (pml4_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(pml4_pg));
 	for (va = VM_MIN_KERNEL_ADDRESS; va <= VM_MAX_KERNEL_ADDRESS &&
 	    va >= VM_MIN_KERNEL_ADDRESS && va > NBPML4; va += NBPML4) {
 		pdpe = pmap_pti_pdpe(va);
 		pmap_pti_wire_pte(pdpe);
 	}
 	pmap_pti_add_kva_locked((vm_offset_t)&__pcpu[0],
 	    (vm_offset_t)&__pcpu[0] + sizeof(__pcpu[0]) * MAXCPU, false);
 	pmap_pti_add_kva_locked((vm_offset_t)gdt, (vm_offset_t)gdt +
 	    sizeof(struct user_segment_descriptor) * NGDT * MAXCPU, false);
 	pmap_pti_add_kva_locked((vm_offset_t)idt, (vm_offset_t)idt +
 	    sizeof(struct gate_descriptor) * NIDT, false);
 	pmap_pti_add_kva_locked((vm_offset_t)common_tss,
 	    (vm_offset_t)common_tss + sizeof(struct amd64tss) * MAXCPU, false);
 	CPU_FOREACH(i) {
 		/* Doublefault stack IST 1 */
 		va = common_tss[i].tss_ist1;
 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 		/* NMI stack IST 2 */
 		va = common_tss[i].tss_ist2 + sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 		/* MC# stack IST 3 */
 		va = common_tss[i].tss_ist3 + sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 		/* DB# stack IST 4 */
 		va = common_tss[i].tss_ist4 + sizeof(struct nmi_pcpu);
 		pmap_pti_add_kva_locked(va - PAGE_SIZE, va, false);
 	}
 	pmap_pti_add_kva_locked((vm_offset_t)kernphys + KERNBASE,
 	    (vm_offset_t)etext, true);
 	pti_finalized = true;
 	VM_OBJECT_WUNLOCK(pti_obj);
 }
 SYSINIT(pmap_pti, SI_SUB_CPU + 1, SI_ORDER_ANY, pmap_pti_init, NULL);
 
 static pdp_entry_t *
 pmap_pti_pdpe(vm_offset_t va)
 {
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	vm_page_t m;
 	vm_pindex_t pml4_idx;
 	vm_paddr_t mphys;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 
 	pml4_idx = pmap_pml4e_index(va);
 	pml4e = &pti_pml4[pml4_idx];
 	m = NULL;
 	if (*pml4e == 0) {
 		if (pti_finalized)
 			panic("pml4 alloc after finalization\n");
 		m = pmap_pti_alloc_page();
 		if (*pml4e != 0) {
 			pmap_pti_free_page(m);
 			mphys = *pml4e & ~PAGE_MASK;
 		} else {
 			mphys = VM_PAGE_TO_PHYS(m);
 			*pml4e = mphys | X86_PG_RW | X86_PG_V;
 		}
 	} else {
 		mphys = *pml4e & ~PAGE_MASK;
 	}
 	pdpe = (pdp_entry_t *)PHYS_TO_DMAP(mphys) + pmap_pdpe_index(va);
 	return (pdpe);
 }
 
 static void
 pmap_pti_wire_pte(void *pte)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
 	m->wire_count++;
 }
 
 static void
 pmap_pti_unwire_pde(void *pde, bool only_ref)
 {
 	vm_page_t m;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pde));
 	MPASS(m->wire_count > 0);
 	MPASS(only_ref || m->wire_count > 1);
 	pmap_pti_free_page(m);
 }
 
 static void
 pmap_pti_unwire_pte(void *pte, vm_offset_t va)
 {
 	vm_page_t m;
 	pd_entry_t *pde;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((uintptr_t)pte));
 	MPASS(m->wire_count > 0);
 	if (pmap_pti_free_page(m)) {
 		pde = pmap_pti_pde(va);
 		MPASS((*pde & (X86_PG_PS | X86_PG_V)) == X86_PG_V);
 		*pde = 0;
 		pmap_pti_unwire_pde(pde, false);
 	}
 }
 
 static pd_entry_t *
 pmap_pti_pde(vm_offset_t va)
 {
 	pdp_entry_t *pdpe;
 	pd_entry_t *pde;
 	vm_page_t m;
 	vm_pindex_t pd_idx;
 	vm_paddr_t mphys;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 
 	pdpe = pmap_pti_pdpe(va);
 	if (*pdpe == 0) {
 		m = pmap_pti_alloc_page();
 		if (*pdpe != 0) {
 			pmap_pti_free_page(m);
 			MPASS((*pdpe & X86_PG_PS) == 0);
 			mphys = *pdpe & ~PAGE_MASK;
 		} else {
 			mphys =  VM_PAGE_TO_PHYS(m);
 			*pdpe = mphys | X86_PG_RW | X86_PG_V;
 		}
 	} else {
 		MPASS((*pdpe & X86_PG_PS) == 0);
 		mphys = *pdpe & ~PAGE_MASK;
 	}
 
 	pde = (pd_entry_t *)PHYS_TO_DMAP(mphys);
 	pd_idx = pmap_pde_index(va);
 	pde += pd_idx;
 	return (pde);
 }
 
 static pt_entry_t *
 pmap_pti_pte(vm_offset_t va, bool *unwire_pde)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_page_t m;
 	vm_paddr_t mphys;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 
 	pde = pmap_pti_pde(va);
 	if (unwire_pde != NULL) {
 		*unwire_pde = true;
 		pmap_pti_wire_pte(pde);
 	}
 	if (*pde == 0) {
 		m = pmap_pti_alloc_page();
 		if (*pde != 0) {
 			pmap_pti_free_page(m);
 			MPASS((*pde & X86_PG_PS) == 0);
 			mphys = *pde & ~(PAGE_MASK | pg_nx);
 		} else {
 			mphys = VM_PAGE_TO_PHYS(m);
 			*pde = mphys | X86_PG_RW | X86_PG_V;
 			if (unwire_pde != NULL)
 				*unwire_pde = false;
 		}
 	} else {
 		MPASS((*pde & X86_PG_PS) == 0);
 		mphys = *pde & ~(PAGE_MASK | pg_nx);
 	}
 
 	pte = (pt_entry_t *)PHYS_TO_DMAP(mphys);
 	pte += pmap_pte_index(va);
 
 	return (pte);
 }
 
 static void
 pmap_pti_add_kva_locked(vm_offset_t sva, vm_offset_t eva, bool exec)
 {
 	vm_paddr_t pa;
 	pd_entry_t *pde;
 	pt_entry_t *pte, ptev;
 	bool unwire_pde;
 
 	VM_OBJECT_ASSERT_WLOCKED(pti_obj);
 
 	sva = trunc_page(sva);
 	MPASS(sva > VM_MAXUSER_ADDRESS);
 	eva = round_page(eva);
 	MPASS(sva < eva);
 	for (; sva < eva; sva += PAGE_SIZE) {
 		pte = pmap_pti_pte(sva, &unwire_pde);
 		pa = pmap_kextract(sva);
 		ptev = pa | X86_PG_RW | X86_PG_V | X86_PG_A | X86_PG_G |
 		    (exec ? 0 : pg_nx) | pmap_cache_bits(kernel_pmap,
 		    VM_MEMATTR_DEFAULT, FALSE);
 		if (*pte == 0) {
 			pte_store(pte, ptev);
 			pmap_pti_wire_pte(pte);
 		} else {
 			KASSERT(!pti_finalized,
 			    ("pti overlap after fin %#lx %#lx %#lx",
 			    sva, *pte, ptev));
 			KASSERT(*pte == ptev,
 			    ("pti non-identical pte after fin %#lx %#lx %#lx",
 			    sva, *pte, ptev));
 		}
 		if (unwire_pde) {
 			pde = pmap_pti_pde(sva);
 			pmap_pti_unwire_pde(pde, true);
 		}
 	}
 }
 
 void
 pmap_pti_add_kva(vm_offset_t sva, vm_offset_t eva, bool exec)
 {
 
 	if (!pti)
 		return;
 	VM_OBJECT_WLOCK(pti_obj);
 	pmap_pti_add_kva_locked(sva, eva, exec);
 	VM_OBJECT_WUNLOCK(pti_obj);
 }
 
 void
 pmap_pti_remove_kva(vm_offset_t sva, vm_offset_t eva)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 
 	if (!pti)
 		return;
 	sva = rounddown2(sva, PAGE_SIZE);
 	MPASS(sva > VM_MAXUSER_ADDRESS);
 	eva = roundup2(eva, PAGE_SIZE);
 	MPASS(sva < eva);
 	VM_OBJECT_WLOCK(pti_obj);
 	for (va = sva; va < eva; va += PAGE_SIZE) {
 		pte = pmap_pti_pte(va, NULL);
 		KASSERT((*pte & X86_PG_V) != 0,
 		    ("invalid pte va %#lx pte %#lx pt %#lx", va,
 		    (u_long)pte, *pte));
 		pte_clear(pte);
 		pmap_pti_unwire_pte(pte, va);
 	}
 	pmap_invalidate_range(kernel_pmap, sva, eva);
 	VM_OBJECT_WUNLOCK(pti_obj);
 }
 
 static void *
 pkru_dup_range(void *ctx __unused, void *data)
 {
 	struct pmap_pkru_range *node, *new_node;
 
 	new_node = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
 	if (new_node == NULL)
 		return (NULL);
 	node = data;
 	memcpy(new_node, node, sizeof(*node));
 	return (new_node);
 }
 
 static void
 pkru_free_range(void *ctx __unused, void *node)
 {
 
 	uma_zfree(pmap_pkru_ranges_zone, node);
 }
 
 static int
 pmap_pkru_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
     int flags)
 {
 	struct pmap_pkru_range *ppr;
 	int error;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	MPASS(pmap->pm_type == PT_X86);
 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
 	if ((flags & AMD64_PKRU_EXCL) != 0 &&
 	    !rangeset_check_empty(&pmap->pm_pkru, sva, eva))
 		return (EBUSY);
 	ppr = uma_zalloc(pmap_pkru_ranges_zone, M_NOWAIT);
 	if (ppr == NULL)
 		return (ENOMEM);
 	ppr->pkru_keyidx = keyidx;
 	ppr->pkru_flags = flags & AMD64_PKRU_PERSIST;
 	error = rangeset_insert(&pmap->pm_pkru, sva, eva, ppr);
 	if (error != 0)
 		uma_zfree(pmap_pkru_ranges_zone, ppr);
 	return (error);
 }
 
 static int
 pmap_pkru_deassign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	MPASS(pmap->pm_type == PT_X86);
 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
 	return (rangeset_remove(&pmap->pm_pkru, sva, eva));
 }
 
 static void
 pmap_pkru_deassign_all(pmap_t pmap)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pmap->pm_type == PT_X86 &&
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0)
 		rangeset_remove_all(&pmap->pm_pkru);
 }
 
 static bool
 pmap_pkru_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct pmap_pkru_range *ppr, *prev_ppr;
 	vm_offset_t va;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pmap->pm_type != PT_X86 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
 	    sva >= VM_MAXUSER_ADDRESS)
 		return (true);
 	MPASS(eva <= VM_MAXUSER_ADDRESS);
 	for (va = sva, prev_ppr = NULL; va < eva;) {
 		ppr = rangeset_lookup(&pmap->pm_pkru, va);
 		if ((ppr == NULL) ^ (prev_ppr == NULL))
 			return (false);
 		if (ppr == NULL) {
 			va += PAGE_SIZE;
 			continue;
 		}
 		if (prev_ppr->pkru_keyidx != ppr->pkru_keyidx)
 			return (false);
 		va = ppr->pkru_rs_el.re_end;
 	}
 	return (true);
 }
 
 static pt_entry_t
 pmap_pkru_get(pmap_t pmap, vm_offset_t va)
 {
 	struct pmap_pkru_range *ppr;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pmap->pm_type != PT_X86 ||
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0 ||
 	    va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	ppr = rangeset_lookup(&pmap->pm_pkru, va);
 	if (ppr != NULL)
 		return (X86_PG_PKU(ppr->pkru_keyidx));
 	return (0);
 }
 
 static bool
 pred_pkru_on_remove(void *ctx __unused, void *r)
 {
 	struct pmap_pkru_range *ppr;
 
 	ppr = r;
 	return ((ppr->pkru_flags & AMD64_PKRU_PERSIST) == 0);
 }
 
 static void
 pmap_pkru_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pmap->pm_type == PT_X86 &&
 	    (cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0) {
 		rangeset_remove_pred(&pmap->pm_pkru, sva, eva,
 		    pred_pkru_on_remove);
 	}
 }
 
 static int
 pmap_pkru_copy(pmap_t dst_pmap, pmap_t src_pmap)
 {
 
 	PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
 	PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
 	MPASS(dst_pmap->pm_type == PT_X86);
 	MPASS(src_pmap->pm_type == PT_X86);
 	MPASS((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) != 0);
 	if (src_pmap->pm_pkru.rs_data_ctx == NULL)
 		return (0);
 	return (rangeset_copy(&dst_pmap->pm_pkru, &src_pmap->pm_pkru));
 }
 
 static void
 pmap_pkru_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     u_int keyidx)
 {
 	pml4_entry_t *pml4e;
 	pdp_entry_t *pdpe;
 	pd_entry_t newpde, ptpaddr, *pde;
 	pt_entry_t newpte, *ptep, pte;
 	vm_offset_t va, va_next;
 	bool changed;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	MPASS(pmap->pm_type == PT_X86);
 	MPASS(keyidx <= PMAP_MAX_PKRU_IDX);
 
 	for (changed = false, va = sva; va < eva; va = va_next) {
 		pml4e = pmap_pml4e(pmap, va);
 		if ((*pml4e & X86_PG_V) == 0) {
 			va_next = (va + NBPML4) & ~PML4MASK;
 			if (va_next < va)
 				va_next = eva;
 			continue;
 		}
 
 		pdpe = pmap_pml4e_to_pdpe(pml4e, va);
 		if ((*pdpe & X86_PG_V) == 0) {
 			va_next = (va + NBPDP) & ~PDPMASK;
 			if (va_next < va)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (va + NBPDR) & ~PDRMASK;
 		if (va_next < va)
 			va_next = eva;
 
 		pde = pmap_pdpe_to_pde(pdpe, va);
 		ptpaddr = *pde;
 		if (ptpaddr == 0)
 			continue;
 
 		MPASS((ptpaddr & X86_PG_V) != 0);
 		if ((ptpaddr & PG_PS) != 0) {
 			if (va + NBPDR == va_next && eva >= va_next) {
 				newpde = (ptpaddr & ~X86_PG_PKU_MASK) |
 				    X86_PG_PKU(keyidx);
 				if (newpde != ptpaddr) {
 					*pde = newpde;
 					changed = true;
 				}
 				continue;
 			} else if (!pmap_demote_pde(pmap, pde, va)) {
 				continue;
 			}
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 
 		for (ptep = pmap_pde_to_pte(pde, va); va != va_next;
 		    ptep++, va += PAGE_SIZE) {
 			pte = *ptep;
 			if ((pte & X86_PG_V) == 0)
 				continue;
 			newpte = (pte & ~X86_PG_PKU_MASK) | X86_PG_PKU(keyidx);
 			if (newpte != pte) {
 				*ptep = newpte;
 				changed = true;
 			}
 		}
 	}
 	if (changed)
 		pmap_invalidate_range(pmap, sva, eva);
 }
 
 static int
 pmap_pkru_check_uargs(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     u_int keyidx, int flags)
 {
 
 	if (pmap->pm_type != PT_X86 || keyidx > PMAP_MAX_PKRU_IDX ||
 	    (flags & ~(AMD64_PKRU_PERSIST | AMD64_PKRU_EXCL)) != 0)
 		return (EINVAL);
 	if (eva <= sva || eva > VM_MAXUSER_ADDRESS)
 		return (EFAULT);
 	if ((cpu_stdext_feature2 & CPUID_STDEXT2_PKU) == 0)
 		return (ENOTSUP);
 	return (0);
 }
 
 int
 pmap_pkru_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, u_int keyidx,
     int flags)
 {
 	int error;
 
 	sva = trunc_page(sva);
 	eva = round_page(eva);
 	error = pmap_pkru_check_uargs(pmap, sva, eva, keyidx, flags);
 	if (error != 0)
 		return (error);
 	for (;;) {
 		PMAP_LOCK(pmap);
 		error = pmap_pkru_assign(pmap, sva, eva, keyidx, flags);
 		if (error == 0)
 			pmap_pkru_update_range(pmap, sva, eva, keyidx);
 		PMAP_UNLOCK(pmap);
 		if (error != ENOMEM)
 			break;
 		vm_wait(NULL);
 	}
 	return (error);
 }
 
 int
 pmap_pkru_clear(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	int error;
 
 	sva = trunc_page(sva);
 	eva = round_page(eva);
 	error = pmap_pkru_check_uargs(pmap, sva, eva, 0, 0);
 	if (error != 0)
 		return (error);
 	for (;;) {
 		PMAP_LOCK(pmap);
 		error = pmap_pkru_deassign(pmap, sva, eva);
 		if (error == 0)
 			pmap_pkru_update_range(pmap, sva, eva, 0);
 		PMAP_UNLOCK(pmap);
 		if (error != ENOMEM)
 			break;
 		vm_wait(NULL);
 	}
 	return (error);
 }
 
 #ifdef DDB
 DB_SHOW_COMMAND(pte, pmap_print_pte)
 {
 	pmap_t pmap;
 	pml4_entry_t *pml4;
 	pdp_entry_t *pdp;
 	pd_entry_t *pde;
 	pt_entry_t *pte, PG_V;
 	vm_offset_t va;
 
 	if (!have_addr) {
 		db_printf("show pte addr\n");
 		return;
 	}
 	va = (vm_offset_t)addr;
 
 	if (kdb_thread != NULL)
 		pmap = vmspace_pmap(kdb_thread->td_proc->p_vmspace);
 	else
 		pmap = PCPU_GET(curpmap);
 
 	PG_V = pmap_valid_bit(pmap);
 	pml4 = pmap_pml4e(pmap, va);
 	db_printf("VA %#016lx pml4e %#016lx", va, *pml4);
 	if ((*pml4 & PG_V) == 0) {
 		db_printf("\n");
 		return;
 	}
 	pdp = pmap_pml4e_to_pdpe(pml4, va);
 	db_printf(" pdpe %#016lx", *pdp);
 	if ((*pdp & PG_V) == 0 || (*pdp & PG_PS) != 0) {
 		db_printf("\n");
 		return;
 	}
 	pde = pmap_pdpe_to_pde(pdp, va);
 	db_printf(" pde %#016lx", *pde);
 	if ((*pde & PG_V) == 0 || (*pde & PG_PS) != 0) {
 		db_printf("\n");
 		return;
 	}
 	pte = pmap_pde_to_pte(pde, va);
 	db_printf(" pte %#016lx\n", *pte);
 }
 
 DB_SHOW_COMMAND(phys2dmap, pmap_phys2dmap)
 {
 	vm_paddr_t a;
 
 	if (have_addr) {
 		a = (vm_paddr_t)addr;
 		db_printf("0x%jx\n", (uintmax_t)PHYS_TO_DMAP(a));
 	} else {
 		db_printf("show phys2dmap addr\n");
 	}
 }
 #endif
Index: projects/nfsv42/sys/arm64/arm64/pmap.c
===================================================================
--- projects/nfsv42/sys/arm64/arm64/pmap.c	(revision 350367)
+++ projects/nfsv42/sys/arm64/arm64/pmap.c	(revision 350368)
@@ -1,5832 +1,5930 @@
 /*-
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  * Copyright (c) 2014 Andrew Turner
  * All rights reserved.
  * Copyright (c) 2014-2016 The FreeBSD Foundation
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * This software was developed by Andrew Turner under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/bitstring.h>
 #include <sys/bus.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/_unrhdr.h>
 #include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 #include <machine/machdep.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 
 #include <arm/include/physmem.h>
 
 #define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
 #define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
 #define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
 #define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
 
 #define	NUL0E		L0_ENTRIES
 #define	NUL1E		(NUL0E * NL1PG)
 #define	NUL2E		(NUL1E * NL2PG)
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
 #else
 #define PMAP_INLINE	extern inline
 #endif
 #else
 #define PMAP_INLINE
 #endif
 
 /*
  * These are configured by the mair_el1 register. This is set up in locore.S
  */
 #define	DEVICE_MEMORY	0
 #define	UNCACHED_MEMORY	1
 #define	CACHED_MEMORY	2
 
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
 #define	pa_to_pvh(pa)		(&pv_table[pmap_l2_pindex(pa)])
 
 #define	NPV_LIST_LOCKS	MAXCPU
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
 	struct rwlock *_new_lock;			\
 							\
 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
 	if (_new_lock != *_lockp) {			\
 		if (*_lockp != NULL)			\
 			rw_wunlock(*_lockp);		\
 		*_lockp = _new_lock;			\
 		rw_wlock(*_lockp);			\
 	}						\
 } while (0)
 
 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
 
 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
 	struct rwlock **_lockp = (lockp);		\
 							\
 	if (*_lockp != NULL) {				\
 		rw_wunlock(*_lockp);			\
 		*_lockp = NULL;				\
 	}						\
 } while (0)
 
 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
 
 /*
  * The presence of this flag indicates that the mapping is writeable.
  * If the ATTR_AP_RO bit is also set, then the mapping is clean, otherwise it is
  * dirty.  This flag may only be set on managed mappings.
  */
 static pt_entry_t ATTR_SW_DBM;
 
 struct pmap kernel_pmap_store;
 
 /* Used for mapping ACPI memory before VM is initialized */
 #define	PMAP_PREINIT_MAPPING_COUNT	32
 #define	PMAP_PREINIT_MAPPING_SIZE	(PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
 static vm_offset_t preinit_map_va;	/* Start VA of pre-init mapping space */
 static int vm_initialized = 0;		/* No need to use pre-init maps when set */
 
 /*
  * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
  * Always map entire L2 block for simplicity.
  * VA of L2 block = preinit_map_va + i * L2_SIZE
  */
 static struct pmap_preinit_mapping {
 	vm_paddr_t	pa;
 	vm_offset_t	va;
 	vm_size_t	size;
 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 vm_offset_t kernel_vm_end = 0;
 
 /*
  * Data for the pv entry allocation mechanism.
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx pv_chunks_mutex;
 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
 static struct md_page pv_dummy;
 
 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
 
 /* This code assumes all L1 DMAP entries will be used */
 CTASSERT((DMAP_MIN_ADDRESS  & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
 CTASSERT((DMAP_MAX_ADDRESS  & ~L0_OFFSET) == DMAP_MAX_ADDRESS);
 
 #define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
 extern pt_entry_t pagetable_dmap[];
 
 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
 static vm_paddr_t physmap[PHYSMAP_SIZE];
 static u_int physmap_idx;
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0, "VM/pmap parameters");
 
 static int superpages_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
     "Are large page mappings enabled?");
 
 /*
  * Internal flags for pmap_enter()'s helper functions.
  */
 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
 
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
 
 static int pmap_change_attr(vm_offset_t va, vm_size_t size, int mode);
 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
     vm_offset_t va, struct rwlock **lockp);
 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
     u_int flags, vm_page_t m, struct rwlock **lockp);
 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m, struct rwlock **lockp);
 
 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp);
 
 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 
 /*
  * These load the old table data and store the new value.
  * They need to be atomic as the System MMU may write to the table at
  * the same time as the CPU.
  */
 #define	pmap_clear(table)		atomic_store_64(table, 0)
 #define	pmap_clear_bits(table, bits)	atomic_clear_64(table, bits)
 #define	pmap_load(table)		(*table)
 #define	pmap_load_clear(table)		atomic_swap_64(table, 0)
 #define	pmap_load_store(table, entry)	atomic_swap_64(table, entry)
 #define	pmap_set_bits(table, bits)	atomic_set_64(table, bits)
 #define	pmap_store(table, entry)	atomic_store_64(table, entry)
 
 /********************/
 /* Inline functions */
 /********************/
 
 static __inline void
 pagecopy(void *s, void *d)
 {
 
 	memcpy(d, s, PAGE_SIZE);
 }
 
 static __inline pd_entry_t *
 pmap_l0(pmap_t pmap, vm_offset_t va)
 {
 
 	return (&pmap->pm_l0[pmap_l0_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
 {
 	pd_entry_t *l1;
 
 	l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
 	return (&l1[pmap_l1_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l1(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *l0;
 
 	l0 = pmap_l0(pmap, va);
 	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
 		return (NULL);
 
 	return (pmap_l0_to_l1(l0, va));
 }
 
 static __inline pd_entry_t *
 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
 {
 	pd_entry_t *l2;
 
 	l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
 	return (&l2[pmap_l2_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l2(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *l1;
 
 	l1 = pmap_l1(pmap, va);
 	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
 		return (NULL);
 
 	return (pmap_l1_to_l2(l1, va));
 }
 
 static __inline pt_entry_t *
 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
 {
 	pt_entry_t *l3;
 
 	l3 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK);
 	return (&l3[pmap_l3_index(va)]);
 }
 
 /*
  * Returns the lowest valid pde for a given virtual address.
  * The next level may or may not point to a valid page or block.
  */
 static __inline pd_entry_t *
 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
 {
 	pd_entry_t *l0, *l1, *l2, desc;
 
 	l0 = pmap_l0(pmap, va);
 	desc = pmap_load(l0) & ATTR_DESCR_MASK;
 	if (desc != L0_TABLE) {
 		*level = -1;
 		return (NULL);
 	}
 
 	l1 = pmap_l0_to_l1(l0, va);
 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
 	if (desc != L1_TABLE) {
 		*level = 0;
 		return (l0);
 	}
 
 	l2 = pmap_l1_to_l2(l1, va);
 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
 	if (desc != L2_TABLE) {
 		*level = 1;
 		return (l1);
 	}
 
 	*level = 2;
 	return (l2);
 }
 
 /*
  * Returns the lowest valid pte block or table entry for a given virtual
  * address. If there are no valid entries return NULL and set the level to
  * the first invalid level.
  */
 static __inline pt_entry_t *
 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
 {
 	pd_entry_t *l1, *l2, desc;
 	pt_entry_t *l3;
 
 	l1 = pmap_l1(pmap, va);
 	if (l1 == NULL) {
 		*level = 0;
 		return (NULL);
 	}
 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
 	if (desc == L1_BLOCK) {
 		*level = 1;
 		return (l1);
 	}
 
 	if (desc != L1_TABLE) {
 		*level = 1;
 		return (NULL);
 	}
 
 	l2 = pmap_l1_to_l2(l1, va);
 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
 	if (desc == L2_BLOCK) {
 		*level = 2;
 		return (l2);
 	}
 
 	if (desc != L2_TABLE) {
 		*level = 2;
 		return (NULL);
 	}
 
 	*level = 3;
 	l3 = pmap_l2_to_l3(l2, va);
 	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
 		return (NULL);
 
 	return (l3);
 }
 
 bool
 pmap_ps_enabled(pmap_t pmap __unused)
 {
 
 	return (superpages_enabled != 0);
 }
 
 bool
 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
     pd_entry_t **l2, pt_entry_t **l3)
 {
 	pd_entry_t *l0p, *l1p, *l2p;
 
 	if (pmap->pm_l0 == NULL)
 		return (false);
 
 	l0p = pmap_l0(pmap, va);
 	*l0 = l0p;
 
 	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
 		return (false);
 
 	l1p = pmap_l0_to_l1(l0p, va);
 	*l1 = l1p;
 
 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
 		*l2 = NULL;
 		*l3 = NULL;
 		return (true);
 	}
 
 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
 		return (false);
 
 	l2p = pmap_l1_to_l2(l1p, va);
 	*l2 = l2p;
 
 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
 		*l3 = NULL;
 		return (true);
 	}
 
 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
 		return (false);
 
 	*l3 = pmap_l2_to_l3(l2p, va);
 
 	return (true);
 }
 
 static __inline int
 pmap_l3_valid(pt_entry_t l3)
 {
 
 	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
 }
 
 
 CTASSERT(L1_BLOCK == L2_BLOCK);
 
 /*
  * Checks if the PTE is dirty.
  */
 static inline int
 pmap_pte_dirty(pt_entry_t pte)
 {
 
 	KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
 	KASSERT((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) != 0,
 	    ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
 
 	return ((pte & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) ==
 	    (ATTR_AP(ATTR_AP_RW) | ATTR_SW_DBM));
 }
 
 static __inline void
 pmap_resident_count_inc(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pmap->pm_stats.resident_count += count;
 }
 
 static __inline void
 pmap_resident_count_dec(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(pmap->pm_stats.resident_count >= count,
 	    ("pmap %p resident count underflow %ld %d", pmap,
 	    pmap->pm_stats.resident_count, count));
 	pmap->pm_stats.resident_count -= count;
 }
 
 static pt_entry_t *
 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
     u_int *l2_slot)
 {
 	pt_entry_t *l2;
 	pd_entry_t *l1;
 
 	l1 = (pd_entry_t *)l1pt;
 	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
 
 	/* Check locore has used a table L1 map */
 	KASSERT((l1[*l1_slot] & ATTR_DESCR_MASK) == L1_TABLE,
 	   ("Invalid bootstrap L1 table"));
 	/* Find the address of the L2 table */
 	l2 = (pt_entry_t *)init_pt_va;
 	*l2_slot = pmap_l2_index(va);
 
 	return (l2);
 }
 
 static vm_paddr_t
 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
 {
 	u_int l1_slot, l2_slot;
 	pt_entry_t *l2;
 
 	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
 
 	return ((l2[l2_slot] & ~ATTR_MASK) + (va & L2_OFFSET));
 }
 
 static vm_offset_t
 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa,
     vm_offset_t freemempos)
 {
 	pt_entry_t *l2;
 	vm_offset_t va;
 	vm_paddr_t l2_pa, pa;
 	u_int l1_slot, l2_slot, prev_l1_slot;
 	int i;
 
 	dmap_phys_base = min_pa & ~L1_OFFSET;
 	dmap_phys_max = 0;
 	dmap_max_addr = 0;
 	l2 = NULL;
 	prev_l1_slot = -1;
 
 #define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
 	memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES);
 
 	for (i = 0; i < (physmap_idx * 2); i += 2) {
 		pa = physmap[i] & ~L2_OFFSET;
 		va = pa - dmap_phys_base + DMAP_MIN_ADDRESS;
 
 		/* Create L2 mappings at the start of the region */
 		if ((pa & L1_OFFSET) != 0) {
 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
 			if (l1_slot != prev_l1_slot) {
 				prev_l1_slot = l1_slot;
 				l2 = (pt_entry_t *)freemempos;
 				l2_pa = pmap_early_vtophys(kern_l1,
 				    (vm_offset_t)l2);
 				freemempos += PAGE_SIZE;
 
 				pmap_store(&pagetable_dmap[l1_slot],
 				    (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
 
 				memset(l2, 0, PAGE_SIZE);
 			}
 			KASSERT(l2 != NULL,
 			    ("pmap_bootstrap_dmap: NULL l2 map"));
 			for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
 			    pa += L2_SIZE, va += L2_SIZE) {
 				/*
 				 * We are on a boundary, stop to
 				 * create a level 1 block
 				 */
 				if ((pa & L1_OFFSET) == 0)
 					break;
 
 				l2_slot = pmap_l2_index(va);
 				KASSERT(l2_slot != 0, ("..."));
 				pmap_store(&l2[l2_slot],
 				    (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN |
 				    ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
 			}
 			KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS),
 			    ("..."));
 		}
 
 		for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] &&
 		    (physmap[i + 1] - pa) >= L1_SIZE;
 		    pa += L1_SIZE, va += L1_SIZE) {
 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
 			pmap_store(&pagetable_dmap[l1_slot],
 			    (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_XN |
 			    ATTR_IDX(CACHED_MEMORY) | L1_BLOCK);
 		}
 
 		/* Create L2 mappings at the end of the region */
 		if (pa < physmap[i + 1]) {
 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
 			if (l1_slot != prev_l1_slot) {
 				prev_l1_slot = l1_slot;
 				l2 = (pt_entry_t *)freemempos;
 				l2_pa = pmap_early_vtophys(kern_l1,
 				    (vm_offset_t)l2);
 				freemempos += PAGE_SIZE;
 
 				pmap_store(&pagetable_dmap[l1_slot],
 				    (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
 
 				memset(l2, 0, PAGE_SIZE);
 			}
 			KASSERT(l2 != NULL,
 			    ("pmap_bootstrap_dmap: NULL l2 map"));
 			for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
 			    pa += L2_SIZE, va += L2_SIZE) {
 				l2_slot = pmap_l2_index(va);
 				pmap_store(&l2[l2_slot],
 				    (pa & ~L2_OFFSET) | ATTR_DEFAULT | ATTR_XN |
 				    ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
 			}
 		}
 
 		if (pa > dmap_phys_max) {
 			dmap_phys_max = pa;
 			dmap_max_addr = va;
 		}
 	}
 
 	cpu_tlb_flushID();
 
 	return (freemempos);
 }
 
 static vm_offset_t
 pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
 {
 	vm_offset_t l2pt;
 	vm_paddr_t pa;
 	pd_entry_t *l1;
 	u_int l1_slot;
 
 	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
 
 	l1 = (pd_entry_t *)l1pt;
 	l1_slot = pmap_l1_index(va);
 	l2pt = l2_start;
 
 	for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
 		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
 
 		pa = pmap_early_vtophys(l1pt, l2pt);
 		pmap_store(&l1[l1_slot],
 		    (pa & ~Ln_TABLE_MASK) | L1_TABLE);
 		l2pt += PAGE_SIZE;
 	}
 
 	/* Clean the L2 page table */
 	memset((void *)l2_start, 0, l2pt - l2_start);
 
 	return l2pt;
 }
 
 static vm_offset_t
 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
 {
 	vm_offset_t l3pt;
 	vm_paddr_t pa;
 	pd_entry_t *l2;
 	u_int l2_slot;
 
 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
 
 	l2 = pmap_l2(kernel_pmap, va);
 	l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE);
 	l2_slot = pmap_l2_index(va);
 	l3pt = l3_start;
 
 	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
 		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
 
 		pa = pmap_early_vtophys(l1pt, l3pt);
 		pmap_store(&l2[l2_slot],
 		    (pa & ~Ln_TABLE_MASK) | L2_TABLE);
 		l3pt += PAGE_SIZE;
 	}
 
 	/* Clean the L2 page table */
 	memset((void *)l3_start, 0, l3pt - l3_start);
 
 	return l3pt;
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  */
 void
 pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart,
     vm_size_t kernlen)
 {
 	u_int l1_slot, l2_slot;
 	pt_entry_t *l2;
 	vm_offset_t va, freemempos;
 	vm_offset_t dpcpu, msgbufpv;
 	vm_paddr_t start_pa, pa, min_pa;
 	uint64_t kern_delta;
 	int i;
 
 #ifdef notyet
 	/* Determine whether the hardware implements DBM management. */
 	uint64_t reg = READ_SPECIALREG(ID_AA64MMFR1_EL1);
 	ATTR_SW_DBM = ID_AA64MMFR1_HAFDBS(reg) == ID_AA64MMFR1_HAFDBS_AF_DBS ?
 	    ATTR_DBM : _ATTR_SW_DBM;
 #else
 	ATTR_SW_DBM = _ATTR_SW_DBM;
 #endif
 
 	kern_delta = KERNBASE - kernstart;
 
 	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
 	printf("%lx\n", l1pt);
 	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
 
 	/* Set this early so we can use the pagetable walking functions */
 	kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt;
 	PMAP_LOCK_INIT(kernel_pmap);
 
 	/* Assume the address we were loaded to is a valid physical address */
 	min_pa = KERNBASE - kern_delta;
 
 	physmap_idx = arm_physmem_avail(physmap, nitems(physmap));
 	physmap_idx /= 2;
 
 	/*
 	 * Find the minimum physical address. physmap is sorted,
 	 * but may contain empty ranges.
 	 */
 	for (i = 0; i < (physmap_idx * 2); i += 2) {
 		if (physmap[i] == physmap[i + 1])
 			continue;
 		if (physmap[i] <= min_pa)
 			min_pa = physmap[i];
 	}
 
 	freemempos = KERNBASE + kernlen;
 	freemempos = roundup2(freemempos, PAGE_SIZE);
 
 	/* Create a direct map region early so we can use it for pa -> va */
 	freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos);
 
 	va = KERNBASE;
 	start_pa = pa = KERNBASE - kern_delta;
 
 	/*
 	 * Read the page table to find out what is already mapped.
 	 * This assumes we have mapped a block of memory from KERNBASE
 	 * using a single L1 entry.
 	 */
 	l2 = pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
 
 	/* Sanity check the index, KERNBASE should be the first VA */
 	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
 
 	/* Find how many pages we have mapped */
 	for (; l2_slot < Ln_ENTRIES; l2_slot++) {
 		if ((l2[l2_slot] & ATTR_DESCR_MASK) == 0)
 			break;
 
 		/* Check locore used L2 blocks */
 		KASSERT((l2[l2_slot] & ATTR_DESCR_MASK) == L2_BLOCK,
 		    ("Invalid bootstrap L2 table"));
 		KASSERT((l2[l2_slot] & ~ATTR_MASK) == pa,
 		    ("Incorrect PA in L2 table"));
 
 		va += L2_SIZE;
 		pa += L2_SIZE;
 	}
 
 	va = roundup2(va, L1_SIZE);
 
 	/* Create the l2 tables up to VM_MAX_KERNEL_ADDRESS */
 	freemempos = pmap_bootstrap_l2(l1pt, va, freemempos);
 	/* And the l3 tables for the early devmap */
 	freemempos = pmap_bootstrap_l3(l1pt,
 	    VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos);
 
 	cpu_tlb_flushID();
 
 #define alloc_pages(var, np)						\
 	(var) = freemempos;						\
 	freemempos += (np * PAGE_SIZE);					\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	/* Allocate dynamic per-cpu area. */
 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
 	dpcpu_init((void *)dpcpu, 0);
 
 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
 	msgbufp = (void *)msgbufpv;
 
 	/* Reserve some VA space for early BIOS/ACPI mapping */
 	preinit_map_va = roundup2(freemempos, L2_SIZE);
 
 	virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
 	virtual_avail = roundup2(virtual_avail, L1_SIZE);
 	virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
 	kernel_vm_end = virtual_avail;
 
 	pa = pmap_early_vtophys(l1pt, freemempos);
 
 	arm_physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
 
 	cpu_tlb_flushID();
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	vm_size_t s;
 	int i, pv_npg;
 
 	/*
 	 * Are large page mappings enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
 	if (superpages_enabled) {
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("pmap_init: can't assign to pagesizes[1]"));
 		pagesizes[1] = L2_SIZE;
 	}
 
 	/*
 	 * Initialize the pv chunk list mutex.
 	 */
 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 
 	/*
 	 * Initialize the pool of pv list locks.
 	 */
 	for (i = 0; i < NPV_LIST_LOCKS; i++)
 		rw_init(&pv_list_locks[i], "pmap pv list");
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 	TAILQ_INIT(&pv_dummy.pv_list);
 
 	vm_initialized = 1;
 }
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0,
     "2MB page mapping counters");
 
 static u_long pmap_l2_demotions;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_l2_demotions, 0, "2MB page demotions");
 
 static u_long pmap_l2_mappings;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_l2_mappings, 0, "2MB page mappings");
 
 static u_long pmap_l2_p_failures;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_l2_p_failures, 0, "2MB page promotion failures");
 
 static u_long pmap_l2_promotions;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_l2_promotions, 0, "2MB page promotions");
 
 /*
  * Invalidate a single TLB entry.
  */
 static __inline void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	sched_pin();
 	__asm __volatile(
 	    "dsb  ishst		\n"
 	    "tlbi vaae1is, %0	\n"
 	    "dsb  ish		\n"
 	    "isb		\n"
 	    : : "r"(va >> PAGE_SHIFT));
 	sched_unpin();
 }
 
 static __inline void
 pmap_invalidate_range_nopin(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	dsb(ishst);
 	for (addr = sva; addr < eva; addr += PAGE_SIZE) {
 		__asm __volatile(
 		    "tlbi vaae1is, %0" : : "r"(addr >> PAGE_SHIFT));
 	}
 	__asm __volatile(
 	    "dsb  ish	\n"
 	    "isb	\n");
 }
 
 static __inline void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 
 	sched_pin();
 	pmap_invalidate_range_nopin(pmap, sva, eva);
 	sched_unpin();
 }
 
 static __inline void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	sched_pin();
 	__asm __volatile(
 	    "dsb  ishst		\n"
 	    "tlbi vmalle1is	\n"
 	    "dsb  ish		\n"
 	    "isb		\n");
 	sched_unpin();
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	pt_entry_t *pte, tpte;
 	vm_paddr_t pa;
 	int lvl;
 
 	pa = 0;
 	PMAP_LOCK(pmap);
 	/*
 	 * Find the block or page map for this virtual address. pmap_pte
 	 * will return either a valid block/page entry, or NULL.
 	 */
 	pte = pmap_pte(pmap, va, &lvl);
 	if (pte != NULL) {
 		tpte = pmap_load(pte);
 		pa = tpte & ~ATTR_MASK;
 		switch(lvl) {
 		case 1:
 			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
 			    ("pmap_extract: Invalid L1 pte found: %lx",
 			    tpte & ATTR_DESCR_MASK));
 			pa |= (va & L1_OFFSET);
 			break;
 		case 2:
 			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
 			    ("pmap_extract: Invalid L2 pte found: %lx",
 			    tpte & ATTR_DESCR_MASK));
 			pa |= (va & L2_OFFSET);
 			break;
 		case 3:
 			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
 			    ("pmap_extract: Invalid L3 pte found: %lx",
 			    tpte & ATTR_DESCR_MASK));
 			pa |= (va & L3_OFFSET);
 			break;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pt_entry_t *pte, tpte;
 	vm_offset_t off;
 	vm_paddr_t pa;
 	vm_page_t m;
 	int lvl;
 
 	pa = 0;
 	m = NULL;
 	PMAP_LOCK(pmap);
 retry:
 	pte = pmap_pte(pmap, va, &lvl);
 	if (pte != NULL) {
 		tpte = pmap_load(pte);
 
 		KASSERT(lvl > 0 && lvl <= 3,
 		    ("pmap_extract_and_hold: Invalid level %d", lvl));
 		CTASSERT(L1_BLOCK == L2_BLOCK);
 		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
 		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
 		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
 		     tpte & ATTR_DESCR_MASK));
 		if (((tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)) ||
 		    ((prot & VM_PROT_WRITE) == 0)) {
 			switch(lvl) {
 			case 1:
 				off = va & L1_OFFSET;
 				break;
 			case 2:
 				off = va & L2_OFFSET;
 				break;
 			case 3:
 			default:
 				off = 0;
 			}
 			if (vm_page_pa_tryrelock(pmap,
 			    (tpte & ~ATTR_MASK) | off, &pa))
 				goto retry;
 			m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off);
 			vm_page_wire(m);
 		}
 	}
 	PA_UNLOCK_COND(pa);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pt_entry_t *pte, tpte;
 	vm_paddr_t pa;
 	int lvl;
 
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		pa = DMAP_TO_PHYS(va);
 	} else {
 		pa = 0;
 		pte = pmap_pte(kernel_pmap, va, &lvl);
 		if (pte != NULL) {
 			tpte = pmap_load(pte);
 			pa = tpte & ~ATTR_MASK;
 			switch(lvl) {
 			case 1:
 				KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
 				    ("pmap_kextract: Invalid L1 pte found: %lx",
 				    tpte & ATTR_DESCR_MASK));
 				pa |= (va & L1_OFFSET);
 				break;
 			case 2:
 				KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
 				    ("pmap_kextract: Invalid L2 pte found: %lx",
 				    tpte & ATTR_DESCR_MASK));
 				pa |= (va & L2_OFFSET);
 				break;
 			case 3:
 				KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
 				    ("pmap_kextract: Invalid L3 pte found: %lx",
 				    tpte & ATTR_DESCR_MASK));
 				pa |= (va & L3_OFFSET);
 				break;
 			}
 		}
 	}
 	return (pa);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 void
 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, attr;
 	vm_offset_t va;
 	int lvl;
 
 	KASSERT((pa & L3_OFFSET) == 0,
 	   ("pmap_kenter: Invalid physical address"));
 	KASSERT((sva & L3_OFFSET) == 0,
 	   ("pmap_kenter: Invalid virtual address"));
 	KASSERT((size & PAGE_MASK) == 0,
 	    ("pmap_kenter: Mapping is not page-sized"));
 
 	attr = ATTR_DEFAULT | ATTR_IDX(mode) | L3_PAGE;
 	if (mode == DEVICE_MEMORY)
 		attr |= ATTR_XN;
 
 	va = sva;
 	while (size != 0) {
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
 		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
 
 		pte = pmap_l2_to_l3(pde, va);
 		pmap_load_store(pte, (pa & ~L3_OFFSET) | attr);
 
 		va += PAGE_SIZE;
 		pa += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 void
 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
 {
 
 	pmap_kenter(sva, size, pa, DEVICE_MEMORY);
 }
 
 /*
  * Remove a page from the kernel pagetables.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *pte;
 	int lvl;
 
 	pte = pmap_pte(kernel_pmap, va, &lvl);
 	KASSERT(pte != NULL, ("pmap_kremove: Invalid address"));
 	KASSERT(lvl == 3, ("pmap_kremove: Invalid pte level %d", lvl));
 
 	pmap_clear(pte);
 	pmap_invalidate_page(kernel_pmap, va);
 }
 
 void
 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 	int lvl;
 
 	KASSERT((sva & L3_OFFSET) == 0,
 	   ("pmap_kremove_device: Invalid virtual address"));
 	KASSERT((size & PAGE_MASK) == 0,
 	    ("pmap_kremove_device: Mapping is not page-sized"));
 
 	va = sva;
 	while (size != 0) {
 		pte = pmap_pte(kernel_pmap, va, &lvl);
 		KASSERT(pte != NULL, ("Invalid page table, va: 0x%lx", va));
 		KASSERT(lvl == 3,
 		    ("Invalid device pagetable level: %d != 3", lvl));
 		pmap_clear(pte);
 
 		va += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 	return PHYS_TO_DMAP(start);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, pa;
 	vm_offset_t va;
 	vm_page_t m;
 	int i, lvl;
 
 	va = sva;
 	for (i = 0; i < count; i++) {
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
 		KASSERT(lvl == 2,
 		    ("pmap_qenter: Invalid level %d", lvl));
 
 		m = ma[i];
 		pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT | ATTR_AP(ATTR_AP_RW) |
 		    ATTR_IDX(m->md.pv_memattr) | L3_PAGE;
 		if (m->md.pv_memattr == DEVICE_MEMORY)
 			pa |= ATTR_XN;
 		pte = pmap_l2_to_l3(pde, va);
 		pmap_load_store(pte, pa);
 
 		va += L3_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 	int lvl;
 
 	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
 
 	va = sva;
 	while (count-- > 0) {
 		pte = pmap_pte(kernel_pmap, va, &lvl);
 		KASSERT(lvl == 3,
 		    ("Invalid device pagetable level: %d != 3", lvl));
 		if (pte != NULL) {
 			pmap_clear(pte);
 		}
 
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 
 /*
  * Decrements a page table page's wire count, which is used to record the
  * number of valid page table entries within the page.  If the wire count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0) {
 		_pmap_unwire_l3(pmap, va, m, free);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 static void
 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/*
 	 * unmap the page table page
 	 */
 	if (m->pindex >= (NUL2E + NUL1E)) {
 		/* l1 page */
 		pd_entry_t *l0;
 
 		l0 = pmap_l0(pmap, va);
 		pmap_clear(l0);
 	} else if (m->pindex >= NUL2E) {
 		/* l2 page */
 		pd_entry_t *l1;
 
 		l1 = pmap_l1(pmap, va);
 		pmap_clear(l1);
 	} else {
 		/* l3 page */
 		pd_entry_t *l2;
 
 		l2 = pmap_l2(pmap, va);
 		pmap_clear(l2);
 	}
 	pmap_resident_count_dec(pmap, 1);
 	if (m->pindex < NUL2E) {
 		/* We just released an l3, unhold the matching l2 */
 		pd_entry_t *l1, tl1;
 		vm_page_t l2pg;
 
 		l1 = pmap_l1(pmap, va);
 		tl1 = pmap_load(l1);
 		l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
 		pmap_unwire_l3(pmap, va, l2pg, free);
 	} else if (m->pindex < (NUL2E + NUL1E)) {
 		/* We just released an l2, unhold the matching l1 */
 		pd_entry_t *l0, tl0;
 		vm_page_t l1pg;
 
 		l0 = pmap_l0(pmap, va);
 		tl0 = pmap_load(l0);
 		l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
 		pmap_unwire_l3(pmap, va, l1pg, free);
 	}
 	pmap_invalidate_page(pmap, va);
 
 	/*
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
     struct spglist *free)
 {
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 	mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
 	return (pmap_unwire_l3(pmap, va, mpte, free));
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
 	pmap->pm_l0 = kernel_pmap->pm_l0;
 	pmap->pm_root.rt_root = 0;
 }
 
 int
 pmap_pinit(pmap_t pmap)
 {
 	vm_paddr_t l0phys;
 	vm_page_t l0pt;
 
 	/*
 	 * allocate the l0 page
 	 */
 	while ((l0pt = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 		vm_wait(NULL);
 
 	l0phys = VM_PAGE_TO_PHYS(l0pt);
 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(l0phys);
 
 	if ((l0pt->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_l0);
 
 	pmap->pm_root.rt_root = 0;
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
 
 	return (1);
 }
 
 /*
  * This routine is called if the desired page table page does not exist.
  *
  * If page table page allocation fails, this routine may sleep before
  * returning NULL.  It sleeps only if a lock pointer was given.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
  * afterwards.  This conservative approach is easily argued to avoid
  * race conditions.
  */
 static vm_page_t
 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, l1pg, l2pg;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (lockp != NULL) {
 			RELEASE_PV_LIST_LOCK(lockp);
 			PMAP_UNLOCK(pmap);
 			vm_wait(NULL);
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	if (ptepindex >= (NUL2E + NUL1E)) {
 		pd_entry_t *l0;
 		vm_pindex_t l0index;
 
 		l0index = ptepindex - (NUL2E + NUL1E);
 		l0 = &pmap->pm_l0[l0index];
 		pmap_store(l0, VM_PAGE_TO_PHYS(m) | L0_TABLE);
 	} else if (ptepindex >= NUL2E) {
 		vm_pindex_t l0index, l1index;
 		pd_entry_t *l0, *l1;
 		pd_entry_t tl0;
 
 		l1index = ptepindex - NUL2E;
 		l0index = l1index >> L0_ENTRIES_SHIFT;
 
 		l0 = &pmap->pm_l0[l0index];
 		tl0 = pmap_load(l0);
 		if (tl0 == 0) {
 			/* recurse for allocating page dir */
 			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 		} else {
 			l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
 			l1pg->wire_count++;
 		}
 
 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
 		l1 = &l1[ptepindex & Ln_ADDR_MASK];
 		pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
 	} else {
 		vm_pindex_t l0index, l1index;
 		pd_entry_t *l0, *l1, *l2;
 		pd_entry_t tl0, tl1;
 
 		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
 		l0index = l1index >> L0_ENTRIES_SHIFT;
 
 		l0 = &pmap->pm_l0[l0index];
 		tl0 = pmap_load(l0);
 		if (tl0 == 0) {
 			/* recurse for allocating page dir */
 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 			tl0 = pmap_load(l0);
 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
 			l1 = &l1[l1index & Ln_ADDR_MASK];
 		} else {
 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
 			l1 = &l1[l1index & Ln_ADDR_MASK];
 			tl1 = pmap_load(l1);
 			if (tl1 == 0) {
 				/* recurse for allocating page dir */
 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
 				    lockp) == NULL) {
 					vm_page_unwire_noq(m);
 					vm_page_free_zero(m);
 					return (NULL);
 				}
 			} else {
 				l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
 				l2pg->wire_count++;
 			}
 		}
 
 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
 		pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
 	}
 
 	pmap_resident_count_inc(pmap, 1);
 
 	return (m);
 }
 
 static vm_page_t
 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	pd_entry_t *l1;
 	vm_page_t l2pg;
 	vm_pindex_t l2pindex;
 
 retry:
 	l1 = pmap_l1(pmap, va);
 	if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
 		/* Add a reference to the L2 page. */
 		l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK);
 		l2pg->wire_count++;
 	} else {
 		/* Allocate a L2 page. */
 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
 		if (l2pg == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (l2pg);
 }
 
 static vm_page_t
 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *pde, tpde;
 #ifdef INVARIANTS
 	pt_entry_t *pte;
 #endif
 	vm_page_t m;
 	int lvl;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_l2_pindex(va);
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	pde = pmap_pde(pmap, va, &lvl);
 
 	/*
 	 * If the page table page is mapped, we just increment the hold count,
 	 * and activate it. If we get a level 2 pde it will point to a level 3
 	 * table.
 	 */
 	switch (lvl) {
 	case -1:
 		break;
 	case 0:
 #ifdef INVARIANTS
 		pte = pmap_l0_to_l1(pde, va);
 		KASSERT(pmap_load(pte) == 0,
 		    ("pmap_alloc_l3: TODO: l0 superpages"));
 #endif
 		break;
 	case 1:
 #ifdef INVARIANTS
 		pte = pmap_l1_to_l2(pde, va);
 		KASSERT(pmap_load(pte) == 0,
 		    ("pmap_alloc_l3: TODO: l1 superpages"));
 #endif
 		break;
 	case 2:
 		tpde = pmap_load(pde);
 		if (tpde != 0) {
 			m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
 			m->wire_count++;
 			return (m);
 		}
 		break;
 	default:
 		panic("pmap_alloc_l3: Invalid level %d", lvl);
 	}
 
 	/*
 	 * Here if the pte page isn't mapped, or if it has been deallocated.
 	 */
 	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
 	if (m == NULL && lockp != NULL)
 		goto retry;
 
 	return (m);
 }
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l0));
 
 	vm_page_unwire_noq(m);
 	vm_page_free_zero(m);
 }
 
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD,
     0, 0, kvm_size, "LU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD,
     0, 0, kvm_free, "LU", "Amount of KVM free");
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t paddr;
 	vm_page_t nkpg;
 	pd_entry_t *l0, *l1, *l2;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 
 	addr = roundup2(addr, L2_SIZE);
 	if (addr - 1 >= vm_map_max(kernel_map))
 		addr = vm_map_max(kernel_map);
 	while (kernel_vm_end < addr) {
 		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
 		KASSERT(pmap_load(l0) != 0,
 		    ("pmap_growkernel: No level 0 kernel entry"));
 
 		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
 		if (pmap_load(l1) == 0) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT,
 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 			if (nkpg == NULL)
 				panic("pmap_growkernel: no memory to grow kernel");
 			if ((nkpg->flags & PG_ZERO) == 0)
 				pmap_zero_page(nkpg);
 			paddr = VM_PAGE_TO_PHYS(nkpg);
 			pmap_store(l1, paddr | L1_TABLE);
 			continue; /* try again */
 		}
 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
 		if ((pmap_load(l2) & ATTR_AF) != 0) {
 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 				kernel_vm_end = vm_map_max(kernel_map);
 				break;
 			}
 			continue;
 		}
 
 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT,
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 		if ((nkpg->flags & PG_ZERO) == 0)
 			pmap_zero_page(nkpg);
 		paddr = VM_PAGE_TO_PHYS(nkpg);
 		pmap_load_store(l2, paddr | L2_TABLE);
 		pmap_invalidate_page(kernel_pmap, kernel_vm_end);
 
 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 			kernel_vm_end = vm_map_max(kernel_map);
 			break;
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 3);
 CTASSERT(_NPCPV == 168);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0	0xfffffffffffffffful
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 #if 0
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 #endif
 #endif /* 0 */
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  *
  * Returns NULL if PV entries were reclaimed from the specified pmap.
  *
  * We do not, however, unmap 2mpages because subsequent accesses will
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
 static vm_page_t
 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 {
 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
 	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t next_pmap, pmap;
 	pt_entry_t *pte, tpte;
 	pv_entry_t pv;
 	vm_offset_t va;
 	vm_page_t m, m_pc;
 	struct spglist free;
 	uint64_t inuse;
 	int bit, field, freed, lvl;
 	static int active_reclaims = 0;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
 
 	pmap = NULL;
 	m_pc = NULL;
 	SLIST_INIT(&free);
 	bzero(&pc_marker_b, sizeof(pc_marker_b));
 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
 	pc_marker = (struct pv_chunk *)&pc_marker_b;
 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
 
 	mtx_lock(&pv_chunks_mutex);
 	active_reclaims++;
 	TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
 	    SLIST_EMPTY(&free)) {
 		next_pmap = pc->pc_pmap;
 		if (next_pmap == NULL) {
 			/*
 			 * The next chunk is a marker.  However, it is
 			 * not our marker, so active_reclaims must be
 			 * > 1.  Consequently, the next_chunk code
 			 * will not rotate the pv_chunks list.
 			 */
 			goto next_chunk;
 		}
 		mtx_unlock(&pv_chunks_mutex);
 
 		/*
 		 * A pv_chunk can only be removed from the pc_lru list
 		 * when both pv_chunks_mutex is owned and the
 		 * corresponding pmap is locked.
 		 */
 		if (pmap != next_pmap) {
 			if (pmap != NULL && pmap != locked_pmap)
 				PMAP_UNLOCK(pmap);
 			pmap = next_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap) {
 				RELEASE_PV_LIST_LOCK(lockp);
 				PMAP_LOCK(pmap);
 				mtx_lock(&pv_chunks_mutex);
 				continue;
 			} else if (pmap != locked_pmap) {
 				if (PMAP_TRYLOCK(pmap)) {
 					mtx_lock(&pv_chunks_mutex);
 					continue;
 				} else {
 					pmap = NULL; /* pmap is not locked */
 					mtx_lock(&pv_chunks_mutex);
 					pc = TAILQ_NEXT(pc_marker, pc_lru);
 					if (pc == NULL ||
 					    pc->pc_pmap != next_pmap)
 						continue;
 					goto next_chunk;
 				}
 			}
 		}
 
 		/*
 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
 		 */
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 			    inuse != 0; inuse &= ~(1UL << bit)) {
 				bit = ffsl(inuse) - 1;
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va = pv->pv_va;
 				pde = pmap_pde(pmap, va, &lvl);
 				if (lvl != 2)
 					continue;
 				pte = pmap_l2_to_l3(pde, va);
 				tpte = pmap_load(pte);
 				if ((tpte & ATTR_SW_WIRED) != 0)
 					continue;
 				tpte = pmap_load_clear(pte);
 				pmap_invalidate_page(pmap, va);
 				m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
 				if (pmap_pte_dirty(tpte))
 					vm_page_dirty(m);
 				if ((tpte & ATTR_AF) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						vm_page_aflag_clear(m,
 						    PGA_WRITEABLE);
 					}
 				}
 				pc->pc_map[field] |= 1UL << bit;
 				pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
 				freed++;
 			}
 		}
 		if (freed == 0) {
 			mtx_lock(&pv_chunks_mutex);
 			goto next_chunk;
 		}
 		/* Every freed mapping is for a 4 KB page. */
 		pmap_resident_count_dec(pmap, freed);
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
 		    pc->pc_map[2] == PC_FREE2) {
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 			/* Entire chunk is free; return it. */
 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 			dump_drop_page(m_pc->phys_addr);
 			mtx_lock(&pv_chunks_mutex);
 			TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 			break;
 		}
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		mtx_lock(&pv_chunks_mutex);
 		/* One freed pv entry in locked_pmap is sufficient. */
 		if (pmap == locked_pmap)
 			break;
 
 next_chunk:
 		TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 		TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
 		if (active_reclaims == 1 && pmap != NULL) {
 			/*
 			 * Rotate the pv chunks list so that we do not
 			 * scan the same pv chunks that could not be
 			 * freed (because they contained a wired
 			 * and/or superpage mapping) on every
 			 * invocation of reclaim_pv_chunk().
 			 */
 			while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
 				MPASS(pc->pc_pmap != NULL);
 				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 			}
 		}
 	}
 	TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
 	TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
 	active_reclaims--;
 	mtx_unlock(&pv_chunks_mutex);
 	if (pmap != NULL && pmap != locked_pmap)
 		PMAP_UNLOCK(pmap);
 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
 		m_pc = SLIST_FIRST(&free);
 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 	}
 	vm_page_free_pages_toq(&free, true);
 	return (m_pc);
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 	    pc->pc_map[2] != PC_FREE2) {
 		/* 98% of the time, pc is already at the head of the list. */
 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		}
 		return;
 	}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
 	mtx_lock(&pv_chunks_mutex);
  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 }
 
 /*
  * Returns a new PV entry, allocating a new PV chunk from the system when
  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
  * returned.
  *
  * The given PV list lock may be released.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 {
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = ffsl(pc->pc_map[field]) - 1;
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 64 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 			    pc->pc_map[2] == 0) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED);
 	if (m == NULL) {
 		if (lockp == NULL) {
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = reclaim_pv_chunk(pmap, lockp);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
 	mtx_lock(&pv_chunks_mutex);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 	return (pv);
 }
 
 /*
  * Ensure that the number of spare PV entries in the specified pmap meets or
  * exceeds the given count, "needed".
  *
  * The given PV list lock may be released.
  */
 static void
 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 {
 	struct pch new_tail;
 	struct pv_chunk *pc;
 	vm_page_t m;
 	int avail, free;
 	bool reclaimed;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 
 	/*
 	 * Newly allocated PV chunks must be stored in a private list until
 	 * the required number of PV chunks have been allocated.  Otherwise,
 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
 	 * contrast, these chunks must be added to the pmap upon allocation.
 	 */
 	TAILQ_INIT(&new_tail);
 retry:
 	avail = 0;
 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 		bit_count((bitstr_t *)pc->pc_map, 0,
 		    sizeof(pc->pc_map) * NBBY, &free);
 		if (free == 0)
 			break;
 		avail += free;
 		if (avail >= needed)
 			break;
 	}
 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED);
 		if (m == NULL) {
 			m = reclaim_pv_chunk(pmap, lockp);
 			if (m == NULL)
 				goto retry;
 			reclaimed = true;
 		}
 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 		dump_add_page(m->phys_addr);
 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 		pc->pc_pmap = pmap;
 		pc->pc_map[0] = PC_FREE0;
 		pc->pc_map[1] = PC_FREE1;
 		pc->pc_map[2] = PC_FREE2;
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
 
 		/*
 		 * The reclaim might have freed a chunk from the current pmap.
 		 * If that chunk contained available entries, we need to
 		 * re-count the number of available entries.
 		 */
 		if (reclaimed)
 			goto retry;
 	}
 	if (!TAILQ_EMPTY(&new_tail)) {
 		mtx_lock(&pv_chunks_mutex);
 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 		mtx_unlock(&pv_chunks_mutex);
 	}
 }
 
 /*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
  * 2MB page mappings.
  */
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 			break;
 		}
 	}
 	return (pv);
 }
 
 /*
  * After demotion from a 2MB page mapping to 512 4KB page mappings,
  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
  * entries for each of the 4KB page mappings.
  */
 static void
 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct pv_chunk *pc;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 	int bit, field;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((va & L2_OFFSET) == 0,
 	    ("pmap_pv_demote_l2: va is not 2mpage aligned"));
 	KASSERT((pa & L2_OFFSET) == 0,
 	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the 2mpage's pv entry for this mapping to the first
 	 * page's pv list.  Once this transfer begins, the pv list lock
 	 * must not be released until the last pv entry is reinstantiated.
 	 */
 	pvh = pa_to_pvh(pa);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	m->md.pv_gen++;
 	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
 	va_last = va + L2_SIZE - PAGE_SIZE;
 	for (;;) {
 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
 		for (field = 0; field < _NPCM; field++) {
 			while (pc->pc_map[field]) {
 				bit = ffsl(pc->pc_map[field]) - 1;
 				pc->pc_map[field] &= ~(1ul << bit);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va += PAGE_SIZE;
 				pv->pv_va = va;
 				m++;
 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("pmap_pv_demote_l2: page %p is not managed", m));
 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (va == va_last)
 					goto out;
 			}
 		}
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 out:
 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
 }
 
 /*
  * First find and then destroy the pv entry for the specified pmap and virtual
  * address.  This operation can be performed on pv lists for either 4KB or 2MB
  * page mappings.
  */
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Conditionally create the PV entry for a 4KB page mapping if the required
  * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct rwlock **lockp)
 {
 	pv_entry_t pv;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
  * false if the PV entry cannot be allocated without resorting to reclamation.
  */
 static bool
 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_paddr_t pa;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
 	    NULL : lockp)) == NULL)
 		return (false);
 	pv->pv_va = va;
 	pa = l2e & ~ATTR_MASK;
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	return (true);
 }
 
 static void
 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
 {
 	pt_entry_t newl2, oldl2;
 	vm_page_t ml3;
 	vm_paddr_t ml3pa;
 
 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	ml3 = pmap_remove_pt_page(pmap, va);
 	if (ml3 == NULL)
 		panic("pmap_remove_kernel_l2: Missing pt page");
 
 	ml3pa = VM_PAGE_TO_PHYS(ml3);
 	newl2 = ml3pa | L2_TABLE;
 
 	/*
 	 * If this page table page was unmapped by a promotion, then it
 	 * contains valid mappings.  Zero it to invalidate those mappings.
 	 */
 	if (ml3->valid != 0)
 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
 
 	/*
 	 * Demote the mapping.  The caller must have already invalidated the
 	 * mapping (i.e., the "break" in break-before-make).
 	 */
 	oldl2 = pmap_load_store(l2, newl2);
 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
 	    __func__, l2, oldl2));
 }
 
 /*
  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
  */
 static int
 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t old_l2;
 	vm_offset_t eva, va;
 	vm_page_t m, ml3;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
 	old_l2 = pmap_load_clear(l2);
 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
 	    ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
 
 	/*
 	 * Since a promotion must break the 4KB page mappings before making
 	 * the 2MB page mapping, a pmap_invalidate_page() suffices.
 	 */
 	pmap_invalidate_page(pmap, sva);
 
 	if (old_l2 & ATTR_SW_WIRED)
 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
 	if (old_l2 & ATTR_SW_MANAGED) {
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK);
 		pvh = pa_to_pvh(old_l2 & ~ATTR_MASK);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + L2_SIZE;
 		for (va = sva, m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
 		    va < eva; va += PAGE_SIZE, m++) {
 			if (pmap_pte_dirty(old_l2))
 				vm_page_dirty(m);
 			if (old_l2 & ATTR_AF)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_l2(pmap, l2, sva);
 	} else {
 		ml3 = pmap_remove_pt_page(pmap, sva);
 		if (ml3 != NULL) {
 			KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_remove_l2: l3 page not promoted"));
 			pmap_resident_count_dec(pmap, 1);
 			KASSERT(ml3->wire_count == NL3PG,
 			    ("pmap_remove_l2: l3 page wire count error"));
 			ml3->wire_count = 0;
 			pmap_add_delayed_free_list(ml3, free, FALSE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, sva, l1e, free));
 }
 
 /*
  * pmap_remove_l3: do the things to unmap a page in a process
  */
-static int __unused
+static int
 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t old_l3;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	old_l3 = pmap_load_clear(l3);
 	pmap_invalidate_page(pmap, va);
 	if (old_l3 & ATTR_SW_WIRED)
 		pmap->pm_stats.wired_count -= 1;
 	pmap_resident_count_dec(pmap, 1);
 	if (old_l3 & ATTR_SW_MANAGED) {
 		m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
 		if (pmap_pte_dirty(old_l3))
 			vm_page_dirty(m);
 		if (old_l3 & ATTR_AF)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		pmap_pvh_free(&m->md, pmap, va);
 		if (TAILQ_EMPTY(&m->md.pv_list) &&
 		    (m->flags & PG_FICTITIOUS) == 0) {
 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 			if (TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, va, l2e, free));
 }
 
 /*
  * Remove the specified range of addresses from the L3 page table that is
  * identified by the given L2 entry.
  */
 static void
 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
     vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct rwlock *new_lock;
 	pt_entry_t *l3, old_l3;
 	vm_offset_t va;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
 	    ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
 	va = eva;
 	for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
 		if (!pmap_l3_valid(pmap_load(l3))) {
 			if (va != eva) {
 				pmap_invalidate_range(pmap, va, sva);
 				va = eva;
 			}
 			continue;
 		}
 		old_l3 = pmap_load_clear(l3);
 		if ((old_l3 & ATTR_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count--;
 		pmap_resident_count_dec(pmap, 1);
 		if ((old_l3 & ATTR_SW_MANAGED) != 0) {
 			m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
 			if (pmap_pte_dirty(old_l3))
 				vm_page_dirty(m);
 			if ((old_l3 & ATTR_AF) != 0)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m));
 			if (new_lock != *lockp) {
 				if (*lockp != NULL) {
 					/*
 					 * Pending TLB invalidations must be
 					 * performed before the PV list lock is
 					 * released.  Otherwise, a concurrent
 					 * pmap_remove_all() on a physical page
 					 * could return while a stale TLB entry
 					 * still provides access to that page. 
 					 */
 					if (va != eva) {
 						pmap_invalidate_range(pmap, va,
 						    sva);
 						va = eva;
 					}
 					rw_wunlock(*lockp);
 				}
 				*lockp = new_lock;
 				rw_wlock(*lockp);
 			}
 			pmap_pvh_free(&m->md, pmap, sva);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    (m->flags & PG_FICTITIOUS) == 0) {
 				pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 				if (TAILQ_EMPTY(&pvh->pv_list))
 					vm_page_aflag_clear(m, PGA_WRITEABLE);
 			}
 		}
 		if (va == eva)
 			va = sva;
 		if (pmap_unuse_pt(pmap, sva, l2e, free)) {
 			sva += L3_SIZE;
 			break;
 		}
 	}
 	if (va != eva)
 		pmap_invalidate_range(pmap, va, sva);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct rwlock *lock;
 	vm_offset_t va_next;
 	pd_entry_t *l0, *l1, *l2;
 	pt_entry_t l3_paddr;
 	struct spglist free;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	SLIST_INIT(&free);
 
 	PMAP_LOCK(pmap);
 
 	lock = NULL;
 	for (; sva < eva; sva = va_next) {
 
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		l0 = pmap_l0(pmap, sva);
 		if (pmap_load(l0) == 0) {
 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		l1 = pmap_l0_to_l1(l0, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (l2 == NULL)
 			continue;
 
 		l3_paddr = pmap_load(l2);
 
 		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
 				    &free, &lock);
 				continue;
 			} else if (pmap_demote_l2_locked(pmap, l2, sva,
 			    &lock) == NULL)
 				continue;
 			l3_paddr = pmap_load(l2);
 		}
 
 		/*
 		 * Weed out invalid mappings.
 		 */
 		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
 			continue;
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (va_next > eva)
 			va_next = eva;
 
 		pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
 		    &lock);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pd_entry_t *pde, tpde;
 	pt_entry_t *pte, tpte;
 	vm_offset_t va;
 	struct spglist free;
 	int lvl, pvh_gen, md_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry:
 	rw_wlock(lock);
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		pte = pmap_pte(pmap, va, &lvl);
 		KASSERT(pte != NULL,
 		    ("pmap_remove_all: no page table entry found"));
 		KASSERT(lvl == 2,
 		    ("pmap_remove_all: invalid pte level %d", lvl));
 
 		pmap_demote_l2_locked(pmap, pte, va, &lock);
 		PMAP_UNLOCK(pmap);
 	}
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				rw_wunlock(lock);
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		pmap_resident_count_dec(pmap, 1);
 
 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_remove_all: no page directory entry found"));
 		KASSERT(lvl == 2,
 		    ("pmap_remove_all: invalid pde level %d", lvl));
 		tpde = pmap_load(pde);
 
 		pte = pmap_l2_to_l3(pde, pv->pv_va);
 		tpte = pmap_load_clear(pte);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		if (tpte & ATTR_SW_WIRED)
 			pmap->pm_stats.wired_count--;
 		if ((tpte & ATTR_AF) != 0)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if (pmap_pte_dirty(tpte))
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(lock);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * pmap_protect_l2: do the things to protect a 2MB page in a pmap
  */
 static void
 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
     pt_entry_t nbits)
 {
 	pd_entry_t old_l2;
 	vm_page_t m, mt;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & L2_OFFSET) == 0,
 	    ("pmap_protect_l2: sva is not 2mpage aligned"));
 	old_l2 = pmap_load(l2);
 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
 	    ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
 
 	/*
 	 * Return if the L2 entry already has the desired access restrictions
 	 * in place.
 	 */
 retry:
 	if ((old_l2 & mask) == nbits)
 		return;
 
 	/*
 	 * When a dirty read/write superpage mapping is write protected,
 	 * update the dirty field of each of the superpage's constituent 4KB
 	 * pages.
 	 */
 	if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
 	    (nbits & ATTR_AP(ATTR_AP_RO)) != 0 && pmap_pte_dirty(old_l2)) {
 		m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 			vm_page_dirty(mt);
 	}
 
 	if (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
 		goto retry;
 
 	/*
 	 * Since a promotion must break the 4KB page mappings before making
 	 * the 2MB page mapping, a pmap_invalidate_page() suffices.
 	 */
 	pmap_invalidate_page(pmap, sva);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	vm_offset_t va, va_next;
 	pd_entry_t *l0, *l1, *l2;
 	pt_entry_t *l3p, l3, mask, nbits;
 
 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 	if (prot == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	mask = nbits = 0;
 	if ((prot & VM_PROT_WRITE) == 0) {
 		mask |= ATTR_AP_RW_BIT | ATTR_SW_DBM;
 		nbits |= ATTR_AP(ATTR_AP_RO);
 	}
 	if ((prot & VM_PROT_EXECUTE) == 0) {
 		mask |= ATTR_XN;
 		nbits |= ATTR_XN;
 	}
 	if (mask == 0)
 		return;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 
 		l0 = pmap_l0(pmap, sva);
 		if (pmap_load(l0) == 0) {
 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		l1 = pmap_l0_to_l1(l0, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (pmap_load(l2) == 0)
 			continue;
 
 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				pmap_protect_l2(pmap, l2, sva, mask, nbits);
 				continue;
 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
 				continue;
 		}
 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_protect: Invalid L2 entry after demotion"));
 
 		if (va_next > eva)
 			va_next = eva;
 
 		va = va_next;
 		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
 		    sva += L3_SIZE) {
 			l3 = pmap_load(l3p);
 retry:
 			/*
 			 * Go to the next L3 entry if the current one is
 			 * invalid or already has the desired access
 			 * restrictions in place.  (The latter case occurs
 			 * frequently.  For example, in a "buildworld"
 			 * workload, almost 1 out of 4 L3 entries already
 			 * have the desired restrictions.)
 			 */
 			if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
 				if (va != va_next) {
 					pmap_invalidate_range(pmap, va, sva);
 					va = va_next;
 				}
 				continue;
 			}
 
 			/*
 			 * When a dirty read/write mapping is write protected,
 			 * update the page's dirty field.
 			 */
 			if ((l3 & ATTR_SW_MANAGED) != 0 &&
 			    (nbits & ATTR_AP(ATTR_AP_RO)) != 0 &&
 			    pmap_pte_dirty(l3))
 				vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK));
 
 			if (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) | nbits))
 				goto retry;
 			if (va == va_next)
 				va = sva;
 		}
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  *
  * If "promoted" is false, then the page table page "mpte" must be zero filled.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
 	return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
 }
 
 /*
  * Performs a break-before-make update of a pmap entry. This is needed when
  * either promoting or demoting pages to ensure the TLB doesn't get into an
  * inconsistent state.
  */
 static void
 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
     vm_offset_t va, vm_size_t size)
 {
 	register_t intr;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Ensure we don't get switched out with the page table in an
 	 * inconsistent state. We also need to ensure no interrupts fire
 	 * as they may make use of an address we are about to invalidate.
 	 */
 	intr = intr_disable();
 	critical_enter();
 
 	/* Clear the old mapping */
 	pmap_clear(pte);
 	pmap_invalidate_range_nopin(pmap, va, va + size);
 
 	/* Create the new mapping */
 	pmap_store(pte, newpte);
 	dsb(ishst);
 
 	critical_exit();
 	intr_restore(intr);
 }
 
 #if VM_NRESERVLEVEL > 0
 /*
  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
  * replace the many pv entries for the 4KB page mappings by a single pv entry
  * for the 2MB page mapping.
  */
 static void
 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	KASSERT((pa & L2_OFFSET) == 0,
 	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
 	 * a transfer avoids the possibility that get_pv_entry() calls
 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
 	 * mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = va & ~L2_OFFSET;
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	/* Free the remaining NPTEPG - 1 pv entries. */
 	va_last = va + L2_SIZE - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 
 /*
  * Tries to promote the 512, contiguous 4KB page mappings that are within a
  * single level 2 table entry to a single 2MB page mapping.  For promotion
  * to occur, two conditions must be met: (1) the 4KB page mappings must map
  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
  * identical characteristics.
  */
 static void
 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pt_entry_t *firstl3, *l3, newl2, oldl3, pa;
 	vm_page_t mpte;
 	vm_offset_t sva;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	sva = va & ~L2_OFFSET;
 	firstl3 = pmap_l2_to_l3(l2, sva);
 	newl2 = pmap_load(firstl3);
 
 setl2:
 	if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) {
 		atomic_add_long(&pmap_l2_p_failures, 1);
 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 
 	if ((newl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) ==
 	    (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) {
 		if (!atomic_fcmpset_64(l2, &newl2, newl2 & ~ATTR_SW_DBM))
 			goto setl2;
 		newl2 &= ~ATTR_SW_DBM;
 	}
 
 	pa = newl2 + L2_SIZE - PAGE_SIZE;
 	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
 		oldl3 = pmap_load(l3);
 setl3:
 		if ((oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) ==
 		    (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) {
 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
 			    ~ATTR_SW_DBM))
 				goto setl3;
 			oldl3 &= ~ATTR_SW_DBM;
 		}
 		if (oldl3 != pa) {
 			atomic_add_long(&pmap_l2_p_failures, 1);
 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		pa -= PAGE_SIZE;
 	}
 
 	/*
 	 * Save the page table page in its current state until the L2
 	 * mapping the superpage is demoted by pmap_demote_l2() or
 	 * destroyed by pmap_remove_l3().
 	 */
 	mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
 	KASSERT(mpte >= vm_page_array &&
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_l2: page table page is out of range"));
 	KASSERT(mpte->pindex == pmap_l2_pindex(va),
 	    ("pmap_promote_l2: page table page's pindex is wrong"));
 	if (pmap_insert_pt_page(pmap, mpte, true)) {
 		atomic_add_long(&pmap_l2_p_failures, 1);
 		CTR2(KTR_PMAP,
 		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
 		    pmap);
 		return;
 	}
 
 	if ((newl2 & ATTR_SW_MANAGED) != 0)
 		pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);
 
 	newl2 &= ~ATTR_DESCR_MASK;
 	newl2 |= L2_BLOCK;
 
 	pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE);
 
 	atomic_add_long(&pmap_l2_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
 		    pmap);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 int
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind)
 {
 	struct rwlock *lock;
 	pd_entry_t *pde;
 	pt_entry_t new_l3, orig_l3;
 	pt_entry_t *l2, *l3;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte, om;
 	boolean_t nosleep;
 	int lvl, rv;
 
 	va = trunc_page(va);
 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 	pa = VM_PAGE_TO_PHYS(m);
 	new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
 	    L3_PAGE);
 	if ((prot & VM_PROT_WRITE) == 0)
 		new_l3 |= ATTR_AP(ATTR_AP_RO);
 	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
 		new_l3 |= ATTR_XN;
 	if ((flags & PMAP_ENTER_WIRED) != 0)
 		new_l3 |= ATTR_SW_WIRED;
 	if (va < VM_MAXUSER_ADDRESS)
 		new_l3 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN;
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		new_l3 |= ATTR_SW_MANAGED;
 		if ((prot & VM_PROT_WRITE) != 0) {
 			new_l3 |= ATTR_SW_DBM;
 			if ((flags & VM_PROT_WRITE) == 0)
 				new_l3 |= ATTR_AP(ATTR_AP_RO);
 		}
 	}
 
 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	if (psind == 1) {
 		/* Assert the required virtual and physical alignment. */
 		KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 		rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
 		    flags, m, &lock);
 		goto out;
 	}
 	mpte = NULL;
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 retry:
 	pde = pmap_pde(pmap, va, &lvl);
 	if (pde != NULL && lvl == 2) {
 		l3 = pmap_l2_to_l3(pde, va);
 		if (va < VM_MAXUSER_ADDRESS && mpte == NULL) {
 			mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
 			mpte->wire_count++;
 		}
 		goto havel3;
 	} else if (pde != NULL && lvl == 1) {
 		l2 = pmap_l1_to_l2(pde, va);
 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
 		    (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
 			l3 = &l3[pmap_l3_index(va)];
 			if (va < VM_MAXUSER_ADDRESS) {
 				mpte = PHYS_TO_VM_PAGE(
 				    pmap_load(l2) & ~ATTR_MASK);
 				mpte->wire_count++;
 			}
 			goto havel3;
 		}
 		/* We need to allocate an L3 table. */
 	}
 	if (va < VM_MAXUSER_ADDRESS) {
 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 
 		/*
 		 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
 		 * to handle the possibility that a superpage mapping for "va"
 		 * was created while we slept.
 		 */
 		mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
 		    nosleep ? NULL : &lock);
 		if (mpte == NULL && nosleep) {
 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
 			rv = KERN_RESOURCE_SHORTAGE;
 			goto out;
 		}
 		goto retry;
 	} else
 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
 
 havel3:
 	orig_l3 = pmap_load(l3);
 	opa = orig_l3 & ~ATTR_MASK;
 	pv = NULL;
 
 	/*
 	 * Is the specified virtual address already mapped?
 	 */
 	if (pmap_l3_valid(orig_l3)) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
 		    (orig_l3 & ATTR_SW_WIRED) == 0)
 			pmap->pm_stats.wired_count++;
 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
 		    (orig_l3 & ATTR_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove the extra PT page reference.
 		 */
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%lx", va));
 		}
 
 		/*
 		 * Has the physical page changed?
 		 */
 		if (opa == pa) {
 			/*
 			 * No, might be a protection or wiring change.
 			 */
 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
 			    (new_l3 & ATTR_SW_DBM) != 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 			goto validate;
 		}
 
 		/*
 		 * The physical page has changed.  Temporarily invalidate
 		 * the mapping.
 		 */
 		orig_l3 = pmap_load_clear(l3);
 		KASSERT((orig_l3 & ~ATTR_MASK) == opa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
 			om = PHYS_TO_VM_PAGE(opa);
 
 			/*
 			 * The pmap lock is sufficient to synchronize with
 			 * concurrent calls to pmap_page_test_mappings() and
 			 * pmap_ts_referenced().
 			 */
 			if (pmap_pte_dirty(orig_l3))
 				vm_page_dirty(om);
 			if ((orig_l3 & ATTR_AF) != 0)
 				vm_page_aflag_set(om, PGA_REFERENCED);
 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 			if ((m->oflags & VPO_UNMANAGED) != 0)
 				free_pv_entry(pmap, pv);
 			if ((om->aflags & PGA_WRITEABLE) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 		}
 		pmap_invalidate_page(pmap, va);
 		orig_l3 = 0;
 	} else {
 		/*
 		 * Increment the counters.
 		 */
 		if ((new_l3 & ATTR_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count++;
 		pmap_resident_count_inc(pmap, 1);
 	}
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		if (pv == NULL) {
 			pv = get_pv_entry(pmap, &lock);
 			pv->pv_va = va;
 		}
 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		if ((new_l3 & ATTR_SW_DBM) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
 validate:
 	/*
 	 * Sync icache if exec permission and attribute VM_MEMATTR_WRITE_BACK
 	 * is set. Do it now, before the mapping is stored and made
 	 * valid for hardware table walk. If done later, then other can
 	 * access this page before caches are properly synced.
 	 * Don't do it for kernel memory which is mapped with exec
 	 * permission even if the memory isn't going to hold executable
 	 * code. The only time when icache sync is needed is after
 	 * kernel module is loaded and the relocation info is processed.
 	 * And it's done in elf_cpu_load_file().
 	*/
 	if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
 	    (opa != pa || (orig_l3 & ATTR_XN)))
 		cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
 
 	/*
 	 * Update the L3 entry
 	 */
 	if (pmap_l3_valid(orig_l3)) {
 		KASSERT(opa == pa, ("pmap_enter: invalid update"));
 		if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
 			/* same PA, different attributes */
 			/* XXXMJ need to reload orig_l3 for hardware DBM. */
 			pmap_load_store(l3, new_l3);
 			pmap_invalidate_page(pmap, va);
 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
 			    pmap_pte_dirty(orig_l3))
 				vm_page_dirty(m);
 		} else {
 			/*
 			 * orig_l3 == new_l3
 			 * This can happens if multiple threads simultaneously
 			 * access not yet mapped page. This bad for performance
 			 * since this can cause full demotion-NOP-promotion
 			 * cycle.
 			 * Another possible reasons are:
 			 * - VM and pmap memory layout are diverged
 			 * - tlb flush is missing somewhere and CPU doesn't see
 			 *   actual mapping.
 			 */
 			CTR4(KTR_PMAP, "%s: already mapped page - "
 			    "pmap %p va 0x%#lx pte 0x%lx",
 			    __func__, pmap, va, new_l3);
 		}
 	} else {
 		/* New mapping */
 		pmap_store(l3, new_l3);
 		dsb(ishst);
 	}
 
 #if VM_NRESERVLEVEL > 0
 	if (pmap != pmap_kernel() &&
 	    (mpte == NULL || mpte->wire_count == NL3PG) &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0) {
 		pmap_promote_l2(pmap, pde, va, &lock);
 	}
 #endif
 
 	rv = KERN_SUCCESS;
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
  * if successful.  Returns false if (1) a page table page cannot be allocated
  * without sleeping, (2) a mapping already exists at the specified virtual
  * address, or (3) a PV entry cannot be allocated without reclaiming another
  * PV entry.
  */
 static bool
 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     struct rwlock **lockp)
 {
 	pd_entry_t new_l2;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
 	    ATTR_IDX(m->md.pv_memattr) | ATTR_AP(ATTR_AP_RO) | L2_BLOCK);
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		new_l2 |= ATTR_SW_MANAGED;
 		new_l2 &= ~ATTR_AF;
 	}
 	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
 		new_l2 |= ATTR_XN;
 	if (va < VM_MAXUSER_ADDRESS)
 		new_l2 |= ATTR_AP(ATTR_AP_USER) | ATTR_PXN;
 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
 	    KERN_SUCCESS);
 }
 
 /*
  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
  * a mapping already exists at the specified virtual address.  Returns
  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
  *
  * The parameter "m" is only used when creating a managed, writeable mapping.
  */
 static int
 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
     vm_page_t m, struct rwlock **lockp)
 {
 	struct spglist free;
 	pd_entry_t *l2, old_l2;
 	vm_page_t l2pg, mt;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
 	    NULL : lockp)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
 		    va, pmap);
 		return (KERN_RESOURCE_SHORTAGE);
 	}
 
 	l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
 	l2 = &l2[pmap_l2_index(va)];
 	if ((old_l2 = pmap_load(l2)) != 0) {
 		KASSERT(l2pg->wire_count > 1,
 		    ("pmap_enter_l2: l2pg's wire count is too low"));
 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
 			l2pg->wire_count--;
 			CTR2(KTR_PMAP,
 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
 			    va, pmap);
 			return (KERN_FAILURE);
 		}
 		SLIST_INIT(&free);
 		if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
 			(void)pmap_remove_l2(pmap, l2, va,
 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
 		else
 			pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
 			    &free, lockp);
 		vm_page_free_pages_toq(&free, true);
 		if (va >= VM_MAXUSER_ADDRESS) {
 			/*
 			 * Both pmap_remove_l2() and pmap_remove_l3_range()
 			 * will leave the kernel page table page zero filled.
 			 * Nonetheless, the TLB could have an intermediate
 			 * entry for the kernel page table page.
 			 */
 			mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
 			if (pmap_insert_pt_page(pmap, mt, false))
 				panic("pmap_enter_l2: trie insert failed");
 			pmap_clear(l2);
 			pmap_invalidate_page(pmap, va);
 		} else
 			KASSERT(pmap_load(l2) == 0,
 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
 	}
 
 	if ((new_l2 & ATTR_SW_MANAGED) != 0) {
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_l3(pmap, va, l2pg, &free)) {
 				/*
 				 * Although "va" is not mapped, the TLB could
 				 * nonetheless have intermediate entries that
 				 * refer to the freed page table pages.
 				 * Invalidate those entries.
 				 *
 				 * XXX redundant invalidation (See
 				 * _pmap_unwire_l3().)
 				 */
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			CTR2(KTR_PMAP,
 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
 			    va, pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		if ((new_l2 & ATTR_SW_DBM) != 0)
 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 				vm_page_aflag_set(mt, PGA_WRITEABLE);
 	}
 
 	/*
 	 * Increment counters.
 	 */
 	if ((new_l2 & ATTR_SW_WIRED) != 0)
 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
 
 	/*
 	 * Map the superpage.
 	 */
 	pmap_store(l2, new_l2);
 	dsb(ishst);
 
 	atomic_add_long(&pmap_l2_mappings, 1);
 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
 	    va, pmap);
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	struct rwlock *lock;
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
 			m = &m[L2_SIZE / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
 			    &lock);
 		m = TAILQ_NEXT(m, listq);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	struct rwlock *lock;
 
 	lock = NULL;
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 {
 	struct spglist free;
 	pd_entry_t *pde;
 	pt_entry_t *l2, *l3, l3_val;
 	vm_paddr_t pa;
 	int lvl;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t l2pindex;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		l2pindex = pmap_l2_pindex(va);
 		if (mpte && (mpte->pindex == l2pindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the l2 entry
 			 */
 			pde = pmap_pde(pmap, va, &lvl);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.  Otherwise, we
 			 * attempt to allocate a page table page.  If this
 			 * attempt fails, we don't retry.  Instead, we give up.
 			 */
 			if (lvl == 1) {
 				l2 = pmap_l1_to_l2(pde, va);
 				if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
 				    L2_BLOCK)
 					return (NULL);
 			}
 			if (lvl == 2 && pmap_load(pde) != 0) {
 				mpte =
 				    PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
 				mpte->wire_count++;
 			} else {
 				/*
 				 * Pass NULL instead of the PV list lock
 				 * pointer, because we don't intend to sleep.
 				 */
 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 		l3 = &l3[pmap_l3_index(va)];
 	} else {
 		mpte = NULL;
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
 		     va));
 		KASSERT(lvl == 2,
 		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
 		l3 = pmap_l2_to_l3(pde, va);
 	}
 
 	/*
 	 * Abort if a mapping already exists.
 	 */
 	if (pmap_load(l3) != 0) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_l3(pmap, va, mpte, &free)) {
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap_resident_count_inc(pmap, 1);
 
 	pa = VM_PAGE_TO_PHYS(m);
 	l3_val = pa | ATTR_DEFAULT | ATTR_IDX(m->md.pv_memattr) |
 	    ATTR_AP(ATTR_AP_RO) | L3_PAGE;
 	if ((prot & VM_PROT_EXECUTE) == 0 || m->md.pv_memattr == DEVICE_MEMORY)
 		l3_val |= ATTR_XN;
 	else if (va < VM_MAXUSER_ADDRESS)
 		l3_val |= ATTR_PXN;
 
 	/*
 	 * Now validate mapping with RO protection
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0) {
 		l3_val |= ATTR_SW_MANAGED;
 		l3_val &= ~ATTR_AF;
 	}
 
 	/* Sync icache before the mapping is stored to PTE */
 	if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
 		cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
 
 	pmap_store(l3, l3_val);
 	dsb(ishst);
 
 	return (mpte);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware feature,
  *	so there is no need to invalidate any TLB entries.
  */
 void
 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va_next;
 	pd_entry_t *l0, *l1, *l2;
 	pt_entry_t *l3;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l0 = pmap_l0(pmap, sva);
 		if (pmap_load(l0) == 0) {
 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		l1 = pmap_l0_to_l1(l0, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (pmap_load(l2) == 0)
 			continue;
 
 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
 				panic("pmap_unwire: l2 %#jx is missing "
 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
 
 			/*
 			 * Are we unwiring the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				pmap_clear_bits(l2, ATTR_SW_WIRED);
 				pmap->pm_stats.wired_count -= L2_SIZE /
 				    PAGE_SIZE;
 				continue;
 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
 				panic("pmap_unwire: demotion failed");
 		}
 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_unwire: Invalid l2 entry after demotion"));
 
 		if (va_next > eva)
 			va_next = eva;
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			if (pmap_load(l3) == 0)
 				continue;
 			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
 				panic("pmap_unwire: l3 %#jx is missing "
 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
 
 			/*
 			 * ATTR_SW_WIRED must be cleared atomically.  Although
 			 * the pmap lock synchronizes access to ATTR_SW_WIRED,
 			 * the System MMU may write to the entry concurrently.
 			 */
 			pmap_clear_bits(l3, ATTR_SW_WIRED);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  *
  *	Because the executable mappings created by this routine are copied,
  *	it should not have to flush the instruction cache.
  */
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
 	struct rwlock *lock;
 	struct spglist free;
 	pd_entry_t *l0, *l1, *l2, srcptepaddr;
 	pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
 	vm_offset_t addr, end_addr, va_next;
 	vm_page_t dst_l2pg, dstmpte, srcmpte;
 
 	if (dst_addr != src_addr)
 		return;
 	end_addr = src_addr + len;
 	lock = NULL;
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	for (addr = src_addr; addr < end_addr; addr = va_next) {
 		l0 = pmap_l0(src_pmap, addr);
 		if (pmap_load(l0) == 0) {
 			va_next = (addr + L0_SIZE) & ~L0_OFFSET;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 		l1 = pmap_l0_to_l1(l0, addr);
 		if (pmap_load(l1) == 0) {
 			va_next = (addr + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < addr)
 				va_next = end_addr;
 			continue;
 		}
 		va_next = (addr + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < addr)
 			va_next = end_addr;
 		l2 = pmap_l1_to_l2(l1, addr);
 		srcptepaddr = pmap_load(l2);
 		if (srcptepaddr == 0)
 			continue;
 		if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
 			if ((addr & L2_OFFSET) != 0 ||
 			    addr + L2_SIZE > end_addr)
 				continue;
 			dst_l2pg = pmap_alloc_l2(dst_pmap, addr, NULL);
 			if (dst_l2pg == NULL)
 				break;
 			l2 = (pd_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dst_l2pg));
 			l2 = &l2[pmap_l2_index(addr)];
 			if (pmap_load(l2) == 0 &&
 			    ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
 			    pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
 			    PMAP_ENTER_NORECLAIM, &lock))) {
 				mask = ATTR_AF | ATTR_SW_WIRED;
 				nbits = 0;
 				if ((srcptepaddr & ATTR_SW_DBM) != 0)
 					nbits |= ATTR_AP_RW_BIT;
 				pmap_store(l2, (srcptepaddr & ~mask) | nbits);
 				pmap_resident_count_inc(dst_pmap, L2_SIZE /
 				    PAGE_SIZE);
 				atomic_add_long(&pmap_l2_mappings, 1);
 			} else
 				dst_l2pg->wire_count--;
 			continue;
 		}
 		KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_copy: invalid L2 entry"));
 		srcptepaddr &= ~ATTR_MASK;
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
 		KASSERT(srcmpte->wire_count > 0,
 		    ("pmap_copy: source page table page is unused"));
 		if (va_next > end_addr)
 			va_next = end_addr;
 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
 		src_pte = &src_pte[pmap_l3_index(addr)];
 		dstmpte = NULL;
 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
 			ptetemp = pmap_load(src_pte);
 
 			/*
 			 * We only virtual copy managed pages.
 			 */
 			if ((ptetemp & ATTR_SW_MANAGED) == 0)
 				continue;
 
 			if (dstmpte != NULL) {
 				KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
 				    ("dstmpte pindex/addr mismatch"));
 				dstmpte->wire_count++;
 			} else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
 			    NULL)) == NULL)
 				goto out;
 			dst_pte = (pt_entry_t *)
 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
 			dst_pte = &dst_pte[pmap_l3_index(addr)];
 			if (pmap_load(dst_pte) == 0 &&
 			    pmap_try_insert_pv_entry(dst_pmap, addr,
 			    PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) {
 				/*
 				 * Clear the wired, modified, and accessed
 				 * (referenced) bits during the copy.
 				 */
 				mask = ATTR_AF | ATTR_SW_WIRED;
 				nbits = 0;
 				if ((ptetemp & ATTR_SW_DBM) != 0)
 					nbits |= ATTR_AP_RW_BIT;
 				pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
 				pmap_resident_count_inc(dst_pmap, 1);
 			} else {
 				SLIST_INIT(&free);
 				if (pmap_unwire_l3(dst_pmap, addr, dstmpte,
 				    &free)) {
 					/*
 					 * Although "addr" is not mapped,
 					 * the TLB could nonetheless have
 					 * intermediate entries that refer
 					 * to the freed page table pages.
 					 * Invalidate those entries.
 					 *
 					 * XXX redundant invalidation
 					 */
 					pmap_invalidate_page(dst_pmap, addr);
 					vm_page_free_pages_toq(&free, true);
 				}
 				goto out;
 			}
 			/* Have we copied all of the valid mappings? */ 
 			if (dstmpte->wire_count >= srcmpte->wire_count)
 				break;
 		}
 	}
 out:
 	/*
 	 * XXX This barrier may not be needed because the destination pmap is
 	 * not active.
 	 */
 	dsb(ishst);
 
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero((void *)va);
 	else
 		bzero((char *)va + off, size);
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	pagecopy((void *)src, (void *)dst);
 }
 
 int unmapped_buf_allowed = 1;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	void *a_cp, *b_cp;
 	vm_page_t m_a, m_b;
 	vm_paddr_t p_a, p_b;
 	vm_offset_t a_pg_offset, b_pg_offset;
 	int cnt;
 
 	while (xfersize > 0) {
 		a_pg_offset = a_offset & PAGE_MASK;
 		m_a = ma[a_offset >> PAGE_SHIFT];
 		p_a = m_a->phys_addr;
 		b_pg_offset = b_offset & PAGE_MASK;
 		m_b = mb[b_offset >> PAGE_SHIFT];
 		p_b = m_b->phys_addr;
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
 			panic("!DMAP a %lx", p_a);
 		} else {
 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
 		}
 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
 			panic("!DMAP b %lx", p_b);
 		} else {
 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
 		}
 		bcopy(a_cp, b_cp, cnt);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 
 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	struct rwlock *lock;
 	struct md_page *pvh;
 	pmap_t pmap;
 	pt_entry_t *pte;
 	pv_entry_t pv;
 	int count, lvl, md_gen, pvh_gen;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (0);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	count = 0;
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
 		if (pte != NULL && (pmap_load(pte) & ATTR_SW_WIRED) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pte(pmap, pv->pv_va, &lvl);
 			if (pte != NULL &&
 			    (pmap_load(pte) & ATTR_SW_WIRED) != 0)
 				count++;
 			PMAP_UNLOCK(pmap);
 		}
 	}
 	rw_runlock(lock);
 	return (count);
 }
 
 /*
  * Destroy all managed, non-wired mappings in the given user-space
  * pmap.  This pmap cannot be active on any processor besides the
  * caller.
  *
  * This function cannot be applied to the kernel pmap.  Moreover, it
  * is not intended for general use.  It is only to be used during
  * process termination.  Consequently, it can be implemented in ways
  * that make it faster than pmap_remove().  First, it can more quickly
  * destroy mappings by iterating over the pmap's collection of PV
  * entries, rather than searching the page table.  Second, it doesn't
  * have to test and clear the page table entries atomically, because
  * no processor is currently accessing the user address space.  In
  * particular, a page table entry's dirty bit won't change state once
  * this function starts.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte, tpte;
 	struct spglist free;
 	vm_page_t m, ml3, mt;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
 	struct rwlock *lock;
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree, field, freed, idx, lvl;
 	vm_paddr_t pa;
 
 	lock = NULL;
 
 	SLIST_INIT(&free);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = ffsl(inuse) - 1;
 				bitmask = 1UL << bit;
 				idx = field * 64 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pde = pmap_pde(pmap, pv->pv_va, &lvl);
 				KASSERT(pde != NULL,
 				    ("Attempting to remove an unmapped page"));
 
 				switch(lvl) {
 				case 1:
 					pte = pmap_l1_to_l2(pde, pv->pv_va);
 					tpte = pmap_load(pte); 
 					KASSERT((tpte & ATTR_DESCR_MASK) ==
 					    L2_BLOCK,
 					    ("Attempting to remove an invalid "
 					    "block: %lx", tpte));
 					tpte = pmap_load(pte);
 					break;
 				case 2:
 					pte = pmap_l2_to_l3(pde, pv->pv_va);
 					tpte = pmap_load(pte);
 					KASSERT((tpte & ATTR_DESCR_MASK) ==
 					    L3_PAGE,
 					    ("Attempting to remove an invalid "
 					     "page: %lx", tpte));
 					break;
 				default:
 					panic(
 					    "Invalid page directory level: %d",
 					    lvl);
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & ATTR_SW_WIRED) {
 					allfree = 0;
 					continue;
 				}
 
 				pa = tpte & ~ATTR_MASK;
 
 				m = PHYS_TO_VM_PAGE(pa);
 				KASSERT(m->phys_addr == pa,
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad pte %#jx",
 				    (uintmax_t)tpte));
 
 				/*
 				 * Because this pmap is not active on other
 				 * processors, the dirty bit cannot have
 				 * changed state since we last loaded pte.
 				 */
 				pmap_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if (pmap_pte_dirty(tpte)) {
 					switch (lvl) {
 					case 1:
 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
 						break;
 					case 2:
 						vm_page_dirty(m);
 						break;
 					}
 				}
 
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 
 				/* Mark free */
 				pc->pc_map[field] |= bitmask;
 				switch (lvl) {
 				case 1:
 					pmap_resident_count_dec(pmap,
 					    L2_SIZE / PAGE_SIZE);
 					pvh = pa_to_pvh(tpte & ~ATTR_MASK);
 					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
 					pvh->pv_gen++;
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 							if ((mt->aflags & PGA_WRITEABLE) != 0 &&
 							    TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					ml3 = pmap_remove_pt_page(pmap,
 					    pv->pv_va);
 					if (ml3 != NULL) {
 						KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
 						    ("pmap_remove_pages: l3 page not promoted"));
 						pmap_resident_count_dec(pmap,1);
 						KASSERT(ml3->wire_count == NL3PG,
 						    ("pmap_remove_pages: l3 page wire count error"));
 						ml3->wire_count = 0;
 						pmap_add_delayed_free_list(ml3,
 						    &free, FALSE);
 					}
 					break;
 				case 2:
 					pmap_resident_count_dec(pmap, 1);
 					TAILQ_REMOVE(&m->md.pv_list, pv,
 					    pv_next);
 					m->md.pv_gen++;
 					if ((m->aflags & PGA_WRITEABLE) != 0 &&
 					    TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(
 						    VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
 							vm_page_aflag_clear(m,
 							    PGA_WRITEABLE);
 					}
 					break;
 				}
 				pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
 				    &free);
 				freed++;
 			}
 		}
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	pmap_invalidate_all(pmap);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * This is used to check if a page has been accessed or modified. As we
  * don't have a bit to see if it has been modified we have to assume it
  * has been if the page is read/write.
  */
 static boolean_t
 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 {
 	struct rwlock *lock;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	pt_entry_t *pte, mask, value;
 	pmap_t pmap;
 	int lvl, md_gen, pvh_gen;
 	boolean_t rv;
 
 	rv = FALSE;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
 		KASSERT(lvl == 3,
 		    ("pmap_page_test_mappings: Invalid level %d", lvl));
 		mask = 0;
 		value = 0;
 		if (modified) {
 			mask |= ATTR_AP_RW_BIT;
 			value |= ATTR_AP(ATTR_AP_RW);
 		}
 		if (accessed) {
 			mask |= ATTR_AF | ATTR_DESCR_MASK;
 			value |= ATTR_AF | L3_PAGE;
 		}
 		rv = (pmap_load(pte) & mask) == value;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			goto out;
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			pte = pmap_pte(pmap, pv->pv_va, &lvl);
 			KASSERT(lvl == 2,
 			    ("pmap_page_test_mappings: Invalid level %d", lvl));
 			mask = 0;
 			value = 0;
 			if (modified) {
 				mask |= ATTR_AP_RW_BIT;
 				value |= ATTR_AP(ATTR_AP_RW);
 			}
 			if (accessed) {
 				mask |= ATTR_AF | ATTR_DESCR_MASK;
 				value |= ATTR_AF | L2_BLOCK;
 			}
 			rv = (pmap_load(pte) & mask) == value;
 			PMAP_UNLOCK(pmap);
 			if (rv)
 				goto out;
 		}
 	}
 out:
 	rw_runlock(lock);
 	return (rv);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTEs can have PG_M set.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
 	return (pmap_page_test_mappings(m, FALSE, TRUE));
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is eligible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *pte;
 	boolean_t rv;
 	int lvl;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pte = pmap_pte(pmap, addr, &lvl);
 	if (pte != NULL && pmap_load(pte) != 0) {
 		rv = TRUE;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	return (pmap_page_test_mappings(m, TRUE, FALSE));
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pv_entry_t next_pv, pv;
 	pt_entry_t oldpte, *pte;
 	vm_offset_t va;
 	int lvl, md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * set by another thread while the object is locked.  Thus,
 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 retry_pv_loop:
 	rw_wlock(lock);
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		va = pv->pv_va;
 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
 		if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
 			(void)pmap_demote_l2_locked(pmap, pte, va, &lock);
 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 		    ("inconsistent pv lock %p %p for page %p",
 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen ||
 			    md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		pte = pmap_pte(pmap, pv->pv_va, &lvl);
 		oldpte = pmap_load(pte);
 retry:
 		if ((oldpte & ATTR_SW_DBM) != 0) {
 			if (!atomic_fcmpset_long(pte, &oldpte,
 			    (oldpte | ATTR_AP_RW_BIT) & ~ATTR_SW_DBM))
 				goto retry;
 			if ((oldpte & ATTR_AP_RW_BIT) ==
 			    ATTR_AP(ATTR_AP_RW))
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	struct rwlock *lock;
 	pd_entry_t *pde, tpde;
 	pt_entry_t *pte, tpte;
 	vm_offset_t va;
 	vm_paddr_t pa;
 	int cleared, lvl, md_gen, not_cleared, pvh_gen;
 	struct spglist free;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	SLIST_INIT(&free);
 	cleared = 0;
 	pa = VM_PAGE_TO_PHYS(m);
 	lock = PHYS_TO_PV_LIST_LOCK(pa);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 	rw_wlock(lock);
 retry:
 	not_cleared = 0;
 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
 		KASSERT(pde != NULL, ("pmap_ts_referenced: no l1 table found"));
 		KASSERT(lvl == 1,
 		    ("pmap_ts_referenced: invalid pde level %d", lvl));
 		tpde = pmap_load(pde);
 		KASSERT((tpde & ATTR_DESCR_MASK) == L1_TABLE,
 		    ("pmap_ts_referenced: found an invalid l1 table"));
 		pte = pmap_l1_to_l2(pde, pv->pv_va);
 		tpte = pmap_load(pte);
 		if (pmap_pte_dirty(tpte)) {
 			/*
 			 * Although "tpte" is mapping a 2MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 
 		if ((tpte & ATTR_AF) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB pages,
 			 * it should not be cleared every time it is tested.
 			 * Apply a simple "hash" function on the physical page
 			 * number, the virtual superpage number, and the pmap
 			 * address to select one 4KB page out of the 512 on
 			 * which testing the reference bit will result in
 			 * clearing that reference bit.  This function is
 			 * designed to avoid the selection of the same 4KB page
 			 * for every 2MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
 			    (tpte & ATTR_SW_WIRED) == 0) {
 				pmap_clear_bits(pte, ATTR_AF);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 		}
 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		if (pvf == NULL)
 			pvf = pv;
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
 		KASSERT(pde != NULL, ("pmap_ts_referenced: no l2 table found"));
 		KASSERT(lvl == 2,
 		    ("pmap_ts_referenced: invalid pde level %d", lvl));
 		tpde = pmap_load(pde);
 		KASSERT((tpde & ATTR_DESCR_MASK) == L2_TABLE,
 		    ("pmap_ts_referenced: found an invalid l2 table"));
 		pte = pmap_l2_to_l3(pde, pv->pv_va);
 		tpte = pmap_load(pte);
 		if (pmap_pte_dirty(tpte))
 			vm_page_dirty(m);
 		if ((tpte & ATTR_AF) != 0) {
 			if ((tpte & ATTR_SW_WIRED) == 0) {
 				pmap_clear_bits(pte, ATTR_AF);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 			m->md.pv_gen++;
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 	    not_cleared < PMAP_TS_REFERENCED_MAX);
 out:
 	rw_wunlock(lock);
 	vm_page_free_pages_toq(&free, true);
 	return (cleared + not_cleared);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
+	struct rwlock *lock;
+	vm_offset_t va, va_next;
+	vm_page_t m;
+	pd_entry_t *l0, *l1, *l2, oldl2;
+	pt_entry_t *l3, oldl3;
+
+	if (advice != MADV_DONTNEED && advice != MADV_FREE)
+		return;
+
+	PMAP_LOCK(pmap);
+	for (; sva < eva; sva = va_next) {
+		l0 = pmap_l0(pmap, sva);
+		if (pmap_load(l0) == 0) {
+			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
+			if (va_next < sva)
+				va_next = eva;
+			continue;
+		}
+		l1 = pmap_l0_to_l1(l0, sva);
+		if (pmap_load(l1) == 0) {
+			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
+			if (va_next < sva)
+				va_next = eva;
+			continue;
+		}
+		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
+		if (va_next < sva)
+			va_next = eva;
+		l2 = pmap_l1_to_l2(l1, sva);
+		oldl2 = pmap_load(l2);
+		if (oldl2 == 0)
+			continue;
+		if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
+			if ((oldl2 & ATTR_SW_MANAGED) == 0)
+				continue;
+			lock = NULL;
+			if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
+				if (lock != NULL)
+					rw_wunlock(lock);
+
+				/*
+				 * The 2MB page mapping was destroyed.
+				 */
+				continue;
+			}
+
+			/*
+			 * Unless the page mappings are wired, remove the
+			 * mapping to a single page so that a subsequent
+			 * access may repromote.  Since the underlying page
+			 * table page is fully populated, this removal never
+			 * frees a page table page.
+			 */
+			if ((oldl2 & ATTR_SW_WIRED) == 0) {
+				l3 = pmap_l2_to_l3(l2, sva);
+				KASSERT(pmap_load(l3) != 0,
+				    ("pmap_advise: invalid PTE"));
+				pmap_remove_l3(pmap, l3, sva, pmap_load(l2),
+				    NULL, &lock);
+			}
+			if (lock != NULL)
+				rw_wunlock(lock);
+		}
+		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
+		    ("pmap_advise: invalid L2 entry after demotion"));
+		if (va_next > eva)
+			va_next = eva;
+		va = va_next;
+		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
+		    sva += L3_SIZE) {
+			oldl3 = pmap_load(l3);
+			if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
+			    (ATTR_SW_MANAGED | L3_PAGE))
+				goto maybe_invlrng;
+			else if (pmap_pte_dirty(oldl3)) {
+				if (advice == MADV_DONTNEED) {
+					/*
+					 * Future calls to pmap_is_modified()
+					 * can be avoided by making the page
+					 * dirty now.
+					 */
+					m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK);
+					vm_page_dirty(m);
+				}
+				while (!atomic_fcmpset_long(l3, &oldl3,
+				    (oldl3 & ~ATTR_AF) | ATTR_AP(ATTR_AP_RO)))
+					cpu_spinwait();
+			} else if ((oldl3 & ATTR_AF) != 0)
+				pmap_clear_bits(l3, ATTR_AF);
+			else
+				goto maybe_invlrng;
+			if (va == va_next)
+				va = sva;
+			continue;
+maybe_invlrng:
+			if (va != va_next) {
+				pmap_invalidate_range(pmap, va, sva);
+				va = va_next;
+			}
+		}
+		if (va != va_next)
+			pmap_invalidate_range(pmap, va, sva);
+	}
+	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pmap_t pmap;
 	pv_entry_t next_pv, pv;
 	pd_entry_t *l2, oldl2;
 	pt_entry_t *l3, oldl3;
 	vm_offset_t va;
 	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT(!vm_page_xbusied(m),
 	    ("pmap_clear_modify: page %p is exclusive busied", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have ATTR_SW_DBM
 	 * set.  If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_wlock(lock);
 restart:
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		va = pv->pv_va;
 		l2 = pmap_l2(pmap, va);
 		oldl2 = pmap_load(l2);
-		if ((oldl2 & ATTR_SW_DBM) != 0) {
-			if (pmap_demote_l2_locked(pmap, l2, va, &lock)) {
-				if ((oldl2 & ATTR_SW_WIRED) == 0) {
-					/*
-					 * Write protect the mapping to a
-					 * single page so that a subsequent
-					 * write access may repromote.
-					 */
-					va += VM_PAGE_TO_PHYS(m) -
-					    (oldl2 & ~ATTR_MASK);
-					l3 = pmap_l2_to_l3(l2, va);
-					oldl3 = pmap_load(l3);
-					if (pmap_l3_valid(oldl3)) {
-						while (!atomic_fcmpset_long(l3,
-						    &oldl3, (oldl3 & ~ATTR_SW_DBM) |
-						    ATTR_AP(ATTR_AP_RO)))
-							cpu_spinwait();
-						vm_page_dirty(m);
-						pmap_invalidate_page(pmap, va);
-					}
-				}
-			}
+		/* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
+		if ((oldl2 & ATTR_SW_DBM) != 0 &&
+		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
+		    (oldl2 & ATTR_SW_WIRED) == 0) {
+			/*
+			 * Write protect the mapping to a single page so that
+			 * a subsequent write access may repromote.
+			 */
+			va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK);
+			l3 = pmap_l2_to_l3(l2, va);
+			oldl3 = pmap_load(l3);
+			while (!atomic_fcmpset_long(l3, &oldl3,
+			    (oldl3 & ~ATTR_SW_DBM) | ATTR_AP(ATTR_AP_RO)))
+				cpu_spinwait();
+			vm_page_dirty(m);
+			pmap_invalidate_page(pmap, va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		l2 = pmap_l2(pmap, pv->pv_va);
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
 		oldl3 = pmap_load(l3);
 		if (pmap_l3_valid(oldl3) &&
 		    (oldl3 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
 			pmap_set_bits(l3, ATTR_AP(ATTR_AP_RO));
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t va, offset;
 	pd_entry_t *pde;
 	pt_entry_t *l2;
 	int i, lvl, l2_blocks, free_l2_count, start_idx;
 
 	if (!vm_initialized) {
 		/*
 		 * No L3 ptables so map entire L2 blocks where start VA is:
 		 * 	preinit_map_va + start_idx * L2_SIZE
 		 * There may be duplicate mappings (multiple VA -> same PA) but
 		 * ARM64 dcache is always PIPT so that's acceptable.
 		 */
 		 if (size == 0)
 			 return (NULL);
 
 		 /* Calculate how many L2 blocks are needed for the mapping */
 		l2_blocks = (roundup2(pa + size, L2_SIZE) -
 		    rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
 
 		offset = pa & L2_OFFSET;
 
 		if (preinit_map_va == 0)
 			return (NULL);
 
 		/* Map 2MiB L2 blocks from reserved VA space */
 
 		free_l2_count = 0;
 		start_idx = -1;
 		/* Find enough free contiguous VA space */
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (free_l2_count > 0 && ppim->pa != 0) {
 				/* Not enough space here */
 				free_l2_count = 0;
 				start_idx = -1;
 				continue;
 			}
 
 			if (ppim->pa == 0) {
 				/* Free L2 block */
 				if (start_idx == -1)
 					start_idx = i;
 				free_l2_count++;
 				if (free_l2_count == l2_blocks)
 					break;
 			}
 		}
 		if (free_l2_count != l2_blocks)
 			panic("%s: too many preinit mappings", __func__);
 
 		va = preinit_map_va + (start_idx * L2_SIZE);
 		for (i = start_idx; i < start_idx + l2_blocks; i++) {
 			/* Mark entries as allocated */
 			ppim = pmap_preinit_mapping + i;
 			ppim->pa = pa;
 			ppim->va = va + offset;
 			ppim->size = size;
 		}
 
 		/* Map L2 blocks */
 		pa = rounddown2(pa, L2_SIZE);
 		for (i = 0; i < l2_blocks; i++) {
 			pde = pmap_pde(kernel_pmap, va, &lvl);
 			KASSERT(pde != NULL,
 			    ("pmap_mapbios: Invalid page entry, va: 0x%lx",
 			    va));
 			KASSERT(lvl == 1,
 			    ("pmap_mapbios: Invalid level %d", lvl));
 
 			/* Insert L2_BLOCK */
 			l2 = pmap_l1_to_l2(pde, va);
 			pmap_load_store(l2,
 			    pa | ATTR_DEFAULT | ATTR_XN |
 			    ATTR_IDX(CACHED_MEMORY) | L2_BLOCK);
 
 			va += L2_SIZE;
 			pa += L2_SIZE;
 		}
 		pmap_invalidate_all(kernel_pmap);
 
 		va = preinit_map_va + (start_idx * L2_SIZE);
 
 	} else {
 		/* kva_alloc may be used to map the pages */
 		offset = pa & PAGE_MASK;
 		size = round_page(offset + size);
 
 		va = kva_alloc(size);
 		if (va == 0)
 			panic("%s: Couldn't allocate KVA", __func__);
 
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
 
 		/* L3 table is linked */
 		va = trunc_page(va);
 		pa = trunc_page(pa);
 		pmap_kenter(va, size, pa, CACHED_MEMORY);
 	}
 
 	return ((void *)(va + offset));
 }
 
 void
 pmap_unmapbios(vm_offset_t va, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t offset, tmpsize, va_trunc;
 	pd_entry_t *pde;
 	pt_entry_t *l2;
 	int i, lvl, l2_blocks, block;
 	bool preinit_map;
 
 	l2_blocks =
 	   (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
 	KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
 
 	/* Remove preinit mapping */
 	preinit_map = false;
 	block = 0;
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == va) {
 			KASSERT(ppim->size == size,
 			    ("pmap_unmapbios: size mismatch"));
 			ppim->va = 0;
 			ppim->pa = 0;
 			ppim->size = 0;
 			preinit_map = true;
 			offset = block * L2_SIZE;
 			va_trunc = rounddown2(va, L2_SIZE) + offset;
 
 			/* Remove L2_BLOCK */
 			pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
 			KASSERT(pde != NULL,
 			    ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
 			    va_trunc));
 			l2 = pmap_l1_to_l2(pde, va_trunc);
 			pmap_clear(l2);
 
 			if (block == (l2_blocks - 1))
 				break;
 			block++;
 		}
 	}
 	if (preinit_map) {
 		pmap_invalidate_all(kernel_pmap);
 		return;
 	}
 
 	/* Unmap the pages reserved with kva_alloc. */
 	if (vm_initialized) {
 		offset = va & PAGE_MASK;
 		size = round_page(offset + size);
 		va = trunc_page(va);
 
 		pde = pmap_pde(kernel_pmap, va, &lvl);
 		KASSERT(pde != NULL,
 		    ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va));
 		KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl));
 
 		/* Unmap and invalidate the pages */
                 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 			pmap_kremove(va + tmpsize);
 
 		kva_free(va, size);
 	}
 }
 
 /*
  * Sets the memory attribute for the specified page.
  */
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 
 	m->md.pv_memattr = ma;
 
 	/*
 	 * If "m" is a normal page, update its direct mapping.  This update
 	 * can be relied upon to perform any cache operations that are
 	 * required for data coherence.
 	 */
 	if ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
 	    m->md.pv_memattr) != 0)
 		panic("memory attribute change on the direct map failed");
 }
 
 /*
  * Changes the specified virtual address range's memory type to that given by
  * the parameter "mode".  The specified virtual address range must be
  * completely contained within either the direct map or the kernel map.  If
  * the virtual address range is contained within the kernel map, then the
  * memory type for each of the corresponding ranges of the direct map is also
  * changed.  (The corresponding ranges of the direct map are those ranges that
  * map the same physical pages as the specified virtual address range.)  These
  * changes to the direct map are necessary because Intel describes the
  * behavior of their processors as "undefined" if two or more mappings to the
  * same physical page have different memory types.
  *
  * Returns zero if the change completed successfully, and either EINVAL or
  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
  * of the virtual address range was not mapped, and ENOMEM is returned if
  * there was insufficient memory available to complete the change.  In the
  * latter case, the memory type may have been changed on some part of the
  * virtual address range or the direct map.
  */
 static int
 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
 {
 	int error;
 
 	PMAP_LOCK(kernel_pmap);
 	error = pmap_change_attr_locked(va, size, mode);
 	PMAP_UNLOCK(kernel_pmap);
 	return (error);
 }
 
 static int
 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
 {
 	vm_offset_t base, offset, tmpva;
 	pt_entry_t l3, *pte, *newpte;
 	int lvl;
 
 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 
 	if (!VIRT_IN_DMAP(base))
 		return (EINVAL);
 
 	for (tmpva = base; tmpva < base + size; ) {
 		pte = pmap_pte(kernel_pmap, tmpva, &lvl);
 		if (pte == NULL)
 			return (EINVAL);
 
 		if ((pmap_load(pte) & ATTR_IDX_MASK) == ATTR_IDX(mode)) {
 			/*
 			 * We already have the correct attribute,
 			 * ignore this entry.
 			 */
 			switch (lvl) {
 			default:
 				panic("Invalid DMAP table level: %d\n", lvl);
 			case 1:
 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
 				break;
 			case 2:
 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
 				break;
 			case 3:
 				tmpva += PAGE_SIZE;
 				break;
 			}
 		} else {
 			/*
 			 * Split the entry to an level 3 table, then
 			 * set the new attribute.
 			 */
 			switch (lvl) {
 			default:
 				panic("Invalid DMAP table level: %d\n", lvl);
 			case 1:
 				newpte = pmap_demote_l1(kernel_pmap, pte,
 				    tmpva & ~L1_OFFSET);
 				if (newpte == NULL)
 					return (EINVAL);
 				pte = pmap_l1_to_l2(pte, tmpva);
 			case 2:
 				newpte = pmap_demote_l2(kernel_pmap, pte,
 				    tmpva);
 				if (newpte == NULL)
 					return (EINVAL);
 				pte = pmap_l2_to_l3(pte, tmpva);
 			case 3:
 				/* Update the entry */
 				l3 = pmap_load(pte);
 				l3 &= ~ATTR_IDX_MASK;
 				l3 |= ATTR_IDX(mode);
 				if (mode == DEVICE_MEMORY)
 					l3 |= ATTR_XN;
 
 				pmap_update_entry(kernel_pmap, pte, l3, tmpva,
 				    PAGE_SIZE);
 
 				/*
 				 * If moving to a non-cacheable entry flush
 				 * the cache.
 				 */
 				if (mode == VM_MEMATTR_UNCACHEABLE)
 					cpu_dcache_wbinv_range(tmpva, L3_SIZE);
 
 				break;
 			}
 			tmpva += PAGE_SIZE;
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Create an L2 table to map all addresses within an L1 mapping.
  */
 static pt_entry_t *
 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
 {
 	pt_entry_t *l2, newl2, oldl1;
 	vm_offset_t tmpl1;
 	vm_paddr_t l2phys, phys;
 	vm_page_t ml2;
 	int i;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldl1 = pmap_load(l1);
 	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
 	    ("pmap_demote_l1: Demoting a non-block entry"));
 	KASSERT((va & L1_OFFSET) == 0,
 	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
 	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
 	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
 
 	tmpl1 = 0;
 	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
 		tmpl1 = kva_alloc(PAGE_SIZE);
 		if (tmpl1 == 0)
 			return (NULL);
 	}
 
 	if ((ml2 = vm_page_alloc(NULL, 0, VM_ALLOC_INTERRUPT |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
 		    " in pmap %p", va, pmap);
 		return (NULL);
 	}
 
 	l2phys = VM_PAGE_TO_PHYS(ml2);
 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
 
 	/* Address the range points at */
 	phys = oldl1 & ~ATTR_MASK;
 	/* The attributed from the old l1 table to be copied */
 	newl2 = oldl1 & ATTR_MASK;
 
 	/* Create the new entries */
 	for (i = 0; i < Ln_ENTRIES; i++) {
 		l2[i] = newl2 | phys;
 		phys += L2_SIZE;
 	}
 	KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
 	    ("Invalid l2 page (%lx != %lx)", l2[0],
 	    (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
 
 	if (tmpl1 != 0) {
 		pmap_kenter(tmpl1, PAGE_SIZE,
 		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET, CACHED_MEMORY);
 		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
 	}
 
 	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
 
 	if (tmpl1 != 0) {
 		pmap_kremove(tmpl1);
 		kva_free(tmpl1, PAGE_SIZE);
 	}
 
 	return (l2);
 }
 
 static void
 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
 {
 	pt_entry_t *l3;
 
 	for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
 		*l3 = newl3;
 		newl3 += L3_SIZE;
 	}
 }
 
 static void
 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
     struct rwlock **lockp)
 {
 	struct spglist free;
 
 	SLIST_INIT(&free);
 	(void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
 	    lockp);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * Create an L3 table to map all addresses within an L2 mapping.
  */
 static pt_entry_t *
 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pt_entry_t *l3, newl3, oldl2;
 	vm_offset_t tmpl2;
 	vm_paddr_t l3phys;
 	vm_page_t ml3;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	l3 = NULL;
 	oldl2 = pmap_load(l2);
 	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
 	    ("pmap_demote_l2: Demoting a non-block entry"));
 	va &= ~L2_OFFSET;
 
 	tmpl2 = 0;
 	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
 		tmpl2 = kva_alloc(PAGE_SIZE);
 		if (tmpl2 == 0)
 			return (NULL);
 	}
 
 	/*
 	 * Invalidate the 2MB page mapping and return "failure" if the
 	 * mapping was never accessed.
 	 */
 	if ((oldl2 & ATTR_AF) == 0) {
 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
 		    ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
 		pmap_demote_l2_abort(pmap, va, l2, lockp);
 		CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
 		    va, pmap);
 		goto fail;
 	}
 
 	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
 		    ("pmap_demote_l2: page table page for a wired mapping"
 		    " is missing"));
 
 		/*
 		 * If the page table page is missing and the mapping
 		 * is for a kernel address, the mapping must belong to
 		 * the direct map.  Page table pages are preallocated
 		 * for every other part of the kernel address space,
 		 * so the direct map region is the only part of the
 		 * kernel address space that must be handled here.
 		 */
 		KASSERT(va < VM_MAXUSER_ADDRESS || VIRT_IN_DMAP(va),
 		    ("pmap_demote_l2: No saved mpte for va %#lx", va));
 
 		/*
 		 * If the 2MB page mapping belongs to the direct map
 		 * region of the kernel's address space, then the page
 		 * allocation request specifies the highest possible
 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
 		 * priority is normal.
 		 */
 		ml3 = vm_page_alloc(NULL, pmap_l2_pindex(va),
 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : VM_ALLOC_NORMAL) |
 		    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED);
 
 		/*
 		 * If the allocation of the new page table page fails,
 		 * invalidate the 2MB page mapping and return "failure".
 		 */
 		if (ml3 == NULL) {
 			pmap_demote_l2_abort(pmap, va, l2, lockp);
 			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			goto fail;
 		}
 
 		if (va < VM_MAXUSER_ADDRESS) {
 			ml3->wire_count = NL3PG;
 			pmap_resident_count_inc(pmap, 1);
 		}
 	}
 	l3phys = VM_PAGE_TO_PHYS(ml3);
 	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
 	newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
 	KASSERT((oldl2 & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) !=
 	    (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM),
 	    ("pmap_demote_l2: L2 entry is writeable but not dirty"));
 
 	/*
 	 * If the page table page is not leftover from an earlier promotion,
 	 * or the mapping attributes have changed, (re)initialize the L3 table.
 	 */
 	if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK))
 		pmap_fill_l3(l3, newl3);
 
 	/*
 	 * Map the temporary page so we don't lose access to the l2 table.
 	 */
 	if (tmpl2 != 0) {
 		pmap_kenter(tmpl2, PAGE_SIZE,
 		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET, CACHED_MEMORY);
 		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
 	}
 
 	/*
 	 * The spare PV entries must be reserved prior to demoting the
 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
 	 * of the L2 and the PV lists will be inconsistent, which can result
 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
 	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
 	 * PV entry for the 2MB page mapping that is being demoted.
 	 */
 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
 
 	/*
 	 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
 	 * the 2MB page mapping.
 	 */
 	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
 
 	/*
 	 * Demote the PV entry.
 	 */
 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
 		pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);
 
 	atomic_add_long(&pmap_l2_demotions, 1);
 	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
 	    " in pmap %p %lx", va, pmap, l3[0]);
 
 fail:
 	if (tmpl2 != 0) {
 		pmap_kremove(tmpl2);
 		kva_free(tmpl2, PAGE_SIZE);
 	}
 
 	return (l3);
 
 }
 
 static pt_entry_t *
 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
 {
 	struct rwlock *lock;
 	pt_entry_t *l3;
 
 	lock = NULL;
 	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	return (l3);
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pt_entry_t *pte, tpte;
 	vm_paddr_t mask, pa;
 	int lvl, val;
 	bool managed;
 
 	PMAP_LOCK(pmap);
 retry:
 	val = 0;
 	pte = pmap_pte(pmap, addr, &lvl);
 	if (pte != NULL) {
 		tpte = pmap_load(pte);
 
 		switch (lvl) {
 		case 3:
 			mask = L3_OFFSET;
 			break;
 		case 2:
 			mask = L2_OFFSET;
 			break;
 		case 1:
 			mask = L1_OFFSET;
 			break;
 		default:
 			panic("pmap_mincore: invalid level %d", lvl);
 		}
 
 		managed = (tpte & ATTR_SW_MANAGED) != 0;
 		val = MINCORE_INCORE;
 		if (lvl != 3)
 			val |= MINCORE_SUPER;
 		if ((managed && pmap_pte_dirty(tpte)) || (!managed &&
 		    (tpte & ATTR_AP_RW_BIT) == ATTR_AP(ATTR_AP_RW)))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((tpte & ATTR_AF) == ATTR_AF)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 
 		pa = (tpte & ~ATTR_MASK) | (addr & mask);
 	} else
 		managed = false;
 
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 
 	return (val);
 }
 
 void
 pmap_activate(struct thread *td)
 {
 	pmap_t	pmap;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	td->td_proc->p_md.md_l0addr = vtophys(pmap->pm_l0);
 	__asm __volatile(
 	    "msr ttbr0_el1, %0	\n"
 	    "isb		\n"
 	    : : "r"(td->td_proc->p_md.md_l0addr));
 	pmap_invalidate_all(pmap);
 	critical_exit();
 }
 
 struct pcb *
 pmap_switch(struct thread *old, struct thread *new)
 {
 	pcpu_bp_harden bp_harden;
 	struct pcb *pcb;
 
 	/* Store the new curthread */
 	PCPU_SET(curthread, new);
 
 	/* And the new pcb */
 	pcb = new->td_pcb;
 	PCPU_SET(curpcb, pcb);
 
 	/*
 	 * TODO: We may need to flush the cache here if switching
 	 * to a user process.
 	 */
 
 	if (old == NULL ||
 	    old->td_proc->p_md.md_l0addr != new->td_proc->p_md.md_l0addr) {
 		__asm __volatile(
 		    /* Switch to the new pmap */
 		    "msr	ttbr0_el1, %0	\n"
 		    "isb			\n"
 
 		    /* Invalidate the TLB */
 		    "dsb	ishst		\n"
 		    "tlbi	vmalle1is	\n"
 		    "dsb	ish		\n"
 		    "isb			\n"
 		    : : "r"(new->td_proc->p_md.md_l0addr));
 
 		/*
 		 * Stop userspace from training the branch predictor against
 		 * other processes. This will call into a CPU specific
 		 * function that clears the branch predictor state.
 		 */
 		bp_harden = PCPU_GET(bp_harden);
 		if (bp_harden != NULL)
 			bp_harden();
 	}
 
 	return (pcb);
 }
 
 void
 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
 {
 
 	if (va >= VM_MIN_KERNEL_ADDRESS) {
 		cpu_icache_sync_range(va, sz);
 	} else {
 		u_int len, offset;
 		vm_paddr_t pa;
 
 		/* Find the length of data in this page to flush */
 		offset = va & PAGE_MASK;
 		len = imin(PAGE_SIZE - offset, sz);
 
 		while (sz != 0) {
 			/* Extract the physical address & find it in the DMAP */
 			pa = pmap_extract(pmap, va);
 			if (pa != 0)
 				cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
 
 			/* Move to the next page */
 			sz -= len;
 			va += len;
 			/* Set the length for the next iteration */
 			len = imin(PAGE_SIZE, sz);
 		}
 	}
 }
 
 int
 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
 {
 	pt_entry_t *pte;
 	register_t intr;
 	uint64_t ec, par;
 	int lvl, rv;
 
 	rv = KERN_FAILURE;
 
 	ec = ESR_ELx_EXCEPTION(esr);
 	switch (ec) {
 	case EXCP_INSN_ABORT_L:
 	case EXCP_INSN_ABORT:
 	case EXCP_DATA_ABORT_L:
 	case EXCP_DATA_ABORT:
 		break;
 	default:
 		return (rv);
 	}
 
 	/* Data and insn aborts use same encoding for FSC field. */
 	switch (esr & ISS_DATA_DFSC_MASK) {
 	case ISS_DATA_DFSC_AFF_L1:
 	case ISS_DATA_DFSC_AFF_L2:
 	case ISS_DATA_DFSC_AFF_L3:
 		PMAP_LOCK(pmap);
 		pte = pmap_pte(pmap, far, &lvl);
 		if (pte != NULL) {
 			pmap_set_bits(pte, ATTR_AF);
 			rv = KERN_SUCCESS;
 			/*
 			 * XXXMJ as an optimization we could mark the entry
 			 * dirty if this is a write fault.
 			 */
 		}
 		PMAP_UNLOCK(pmap);
 		break;
 	case ISS_DATA_DFSC_PF_L1:
 	case ISS_DATA_DFSC_PF_L2:
 	case ISS_DATA_DFSC_PF_L3:
 		if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
 		    (esr & ISS_DATA_WnR) == 0)
 			return (rv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte(pmap, far, &lvl);
 		if (pte != NULL &&
 		    (pmap_load(pte) & (ATTR_AP_RW_BIT | ATTR_SW_DBM)) ==
 		    (ATTR_AP(ATTR_AP_RO) | ATTR_SW_DBM)) {
 			pmap_clear_bits(pte, ATTR_AP_RW_BIT);
 			pmap_invalidate_page(pmap, trunc_page(far));
 			rv = KERN_SUCCESS;
 		}
 		PMAP_UNLOCK(pmap);
 		break;
 	case ISS_DATA_DFSC_TF_L0:
 	case ISS_DATA_DFSC_TF_L1:
 	case ISS_DATA_DFSC_TF_L2:
 	case ISS_DATA_DFSC_TF_L3:
 		PMAP_LOCK(pmap);
 		/* Ask the MMU to check the address */
 		intr = intr_disable();
 		if (pmap == kernel_pmap)
 			par = arm64_address_translate_s1e1r(far);
 		else
 			par = arm64_address_translate_s1e0r(far);
 		intr_restore(intr);
 		PMAP_UNLOCK(pmap);
 
 		/*
 		 * If the translation was successful the address was invalid
 		 * due to a break-before-make sequence. We can unlock and
 		 * return success to the trap handler.
 		 */
 		if (PAR_SUCCESS(par))
 			rv = KERN_SUCCESS;
 		break;
 	}
 
 	return (rv);
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t superpage_offset;
 
 	if (size < L2_SIZE)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & L2_OFFSET;
 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
 	    (*addr & L2_OFFSET) == superpage_offset)
 		return;
 	if ((*addr & L2_OFFSET) < superpage_offset)
 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
 	else
 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
 }
 
 /**
  * Get the kernel virtual address of a set of physical pages. If there are
  * physical addresses not covered by the DMAP perform a transient mapping
  * that will be removed when calling pmap_unmap_io_transient.
  *
  * \param page        The pages the caller wishes to obtain the virtual
  *                    address on the kernel memory map.
  * \param vaddr       On return contains the kernel virtual memory address
  *                    of the pages passed in the page parameter.
  * \param count       Number of pages passed in.
  * \param can_fault   TRUE if the thread using the mapped pages can take
  *                    page faults, FALSE otherwise.
  *
  * \returns TRUE if the caller must call pmap_unmap_io_transient when
  *          finished or FALSE otherwise.
  *
  */
 boolean_t
 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	boolean_t needs_mapping;
 	int error, i;
 
 	/*
 	 * Allocate any KVA space that we need, this is done in a separate
 	 * loop to prevent calling vmem_alloc while pinned.
 	 */
 	needs_mapping = FALSE;
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 			needs_mapping = TRUE;
 		} else {
 			vaddr[i] = PHYS_TO_DMAP(paddr);
 		}
 	}
 
 	/* Exit early if everything is covered by the DMAP */
 	if (!needs_mapping)
 		return (FALSE);
 
 	if (!can_fault)
 		sched_pin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (!PHYS_IN_DMAP(paddr)) {
 			panic(
 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
 		}
 	}
 
 	return (needs_mapping);
 }
 
 void
 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	int i;
 
 	if (!can_fault)
 		sched_unpin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (!PHYS_IN_DMAP(paddr)) {
 			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
 		}
 	}
 }
 
 boolean_t
 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
 {
 
 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
 }
Index: projects/nfsv42/sys/cam/ata/ata_all.c
===================================================================
--- projects/nfsv42/sys/cam/ata/ata_all.c	(revision 350367)
+++ projects/nfsv42/sys/cam/ata/ata_all.c	(revision 350368)
@@ -1,1271 +1,1280 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2009 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 
 #ifdef _KERNEL
 #include "opt_scsi.h"
 
 #include <sys/systm.h>
 #include <sys/libkern.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #else
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #ifndef min
 #define min(a,b) (((a)<(b))?(a):(b))
 #endif
 #endif
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_queue.h>
 #include <cam/cam_xpt.h>
 #include <sys/ata.h>
 #include <cam/ata/ata_all.h>
 #include <sys/sbuf.h>
 #include <sys/endian.h>
 
 int
 ata_version(int ver)
 {
 	int bit;
 
 	if (ver == 0xffff)
 		return 0;
 	for (bit = 15; bit >= 0; bit--)
 		if (ver & (1<<bit))
 			return bit;
 	return 0;
 }
 
 char *
 ata_op_string(struct ata_cmd *cmd)
 {
 
 	if (cmd->control & 0x04)
 		return ("SOFT_RESET");
 	switch (cmd->command) {
 	case 0x00:
 		switch (cmd->features) {
 		case 0x00: return ("NOP FLUSHQUEUE");
 		case 0x01: return ("NOP AUTOPOLL");
 		}
 		return ("NOP");
 	case 0x03: return ("CFA_REQUEST_EXTENDED_ERROR");
 	case 0x06:
 		switch (cmd->features) {
 		case 0x01: return ("DSM TRIM");
 		}
 		return "DSM";
 	case 0x07:
 		switch (cmd->features) {
 		case 0x01: return ("DSM_XL TRIM");
 		}
 		return "DSM_XL";
 	case 0x08: return ("DEVICE_RESET");
 	case 0x0b: return ("REQUEST_SENSE_DATA_EXT");
 	case 0x12: return ("GET_PHYSICAL_ELEMENT_STATUS");
 	case 0x20: return ("READ");
 	case 0x24: return ("READ48");
 	case 0x25: return ("READ_DMA48");
 	case 0x26: return ("READ_DMA_QUEUED48");
 	case 0x27: return ("READ_NATIVE_MAX_ADDRESS48");
 	case 0x29: return ("READ_MUL48");
 	case 0x2a: return ("READ_STREAM_DMA48");
 	case 0x2b: return ("READ_STREAM48");
 	case 0x2f: return ("READ_LOG_EXT");
 	case 0x30: return ("WRITE");
 	case 0x34: return ("WRITE48");
 	case 0x35: return ("WRITE_DMA48");
 	case 0x36: return ("WRITE_DMA_QUEUED48");
 	case 0x37: return ("SET_MAX_ADDRESS48");
 	case 0x39: return ("WRITE_MUL48");
 	case 0x3a: return ("WRITE_STREAM_DMA48");
 	case 0x3b: return ("WRITE_STREAM48");
 	case 0x3d: return ("WRITE_DMA_FUA48");
 	case 0x3e: return ("WRITE_DMA_QUEUED_FUA48");
 	case 0x3f: return ("WRITE_LOG_EXT");
 	case 0x40: return ("READ_VERIFY");
 	case 0x42: return ("READ_VERIFY48");
 	case 0x44:
 		switch (cmd->features) {
 		case 0x01: return ("ZERO_EXT TRIM");
 		}
 		return "ZERO_EXT";
 	case 0x45:
 		switch (cmd->features) {
 		case 0x55: return ("WRITE_UNCORRECTABLE48 PSEUDO");
 		case 0xaa: return ("WRITE_UNCORRECTABLE48 FLAGGED");
 		}
 		return "WRITE_UNCORRECTABLE48";
 	case 0x47: return ("READ_LOG_DMA_EXT");
 	case 0x4a: return ("ZAC_MANAGEMENT_IN");
 	case 0x51: return ("CONFIGURE_STREAM");
 	case 0x57: return ("WRITE_LOG_DMA_EXT");
 	case 0x5b: return ("TRUSTED_NON_DATA");
 	case 0x5c: return ("TRUSTED_RECEIVE");
 	case 0x5d: return ("TRUSTED_RECEIVE_DMA");
 	case 0x5e: return ("TRUSTED_SEND");
 	case 0x5f: return ("TRUSTED_SEND_DMA");
 	case 0x60: return ("READ_FPDMA_QUEUED");
 	case 0x61: return ("WRITE_FPDMA_QUEUED");
 	case 0x63:
 		switch (cmd->features & 0xf) {
 		case 0x00: return ("NCQ_NON_DATA ABORT NCQ QUEUE");
 		case 0x01: return ("NCQ_NON_DATA DEADLINE HANDLING");
 		case 0x02: return ("NCQ_NON_DATA HYBRID DEMOTE BY SIZE");
 		case 0x03: return ("NCQ_NON_DATA HYBRID CHANGE BY LBA RANGE");
 		case 0x04: return ("NCQ_NON_DATA HYBRID CONTROL");
 		case 0x05: return ("NCQ_NON_DATA SET FEATURES");
 		/*
 		 * XXX KDM need common decoding between NCQ and non-NCQ
 		 * versions of SET FEATURES.
 		 */
 		case 0x06: return ("NCQ_NON_DATA ZERO EXT");
 		case 0x07: return ("NCQ_NON_DATA ZAC MANAGEMENT OUT");
 		}
 		return ("NCQ_NON_DATA");
 	case 0x64:
 		switch (cmd->sector_count_exp & 0xf) {
 		case 0x00: return ("SEND_FPDMA_QUEUED DATA SET MANAGEMENT");
 		case 0x01: return ("SEND_FPDMA_QUEUED HYBRID EVICT");
 		case 0x02: return ("SEND_FPDMA_QUEUED WRITE LOG DMA EXT");
 		case 0x03: return ("SEND_FPDMA_QUEUED ZAC MANAGEMENT OUT");
 		case 0x04: return ("SEND_FPDMA_QUEUED DATA SET MANAGEMENT XL");
 		}
 		return ("SEND_FPDMA_QUEUED");
 	case 0x65:
 		switch (cmd->sector_count_exp & 0xf) {
 		case 0x01: return ("RECEIVE_FPDMA_QUEUED READ LOG DMA EXT");
 		case 0x02: return ("RECEIVE_FPDMA_QUEUED ZAC MANAGEMENT IN");
 		}
 		return ("RECEIVE_FPDMA_QUEUED");
 	case 0x67:
 		if (cmd->features == 0xec)
 			return ("SEP_ATTN IDENTIFY");
 		switch (cmd->lba_low) {
 		case 0x00: return ("SEP_ATTN READ BUFFER");
 		case 0x02: return ("SEP_ATTN RECEIVE DIAGNOSTIC RESULTS");
 		case 0x80: return ("SEP_ATTN WRITE BUFFER");
 		case 0x82: return ("SEP_ATTN SEND DIAGNOSTIC");
 		}
 		return ("SEP_ATTN");
 	case 0x70: return ("SEEK");
 	case 0x77: return ("SET_DATE_TIME_EXT");
 	case 0x78:
 		switch (cmd->features) {
 		case 0x00: return ("GET_NATIVE_MAX_ADDRESS_EXT");
 		case 0x01: return ("SET_ACCESSIBLE_MAX_ADDRESS_EXT");
 		case 0x02: return ("FREEZE_ACCESSIBLE_MAX_ADDRESS_EXT");
 		}
 		return ("ACCESSIBLE_MAX_ADDRESS_CONFIGURATION");
 	case 0x7C: return ("REMOVE_ELEMENT_AND_TRUNCATE");
 	case 0x87: return ("CFA_TRANSLATE_SECTOR");
 	case 0x90: return ("EXECUTE_DEVICE_DIAGNOSTIC");
 	case 0x92: return ("DOWNLOAD_MICROCODE");
 	case 0x93: return ("DOWNLOAD_MICROCODE_DMA");
 	case 0x9a: return ("ZAC_MANAGEMENT_OUT");
 	case 0xa0: return ("PACKET");
 	case 0xa1: return ("ATAPI_IDENTIFY");
 	case 0xa2: return ("SERVICE");
 	case 0xb0:
 		switch(cmd->features) {
 		case 0xd0: return ("SMART READ ATTR VALUES");
 		case 0xd1: return ("SMART READ ATTR THRESHOLDS");
 		case 0xd3: return ("SMART SAVE ATTR VALUES");
 		case 0xd4: return ("SMART EXECUTE OFFLINE IMMEDIATE");
 		case 0xd5: return ("SMART READ LOG");
 		case 0xd6: return ("SMART WRITE LOG");
 		case 0xd8: return ("SMART ENABLE OPERATION");
 		case 0xd9: return ("SMART DISABLE OPERATION");
 		case 0xda: return ("SMART RETURN STATUS");
 		}
 		return ("SMART");
 	case 0xb1: return ("DEVICE CONFIGURATION");
 	case 0xb2: return ("SET_SECTOR_CONFIGURATION_EXT");
-	case 0xb4: return ("SANITIZE_DEVICE");
+	case 0xb4:
+		switch(cmd->features) {
+		case 0x00: return ("SANITIZE_STATUS_EXT");
+		case 0x11: return ("CRYPTO_SCRAMBLE_EXT");
+		case 0x12: return ("BLOCK_ERASE_EXT");
+		case 0x14: return ("OVERWRITE_EXT");
+		case 0x20: return ("SANITIZE_FREEZE_LOCK_EXT");
+		case 0x40: return ("SANITIZE_ANTIFREEZE_LOCK_EXT");
+		}
+		return ("SANITIZE_DEVICE");
 	case 0xc0: return ("CFA_ERASE");
 	case 0xc4: return ("READ_MUL");
 	case 0xc5: return ("WRITE_MUL");
 	case 0xc6: return ("SET_MULTI");
 	case 0xc7: return ("READ_DMA_QUEUED");
 	case 0xc8: return ("READ_DMA");
 	case 0xca: return ("WRITE_DMA");
 	case 0xcc: return ("WRITE_DMA_QUEUED");
 	case 0xcd: return ("CFA_WRITE_MULTIPLE_WITHOUT_ERASE");
 	case 0xce: return ("WRITE_MUL_FUA48");
 	case 0xd1: return ("CHECK_MEDIA_CARD_TYPE");
 	case 0xda: return ("GET_MEDIA_STATUS");
 	case 0xde: return ("MEDIA_LOCK");
 	case 0xdf: return ("MEDIA_UNLOCK");
 	case 0xe0: return ("STANDBY_IMMEDIATE");
 	case 0xe1: return ("IDLE_IMMEDIATE");
 	case 0xe2: return ("STANDBY");
 	case 0xe3: return ("IDLE");
 	case 0xe4: return ("READ_BUFFER/PM");
 	case 0xe5: return ("CHECK_POWER_MODE");
 	case 0xe6: return ("SLEEP");
 	case 0xe7: return ("FLUSHCACHE");
 	case 0xe8: return ("WRITE_BUFFER/PM");
 	case 0xe9: return ("READ_BUFFER_DMA");
 	case 0xea: return ("FLUSHCACHE48");
 	case 0xeb: return ("WRITE_BUFFER_DMA");
 	case 0xec: return ("ATA_IDENTIFY");
 	case 0xed: return ("MEDIA_EJECT");
 	case 0xef:
 		/*
 		 * XXX KDM need common decoding between NCQ and non-NCQ
 		 * versions of SET FEATURES.
 		 */
 		switch (cmd->features) {
 	        case 0x02: return ("SETFEATURES ENABLE WCACHE");
 	        case 0x03: return ("SETFEATURES SET TRANSFER MODE");
 		case 0x05: return ("SETFEATURES ENABLE APM");
 	        case 0x06: return ("SETFEATURES ENABLE PUIS");
 	        case 0x07: return ("SETFEATURES SPIN-UP");
 		case 0x0b: return ("SETFEATURES ENABLE WRITE READ VERIFY");
 		case 0x0c: return ("SETFEATURES ENABLE DEVICE LIFE CONTROL");
 	        case 0x10: return ("SETFEATURES ENABLE SATA FEATURE");
 		case 0x41: return ("SETFEATURES ENABLE FREEFALL CONTROL");
 		case 0x43: return ("SETFEATURES SET MAX HOST INT SECT TIMES");
 		case 0x45: return ("SETFEATURES SET RATE BASIS");
 		case 0x4a: return ("SETFEATURES EXTENDED POWER CONDITIONS");
 		case 0x50: return ("SETFEATURES ADVANCED BACKGROUD OPERATION");
 	        case 0x55: return ("SETFEATURES DISABLE RCACHE");
 		case 0x5d: return ("SETFEATURES ENABLE RELIRQ");
 		case 0x5e: return ("SETFEATURES ENABLE SRVIRQ");
 		case 0x62: return ("SETFEATURES LONG PHYS SECT ALIGN ERC");
 		case 0x63: return ("SETFEATURES DSN");
 		case 0x66: return ("SETFEATURES DISABLE DEFAULTS");
 	        case 0x82: return ("SETFEATURES DISABLE WCACHE");
 	        case 0x85: return ("SETFEATURES DISABLE APM");
 	        case 0x86: return ("SETFEATURES DISABLE PUIS");
 		case 0x8b: return ("SETFEATURES DISABLE WRITE READ VERIFY");
 		case 0x8c: return ("SETFEATURES DISABLE DEVICE LIFE CONTROL");
 	        case 0x90: return ("SETFEATURES DISABLE SATA FEATURE");
 	        case 0xaa: return ("SETFEATURES ENABLE RCACHE");
 		case 0xC1: return ("SETFEATURES DISABLE FREEFALL CONTROL");
 		case 0xC3: return ("SETFEATURES SENSE DATA REPORTING");
 		case 0xC4: return ("SETFEATURES NCQ SENSE DATA RETURN");
 		case 0xCC: return ("SETFEATURES ENABLE DEFAULTS");
 		case 0xdd: return ("SETFEATURES DISABLE RELIRQ");
 		case 0xde: return ("SETFEATURES DISABLE SRVIRQ");
 	        }
 	        return "SETFEATURES";
 	case 0xf1: return ("SECURITY_SET_PASSWORD");
 	case 0xf2: return ("SECURITY_UNLOCK");
 	case 0xf3: return ("SECURITY_ERASE_PREPARE");
 	case 0xf4: return ("SECURITY_ERASE_UNIT");
 	case 0xf5: return ("SECURITY_FREEZE_LOCK");
 	case 0xf6: return ("SECURITY_DISABLE_PASSWORD");
 	case 0xf8: return ("READ_NATIVE_MAX_ADDRESS");
 	case 0xf9: return ("SET_MAX_ADDRESS");
 	}
 	return "UNKNOWN";
 }
 
 char *
 ata_cmd_string(struct ata_cmd *cmd, char *cmd_string, size_t len)
 {
 	struct sbuf sb;
 	int error;
 
 	if (len == 0)
 		return ("");
 
 	sbuf_new(&sb, cmd_string, len, SBUF_FIXEDLEN);
 	ata_cmd_sbuf(cmd, &sb);
 
 	error = sbuf_finish(&sb);
 	if (error != 0 && error != ENOMEM)
 		return ("");
 
 	return(sbuf_data(&sb));
 }
 
 void
 ata_cmd_sbuf(struct ata_cmd *cmd, struct sbuf *sb)
 {
 	sbuf_printf(sb, "%02x %02x %02x %02x "
 	    "%02x %02x %02x %02x %02x %02x %02x %02x",
 	    cmd->command, cmd->features,
 	    cmd->lba_low, cmd->lba_mid, cmd->lba_high, cmd->device,
 	    cmd->lba_low_exp, cmd->lba_mid_exp, cmd->lba_high_exp,
 	    cmd->features_exp, cmd->sector_count, cmd->sector_count_exp);
 }
 
 char *
 ata_res_string(struct ata_res *res, char *res_string, size_t len)
 {
 	struct sbuf sb;
 	int error;
 
 	if (len == 0)
 		return ("");
 
 	sbuf_new(&sb, res_string, len, SBUF_FIXEDLEN);
 	ata_res_sbuf(res, &sb);
 
 	error = sbuf_finish(&sb);
 	if (error != 0 && error != ENOMEM)
 		return ("");
 
 	return(sbuf_data(&sb));
 }
 
 int
 ata_res_sbuf(struct ata_res *res, struct sbuf *sb)
 {
 
 	sbuf_printf(sb, "%02x %02x %02x %02x "
 	    "%02x %02x %02x %02x %02x %02x %02x",
 	    res->status, res->error,
 	    res->lba_low, res->lba_mid, res->lba_high, res->device,
 	    res->lba_low_exp, res->lba_mid_exp, res->lba_high_exp,
 	    res->sector_count, res->sector_count_exp);
 
 	return (0);
 }
 
 /*
  * ata_command_sbuf() returns 0 for success and -1 for failure.
  */
 int
 ata_command_sbuf(struct ccb_ataio *ataio, struct sbuf *sb)
 {
 
 	sbuf_printf(sb, "%s. ACB: ",
 	    ata_op_string(&ataio->cmd));
 	ata_cmd_sbuf(&ataio->cmd, sb);
 
 	return(0);
 }
 
 /*
  * ata_status_abuf() returns 0 for success and -1 for failure.
  */
 int
 ata_status_sbuf(struct ccb_ataio *ataio, struct sbuf *sb)
 {
 
 	sbuf_printf(sb, "ATA status: %02x (%s%s%s%s%s%s%s%s)",
 	    ataio->res.status,
 	    (ataio->res.status & 0x80) ? "BSY " : "",
 	    (ataio->res.status & 0x40) ? "DRDY " : "",
 	    (ataio->res.status & 0x20) ? "DF " : "",
 	    (ataio->res.status & 0x10) ? "SERV " : "",
 	    (ataio->res.status & 0x08) ? "DRQ " : "",
 	    (ataio->res.status & 0x04) ? "CORR " : "",
 	    (ataio->res.status & 0x02) ? "IDX " : "",
 	    (ataio->res.status & 0x01) ? "ERR" : "");
 	if (ataio->res.status & 1) {
 	    sbuf_printf(sb, ", error: %02x (%s%s%s%s%s%s%s%s)",
 		ataio->res.error,
 		(ataio->res.error & 0x80) ? "ICRC " : "",
 		(ataio->res.error & 0x40) ? "UNC " : "",
 		(ataio->res.error & 0x20) ? "MC " : "",
 		(ataio->res.error & 0x10) ? "IDNF " : "",
 		(ataio->res.error & 0x08) ? "MCR " : "",
 		(ataio->res.error & 0x04) ? "ABRT " : "",
 		(ataio->res.error & 0x02) ? "NM " : "",
 		(ataio->res.error & 0x01) ? "ILI" : "");
 	}
 
 	return(0);
 }
 
 void
 ata_print_ident(struct ata_params *ident_data)
 {
 	const char *proto;
 	char ata[12], sata[12];
 
 	ata_print_ident_short(ident_data);
 
 	proto = (ident_data->config == ATA_PROTO_CFA) ? "CFA" :
 		(ident_data->config & ATA_PROTO_ATAPI) ? "ATAPI" : "ATA";
 	if (ata_version(ident_data->version_major) == 0) {
 		snprintf(ata, sizeof(ata), "%s", proto);
 	} else if (ata_version(ident_data->version_major) <= 7) {
 		snprintf(ata, sizeof(ata), "%s-%d", proto,
 		    ata_version(ident_data->version_major));
 	} else if (ata_version(ident_data->version_major) == 8) {
 		snprintf(ata, sizeof(ata), "%s8-ACS", proto);
 	} else {
 		snprintf(ata, sizeof(ata), "ACS-%d %s",
 		    ata_version(ident_data->version_major) - 7, proto);
 	}
 	if (ident_data->satacapabilities && ident_data->satacapabilities != 0xffff) {
 		if (ident_data->satacapabilities & ATA_SATA_GEN3)
 			snprintf(sata, sizeof(sata), " SATA 3.x");
 		else if (ident_data->satacapabilities & ATA_SATA_GEN2)
 			snprintf(sata, sizeof(sata), " SATA 2.x");
 		else if (ident_data->satacapabilities & ATA_SATA_GEN1)
 			snprintf(sata, sizeof(sata), " SATA 1.x");
 		else
 			snprintf(sata, sizeof(sata), " SATA");
 	} else
 		sata[0] = 0;
 	printf(" %s%s device\n", ata, sata);
 }
 
 void
 ata_print_ident_sbuf(struct ata_params *ident_data, struct sbuf *sb)
 {
 	const char *proto, *sata;
 	int version;
 
 	ata_print_ident_short_sbuf(ident_data, sb);
 	sbuf_printf(sb, " ");
 
 	proto = (ident_data->config == ATA_PROTO_CFA) ? "CFA" :
 		(ident_data->config & ATA_PROTO_ATAPI) ? "ATAPI" : "ATA";
 	version = ata_version(ident_data->version_major);
 
 	switch (version) {
 	case 0:
 		sbuf_printf(sb, "%s", proto);
 		break;
 	case 1:
 	case 2:
 	case 3:
 	case 4:
 	case 5:
 	case 6:
 	case 7:
 		sbuf_printf(sb, "%s-%d", proto, version);
 		break;
 	case 8:
 		sbuf_printf(sb, "%s8-ACS", proto);
 		break;
 	default:
 		sbuf_printf(sb, "ACS-%d %s", version - 7, proto);
 		break;
 	}
 
 	if (ident_data->satacapabilities && ident_data->satacapabilities != 0xffff) {
 		if (ident_data->satacapabilities & ATA_SATA_GEN3)
 			sata = " SATA 3.x";
 		else if (ident_data->satacapabilities & ATA_SATA_GEN2)
 			sata = " SATA 2.x";
 		else if (ident_data->satacapabilities & ATA_SATA_GEN1)
 			sata = " SATA 1.x";
 		else
 			sata = " SATA";
 	} else
 		sata = "";
 	sbuf_printf(sb, "%s device\n", sata);
 }
 
 void
 ata_print_ident_short(struct ata_params *ident_data)
 {
 	char product[48], revision[16];
 
 	cam_strvis(product, ident_data->model, sizeof(ident_data->model),
 		   sizeof(product));
 	cam_strvis(revision, ident_data->revision, sizeof(ident_data->revision),
 		   sizeof(revision));
 	printf("<%s %s>", product, revision);
 }
 
 void
 ata_print_ident_short_sbuf(struct ata_params *ident_data, struct sbuf *sb)
 {
 
 	sbuf_printf(sb, "<");
 	cam_strvis_sbuf(sb, ident_data->model, sizeof(ident_data->model), 0);
 	sbuf_printf(sb, " ");
 	cam_strvis_sbuf(sb, ident_data->revision, sizeof(ident_data->revision), 0);
 	sbuf_printf(sb, ">");
 }
 
 void
 semb_print_ident(struct sep_identify_data *ident_data)
 {
 	char in[7], ins[5];
 
 	semb_print_ident_short(ident_data);
 	cam_strvis(in, ident_data->interface_id, 6, sizeof(in));
 	cam_strvis(ins, ident_data->interface_rev, 4, sizeof(ins));
 	printf(" SEMB %s %s device\n", in, ins);
 }
 
 void
 semb_print_ident_sbuf(struct sep_identify_data *ident_data, struct sbuf *sb)
 {
 
 	semb_print_ident_short_sbuf(ident_data, sb);
 
 	sbuf_printf(sb, " SEMB ");
 	cam_strvis_sbuf(sb, ident_data->interface_id, 6, 0);
 	sbuf_printf(sb, " ");
 	cam_strvis_sbuf(sb, ident_data->interface_rev, 4, 0);
 	sbuf_printf(sb, " device\n");
 }
 
 void
 semb_print_ident_short(struct sep_identify_data *ident_data)
 {
 	char vendor[9], product[17], revision[5], fw[5];
 
 	cam_strvis(vendor, ident_data->vendor_id, 8, sizeof(vendor));
 	cam_strvis(product, ident_data->product_id, 16, sizeof(product));
 	cam_strvis(revision, ident_data->product_rev, 4, sizeof(revision));
 	cam_strvis(fw, ident_data->firmware_rev, 4, sizeof(fw));
 	printf("<%s %s %s %s>", vendor, product, revision, fw);
 }
 
 void
 semb_print_ident_short_sbuf(struct sep_identify_data *ident_data, struct sbuf *sb)
 {
 
 	sbuf_printf(sb, "<");
 	cam_strvis_sbuf(sb, ident_data->vendor_id, 8, 0);
 	sbuf_printf(sb, " ");
 	cam_strvis_sbuf(sb, ident_data->product_id, 16, 0);
 	sbuf_printf(sb, " ");
 	cam_strvis_sbuf(sb, ident_data->product_rev, 4, 0);
 	sbuf_printf(sb, " ");
 	cam_strvis_sbuf(sb, ident_data->firmware_rev, 4, 0);
 	sbuf_printf(sb, ">");
 }
 
 uint32_t
 ata_logical_sector_size(struct ata_params *ident_data)
 {
 	if ((ident_data->pss & ATA_PSS_VALID_MASK) == ATA_PSS_VALID_VALUE &&
 	    (ident_data->pss & ATA_PSS_LSSABOVE512)) {
 		return (((u_int32_t)ident_data->lss_1 |
 		    ((u_int32_t)ident_data->lss_2 << 16)) * 2);
 	}
 	return (512);
 }
 
 uint64_t
 ata_physical_sector_size(struct ata_params *ident_data)
 {
 	if ((ident_data->pss & ATA_PSS_VALID_MASK) == ATA_PSS_VALID_VALUE) {
 		if (ident_data->pss & ATA_PSS_MULTLS) {
 			return ((uint64_t)ata_logical_sector_size(ident_data) *
 			    (1 << (ident_data->pss & ATA_PSS_LSPPS)));
 		} else {
 			return (uint64_t)ata_logical_sector_size(ident_data);
 		}
 	}
 	return (512);
 }
 
 uint64_t
 ata_logical_sector_offset(struct ata_params *ident_data)
 {
 	if ((ident_data->lsalign & 0xc000) == 0x4000) {
 		return ((uint64_t)ata_logical_sector_size(ident_data) *
 		    (ident_data->lsalign & 0x3fff));
 	}
 	return (0);
 }
 
 void
 ata_28bit_cmd(struct ccb_ataio *ataio, uint8_t cmd, uint8_t features,
     uint32_t lba, uint8_t sector_count)
 {
 	bzero(&ataio->cmd, sizeof(ataio->cmd));
 	ataio->cmd.flags = 0;
 	if (cmd == ATA_READ_DMA ||
 	    cmd == ATA_READ_DMA_QUEUED ||
 	    cmd == ATA_WRITE_DMA ||
 	    cmd == ATA_WRITE_DMA_QUEUED ||
 	    cmd == ATA_TRUSTED_RECEIVE_DMA ||
 	    cmd == ATA_TRUSTED_SEND_DMA ||
 	    cmd == ATA_DOWNLOAD_MICROCODE_DMA ||
 	    cmd == ATA_READ_BUFFER_DMA ||
 	    cmd == ATA_WRITE_BUFFER_DMA)
 		ataio->cmd.flags |= CAM_ATAIO_DMA;
 	ataio->cmd.command = cmd;
 	ataio->cmd.features = features;
 	ataio->cmd.lba_low = lba;
 	ataio->cmd.lba_mid = lba >> 8;
 	ataio->cmd.lba_high = lba >> 16;
 	ataio->cmd.device = ATA_DEV_LBA | ((lba >> 24) & 0x0f);
 	ataio->cmd.sector_count = sector_count;
 }
 
 void
 ata_48bit_cmd(struct ccb_ataio *ataio, uint8_t cmd, uint16_t features,
     uint64_t lba, uint16_t sector_count)
 {
 
 	ataio->cmd.flags = CAM_ATAIO_48BIT;
 	if (cmd == ATA_READ_DMA48 ||
 	    cmd == ATA_READ_DMA_QUEUED48 ||
 	    cmd == ATA_READ_STREAM_DMA48 ||
 	    cmd == ATA_WRITE_DMA48 ||
 	    cmd == ATA_WRITE_DMA_FUA48 ||
 	    cmd == ATA_WRITE_DMA_QUEUED48 ||
 	    cmd == ATA_WRITE_DMA_QUEUED_FUA48 ||
 	    cmd == ATA_WRITE_STREAM_DMA48 ||
 	    cmd == ATA_DATA_SET_MANAGEMENT ||
 	    cmd == ATA_READ_LOG_DMA_EXT ||
 	    cmd == ATA_WRITE_LOG_DMA_EXT)
 		ataio->cmd.flags |= CAM_ATAIO_DMA;
 	ataio->cmd.command = cmd;
 	ataio->cmd.features = features;
 	ataio->cmd.lba_low = lba;
 	ataio->cmd.lba_mid = lba >> 8;
 	ataio->cmd.lba_high = lba >> 16;
 	ataio->cmd.device = ATA_DEV_LBA;
 	ataio->cmd.lba_low_exp = lba >> 24;
 	ataio->cmd.lba_mid_exp = lba >> 32;
 	ataio->cmd.lba_high_exp = lba >> 40;
 	ataio->cmd.features_exp = features >> 8;
 	ataio->cmd.sector_count = sector_count;
 	ataio->cmd.sector_count_exp = sector_count >> 8;
 	ataio->cmd.control = 0;
 }
 
 void
 ata_ncq_cmd(struct ccb_ataio *ataio, uint8_t cmd,
     uint64_t lba, uint16_t sector_count)
 {
 
 	ataio->cmd.flags = CAM_ATAIO_48BIT | CAM_ATAIO_FPDMA;
 	ataio->cmd.command = cmd;
 	ataio->cmd.features = sector_count;
 	ataio->cmd.lba_low = lba;
 	ataio->cmd.lba_mid = lba >> 8;
 	ataio->cmd.lba_high = lba >> 16;
 	ataio->cmd.device = ATA_DEV_LBA;
 	ataio->cmd.lba_low_exp = lba >> 24;
 	ataio->cmd.lba_mid_exp = lba >> 32;
 	ataio->cmd.lba_high_exp = lba >> 40;
 	ataio->cmd.features_exp = sector_count >> 8;
 	ataio->cmd.sector_count = 0;
 	ataio->cmd.sector_count_exp = 0;
 	ataio->cmd.control = 0;
 }
 
 void
 ata_reset_cmd(struct ccb_ataio *ataio)
 {
 	bzero(&ataio->cmd, sizeof(ataio->cmd));
 	ataio->cmd.flags = CAM_ATAIO_CONTROL | CAM_ATAIO_NEEDRESULT;
 	ataio->cmd.control = 0x04;
 }
 
 void
 ata_pm_read_cmd(struct ccb_ataio *ataio, int reg, int port)
 {
 	bzero(&ataio->cmd, sizeof(ataio->cmd));
 	ataio->cmd.flags = CAM_ATAIO_NEEDRESULT;
 	ataio->cmd.command = ATA_READ_PM;
 	ataio->cmd.features = reg;
 	ataio->cmd.device = port & 0x0f;
 }
 
 void
 ata_pm_write_cmd(struct ccb_ataio *ataio, int reg, int port, uint32_t val)
 {
 	bzero(&ataio->cmd, sizeof(ataio->cmd));
 	ataio->cmd.flags = 0;
 	ataio->cmd.command = ATA_WRITE_PM;
 	ataio->cmd.features = reg;
 	ataio->cmd.sector_count = val;
 	ataio->cmd.lba_low = val >> 8;
 	ataio->cmd.lba_mid = val >> 16;
 	ataio->cmd.lba_high = val >> 24;
 	ataio->cmd.device = port & 0x0f;
 }
 
 void
 ata_read_log(struct ccb_ataio *ataio, uint32_t retries,
 	     void (*cbfcnp)(struct cam_periph *, union ccb *),
 	     uint32_t log_address, uint32_t page_number, uint16_t block_count,
 	     uint32_t protocol, uint8_t *data_ptr, uint32_t dxfer_len,
 	     uint32_t timeout)
 {
 	uint64_t lba;
 
 	cam_fill_ataio(ataio,
 	    /*retries*/ 1,
 	    /*cbfcnp*/ cbfcnp,
 	    /*flags*/ CAM_DIR_IN,
 	    /*tag_action*/ 0,
 	    /*data_ptr*/ data_ptr,
 	    /*dxfer_len*/ dxfer_len,
 	    /*timeout*/ timeout);
 
 	lba = (((uint64_t)page_number & 0xff00) << 32) |
 	      ((page_number & 0x00ff) << 8) |
 	      (log_address & 0xff);
 
 	ata_48bit_cmd(ataio,
 	    /*cmd*/ (protocol & CAM_ATAIO_DMA) ? ATA_READ_LOG_DMA_EXT :
 		     ATA_READ_LOG_EXT,
 	    /*features*/ 0,
 	    /*lba*/ lba,
 	    /*sector_count*/ block_count);
 }
 
 void
 ata_bswap(int8_t *buf, int len)
 {
 	u_int16_t *ptr = (u_int16_t*)(buf + len);
 
 	while (--ptr >= (u_int16_t*)buf)
 		*ptr = be16toh(*ptr);
 }
 
 void
 ata_btrim(int8_t *buf, int len)
 {
 	int8_t *ptr;
 
 	for (ptr = buf; ptr < buf+len; ++ptr)
 		if (!*ptr || *ptr == '_')
 			*ptr = ' ';
 	for (ptr = buf + len - 1; ptr >= buf && *ptr == ' '; --ptr)
 		*ptr = 0;
 }
 
 void
 ata_bpack(int8_t *src, int8_t *dst, int len)
 {
 	int i, j, blank;
 
 	for (i = j = blank = 0 ; i < len; i++) {
 		if (blank && src[i] == ' ') continue;
 		if (blank && src[i] != ' ') {
 			dst[j++] = src[i];
 			blank = 0;
 			continue;
 		}
 		if (src[i] == ' ') {
 			blank = 1;
 			if (i == 0)
 			continue;
 		}
 		dst[j++] = src[i];
 	}
 	while (j < len)
 		dst[j++] = 0x00;
 }
 
 int
 ata_max_pmode(struct ata_params *ap)
 {
     if (ap->atavalid & ATA_FLAG_64_70) {
 	if (ap->apiomodes & 0x02)
 	    return ATA_PIO4;
 	if (ap->apiomodes & 0x01)
 	    return ATA_PIO3;
     }
     if (ap->mwdmamodes & 0x04)
 	return ATA_PIO4;
     if (ap->mwdmamodes & 0x02)
 	return ATA_PIO3;
     if (ap->mwdmamodes & 0x01)
 	return ATA_PIO2;
     if ((ap->retired_piomode & ATA_RETIRED_PIO_MASK) == 0x200)
 	return ATA_PIO2;
     if ((ap->retired_piomode & ATA_RETIRED_PIO_MASK) == 0x100)
 	return ATA_PIO1;
     if ((ap->retired_piomode & ATA_RETIRED_PIO_MASK) == 0x000)
 	return ATA_PIO0;
     return ATA_PIO0;
 }
 
 int
 ata_max_wmode(struct ata_params *ap)
 {
     if (ap->mwdmamodes & 0x04)
 	return ATA_WDMA2;
     if (ap->mwdmamodes & 0x02)
 	return ATA_WDMA1;
     if (ap->mwdmamodes & 0x01)
 	return ATA_WDMA0;
     return -1;
 }
 
 int
 ata_max_umode(struct ata_params *ap)
 {
     if (ap->atavalid & ATA_FLAG_88) {
 	if (ap->udmamodes & 0x40)
 	    return ATA_UDMA6;
 	if (ap->udmamodes & 0x20)
 	    return ATA_UDMA5;
 	if (ap->udmamodes & 0x10)
 	    return ATA_UDMA4;
 	if (ap->udmamodes & 0x08)
 	    return ATA_UDMA3;
 	if (ap->udmamodes & 0x04)
 	    return ATA_UDMA2;
 	if (ap->udmamodes & 0x02)
 	    return ATA_UDMA1;
 	if (ap->udmamodes & 0x01)
 	    return ATA_UDMA0;
     }
     return -1;
 }
 
 int
 ata_max_mode(struct ata_params *ap, int maxmode)
 {
 
 	if (maxmode == 0)
 		maxmode = ATA_DMA_MAX;
 	if (maxmode >= ATA_UDMA0 && ata_max_umode(ap) > 0)
 		return (min(maxmode, ata_max_umode(ap)));
 	if (maxmode >= ATA_WDMA0 && ata_max_wmode(ap) > 0)
 		return (min(maxmode, ata_max_wmode(ap)));
 	return (min(maxmode, ata_max_pmode(ap)));
 }
 
 char *
 ata_mode2string(int mode)
 {
     switch (mode) {
     case -1: return "UNSUPPORTED";
     case 0: return "NONE";
     case ATA_PIO0: return "PIO0";
     case ATA_PIO1: return "PIO1";
     case ATA_PIO2: return "PIO2";
     case ATA_PIO3: return "PIO3";
     case ATA_PIO4: return "PIO4";
     case ATA_WDMA0: return "WDMA0";
     case ATA_WDMA1: return "WDMA1";
     case ATA_WDMA2: return "WDMA2";
     case ATA_UDMA0: return "UDMA0";
     case ATA_UDMA1: return "UDMA1";
     case ATA_UDMA2: return "UDMA2";
     case ATA_UDMA3: return "UDMA3";
     case ATA_UDMA4: return "UDMA4";
     case ATA_UDMA5: return "UDMA5";
     case ATA_UDMA6: return "UDMA6";
     default:
 	if (mode & ATA_DMA_MASK)
 	    return "BIOSDMA";
 	else
 	    return "BIOSPIO";
     }
 }
 
 int
 ata_string2mode(char *str)
 {
 	if (!strcasecmp(str, "PIO0")) return (ATA_PIO0);
 	if (!strcasecmp(str, "PIO1")) return (ATA_PIO1);
 	if (!strcasecmp(str, "PIO2")) return (ATA_PIO2);
 	if (!strcasecmp(str, "PIO3")) return (ATA_PIO3);
 	if (!strcasecmp(str, "PIO4")) return (ATA_PIO4);
 	if (!strcasecmp(str, "WDMA0")) return (ATA_WDMA0);
 	if (!strcasecmp(str, "WDMA1")) return (ATA_WDMA1);
 	if (!strcasecmp(str, "WDMA2")) return (ATA_WDMA2);
 	if (!strcasecmp(str, "UDMA0")) return (ATA_UDMA0);
 	if (!strcasecmp(str, "UDMA16")) return (ATA_UDMA0);
 	if (!strcasecmp(str, "UDMA1")) return (ATA_UDMA1);
 	if (!strcasecmp(str, "UDMA25")) return (ATA_UDMA1);
 	if (!strcasecmp(str, "UDMA2")) return (ATA_UDMA2);
 	if (!strcasecmp(str, "UDMA33")) return (ATA_UDMA2);
 	if (!strcasecmp(str, "UDMA3")) return (ATA_UDMA3);
 	if (!strcasecmp(str, "UDMA44")) return (ATA_UDMA3);
 	if (!strcasecmp(str, "UDMA4")) return (ATA_UDMA4);
 	if (!strcasecmp(str, "UDMA66")) return (ATA_UDMA4);
 	if (!strcasecmp(str, "UDMA5")) return (ATA_UDMA5);
 	if (!strcasecmp(str, "UDMA100")) return (ATA_UDMA5);
 	if (!strcasecmp(str, "UDMA6")) return (ATA_UDMA6);
 	if (!strcasecmp(str, "UDMA133")) return (ATA_UDMA6);
 	return (-1);
 }
 
 
 u_int
 ata_mode2speed(int mode)
 {
 	switch (mode) {
 	case ATA_PIO0:
 	default:
 		return (3300);
 	case ATA_PIO1:
 		return (5200);
 	case ATA_PIO2:
 		return (8300);
 	case ATA_PIO3:
 		return (11100);
 	case ATA_PIO4:
 		return (16700);
 	case ATA_WDMA0:
 		return (4200);
 	case ATA_WDMA1:
 		return (13300);
 	case ATA_WDMA2:
 		return (16700);
 	case ATA_UDMA0:
 		return (16700);
 	case ATA_UDMA1:
 		return (25000);
 	case ATA_UDMA2:
 		return (33300);
 	case ATA_UDMA3:
 		return (44400);
 	case ATA_UDMA4:
 		return (66700);
 	case ATA_UDMA5:
 		return (100000);
 	case ATA_UDMA6:
 		return (133000);
 	}
 }
 
 u_int
 ata_revision2speed(int revision)
 {
 	switch (revision) {
 	case 1:
 	default:
 		return (150000);
 	case 2:
 		return (300000);
 	case 3:
 		return (600000);
 	}
 }
 
 int
 ata_speed2revision(u_int speed)
 {
 	switch (speed) {
 	case 0:
 		return (0);
 	case 150000:
 		return (1);
 	case 300000:
 		return (2);
 	case 600000:
 		return (3);
 	default:
 		return (-1);
 	}
 }
 
 int
 ata_identify_match(caddr_t identbuffer, caddr_t table_entry)
 {
 	struct scsi_inquiry_pattern *entry;
 	struct ata_params *ident;
  
 	entry = (struct scsi_inquiry_pattern *)table_entry;
 	ident = (struct ata_params *)identbuffer;
 
 	if ((cam_strmatch(ident->model, entry->product,
 			  sizeof(ident->model)) == 0)
 	 && (cam_strmatch(ident->revision, entry->revision,
 			  sizeof(ident->revision)) == 0)) {
 		return (0);
 	}
         return (-1);
 }
 
 int
 ata_static_identify_match(caddr_t identbuffer, caddr_t table_entry)
 {
 	struct scsi_static_inquiry_pattern *entry;
 	struct ata_params *ident;
  
 	entry = (struct scsi_static_inquiry_pattern *)table_entry;
 	ident = (struct ata_params *)identbuffer;
 
 	if ((cam_strmatch(ident->model, entry->product,
 			  sizeof(ident->model)) == 0)
 	 && (cam_strmatch(ident->revision, entry->revision,
 			  sizeof(ident->revision)) == 0)) {
 		return (0);
 	}
         return (-1);
 }
 
 void
 semb_receive_diagnostic_results(struct ccb_ataio *ataio,
     u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb*),
     uint8_t tag_action, int pcv, uint8_t page_code,
     uint8_t *data_ptr, uint16_t length, uint32_t timeout)
 {
 
 	length = min(length, 1020);
 	length = (length + 3) & ~3;
 	cam_fill_ataio(ataio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      data_ptr,
 		      length,
 		      timeout);
 	ata_28bit_cmd(ataio, ATA_SEP_ATTN,
 	    pcv ? page_code : 0, 0x02, length / 4);
 }
 
 void
 semb_send_diagnostic(struct ccb_ataio *ataio,
     u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *),
     uint8_t tag_action, uint8_t *data_ptr, uint16_t length, uint32_t timeout)
 {
 
 	length = min(length, 1020);
 	length = (length + 3) & ~3;
 	cam_fill_ataio(ataio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/length ? CAM_DIR_OUT : CAM_DIR_NONE,
 		      tag_action,
 		      data_ptr,
 		      length,
 		      timeout);
 	ata_28bit_cmd(ataio, ATA_SEP_ATTN,
 	    length > 0 ? data_ptr[0] : 0, 0x82, length / 4);
 }
 
 void
 semb_read_buffer(struct ccb_ataio *ataio,
     u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb*),
     uint8_t tag_action, uint8_t page_code,
     uint8_t *data_ptr, uint16_t length, uint32_t timeout)
 {
 
 	length = min(length, 1020);
 	length = (length + 3) & ~3;
 	cam_fill_ataio(ataio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      data_ptr,
 		      length,
 		      timeout);
 	ata_28bit_cmd(ataio, ATA_SEP_ATTN,
 	    page_code, 0x00, length / 4);
 }
 
 void
 semb_write_buffer(struct ccb_ataio *ataio,
     u_int32_t retries, void (*cbfcnp)(struct cam_periph *, union ccb *),
     uint8_t tag_action, uint8_t *data_ptr, uint16_t length, uint32_t timeout)
 {
 
 	length = min(length, 1020);
 	length = (length + 3) & ~3;
 	cam_fill_ataio(ataio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/length ? CAM_DIR_OUT : CAM_DIR_NONE,
 		      tag_action,
 		      data_ptr,
 		      length,
 		      timeout);
 	ata_28bit_cmd(ataio, ATA_SEP_ATTN,
 	    length > 0 ? data_ptr[0] : 0, 0x80, length / 4);
 }
 
 
 void
 ata_zac_mgmt_out(struct ccb_ataio *ataio, uint32_t retries, 
 		 void (*cbfcnp)(struct cam_periph *, union ccb *),
 		 int use_ncq, uint8_t zm_action, uint64_t zone_id,
 		 uint8_t zone_flags, uint16_t sector_count, uint8_t *data_ptr,
 		 uint32_t dxfer_len, uint32_t timeout)
 {
 	uint8_t command_out, ata_flags;
 	uint16_t features_out, sectors_out;
 	uint32_t auxiliary;
 
 	if (use_ncq == 0) {
 		command_out = ATA_ZAC_MANAGEMENT_OUT;
 		features_out = (zm_action & 0xf) | (zone_flags << 8);
 		if (dxfer_len == 0) {
 			ata_flags = 0;
 			sectors_out = 0;
 		} else {
 			ata_flags = CAM_ATAIO_DMA;
 			/* XXX KDM use sector count? */
 			sectors_out = ((dxfer_len >> 9) & 0xffff);
 		}
 		auxiliary = 0;
 	} else {
 		if (dxfer_len == 0) {
 			command_out = ATA_NCQ_NON_DATA;
 			features_out = ATA_NCQ_ZAC_MGMT_OUT;
 			sectors_out = 0;
 		} else {
 			command_out = ATA_SEND_FPDMA_QUEUED;
 
 			/* Note that we're defaulting to normal priority */
 			sectors_out = ATA_SFPDMA_ZAC_MGMT_OUT << 8;
 
 			/*
 			 * For SEND FPDMA QUEUED, the transfer length is
 			 * encoded in the FEATURE register, and 0 means
 			 * that 65536 512 byte blocks are to be tranferred.
 			 * In practice, it seems unlikely that we'll see
 			 * a transfer that large.
 			 */
 			if (dxfer_len == (65536 * 512)) {
 				features_out = 0;
 			} else {
 				/*
 				 * Yes, the caller can theoretically send a
 				 * transfer larger than we can handle.
 				 * Anyone using this function needs enough
 				 * knowledge to avoid doing that.
 				 */
 				features_out = ((dxfer_len >> 9) & 0xffff);
 			}
 		}
 		auxiliary = (zm_action & 0xf) | (zone_flags << 8);
 
 		ata_flags = CAM_ATAIO_FPDMA;
 	}
 
 	cam_fill_ataio(ataio,
 	    /*retries*/ retries,
 	    /*cbfcnp*/ cbfcnp,
 	    /*flags*/ (dxfer_len > 0) ? CAM_DIR_OUT : CAM_DIR_NONE,
 	    /*tag_action*/ 0,
 	    /*data_ptr*/ data_ptr,
 	    /*dxfer_len*/ dxfer_len,
 	    /*timeout*/ timeout);
 
 	ata_48bit_cmd(ataio,
 	    /*cmd*/ command_out,
 	    /*features*/ features_out,
 	    /*lba*/ zone_id,
 	    /*sector_count*/ sectors_out);
 
 	ataio->cmd.flags |= ata_flags;
 	if (auxiliary != 0) {
 		ataio->ata_flags |= ATA_FLAG_AUX;
 		ataio->aux = auxiliary;
 	}
 }
 
 void
 ata_zac_mgmt_in(struct ccb_ataio *ataio, uint32_t retries, 
 		void (*cbfcnp)(struct cam_periph *, union ccb *),
 		int use_ncq, uint8_t zm_action, uint64_t zone_id,
 		uint8_t zone_flags, uint8_t *data_ptr, uint32_t dxfer_len,
 		uint32_t timeout)
 {
 	uint8_t command_out, ata_flags;
 	uint16_t features_out, sectors_out;
 	uint32_t auxiliary;
 
 	if (use_ncq == 0) {
 		command_out = ATA_ZAC_MANAGEMENT_IN;
 		/* XXX KDM put a macro here */
 		features_out = (zm_action & 0xf) | (zone_flags << 8);
 		ata_flags = CAM_ATAIO_DMA;
 		sectors_out = ((dxfer_len >> 9) & 0xffff);
 		auxiliary = 0;
 	} else {
 		command_out = ATA_RECV_FPDMA_QUEUED;
 		sectors_out = ATA_RFPDMA_ZAC_MGMT_IN << 8;
 		auxiliary = (zm_action & 0xf) | (zone_flags << 8);
 		ata_flags = CAM_ATAIO_FPDMA;
 		/*
 		 * For RECEIVE FPDMA QUEUED, the transfer length is
 		 * encoded in the FEATURE register, and 0 means
 		 * that 65536 512 byte blocks are to be tranferred.
 		 * In practice, it is unlikely we will see a transfer that
 		 * large.
 		 */
 		if (dxfer_len == (65536 * 512)) {
 			features_out = 0;
 		} else {
 			/*
 			 * Yes, the caller can theoretically request a
 			 * transfer larger than we can handle.
 			 * Anyone using this function needs enough
 			 * knowledge to avoid doing that.
 			 */
 			features_out = ((dxfer_len >> 9) & 0xffff);
 		}
 	}
 
 	cam_fill_ataio(ataio,
 	    /*retries*/ retries,
 	    /*cbfcnp*/ cbfcnp,
 	    /*flags*/ CAM_DIR_IN,
 	    /*tag_action*/ 0,
 	    /*data_ptr*/ data_ptr,
 	    /*dxfer_len*/ dxfer_len,
 	    /*timeout*/ timeout);
 
 	ata_48bit_cmd(ataio,
 	    /*cmd*/ command_out,
 	    /*features*/ features_out,
 	    /*lba*/ zone_id,
 	    /*sector_count*/ sectors_out);
 
 	ataio->cmd.flags |= ata_flags;
 	if (auxiliary != 0) {
 		ataio->ata_flags |= ATA_FLAG_AUX;
 		ataio->aux = auxiliary;
 	}
 }
 
 void
 ata_param_fixup(struct ata_params *ident_buf)
 {
 	int16_t *ptr;
 
 	for (ptr = (int16_t *)ident_buf;
 	     ptr < (int16_t *)ident_buf + sizeof(struct ata_params)/2; ptr++) {
 		*ptr = le16toh(*ptr);
 	}
 	if (strncmp(ident_buf->model, "FX", 2) &&
 	    strncmp(ident_buf->model, "NEC", 3) &&
 	    strncmp(ident_buf->model, "Pioneer", 7) &&
 	    strncmp(ident_buf->model, "SHARP", 5)) {
 		ata_bswap(ident_buf->model, sizeof(ident_buf->model));
 		ata_bswap(ident_buf->revision, sizeof(ident_buf->revision));
 		ata_bswap(ident_buf->serial, sizeof(ident_buf->serial));
 	}
 	ata_btrim(ident_buf->model, sizeof(ident_buf->model));
 	ata_bpack(ident_buf->model, ident_buf->model, sizeof(ident_buf->model));
 	ata_btrim(ident_buf->revision, sizeof(ident_buf->revision));
 	ata_bpack(ident_buf->revision, ident_buf->revision, sizeof(ident_buf->revision));
 	ata_btrim(ident_buf->serial, sizeof(ident_buf->serial));
 	ata_bpack(ident_buf->serial, ident_buf->serial, sizeof(ident_buf->serial));
 }
Index: projects/nfsv42/sys/cam/ctl/ctl.c
===================================================================
--- projects/nfsv42/sys/cam/ctl/ctl.c	(revision 350367)
+++ projects/nfsv42/sys/cam/ctl/ctl.c	(revision 350368)
@@ -1,13448 +1,13564 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003-2009 Silicon Graphics International Corp.
  * Copyright (c) 2012 The FreeBSD Foundation
  * Copyright (c) 2014-2017 Alexander Motin <mav@FreeBSD.org>
  * Copyright (c) 2017 Jakub Wojciech Klama <jceel@FreeBSD.org>
  * Copyright (c) 2018 Marcelo Araujo <araujo@FreeBSD.org>
  * All rights reserved.
  *
  * Portions of this software were developed by Edward Tomasz Napierala
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
  * $Id$
  */
 /*
  * CAM Target Layer, a SCSI device emulation subsystem.
  *
  * Author: Ken Merry <ken@FreeBSD.org>
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/ctype.h>
 #include <sys/kernel.h>
 #include <sys/types.h>
 #include <sys/kthread.h>
 #include <sys/bio.h>
 #include <sys/fcntl.h>
 #include <sys/lock.h>
 #include <sys/module.h>
 #include <sys/mutex.h>
 #include <sys/condvar.h>
 #include <sys/malloc.h>
 #include <sys/conf.h>
 #include <sys/ioccom.h>
 #include <sys/queue.h>
 #include <sys/sbuf.h>
 #include <sys/smp.h>
 #include <sys/endian.h>
 #include <sys/proc.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/nv.h>
 #include <sys/dnv.h>
 #include <vm/uma.h>
 
 #include <cam/cam.h>
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_cd.h>
 #include <cam/scsi/scsi_da.h>
 #include <cam/ctl/ctl_io.h>
 #include <cam/ctl/ctl.h>
 #include <cam/ctl/ctl_frontend.h>
 #include <cam/ctl/ctl_util.h>
 #include <cam/ctl/ctl_backend.h>
 #include <cam/ctl/ctl_ioctl.h>
 #include <cam/ctl/ctl_ha.h>
 #include <cam/ctl/ctl_private.h>
 #include <cam/ctl/ctl_debug.h>
 #include <cam/ctl/ctl_scsi_all.h>
 #include <cam/ctl/ctl_error.h>
 
 struct ctl_softc *control_softc = NULL;
 
 /*
  * Template mode pages.
  */
 
 /*
  * Note that these are default values only.  The actual values will be
  * filled in when the user does a mode sense.
  */
 const static struct scsi_da_rw_recovery_page rw_er_page_default = {
 	/*page_code*/SMS_RW_ERROR_RECOVERY_PAGE,
 	/*page_length*/sizeof(struct scsi_da_rw_recovery_page) - 2,
 	/*byte3*/SMS_RWER_AWRE|SMS_RWER_ARRE,
 	/*read_retry_count*/0,
 	/*correction_span*/0,
 	/*head_offset_count*/0,
 	/*data_strobe_offset_cnt*/0,
 	/*byte8*/SMS_RWER_LBPERE,
 	/*write_retry_count*/0,
 	/*reserved2*/0,
 	/*recovery_time_limit*/{0, 0},
 };
 
 const static struct scsi_da_rw_recovery_page rw_er_page_changeable = {
 	/*page_code*/SMS_RW_ERROR_RECOVERY_PAGE,
 	/*page_length*/sizeof(struct scsi_da_rw_recovery_page) - 2,
 	/*byte3*/SMS_RWER_PER,
 	/*read_retry_count*/0,
 	/*correction_span*/0,
 	/*head_offset_count*/0,
 	/*data_strobe_offset_cnt*/0,
 	/*byte8*/SMS_RWER_LBPERE,
 	/*write_retry_count*/0,
 	/*reserved2*/0,
 	/*recovery_time_limit*/{0, 0},
 };
 
 const static struct scsi_format_page format_page_default = {
 	/*page_code*/SMS_FORMAT_DEVICE_PAGE,
 	/*page_length*/sizeof(struct scsi_format_page) - 2,
 	/*tracks_per_zone*/ {0, 0},
 	/*alt_sectors_per_zone*/ {0, 0},
 	/*alt_tracks_per_zone*/ {0, 0},
 	/*alt_tracks_per_lun*/ {0, 0},
 	/*sectors_per_track*/ {(CTL_DEFAULT_SECTORS_PER_TRACK >> 8) & 0xff,
 			        CTL_DEFAULT_SECTORS_PER_TRACK & 0xff},
 	/*bytes_per_sector*/ {0, 0},
 	/*interleave*/ {0, 0},
 	/*track_skew*/ {0, 0},
 	/*cylinder_skew*/ {0, 0},
 	/*flags*/ SFP_HSEC,
 	/*reserved*/ {0, 0, 0}
 };
 
 const static struct scsi_format_page format_page_changeable = {
 	/*page_code*/SMS_FORMAT_DEVICE_PAGE,
 	/*page_length*/sizeof(struct scsi_format_page) - 2,
 	/*tracks_per_zone*/ {0, 0},
 	/*alt_sectors_per_zone*/ {0, 0},
 	/*alt_tracks_per_zone*/ {0, 0},
 	/*alt_tracks_per_lun*/ {0, 0},
 	/*sectors_per_track*/ {0, 0},
 	/*bytes_per_sector*/ {0, 0},
 	/*interleave*/ {0, 0},
 	/*track_skew*/ {0, 0},
 	/*cylinder_skew*/ {0, 0},
 	/*flags*/ 0,
 	/*reserved*/ {0, 0, 0}
 };
 
 const static struct scsi_rigid_disk_page rigid_disk_page_default = {
 	/*page_code*/SMS_RIGID_DISK_PAGE,
 	/*page_length*/sizeof(struct scsi_rigid_disk_page) - 2,
 	/*cylinders*/ {0, 0, 0},
 	/*heads*/ CTL_DEFAULT_HEADS,
 	/*start_write_precomp*/ {0, 0, 0},
 	/*start_reduced_current*/ {0, 0, 0},
 	/*step_rate*/ {0, 0},
 	/*landing_zone_cylinder*/ {0, 0, 0},
 	/*rpl*/ SRDP_RPL_DISABLED,
 	/*rotational_offset*/ 0,
 	/*reserved1*/ 0,
 	/*rotation_rate*/ {(CTL_DEFAULT_ROTATION_RATE >> 8) & 0xff,
 			   CTL_DEFAULT_ROTATION_RATE & 0xff},
 	/*reserved2*/ {0, 0}
 };
 
 const static struct scsi_rigid_disk_page rigid_disk_page_changeable = {
 	/*page_code*/SMS_RIGID_DISK_PAGE,
 	/*page_length*/sizeof(struct scsi_rigid_disk_page) - 2,
 	/*cylinders*/ {0, 0, 0},
 	/*heads*/ 0,
 	/*start_write_precomp*/ {0, 0, 0},
 	/*start_reduced_current*/ {0, 0, 0},
 	/*step_rate*/ {0, 0},
 	/*landing_zone_cylinder*/ {0, 0, 0},
 	/*rpl*/ 0,
 	/*rotational_offset*/ 0,
 	/*reserved1*/ 0,
 	/*rotation_rate*/ {0, 0},
 	/*reserved2*/ {0, 0}
 };
 
 const static struct scsi_da_verify_recovery_page verify_er_page_default = {
 	/*page_code*/SMS_VERIFY_ERROR_RECOVERY_PAGE,
 	/*page_length*/sizeof(struct scsi_da_verify_recovery_page) - 2,
 	/*byte3*/0,
 	/*read_retry_count*/0,
 	/*reserved*/{ 0, 0, 0, 0, 0, 0 },
 	/*recovery_time_limit*/{0, 0},
 };
 
 const static struct scsi_da_verify_recovery_page verify_er_page_changeable = {
 	/*page_code*/SMS_VERIFY_ERROR_RECOVERY_PAGE,
 	/*page_length*/sizeof(struct scsi_da_verify_recovery_page) - 2,
 	/*byte3*/SMS_VER_PER,
 	/*read_retry_count*/0,
 	/*reserved*/{ 0, 0, 0, 0, 0, 0 },
 	/*recovery_time_limit*/{0, 0},
 };
 
 const static struct scsi_caching_page caching_page_default = {
 	/*page_code*/SMS_CACHING_PAGE,
 	/*page_length*/sizeof(struct scsi_caching_page) - 2,
 	/*flags1*/ SCP_DISC | SCP_WCE,
 	/*ret_priority*/ 0,
 	/*disable_pf_transfer_len*/ {0xff, 0xff},
 	/*min_prefetch*/ {0, 0},
 	/*max_prefetch*/ {0xff, 0xff},
 	/*max_pf_ceiling*/ {0xff, 0xff},
 	/*flags2*/ 0,
 	/*cache_segments*/ 0,
 	/*cache_seg_size*/ {0, 0},
 	/*reserved*/ 0,
 	/*non_cache_seg_size*/ {0, 0, 0}
 };
 
 const static struct scsi_caching_page caching_page_changeable = {
 	/*page_code*/SMS_CACHING_PAGE,
 	/*page_length*/sizeof(struct scsi_caching_page) - 2,
 	/*flags1*/ SCP_WCE | SCP_RCD,
 	/*ret_priority*/ 0,
 	/*disable_pf_transfer_len*/ {0, 0},
 	/*min_prefetch*/ {0, 0},
 	/*max_prefetch*/ {0, 0},
 	/*max_pf_ceiling*/ {0, 0},
 	/*flags2*/ 0,
 	/*cache_segments*/ 0,
 	/*cache_seg_size*/ {0, 0},
 	/*reserved*/ 0,
 	/*non_cache_seg_size*/ {0, 0, 0}
 };
 
 const static struct scsi_control_page control_page_default = {
 	/*page_code*/SMS_CONTROL_MODE_PAGE,
 	/*page_length*/sizeof(struct scsi_control_page) - 2,
 	/*rlec*/0,
 	/*queue_flags*/SCP_QUEUE_ALG_RESTRICTED,
 	/*eca_and_aen*/0,
 	/*flags4*/SCP_TAS,
 	/*aen_holdoff_period*/{0, 0},
 	/*busy_timeout_period*/{0, 0},
 	/*extended_selftest_completion_time*/{0, 0}
 };
 
 const static struct scsi_control_page control_page_changeable = {
 	/*page_code*/SMS_CONTROL_MODE_PAGE,
 	/*page_length*/sizeof(struct scsi_control_page) - 2,
 	/*rlec*/SCP_DSENSE,
 	/*queue_flags*/SCP_QUEUE_ALG_MASK | SCP_NUAR,
 	/*eca_and_aen*/SCP_SWP,
 	/*flags4*/0,
 	/*aen_holdoff_period*/{0, 0},
 	/*busy_timeout_period*/{0, 0},
 	/*extended_selftest_completion_time*/{0, 0}
 };
 
 #define CTL_CEM_LEN	(sizeof(struct scsi_control_ext_page) - 4)
 
 const static struct scsi_control_ext_page control_ext_page_default = {
 	/*page_code*/SMS_CONTROL_MODE_PAGE | SMPH_SPF,
 	/*subpage_code*/0x01,
 	/*page_length*/{CTL_CEM_LEN >> 8, CTL_CEM_LEN},
 	/*flags*/0,
 	/*prio*/0,
 	/*max_sense*/0
 };
 
 const static struct scsi_control_ext_page control_ext_page_changeable = {
 	/*page_code*/SMS_CONTROL_MODE_PAGE | SMPH_SPF,
 	/*subpage_code*/0x01,
 	/*page_length*/{CTL_CEM_LEN >> 8, CTL_CEM_LEN},
 	/*flags*/0,
 	/*prio*/0,
 	/*max_sense*/0xff
 };
 
 const static struct scsi_info_exceptions_page ie_page_default = {
 	/*page_code*/SMS_INFO_EXCEPTIONS_PAGE,
 	/*page_length*/sizeof(struct scsi_info_exceptions_page) - 2,
 	/*info_flags*/SIEP_FLAGS_EWASC,
 	/*mrie*/SIEP_MRIE_NO,
 	/*interval_timer*/{0, 0, 0, 0},
 	/*report_count*/{0, 0, 0, 1}
 };
 
 const static struct scsi_info_exceptions_page ie_page_changeable = {
 	/*page_code*/SMS_INFO_EXCEPTIONS_PAGE,
 	/*page_length*/sizeof(struct scsi_info_exceptions_page) - 2,
 	/*info_flags*/SIEP_FLAGS_EWASC | SIEP_FLAGS_DEXCPT | SIEP_FLAGS_TEST |
 	    SIEP_FLAGS_LOGERR,
 	/*mrie*/0x0f,
 	/*interval_timer*/{0xff, 0xff, 0xff, 0xff},
 	/*report_count*/{0xff, 0xff, 0xff, 0xff}
 };
 
 #define CTL_LBPM_LEN	(sizeof(struct ctl_logical_block_provisioning_page) - 4)
 
 const static struct ctl_logical_block_provisioning_page lbp_page_default = {{
 	/*page_code*/SMS_INFO_EXCEPTIONS_PAGE | SMPH_SPF,
 	/*subpage_code*/0x02,
 	/*page_length*/{CTL_LBPM_LEN >> 8, CTL_LBPM_LEN},
 	/*flags*/0,
 	/*reserved*/{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
 	/*descr*/{}},
 	{{/*flags*/0,
 	  /*resource*/0x01,
 	  /*reserved*/{0, 0},
 	  /*count*/{0, 0, 0, 0}},
 	 {/*flags*/0,
 	  /*resource*/0x02,
 	  /*reserved*/{0, 0},
 	  /*count*/{0, 0, 0, 0}},
 	 {/*flags*/0,
 	  /*resource*/0xf1,
 	  /*reserved*/{0, 0},
 	  /*count*/{0, 0, 0, 0}},
 	 {/*flags*/0,
 	  /*resource*/0xf2,
 	  /*reserved*/{0, 0},
 	  /*count*/{0, 0, 0, 0}}
 	}
 };
 
 const static struct ctl_logical_block_provisioning_page lbp_page_changeable = {{
 	/*page_code*/SMS_INFO_EXCEPTIONS_PAGE | SMPH_SPF,
 	/*subpage_code*/0x02,
 	/*page_length*/{CTL_LBPM_LEN >> 8, CTL_LBPM_LEN},
 	/*flags*/SLBPP_SITUA,
 	/*reserved*/{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},
 	/*descr*/{}},
 	{{/*flags*/0,
 	  /*resource*/0,
 	  /*reserved*/{0, 0},
 	  /*count*/{0, 0, 0, 0}},
 	 {/*flags*/0,
 	  /*resource*/0,
 	  /*reserved*/{0, 0},
 	  /*count*/{0, 0, 0, 0}},
 	 {/*flags*/0,
 	  /*resource*/0,
 	  /*reserved*/{0, 0},
 	  /*count*/{0, 0, 0, 0}},
 	 {/*flags*/0,
 	  /*resource*/0,
 	  /*reserved*/{0, 0},
 	  /*count*/{0, 0, 0, 0}}
 	}
 };
 
 const static struct scsi_cddvd_capabilities_page cddvd_page_default = {
 	/*page_code*/SMS_CDDVD_CAPS_PAGE,
 	/*page_length*/sizeof(struct scsi_cddvd_capabilities_page) - 2,
 	/*caps1*/0x3f,
 	/*caps2*/0x00,
 	/*caps3*/0xf0,
 	/*caps4*/0x00,
 	/*caps5*/0x29,
 	/*caps6*/0x00,
 	/*obsolete*/{0, 0},
 	/*nvol_levels*/{0, 0},
 	/*buffer_size*/{8, 0},
 	/*obsolete2*/{0, 0},
 	/*reserved*/0,
 	/*digital*/0,
 	/*obsolete3*/0,
 	/*copy_management*/0,
 	/*reserved2*/0,
 	/*rotation_control*/0,
 	/*cur_write_speed*/0,
 	/*num_speed_descr*/0,
 };
 
 const static struct scsi_cddvd_capabilities_page cddvd_page_changeable = {
 	/*page_code*/SMS_CDDVD_CAPS_PAGE,
 	/*page_length*/sizeof(struct scsi_cddvd_capabilities_page) - 2,
 	/*caps1*/0,
 	/*caps2*/0,
 	/*caps3*/0,
 	/*caps4*/0,
 	/*caps5*/0,
 	/*caps6*/0,
 	/*obsolete*/{0, 0},
 	/*nvol_levels*/{0, 0},
 	/*buffer_size*/{0, 0},
 	/*obsolete2*/{0, 0},
 	/*reserved*/0,
 	/*digital*/0,
 	/*obsolete3*/0,
 	/*copy_management*/0,
 	/*reserved2*/0,
 	/*rotation_control*/0,
 	/*cur_write_speed*/0,
 	/*num_speed_descr*/0,
 };
 
 SYSCTL_NODE(_kern_cam, OID_AUTO, ctl, CTLFLAG_RD, 0, "CAM Target Layer");
 static int worker_threads = -1;
 SYSCTL_INT(_kern_cam_ctl, OID_AUTO, worker_threads, CTLFLAG_RDTUN,
     &worker_threads, 1, "Number of worker threads");
 static int ctl_debug = CTL_DEBUG_NONE;
 SYSCTL_INT(_kern_cam_ctl, OID_AUTO, debug, CTLFLAG_RWTUN,
     &ctl_debug, 0, "Enabled debug flags");
 static int ctl_lun_map_size = 1024;
 SYSCTL_INT(_kern_cam_ctl, OID_AUTO, lun_map_size, CTLFLAG_RWTUN,
     &ctl_lun_map_size, 0, "Size of per-port LUN map (max LUN + 1)");
 #ifdef  CTL_TIME_IO
 static int ctl_time_io_secs = CTL_TIME_IO_DEFAULT_SECS;
 SYSCTL_INT(_kern_cam_ctl, OID_AUTO, time_io_secs, CTLFLAG_RWTUN,
     &ctl_time_io_secs, 0, "Log requests taking more seconds");
 #endif
 
 /*
  * Maximum number of LUNs we support.  MUST be a power of 2.
  */
 #define	CTL_DEFAULT_MAX_LUNS	1024
 static int ctl_max_luns = CTL_DEFAULT_MAX_LUNS;
 TUNABLE_INT("kern.cam.ctl.max_luns", &ctl_max_luns);
 SYSCTL_INT(_kern_cam_ctl, OID_AUTO, max_luns, CTLFLAG_RDTUN,
     &ctl_max_luns, CTL_DEFAULT_MAX_LUNS, "Maximum number of LUNs");
 
 /*
  * Maximum number of ports registered at one time.
  */
 #define	CTL_DEFAULT_MAX_PORTS		256
 static int ctl_max_ports = CTL_DEFAULT_MAX_PORTS;
 TUNABLE_INT("kern.cam.ctl.max_ports", &ctl_max_ports);
 SYSCTL_INT(_kern_cam_ctl, OID_AUTO, max_ports, CTLFLAG_RDTUN,
     &ctl_max_ports, CTL_DEFAULT_MAX_LUNS, "Maximum number of ports");
 
 /*
  * Maximum number of initiators we support.
  */
 #define	CTL_MAX_INITIATORS	(CTL_MAX_INIT_PER_PORT * ctl_max_ports)
 
 /*
  * Supported pages (0x00), Serial number (0x80), Device ID (0x83),
  * Extended INQUIRY Data (0x86), Mode Page Policy (0x87),
- * SCSI Ports (0x88), Third-party Copy (0x8F), Block limits (0xB0),
- * Block Device Characteristics (0xB1) and Logical Block Provisioning (0xB2)
+ * SCSI Ports (0x88), Third-party Copy (0x8F), SCSI Feature Sets (0x92),
+ * Block limits (0xB0), Block Device Characteristics (0xB1) and
+ * Logical Block Provisioning (0xB2)
  */
-#define SCSI_EVPD_NUM_SUPPORTED_PAGES	10
+#define SCSI_EVPD_NUM_SUPPORTED_PAGES	11
 
 static void ctl_isc_event_handler(ctl_ha_channel chanel, ctl_ha_event event,
 				  int param);
 static void ctl_copy_sense_data(union ctl_ha_msg *src, union ctl_io *dest);
 static void ctl_copy_sense_data_back(union ctl_io *src, union ctl_ha_msg *dest);
 static int ctl_init(void);
 static int ctl_shutdown(void);
 static int ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td);
 static int ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td);
 static void ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio);
 static void ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num,
 			      struct ctl_ooa *ooa_hdr,
 			      struct ctl_ooa_entry *kern_entries);
 static int ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
 		     struct thread *td);
 static int ctl_alloc_lun(struct ctl_softc *ctl_softc, struct ctl_lun *lun,
 			 struct ctl_be_lun *be_lun);
 static int ctl_free_lun(struct ctl_lun *lun);
 static void ctl_create_lun(struct ctl_be_lun *be_lun);
 
 static int ctl_do_mode_select(union ctl_io *io);
 static int ctl_pro_preempt(struct ctl_softc *softc, struct ctl_lun *lun,
 			   uint64_t res_key, uint64_t sa_res_key,
 			   uint8_t type, uint32_t residx,
 			   struct ctl_scsiio *ctsio,
 			   struct scsi_per_res_out *cdb,
 			   struct scsi_per_res_out_parms* param);
 static void ctl_pro_preempt_other(struct ctl_lun *lun,
 				  union ctl_ha_msg *msg);
 static void ctl_hndl_per_res_out_on_other_sc(union ctl_io *io);
 static int ctl_inquiry_evpd_supported(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd_serial(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd_devid(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd_eid(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd_mpp(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd_scsi_ports(struct ctl_scsiio *ctsio,
 					 int alloc_len);
+static int ctl_inquiry_evpd_sfs(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd_block_limits(struct ctl_scsiio *ctsio,
 					 int alloc_len);
 static int ctl_inquiry_evpd_bdc(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd_lbp(struct ctl_scsiio *ctsio, int alloc_len);
 static int ctl_inquiry_evpd(struct ctl_scsiio *ctsio);
 static int ctl_inquiry_std(struct ctl_scsiio *ctsio);
 static int ctl_get_lba_len(union ctl_io *io, uint64_t *lba, uint64_t *len);
 static ctl_action ctl_extent_check(union ctl_io *io1, union ctl_io *io2,
     bool seq);
 static ctl_action ctl_extent_check_seq(union ctl_io *io1, union ctl_io *io2);
 static ctl_action ctl_check_for_blockage(struct ctl_lun *lun,
     union ctl_io *pending_io, union ctl_io *ooa_io);
 static ctl_action ctl_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io,
 				union ctl_io **starting_io);
 static void ctl_try_unblock_io(struct ctl_lun *lun, union ctl_io *io,
     bool skip);
 static void ctl_try_unblock_others(struct ctl_lun *lun, union ctl_io *io,
     bool skip);
 static int ctl_scsiio_lun_check(struct ctl_lun *lun,
 				const struct ctl_cmd_entry *entry,
 				struct ctl_scsiio *ctsio);
 static void ctl_failover_lun(union ctl_io *io);
 static int ctl_scsiio_precheck(struct ctl_softc *ctl_softc,
 			       struct ctl_scsiio *ctsio);
 static int ctl_scsiio(struct ctl_scsiio *ctsio);
 
 static int ctl_target_reset(union ctl_io *io);
 static void ctl_do_lun_reset(struct ctl_lun *lun, uint32_t initidx,
 			 ctl_ua_type ua_type);
 static int ctl_lun_reset(union ctl_io *io);
 static int ctl_abort_task(union ctl_io *io);
 static int ctl_abort_task_set(union ctl_io *io);
 static int ctl_query_task(union ctl_io *io, int task_set);
 static void ctl_i_t_nexus_loss(struct ctl_softc *softc, uint32_t initidx,
 			      ctl_ua_type ua_type);
 static int ctl_i_t_nexus_reset(union ctl_io *io);
 static int ctl_query_async_event(union ctl_io *io);
 static void ctl_run_task(union ctl_io *io);
 #ifdef CTL_IO_DELAY
 static void ctl_datamove_timer_wakeup(void *arg);
 static void ctl_done_timer_wakeup(void *arg);
 #endif /* CTL_IO_DELAY */
 
 static void ctl_send_datamove_done(union ctl_io *io, int have_lock);
 static void ctl_datamove_remote_write_cb(struct ctl_ha_dt_req *rq);
 static int ctl_datamove_remote_dm_write_cb(union ctl_io *io);
 static void ctl_datamove_remote_write(union ctl_io *io);
 static int ctl_datamove_remote_dm_read_cb(union ctl_io *io);
 static void ctl_datamove_remote_read_cb(struct ctl_ha_dt_req *rq);
 static int ctl_datamove_remote_sgl_setup(union ctl_io *io);
 static int ctl_datamove_remote_xfer(union ctl_io *io, unsigned command,
 				    ctl_ha_dt_cb callback);
 static void ctl_datamove_remote_read(union ctl_io *io);
 static void ctl_datamove_remote(union ctl_io *io);
 static void ctl_process_done(union ctl_io *io);
 static void ctl_lun_thread(void *arg);
 static void ctl_thresh_thread(void *arg);
 static void ctl_work_thread(void *arg);
 static void ctl_enqueue_incoming(union ctl_io *io);
 static void ctl_enqueue_rtr(union ctl_io *io);
 static void ctl_enqueue_done(union ctl_io *io);
 static void ctl_enqueue_isc(union ctl_io *io);
 static const struct ctl_cmd_entry *
     ctl_get_cmd_entry(struct ctl_scsiio *ctsio, int *sa);
 static const struct ctl_cmd_entry *
     ctl_validate_command(struct ctl_scsiio *ctsio);
 static int ctl_cmd_applicable(uint8_t lun_type,
     const struct ctl_cmd_entry *entry);
 static int ctl_ha_init(void);
 static int ctl_ha_shutdown(void);
 
 static uint64_t ctl_get_prkey(struct ctl_lun *lun, uint32_t residx);
 static void ctl_clr_prkey(struct ctl_lun *lun, uint32_t residx);
 static void ctl_alloc_prkey(struct ctl_lun *lun, uint32_t residx);
 static void ctl_set_prkey(struct ctl_lun *lun, uint32_t residx, uint64_t key);
 
 /*
  * Load the serialization table.  This isn't very pretty, but is probably
  * the easiest way to do it.
  */
 #include "ctl_ser_table.c"
 
 /*
  * We only need to define open, close and ioctl routines for this driver.
  */
 static struct cdevsw ctl_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	0,
 	.d_open =	ctl_open,
 	.d_close =	ctl_close,
 	.d_ioctl =	ctl_ioctl,
 	.d_name =	"ctl",
 };
 
 
 MALLOC_DEFINE(M_CTL, "ctlmem", "Memory used for CTL");
 
 static int ctl_module_event_handler(module_t, int /*modeventtype_t*/, void *);
 
 static moduledata_t ctl_moduledata = {
 	"ctl",
 	ctl_module_event_handler,
 	NULL
 };
 
 DECLARE_MODULE(ctl, ctl_moduledata, SI_SUB_CONFIGURE, SI_ORDER_THIRD);
 MODULE_VERSION(ctl, 1);
 
 static struct ctl_frontend ha_frontend =
 {
 	.name = "ha",
 	.init = ctl_ha_init,
 	.shutdown = ctl_ha_shutdown,
 };
 
 static int
 ctl_ha_init(void)
 {
 	struct ctl_softc *softc = control_softc;
 
 	if (ctl_pool_create(softc, "othersc", CTL_POOL_ENTRIES_OTHER_SC,
 	                    &softc->othersc_pool) != 0)
 		return (ENOMEM);
 	if (ctl_ha_msg_init(softc) != CTL_HA_STATUS_SUCCESS) {
 		ctl_pool_free(softc->othersc_pool);
 		return (EIO);
 	}
 	if (ctl_ha_msg_register(CTL_HA_CHAN_CTL, ctl_isc_event_handler)
 	    != CTL_HA_STATUS_SUCCESS) {
 		ctl_ha_msg_destroy(softc);
 		ctl_pool_free(softc->othersc_pool);
 		return (EIO);
 	}
 	return (0);
 };
 
 static int
 ctl_ha_shutdown(void)
 {
 	struct ctl_softc *softc = control_softc;
 	struct ctl_port *port;
 
 	ctl_ha_msg_shutdown(softc);
 	if (ctl_ha_msg_deregister(CTL_HA_CHAN_CTL) != CTL_HA_STATUS_SUCCESS)
 		return (EIO);
 	if (ctl_ha_msg_destroy(softc) != CTL_HA_STATUS_SUCCESS)
 		return (EIO);
 	ctl_pool_free(softc->othersc_pool);
 	while ((port = STAILQ_FIRST(&ha_frontend.port_list)) != NULL) {
 		ctl_port_deregister(port);
 		free(port->port_name, M_CTL);
 		free(port, M_CTL);
 	}
 	return (0);
 };
 
 static void
 ctl_ha_datamove(union ctl_io *io)
 {
 	struct ctl_lun *lun = CTL_LUN(io);
 	struct ctl_sg_entry *sgl;
 	union ctl_ha_msg msg;
 	uint32_t sg_entries_sent;
 	int do_sg_copy, i, j;
 
 	memset(&msg.dt, 0, sizeof(msg.dt));
 	msg.hdr.msg_type = CTL_MSG_DATAMOVE;
 	msg.hdr.original_sc = io->io_hdr.remote_io;
 	msg.hdr.serializing_sc = io;
 	msg.hdr.nexus = io->io_hdr.nexus;
 	msg.hdr.status = io->io_hdr.status;
 	msg.dt.flags = io->io_hdr.flags;
 
 	/*
 	 * We convert everything into a S/G list here.  We can't
 	 * pass by reference, only by value between controllers.
 	 * So we can't pass a pointer to the S/G list, only as many
 	 * S/G entries as we can fit in here.  If it's possible for
 	 * us to get more than CTL_HA_MAX_SG_ENTRIES S/G entries,
 	 * then we need to break this up into multiple transfers.
 	 */
 	if (io->scsiio.kern_sg_entries == 0) {
 		msg.dt.kern_sg_entries = 1;
 #if 0
 		if (io->io_hdr.flags & CTL_FLAG_BUS_ADDR) {
 			msg.dt.sg_list[0].addr = io->scsiio.kern_data_ptr;
 		} else {
 			/* XXX KDM use busdma here! */
 			msg.dt.sg_list[0].addr =
 			    (void *)vtophys(io->scsiio.kern_data_ptr);
 		}
 #else
 		KASSERT((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0,
 		    ("HA does not support BUS_ADDR"));
 		msg.dt.sg_list[0].addr = io->scsiio.kern_data_ptr;
 #endif
 		msg.dt.sg_list[0].len = io->scsiio.kern_data_len;
 		do_sg_copy = 0;
 	} else {
 		msg.dt.kern_sg_entries = io->scsiio.kern_sg_entries;
 		do_sg_copy = 1;
 	}
 
 	msg.dt.kern_data_len = io->scsiio.kern_data_len;
 	msg.dt.kern_total_len = io->scsiio.kern_total_len;
 	msg.dt.kern_data_resid = io->scsiio.kern_data_resid;
 	msg.dt.kern_rel_offset = io->scsiio.kern_rel_offset;
 	msg.dt.sg_sequence = 0;
 
 	/*
 	 * Loop until we've sent all of the S/G entries.  On the
 	 * other end, we'll recompose these S/G entries into one
 	 * contiguous list before processing.
 	 */
 	for (sg_entries_sent = 0; sg_entries_sent < msg.dt.kern_sg_entries;
 	    msg.dt.sg_sequence++) {
 		msg.dt.cur_sg_entries = MIN((sizeof(msg.dt.sg_list) /
 		    sizeof(msg.dt.sg_list[0])),
 		    msg.dt.kern_sg_entries - sg_entries_sent);
 		if (do_sg_copy != 0) {
 			sgl = (struct ctl_sg_entry *)io->scsiio.kern_data_ptr;
 			for (i = sg_entries_sent, j = 0;
 			     i < msg.dt.cur_sg_entries; i++, j++) {
 #if 0
 				if (io->io_hdr.flags & CTL_FLAG_BUS_ADDR) {
 					msg.dt.sg_list[j].addr = sgl[i].addr;
 				} else {
 					/* XXX KDM use busdma here! */
 					msg.dt.sg_list[j].addr =
 					    (void *)vtophys(sgl[i].addr);
 				}
 #else
 				KASSERT((io->io_hdr.flags &
 				    CTL_FLAG_BUS_ADDR) == 0,
 				    ("HA does not support BUS_ADDR"));
 				msg.dt.sg_list[j].addr = sgl[i].addr;
 #endif
 				msg.dt.sg_list[j].len = sgl[i].len;
 			}
 		}
 
 		sg_entries_sent += msg.dt.cur_sg_entries;
 		msg.dt.sg_last = (sg_entries_sent >= msg.dt.kern_sg_entries);
 		if (ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg,
 		    sizeof(msg.dt) - sizeof(msg.dt.sg_list) +
 		    sizeof(struct ctl_sg_entry) * msg.dt.cur_sg_entries,
 		    M_WAITOK) > CTL_HA_STATUS_SUCCESS) {
 			io->io_hdr.port_status = 31341;
 			io->scsiio.be_move_done(io);
 			return;
 		}
 		msg.dt.sent_sg_entries = sg_entries_sent;
 	}
 
 	/*
 	 * Officially handover the request from us to peer.
 	 * If failover has just happened, then we must return error.
 	 * If failover happen just after, then it is not our problem.
 	 */
 	if (lun)
 		mtx_lock(&lun->lun_lock);
 	if (io->io_hdr.flags & CTL_FLAG_FAILOVER) {
 		if (lun)
 			mtx_unlock(&lun->lun_lock);
 		io->io_hdr.port_status = 31342;
 		io->scsiio.be_move_done(io);
 		return;
 	}
 	io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE;
 	io->io_hdr.flags |= CTL_FLAG_DMA_INPROG;
 	if (lun)
 		mtx_unlock(&lun->lun_lock);
 }
 
 static void
 ctl_ha_done(union ctl_io *io)
 {
 	union ctl_ha_msg msg;
 
 	if (io->io_hdr.io_type == CTL_IO_SCSI) {
 		memset(&msg, 0, sizeof(msg));
 		msg.hdr.msg_type = CTL_MSG_FINISH_IO;
 		msg.hdr.original_sc = io->io_hdr.remote_io;
 		msg.hdr.nexus = io->io_hdr.nexus;
 		msg.hdr.status = io->io_hdr.status;
 		msg.scsi.scsi_status = io->scsiio.scsi_status;
 		msg.scsi.tag_num = io->scsiio.tag_num;
 		msg.scsi.tag_type = io->scsiio.tag_type;
 		msg.scsi.sense_len = io->scsiio.sense_len;
 		memcpy(&msg.scsi.sense_data, &io->scsiio.sense_data,
 		    io->scsiio.sense_len);
 		ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg,
 		    sizeof(msg.scsi) - sizeof(msg.scsi.sense_data) +
 		    msg.scsi.sense_len, M_WAITOK);
 	}
 	ctl_free_io(io);
 }
 
 static void
 ctl_isc_handler_finish_xfer(struct ctl_softc *ctl_softc,
 			    union ctl_ha_msg *msg_info)
 {
 	struct ctl_scsiio *ctsio;
 
 	if (msg_info->hdr.original_sc == NULL) {
 		printf("%s: original_sc == NULL!\n", __func__);
 		/* XXX KDM now what? */
 		return;
 	}
 
 	ctsio = &msg_info->hdr.original_sc->scsiio;
 	ctsio->io_hdr.flags |= CTL_FLAG_IO_ACTIVE;
 	ctsio->io_hdr.msg_type = CTL_MSG_FINISH_IO;
 	ctsio->io_hdr.status = msg_info->hdr.status;
 	ctsio->scsi_status = msg_info->scsi.scsi_status;
 	ctsio->sense_len = msg_info->scsi.sense_len;
 	memcpy(&ctsio->sense_data, &msg_info->scsi.sense_data,
 	       msg_info->scsi.sense_len);
 	ctl_enqueue_isc((union ctl_io *)ctsio);
 }
 
 static void
 ctl_isc_handler_finish_ser_only(struct ctl_softc *ctl_softc,
 				union ctl_ha_msg *msg_info)
 {
 	struct ctl_scsiio *ctsio;
 
 	if (msg_info->hdr.serializing_sc == NULL) {
 		printf("%s: serializing_sc == NULL!\n", __func__);
 		/* XXX KDM now what? */
 		return;
 	}
 
 	ctsio = &msg_info->hdr.serializing_sc->scsiio;
 	ctsio->io_hdr.msg_type = CTL_MSG_FINISH_IO;
 	ctl_enqueue_isc((union ctl_io *)ctsio);
 }
 
 void
 ctl_isc_announce_lun(struct ctl_lun *lun)
 {
 	struct ctl_softc *softc = lun->ctl_softc;
 	union ctl_ha_msg *msg;
 	struct ctl_ha_msg_lun_pr_key pr_key;
 	int i, k;
 
 	if (softc->ha_link != CTL_HA_LINK_ONLINE)
 		return;
 	mtx_lock(&lun->lun_lock);
 	i = sizeof(msg->lun);
 	if (lun->lun_devid)
 		i += lun->lun_devid->len;
 	i += sizeof(pr_key) * lun->pr_key_count;
 alloc:
 	mtx_unlock(&lun->lun_lock);
 	msg = malloc(i, M_CTL, M_WAITOK);
 	mtx_lock(&lun->lun_lock);
 	k = sizeof(msg->lun);
 	if (lun->lun_devid)
 		k += lun->lun_devid->len;
 	k += sizeof(pr_key) * lun->pr_key_count;
 	if (i < k) {
 		free(msg, M_CTL);
 		i = k;
 		goto alloc;
 	}
 	bzero(&msg->lun, sizeof(msg->lun));
 	msg->hdr.msg_type = CTL_MSG_LUN_SYNC;
 	msg->hdr.nexus.targ_lun = lun->lun;
 	msg->hdr.nexus.targ_mapped_lun = lun->lun;
 	msg->lun.flags = lun->flags;
 	msg->lun.pr_generation = lun->pr_generation;
 	msg->lun.pr_res_idx = lun->pr_res_idx;
 	msg->lun.pr_res_type = lun->pr_res_type;
 	msg->lun.pr_key_count = lun->pr_key_count;
 	i = 0;
 	if (lun->lun_devid) {
 		msg->lun.lun_devid_len = lun->lun_devid->len;
 		memcpy(&msg->lun.data[i], lun->lun_devid->data,
 		    msg->lun.lun_devid_len);
 		i += msg->lun.lun_devid_len;
 	}
 	for (k = 0; k < CTL_MAX_INITIATORS; k++) {
 		if ((pr_key.pr_key = ctl_get_prkey(lun, k)) == 0)
 			continue;
 		pr_key.pr_iid = k;
 		memcpy(&msg->lun.data[i], &pr_key, sizeof(pr_key));
 		i += sizeof(pr_key);
 	}
 	mtx_unlock(&lun->lun_lock);
 	ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg->port, sizeof(msg->port) + i,
 	    M_WAITOK);
 	free(msg, M_CTL);
 
 	if (lun->flags & CTL_LUN_PRIMARY_SC) {
 		for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 			ctl_isc_announce_mode(lun, -1,
 			    lun->mode_pages.index[i].page_code & SMPH_PC_MASK,
 			    lun->mode_pages.index[i].subpage);
 		}
 	}
 }
 
 void
 ctl_isc_announce_port(struct ctl_port *port)
 {
 	struct ctl_softc *softc = port->ctl_softc;
 	union ctl_ha_msg *msg;
 	int i;
 
 	if (port->targ_port < softc->port_min ||
 	    port->targ_port >= softc->port_max ||
 	    softc->ha_link != CTL_HA_LINK_ONLINE)
 		return;
 	i = sizeof(msg->port) + strlen(port->port_name) + 1;
 	if (port->lun_map)
 		i += port->lun_map_size * sizeof(uint32_t);
 	if (port->port_devid)
 		i += port->port_devid->len;
 	if (port->target_devid)
 		i += port->target_devid->len;
 	if (port->init_devid)
 		i += port->init_devid->len;
 	msg = malloc(i, M_CTL, M_WAITOK);
 	bzero(&msg->port, sizeof(msg->port));
 	msg->hdr.msg_type = CTL_MSG_PORT_SYNC;
 	msg->hdr.nexus.targ_port = port->targ_port;
 	msg->port.port_type = port->port_type;
 	msg->port.physical_port = port->physical_port;
 	msg->port.virtual_port = port->virtual_port;
 	msg->port.status = port->status;
 	i = 0;
 	msg->port.name_len = sprintf(&msg->port.data[i],
 	    "%d:%s", softc->ha_id, port->port_name) + 1;
 	i += msg->port.name_len;
 	if (port->lun_map) {
 		msg->port.lun_map_len = port->lun_map_size * sizeof(uint32_t);
 		memcpy(&msg->port.data[i], port->lun_map,
 		    msg->port.lun_map_len);
 		i += msg->port.lun_map_len;
 	}
 	if (port->port_devid) {
 		msg->port.port_devid_len = port->port_devid->len;
 		memcpy(&msg->port.data[i], port->port_devid->data,
 		    msg->port.port_devid_len);
 		i += msg->port.port_devid_len;
 	}
 	if (port->target_devid) {
 		msg->port.target_devid_len = port->target_devid->len;
 		memcpy(&msg->port.data[i], port->target_devid->data,
 		    msg->port.target_devid_len);
 		i += msg->port.target_devid_len;
 	}
 	if (port->init_devid) {
 		msg->port.init_devid_len = port->init_devid->len;
 		memcpy(&msg->port.data[i], port->init_devid->data,
 		    msg->port.init_devid_len);
 		i += msg->port.init_devid_len;
 	}
 	ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg->port, sizeof(msg->port) + i,
 	    M_WAITOK);
 	free(msg, M_CTL);
 }
 
 void
 ctl_isc_announce_iid(struct ctl_port *port, int iid)
 {
 	struct ctl_softc *softc = port->ctl_softc;
 	union ctl_ha_msg *msg;
 	int i, l;
 
 	if (port->targ_port < softc->port_min ||
 	    port->targ_port >= softc->port_max ||
 	    softc->ha_link != CTL_HA_LINK_ONLINE)
 		return;
 	mtx_lock(&softc->ctl_lock);
 	i = sizeof(msg->iid);
 	l = 0;
 	if (port->wwpn_iid[iid].name)
 		l = strlen(port->wwpn_iid[iid].name) + 1;
 	i += l;
 	msg = malloc(i, M_CTL, M_NOWAIT);
 	if (msg == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 		return;
 	}
 	bzero(&msg->iid, sizeof(msg->iid));
 	msg->hdr.msg_type = CTL_MSG_IID_SYNC;
 	msg->hdr.nexus.targ_port = port->targ_port;
 	msg->hdr.nexus.initid = iid;
 	msg->iid.in_use = port->wwpn_iid[iid].in_use;
 	msg->iid.name_len = l;
 	msg->iid.wwpn = port->wwpn_iid[iid].wwpn;
 	if (port->wwpn_iid[iid].name)
 		strlcpy(msg->iid.data, port->wwpn_iid[iid].name, l);
 	mtx_unlock(&softc->ctl_lock);
 	ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg->iid, i, M_NOWAIT);
 	free(msg, M_CTL);
 }
 
 void
 ctl_isc_announce_mode(struct ctl_lun *lun, uint32_t initidx,
     uint8_t page, uint8_t subpage)
 {
 	struct ctl_softc *softc = lun->ctl_softc;
 	union ctl_ha_msg msg;
 	u_int i;
 
 	if (softc->ha_link != CTL_HA_LINK_ONLINE)
 		return;
 	for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 		if ((lun->mode_pages.index[i].page_code & SMPH_PC_MASK) ==
 		    page && lun->mode_pages.index[i].subpage == subpage)
 			break;
 	}
 	if (i == CTL_NUM_MODE_PAGES)
 		return;
 
 	/* Don't try to replicate pages not present on this device. */
 	if (lun->mode_pages.index[i].page_data == NULL)
 		return;
 
 	bzero(&msg.mode, sizeof(msg.mode));
 	msg.hdr.msg_type = CTL_MSG_MODE_SYNC;
 	msg.hdr.nexus.targ_port = initidx / CTL_MAX_INIT_PER_PORT;
 	msg.hdr.nexus.initid = initidx % CTL_MAX_INIT_PER_PORT;
 	msg.hdr.nexus.targ_lun = lun->lun;
 	msg.hdr.nexus.targ_mapped_lun = lun->lun;
 	msg.mode.page_code = page;
 	msg.mode.subpage = subpage;
 	msg.mode.page_len = lun->mode_pages.index[i].page_len;
 	memcpy(msg.mode.data, lun->mode_pages.index[i].page_data,
 	    msg.mode.page_len);
 	ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg.mode, sizeof(msg.mode),
 	    M_WAITOK);
 }
 
 static void
 ctl_isc_ha_link_up(struct ctl_softc *softc)
 {
 	struct ctl_port *port;
 	struct ctl_lun *lun;
 	union ctl_ha_msg msg;
 	int i;
 
 	/* Announce this node parameters to peer for validation. */
 	msg.login.msg_type = CTL_MSG_LOGIN;
 	msg.login.version = CTL_HA_VERSION;
 	msg.login.ha_mode = softc->ha_mode;
 	msg.login.ha_id = softc->ha_id;
 	msg.login.max_luns = ctl_max_luns;
 	msg.login.max_ports = ctl_max_ports;
 	msg.login.max_init_per_port = CTL_MAX_INIT_PER_PORT;
 	ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg.login, sizeof(msg.login),
 	    M_WAITOK);
 
 	STAILQ_FOREACH(port, &softc->port_list, links) {
 		ctl_isc_announce_port(port);
 		for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) {
 			if (port->wwpn_iid[i].in_use)
 				ctl_isc_announce_iid(port, i);
 		}
 	}
 	STAILQ_FOREACH(lun, &softc->lun_list, links)
 		ctl_isc_announce_lun(lun);
 }
 
 static void
 ctl_isc_ha_link_down(struct ctl_softc *softc)
 {
 	struct ctl_port *port;
 	struct ctl_lun *lun;
 	union ctl_io *io;
 	int i;
 
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_FOREACH(lun, &softc->lun_list, links) {
 		mtx_lock(&lun->lun_lock);
 		if (lun->flags & CTL_LUN_PEER_SC_PRIMARY) {
 			lun->flags &= ~CTL_LUN_PEER_SC_PRIMARY;
 			ctl_est_ua_all(lun, -1, CTL_UA_ASYM_ACC_CHANGE);
 		}
 		mtx_unlock(&lun->lun_lock);
 
 		mtx_unlock(&softc->ctl_lock);
 		io = ctl_alloc_io(softc->othersc_pool);
 		mtx_lock(&softc->ctl_lock);
 		ctl_zero_io(io);
 		io->io_hdr.msg_type = CTL_MSG_FAILOVER;
 		io->io_hdr.nexus.targ_mapped_lun = lun->lun;
 		ctl_enqueue_isc(io);
 	}
 
 	STAILQ_FOREACH(port, &softc->port_list, links) {
 		if (port->targ_port >= softc->port_min &&
 		    port->targ_port < softc->port_max)
 			continue;
 		port->status &= ~CTL_PORT_STATUS_ONLINE;
 		for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) {
 			port->wwpn_iid[i].in_use = 0;
 			free(port->wwpn_iid[i].name, M_CTL);
 			port->wwpn_iid[i].name = NULL;
 		}
 	}
 	mtx_unlock(&softc->ctl_lock);
 }
 
 static void
 ctl_isc_ua(struct ctl_softc *softc, union ctl_ha_msg *msg, int len)
 {
 	struct ctl_lun *lun;
 	uint32_t iid = ctl_get_initindex(&msg->hdr.nexus);
 
 	mtx_lock(&softc->ctl_lock);
 	if (msg->hdr.nexus.targ_mapped_lun >= ctl_max_luns ||
 	    (lun = softc->ctl_luns[msg->hdr.nexus.targ_mapped_lun]) == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 		return;
 	}
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 	if (msg->ua.ua_type == CTL_UA_THIN_PROV_THRES && msg->ua.ua_set)
 		memcpy(lun->ua_tpt_info, msg->ua.ua_info, 8);
 	if (msg->ua.ua_all) {
 		if (msg->ua.ua_set)
 			ctl_est_ua_all(lun, iid, msg->ua.ua_type);
 		else
 			ctl_clr_ua_all(lun, iid, msg->ua.ua_type);
 	} else {
 		if (msg->ua.ua_set)
 			ctl_est_ua(lun, iid, msg->ua.ua_type);
 		else
 			ctl_clr_ua(lun, iid, msg->ua.ua_type);
 	}
 	mtx_unlock(&lun->lun_lock);
 }
 
 static void
 ctl_isc_lun_sync(struct ctl_softc *softc, union ctl_ha_msg *msg, int len)
 {
 	struct ctl_lun *lun;
 	struct ctl_ha_msg_lun_pr_key pr_key;
 	int i, k;
 	ctl_lun_flags oflags;
 	uint32_t targ_lun;
 
 	targ_lun = msg->hdr.nexus.targ_mapped_lun;
 	mtx_lock(&softc->ctl_lock);
 	if (targ_lun >= ctl_max_luns ||
 	    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 		return;
 	}
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 	if (lun->flags & CTL_LUN_DISABLED) {
 		mtx_unlock(&lun->lun_lock);
 		return;
 	}
 	i = (lun->lun_devid != NULL) ? lun->lun_devid->len : 0;
 	if (msg->lun.lun_devid_len != i || (i > 0 &&
 	    memcmp(&msg->lun.data[0], lun->lun_devid->data, i) != 0)) {
 		mtx_unlock(&lun->lun_lock);
 		printf("%s: Received conflicting HA LUN %d\n",
 		    __func__, targ_lun);
 		return;
 	} else {
 		/* Record whether peer is primary. */
 		oflags = lun->flags;
 		if ((msg->lun.flags & CTL_LUN_PRIMARY_SC) &&
 		    (msg->lun.flags & CTL_LUN_DISABLED) == 0)
 			lun->flags |= CTL_LUN_PEER_SC_PRIMARY;
 		else
 			lun->flags &= ~CTL_LUN_PEER_SC_PRIMARY;
 		if (oflags != lun->flags)
 			ctl_est_ua_all(lun, -1, CTL_UA_ASYM_ACC_CHANGE);
 
 		/* If peer is primary and we are not -- use data */
 		if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0 &&
 		    (lun->flags & CTL_LUN_PEER_SC_PRIMARY)) {
 			lun->pr_generation = msg->lun.pr_generation;
 			lun->pr_res_idx = msg->lun.pr_res_idx;
 			lun->pr_res_type = msg->lun.pr_res_type;
 			lun->pr_key_count = msg->lun.pr_key_count;
 			for (k = 0; k < CTL_MAX_INITIATORS; k++)
 				ctl_clr_prkey(lun, k);
 			for (k = 0; k < msg->lun.pr_key_count; k++) {
 				memcpy(&pr_key, &msg->lun.data[i],
 				    sizeof(pr_key));
 				ctl_alloc_prkey(lun, pr_key.pr_iid);
 				ctl_set_prkey(lun, pr_key.pr_iid,
 				    pr_key.pr_key);
 				i += sizeof(pr_key);
 			}
 		}
 
 		mtx_unlock(&lun->lun_lock);
 		CTL_DEBUG_PRINT(("%s: Known LUN %d, peer is %s\n",
 		    __func__, targ_lun,
 		    (msg->lun.flags & CTL_LUN_PRIMARY_SC) ?
 		    "primary" : "secondary"));
 
 		/* If we are primary but peer doesn't know -- notify */
 		if ((lun->flags & CTL_LUN_PRIMARY_SC) &&
 		    (msg->lun.flags & CTL_LUN_PEER_SC_PRIMARY) == 0)
 			ctl_isc_announce_lun(lun);
 	}
 }
 
 static void
 ctl_isc_port_sync(struct ctl_softc *softc, union ctl_ha_msg *msg, int len)
 {
 	struct ctl_port *port;
 	struct ctl_lun *lun;
 	int i, new;
 
 	port = softc->ctl_ports[msg->hdr.nexus.targ_port];
 	if (port == NULL) {
 		CTL_DEBUG_PRINT(("%s: New port %d\n", __func__,
 		    msg->hdr.nexus.targ_port));
 		new = 1;
 		port = malloc(sizeof(*port), M_CTL, M_WAITOK | M_ZERO);
 		port->frontend = &ha_frontend;
 		port->targ_port = msg->hdr.nexus.targ_port;
 		port->fe_datamove = ctl_ha_datamove;
 		port->fe_done = ctl_ha_done;
 	} else if (port->frontend == &ha_frontend) {
 		CTL_DEBUG_PRINT(("%s: Updated port %d\n", __func__,
 		    msg->hdr.nexus.targ_port));
 		new = 0;
 	} else {
 		printf("%s: Received conflicting HA port %d\n",
 		    __func__, msg->hdr.nexus.targ_port);
 		return;
 	}
 	port->port_type = msg->port.port_type;
 	port->physical_port = msg->port.physical_port;
 	port->virtual_port = msg->port.virtual_port;
 	port->status = msg->port.status;
 	i = 0;
 	free(port->port_name, M_CTL);
 	port->port_name = strndup(&msg->port.data[i], msg->port.name_len,
 	    M_CTL);
 	i += msg->port.name_len;
 	if (msg->port.lun_map_len != 0) {
 		if (port->lun_map == NULL ||
 		    port->lun_map_size * sizeof(uint32_t) <
 		    msg->port.lun_map_len) {
 			port->lun_map_size = 0;
 			free(port->lun_map, M_CTL);
 			port->lun_map = malloc(msg->port.lun_map_len,
 			    M_CTL, M_WAITOK);
 		}
 		memcpy(port->lun_map, &msg->port.data[i], msg->port.lun_map_len);
 		port->lun_map_size = msg->port.lun_map_len / sizeof(uint32_t);
 		i += msg->port.lun_map_len;
 	} else {
 		port->lun_map_size = 0;
 		free(port->lun_map, M_CTL);
 		port->lun_map = NULL;
 	}
 	if (msg->port.port_devid_len != 0) {
 		if (port->port_devid == NULL ||
 		    port->port_devid->len < msg->port.port_devid_len) {
 			free(port->port_devid, M_CTL);
 			port->port_devid = malloc(sizeof(struct ctl_devid) +
 			    msg->port.port_devid_len, M_CTL, M_WAITOK);
 		}
 		memcpy(port->port_devid->data, &msg->port.data[i],
 		    msg->port.port_devid_len);
 		port->port_devid->len = msg->port.port_devid_len;
 		i += msg->port.port_devid_len;
 	} else {
 		free(port->port_devid, M_CTL);
 		port->port_devid = NULL;
 	}
 	if (msg->port.target_devid_len != 0) {
 		if (port->target_devid == NULL ||
 		    port->target_devid->len < msg->port.target_devid_len) {
 			free(port->target_devid, M_CTL);
 			port->target_devid = malloc(sizeof(struct ctl_devid) +
 			    msg->port.target_devid_len, M_CTL, M_WAITOK);
 		}
 		memcpy(port->target_devid->data, &msg->port.data[i],
 		    msg->port.target_devid_len);
 		port->target_devid->len = msg->port.target_devid_len;
 		i += msg->port.target_devid_len;
 	} else {
 		free(port->target_devid, M_CTL);
 		port->target_devid = NULL;
 	}
 	if (msg->port.init_devid_len != 0) {
 		if (port->init_devid == NULL ||
 		    port->init_devid->len < msg->port.init_devid_len) {
 			free(port->init_devid, M_CTL);
 			port->init_devid = malloc(sizeof(struct ctl_devid) +
 			    msg->port.init_devid_len, M_CTL, M_WAITOK);
 		}
 		memcpy(port->init_devid->data, &msg->port.data[i],
 		    msg->port.init_devid_len);
 		port->init_devid->len = msg->port.init_devid_len;
 		i += msg->port.init_devid_len;
 	} else {
 		free(port->init_devid, M_CTL);
 		port->init_devid = NULL;
 	}
 	if (new) {
 		if (ctl_port_register(port) != 0) {
 			printf("%s: ctl_port_register() failed with error\n",
 			    __func__);
 		}
 	}
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_FOREACH(lun, &softc->lun_list, links) {
 		if (ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX)
 			continue;
 		mtx_lock(&lun->lun_lock);
 		ctl_est_ua_all(lun, -1, CTL_UA_INQ_CHANGE);
 		mtx_unlock(&lun->lun_lock);
 	}
 	mtx_unlock(&softc->ctl_lock);
 }
 
 static void
 ctl_isc_iid_sync(struct ctl_softc *softc, union ctl_ha_msg *msg, int len)
 {
 	struct ctl_port *port;
 	int iid;
 
 	port = softc->ctl_ports[msg->hdr.nexus.targ_port];
 	if (port == NULL) {
 		printf("%s: Received IID for unknown port %d\n",
 		    __func__, msg->hdr.nexus.targ_port);
 		return;
 	}
 	iid = msg->hdr.nexus.initid;
 	if (port->wwpn_iid[iid].in_use != 0 &&
 	    msg->iid.in_use == 0)
 		ctl_i_t_nexus_loss(softc, iid, CTL_UA_POWERON);
 	port->wwpn_iid[iid].in_use = msg->iid.in_use;
 	port->wwpn_iid[iid].wwpn = msg->iid.wwpn;
 	free(port->wwpn_iid[iid].name, M_CTL);
 	if (msg->iid.name_len) {
 		port->wwpn_iid[iid].name = strndup(&msg->iid.data[0],
 		    msg->iid.name_len, M_CTL);
 	} else
 		port->wwpn_iid[iid].name = NULL;
 }
 
 static void
 ctl_isc_login(struct ctl_softc *softc, union ctl_ha_msg *msg, int len)
 {
 
 	if (msg->login.version != CTL_HA_VERSION) {
 		printf("CTL HA peers have different versions %d != %d\n",
 		    msg->login.version, CTL_HA_VERSION);
 		ctl_ha_msg_abort(CTL_HA_CHAN_CTL);
 		return;
 	}
 	if (msg->login.ha_mode != softc->ha_mode) {
 		printf("CTL HA peers have different ha_mode %d != %d\n",
 		    msg->login.ha_mode, softc->ha_mode);
 		ctl_ha_msg_abort(CTL_HA_CHAN_CTL);
 		return;
 	}
 	if (msg->login.ha_id == softc->ha_id) {
 		printf("CTL HA peers have same ha_id %d\n", msg->login.ha_id);
 		ctl_ha_msg_abort(CTL_HA_CHAN_CTL);
 		return;
 	}
 	if (msg->login.max_luns != ctl_max_luns ||
 	    msg->login.max_ports != ctl_max_ports ||
 	    msg->login.max_init_per_port != CTL_MAX_INIT_PER_PORT) {
 		printf("CTL HA peers have different limits\n");
 		ctl_ha_msg_abort(CTL_HA_CHAN_CTL);
 		return;
 	}
 }
 
 static void
 ctl_isc_mode_sync(struct ctl_softc *softc, union ctl_ha_msg *msg, int len)
 {
 	struct ctl_lun *lun;
 	u_int i;
 	uint32_t initidx, targ_lun;
 
 	targ_lun = msg->hdr.nexus.targ_mapped_lun;
 	mtx_lock(&softc->ctl_lock);
 	if (targ_lun >= ctl_max_luns ||
 	    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 		return;
 	}
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 	if (lun->flags & CTL_LUN_DISABLED) {
 		mtx_unlock(&lun->lun_lock);
 		return;
 	}
 	for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 		if ((lun->mode_pages.index[i].page_code & SMPH_PC_MASK) ==
 		    msg->mode.page_code &&
 		    lun->mode_pages.index[i].subpage == msg->mode.subpage)
 			break;
 	}
 	if (i == CTL_NUM_MODE_PAGES) {
 		mtx_unlock(&lun->lun_lock);
 		return;
 	}
 	memcpy(lun->mode_pages.index[i].page_data, msg->mode.data,
 	    lun->mode_pages.index[i].page_len);
 	initidx = ctl_get_initindex(&msg->hdr.nexus);
 	if (initidx != -1)
 		ctl_est_ua_all(lun, initidx, CTL_UA_MODE_CHANGE);
 	mtx_unlock(&lun->lun_lock);
 }
 
 /*
  * ISC (Inter Shelf Communication) event handler.  Events from the HA
  * subsystem come in here.
  */
 static void
 ctl_isc_event_handler(ctl_ha_channel channel, ctl_ha_event event, int param)
 {
 	struct ctl_softc *softc = control_softc;
 	union ctl_io *io;
 	struct ctl_prio *presio;
 	ctl_ha_status isc_status;
 
 	CTL_DEBUG_PRINT(("CTL: Isc Msg event %d\n", event));
 	if (event == CTL_HA_EVT_MSG_RECV) {
 		union ctl_ha_msg *msg, msgbuf;
 
 		if (param > sizeof(msgbuf))
 			msg = malloc(param, M_CTL, M_WAITOK);
 		else
 			msg = &msgbuf;
 		isc_status = ctl_ha_msg_recv(CTL_HA_CHAN_CTL, msg, param,
 		    M_WAITOK);
 		if (isc_status != CTL_HA_STATUS_SUCCESS) {
 			printf("%s: Error receiving message: %d\n",
 			    __func__, isc_status);
 			if (msg != &msgbuf)
 				free(msg, M_CTL);
 			return;
 		}
 
 		CTL_DEBUG_PRINT(("CTL: msg_type %d\n", msg->msg_type));
 		switch (msg->hdr.msg_type) {
 		case CTL_MSG_SERIALIZE:
 			io = ctl_alloc_io(softc->othersc_pool);
 			ctl_zero_io(io);
 			// populate ctsio from msg
 			io->io_hdr.io_type = CTL_IO_SCSI;
 			io->io_hdr.msg_type = CTL_MSG_SERIALIZE;
 			io->io_hdr.remote_io = msg->hdr.original_sc;
 			io->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC |
 					    CTL_FLAG_IO_ACTIVE;
 			/*
 			 * If we're in serialization-only mode, we don't
 			 * want to go through full done processing.  Thus
 			 * the COPY flag.
 			 *
 			 * XXX KDM add another flag that is more specific.
 			 */
 			if (softc->ha_mode != CTL_HA_MODE_XFER)
 				io->io_hdr.flags |= CTL_FLAG_INT_COPY;
 			io->io_hdr.nexus = msg->hdr.nexus;
 			io->scsiio.tag_num = msg->scsi.tag_num;
 			io->scsiio.tag_type = msg->scsi.tag_type;
 #ifdef CTL_TIME_IO
 			io->io_hdr.start_time = time_uptime;
 			getbinuptime(&io->io_hdr.start_bt);
 #endif /* CTL_TIME_IO */
 			io->scsiio.cdb_len = msg->scsi.cdb_len;
 			memcpy(io->scsiio.cdb, msg->scsi.cdb,
 			       CTL_MAX_CDBLEN);
 			if (softc->ha_mode == CTL_HA_MODE_XFER) {
 				const struct ctl_cmd_entry *entry;
 
 				entry = ctl_get_cmd_entry(&io->scsiio, NULL);
 				io->io_hdr.flags &= ~CTL_FLAG_DATA_MASK;
 				io->io_hdr.flags |=
 					entry->flags & CTL_FLAG_DATA_MASK;
 			}
 			ctl_enqueue_isc(io);
 			break;
 
 		/* Performed on the Originating SC, XFER mode only */
 		case CTL_MSG_DATAMOVE: {
 			struct ctl_sg_entry *sgl;
 			int i, j;
 
 			io = msg->hdr.original_sc;
 			if (io == NULL) {
 				printf("%s: original_sc == NULL!\n", __func__);
 				/* XXX KDM do something here */
 				break;
 			}
 			io->io_hdr.msg_type = CTL_MSG_DATAMOVE;
 			io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE;
 			/*
 			 * Keep track of this, we need to send it back over
 			 * when the datamove is complete.
 			 */
 			io->io_hdr.remote_io = msg->hdr.serializing_sc;
 			if (msg->hdr.status == CTL_SUCCESS)
 				io->io_hdr.status = msg->hdr.status;
 
 			if (msg->dt.sg_sequence == 0) {
 #ifdef CTL_TIME_IO
 				getbinuptime(&io->io_hdr.dma_start_bt);
 #endif
 				i = msg->dt.kern_sg_entries +
 				    msg->dt.kern_data_len /
 				    CTL_HA_DATAMOVE_SEGMENT + 1;
 				sgl = malloc(sizeof(*sgl) * i, M_CTL,
 				    M_WAITOK | M_ZERO);
 				CTL_RSGL(io) = sgl;
 				CTL_LSGL(io) = &sgl[msg->dt.kern_sg_entries];
 
 				io->scsiio.kern_data_ptr = (uint8_t *)sgl;
 
 				io->scsiio.kern_sg_entries =
 					msg->dt.kern_sg_entries;
 				io->scsiio.rem_sg_entries =
 					msg->dt.kern_sg_entries;
 				io->scsiio.kern_data_len =
 					msg->dt.kern_data_len;
 				io->scsiio.kern_total_len =
 					msg->dt.kern_total_len;
 				io->scsiio.kern_data_resid =
 					msg->dt.kern_data_resid;
 				io->scsiio.kern_rel_offset =
 					msg->dt.kern_rel_offset;
 				io->io_hdr.flags &= ~CTL_FLAG_BUS_ADDR;
 				io->io_hdr.flags |= msg->dt.flags &
 				    CTL_FLAG_BUS_ADDR;
 			} else
 				sgl = (struct ctl_sg_entry *)
 					io->scsiio.kern_data_ptr;
 
 			for (i = msg->dt.sent_sg_entries, j = 0;
 			     i < (msg->dt.sent_sg_entries +
 			     msg->dt.cur_sg_entries); i++, j++) {
 				sgl[i].addr = msg->dt.sg_list[j].addr;
 				sgl[i].len = msg->dt.sg_list[j].len;
 			}
 
 			/*
 			 * If this is the last piece of the I/O, we've got
 			 * the full S/G list.  Queue processing in the thread.
 			 * Otherwise wait for the next piece.
 			 */
 			if (msg->dt.sg_last != 0)
 				ctl_enqueue_isc(io);
 			break;
 		}
 		/* Performed on the Serializing (primary) SC, XFER mode only */
 		case CTL_MSG_DATAMOVE_DONE: {
 			if (msg->hdr.serializing_sc == NULL) {
 				printf("%s: serializing_sc == NULL!\n",
 				       __func__);
 				/* XXX KDM now what? */
 				break;
 			}
 			/*
 			 * We grab the sense information here in case
 			 * there was a failure, so we can return status
 			 * back to the initiator.
 			 */
 			io = msg->hdr.serializing_sc;
 			io->io_hdr.msg_type = CTL_MSG_DATAMOVE_DONE;
 			io->io_hdr.flags &= ~CTL_FLAG_DMA_INPROG;
 			io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE;
 			io->io_hdr.port_status = msg->scsi.port_status;
 			io->scsiio.kern_data_resid = msg->scsi.kern_data_resid;
 			if (msg->hdr.status != CTL_STATUS_NONE) {
 				io->io_hdr.status = msg->hdr.status;
 				io->scsiio.scsi_status = msg->scsi.scsi_status;
 				io->scsiio.sense_len = msg->scsi.sense_len;
 				memcpy(&io->scsiio.sense_data,
 				    &msg->scsi.sense_data,
 				    msg->scsi.sense_len);
 				if (msg->hdr.status == CTL_SUCCESS)
 					io->io_hdr.flags |= CTL_FLAG_STATUS_SENT;
 			}
 			ctl_enqueue_isc(io);
 			break;
 		}
 
 		/* Preformed on Originating SC, SER_ONLY mode */
 		case CTL_MSG_R2R:
 			io = msg->hdr.original_sc;
 			if (io == NULL) {
 				printf("%s: original_sc == NULL!\n",
 				    __func__);
 				break;
 			}
 			io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE;
 			io->io_hdr.msg_type = CTL_MSG_R2R;
 			io->io_hdr.remote_io = msg->hdr.serializing_sc;
 			ctl_enqueue_isc(io);
 			break;
 
 		/*
 		 * Performed on Serializing(i.e. primary SC) SC in SER_ONLY
 		 * mode.
 		 * Performed on the Originating (i.e. secondary) SC in XFER
 		 * mode
 		 */
 		case CTL_MSG_FINISH_IO:
 			if (softc->ha_mode == CTL_HA_MODE_XFER)
 				ctl_isc_handler_finish_xfer(softc, msg);
 			else
 				ctl_isc_handler_finish_ser_only(softc, msg);
 			break;
 
 		/* Preformed on Originating SC */
 		case CTL_MSG_BAD_JUJU:
 			io = msg->hdr.original_sc;
 			if (io == NULL) {
 				printf("%s: Bad JUJU!, original_sc is NULL!\n",
 				       __func__);
 				break;
 			}
 			ctl_copy_sense_data(msg, io);
 			/*
 			 * IO should have already been cleaned up on other
 			 * SC so clear this flag so we won't send a message
 			 * back to finish the IO there.
 			 */
 			io->io_hdr.flags &= ~CTL_FLAG_SENT_2OTHER_SC;
 			io->io_hdr.flags |= CTL_FLAG_IO_ACTIVE;
 
 			/* io = msg->hdr.serializing_sc; */
 			io->io_hdr.msg_type = CTL_MSG_BAD_JUJU;
 			ctl_enqueue_isc(io);
 			break;
 
 		/* Handle resets sent from the other side */
 		case CTL_MSG_MANAGE_TASKS: {
 			struct ctl_taskio *taskio;
 			taskio = (struct ctl_taskio *)ctl_alloc_io(
 			    softc->othersc_pool);
 			ctl_zero_io((union ctl_io *)taskio);
 			taskio->io_hdr.io_type = CTL_IO_TASK;
 			taskio->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC;
 			taskio->io_hdr.nexus = msg->hdr.nexus;
 			taskio->task_action = msg->task.task_action;
 			taskio->tag_num = msg->task.tag_num;
 			taskio->tag_type = msg->task.tag_type;
 #ifdef CTL_TIME_IO
 			taskio->io_hdr.start_time = time_uptime;
 			getbinuptime(&taskio->io_hdr.start_bt);
 #endif /* CTL_TIME_IO */
 			ctl_run_task((union ctl_io *)taskio);
 			break;
 		}
 		/* Persistent Reserve action which needs attention */
 		case CTL_MSG_PERS_ACTION:
 			presio = (struct ctl_prio *)ctl_alloc_io(
 			    softc->othersc_pool);
 			ctl_zero_io((union ctl_io *)presio);
 			presio->io_hdr.msg_type = CTL_MSG_PERS_ACTION;
 			presio->io_hdr.flags |= CTL_FLAG_FROM_OTHER_SC;
 			presio->io_hdr.nexus = msg->hdr.nexus;
 			presio->pr_msg = msg->pr;
 			ctl_enqueue_isc((union ctl_io *)presio);
 			break;
 		case CTL_MSG_UA:
 			ctl_isc_ua(softc, msg, param);
 			break;
 		case CTL_MSG_PORT_SYNC:
 			ctl_isc_port_sync(softc, msg, param);
 			break;
 		case CTL_MSG_LUN_SYNC:
 			ctl_isc_lun_sync(softc, msg, param);
 			break;
 		case CTL_MSG_IID_SYNC:
 			ctl_isc_iid_sync(softc, msg, param);
 			break;
 		case CTL_MSG_LOGIN:
 			ctl_isc_login(softc, msg, param);
 			break;
 		case CTL_MSG_MODE_SYNC:
 			ctl_isc_mode_sync(softc, msg, param);
 			break;
 		default:
 			printf("Received HA message of unknown type %d\n",
 			    msg->hdr.msg_type);
 			ctl_ha_msg_abort(CTL_HA_CHAN_CTL);
 			break;
 		}
 		if (msg != &msgbuf)
 			free(msg, M_CTL);
 	} else if (event == CTL_HA_EVT_LINK_CHANGE) {
 		printf("CTL: HA link status changed from %d to %d\n",
 		    softc->ha_link, param);
 		if (param == softc->ha_link)
 			return;
 		if (softc->ha_link == CTL_HA_LINK_ONLINE) {
 			softc->ha_link = param;
 			ctl_isc_ha_link_down(softc);
 		} else {
 			softc->ha_link = param;
 			if (softc->ha_link == CTL_HA_LINK_ONLINE)
 				ctl_isc_ha_link_up(softc);
 		}
 		return;
 	} else {
 		printf("ctl_isc_event_handler: Unknown event %d\n", event);
 		return;
 	}
 }
 
 static void
 ctl_copy_sense_data(union ctl_ha_msg *src, union ctl_io *dest)
 {
 
 	memcpy(&dest->scsiio.sense_data, &src->scsi.sense_data,
 	    src->scsi.sense_len);
 	dest->scsiio.scsi_status = src->scsi.scsi_status;
 	dest->scsiio.sense_len = src->scsi.sense_len;
 	dest->io_hdr.status = src->hdr.status;
 }
 
 static void
 ctl_copy_sense_data_back(union ctl_io *src, union ctl_ha_msg *dest)
 {
 
 	memcpy(&dest->scsi.sense_data, &src->scsiio.sense_data,
 	    src->scsiio.sense_len);
 	dest->scsi.scsi_status = src->scsiio.scsi_status;
 	dest->scsi.sense_len = src->scsiio.sense_len;
 	dest->hdr.status = src->io_hdr.status;
 }
 
 void
 ctl_est_ua(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua)
 {
 	struct ctl_softc *softc = lun->ctl_softc;
 	ctl_ua_type *pu;
 
 	if (initidx < softc->init_min || initidx >= softc->init_max)
 		return;
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 	pu = lun->pending_ua[initidx / CTL_MAX_INIT_PER_PORT];
 	if (pu == NULL)
 		return;
 	pu[initidx % CTL_MAX_INIT_PER_PORT] |= ua;
 }
 
 void
 ctl_est_ua_port(struct ctl_lun *lun, int port, uint32_t except, ctl_ua_type ua)
 {
 	int i;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 	if (lun->pending_ua[port] == NULL)
 		return;
 	for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) {
 		if (port * CTL_MAX_INIT_PER_PORT + i == except)
 			continue;
 		lun->pending_ua[port][i] |= ua;
 	}
 }
 
 void
 ctl_est_ua_all(struct ctl_lun *lun, uint32_t except, ctl_ua_type ua)
 {
 	struct ctl_softc *softc = lun->ctl_softc;
 	int i;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 	for (i = softc->port_min; i < softc->port_max; i++)
 		ctl_est_ua_port(lun, i, except, ua);
 }
 
 void
 ctl_clr_ua(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua)
 {
 	struct ctl_softc *softc = lun->ctl_softc;
 	ctl_ua_type *pu;
 
 	if (initidx < softc->init_min || initidx >= softc->init_max)
 		return;
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 	pu = lun->pending_ua[initidx / CTL_MAX_INIT_PER_PORT];
 	if (pu == NULL)
 		return;
 	pu[initidx % CTL_MAX_INIT_PER_PORT] &= ~ua;
 }
 
 void
 ctl_clr_ua_all(struct ctl_lun *lun, uint32_t except, ctl_ua_type ua)
 {
 	struct ctl_softc *softc = lun->ctl_softc;
 	int i, j;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 	for (i = softc->port_min; i < softc->port_max; i++) {
 		if (lun->pending_ua[i] == NULL)
 			continue;
 		for (j = 0; j < CTL_MAX_INIT_PER_PORT; j++) {
 			if (i * CTL_MAX_INIT_PER_PORT + j == except)
 				continue;
 			lun->pending_ua[i][j] &= ~ua;
 		}
 	}
 }
 
 void
 ctl_clr_ua_allluns(struct ctl_softc *ctl_softc, uint32_t initidx,
     ctl_ua_type ua_type)
 {
 	struct ctl_lun *lun;
 
 	mtx_assert(&ctl_softc->ctl_lock, MA_OWNED);
 	STAILQ_FOREACH(lun, &ctl_softc->lun_list, links) {
 		mtx_lock(&lun->lun_lock);
 		ctl_clr_ua(lun, initidx, ua_type);
 		mtx_unlock(&lun->lun_lock);
 	}
 }
 
 static int
 ctl_ha_role_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	struct ctl_softc *softc = (struct ctl_softc *)arg1;
 	struct ctl_lun *lun;
 	struct ctl_lun_req ireq;
 	int error, value;
 
 	value = (softc->flags & CTL_FLAG_ACTIVE_SHELF) ? 0 : 1;
 	error = sysctl_handle_int(oidp, &value, 0, req);
 	if ((error != 0) || (req->newptr == NULL))
 		return (error);
 
 	mtx_lock(&softc->ctl_lock);
 	if (value == 0)
 		softc->flags |= CTL_FLAG_ACTIVE_SHELF;
 	else
 		softc->flags &= ~CTL_FLAG_ACTIVE_SHELF;
 	STAILQ_FOREACH(lun, &softc->lun_list, links) {
 		mtx_unlock(&softc->ctl_lock);
 		bzero(&ireq, sizeof(ireq));
 		ireq.reqtype = CTL_LUNREQ_MODIFY;
 		ireq.reqdata.modify.lun_id = lun->lun;
 		lun->backend->ioctl(NULL, CTL_LUN_REQ, (caddr_t)&ireq, 0,
 		    curthread);
 		if (ireq.status != CTL_LUN_OK) {
 			printf("%s: CTL_LUNREQ_MODIFY returned %d '%s'\n",
 			    __func__, ireq.status, ireq.error_str);
 		}
 		mtx_lock(&softc->ctl_lock);
 	}
 	mtx_unlock(&softc->ctl_lock);
 	return (0);
 }
 
 static int
 ctl_init(void)
 {
 	struct make_dev_args args;
 	struct ctl_softc *softc;
 	int i, error;
 
 	softc = control_softc = malloc(sizeof(*control_softc), M_DEVBUF,
 			       M_WAITOK | M_ZERO);
 
 	make_dev_args_init(&args);
 	args.mda_devsw = &ctl_cdevsw;
 	args.mda_uid = UID_ROOT;
 	args.mda_gid = GID_OPERATOR;
 	args.mda_mode = 0600;
 	args.mda_si_drv1 = softc;
 	args.mda_si_drv2 = NULL;
 	error = make_dev_s(&args, &softc->dev, "cam/ctl");
 	if (error != 0) {
 		free(softc, M_DEVBUF);
 		control_softc = NULL;
 		return (error);
 	}
 
 	sysctl_ctx_init(&softc->sysctl_ctx);
 	softc->sysctl_tree = SYSCTL_ADD_NODE(&softc->sysctl_ctx,
 		SYSCTL_STATIC_CHILDREN(_kern_cam), OID_AUTO, "ctl",
 		CTLFLAG_RD, 0, "CAM Target Layer");
 
 	if (softc->sysctl_tree == NULL) {
 		printf("%s: unable to allocate sysctl tree\n", __func__);
 		destroy_dev(softc->dev);
 		free(softc, M_DEVBUF);
 		control_softc = NULL;
 		return (ENOMEM);
 	}
 
 	mtx_init(&softc->ctl_lock, "CTL mutex", NULL, MTX_DEF);
 	softc->io_zone = uma_zcreate("CTL IO", sizeof(union ctl_io),
 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
 	softc->flags = 0;
 
 	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "ha_mode", CTLFLAG_RDTUN, (int *)&softc->ha_mode, 0,
 	    "HA mode (0 - act/stby, 1 - serialize only, 2 - xfer)");
 
 	if (ctl_max_luns <= 0 || powerof2(ctl_max_luns) == 0) {
 		printf("Bad value %d for kern.cam.ctl.max_luns, must be a power of two, using %d\n",
 		    ctl_max_luns, CTL_DEFAULT_MAX_LUNS);
 		ctl_max_luns = CTL_DEFAULT_MAX_LUNS;
 	}
 	softc->ctl_luns = malloc(sizeof(struct ctl_lun *) * ctl_max_luns,
 	    M_DEVBUF, M_WAITOK | M_ZERO);
 	softc->ctl_lun_mask = malloc(sizeof(uint32_t) *
 	    ((ctl_max_luns + 31) / 32), M_DEVBUF, M_WAITOK | M_ZERO);
 	if (ctl_max_ports <= 0 || powerof2(ctl_max_ports) == 0) {
 		printf("Bad value %d for kern.cam.ctl.max_ports, must be a power of two, using %d\n",
 		    ctl_max_ports, CTL_DEFAULT_MAX_PORTS);
 		ctl_max_ports = CTL_DEFAULT_MAX_PORTS;
 	}
 	softc->ctl_port_mask = malloc(sizeof(uint32_t) *
 	  ((ctl_max_ports + 31) / 32), M_DEVBUF, M_WAITOK | M_ZERO);
 	softc->ctl_ports = malloc(sizeof(struct ctl_port *) * ctl_max_ports,
 	     M_DEVBUF, M_WAITOK | M_ZERO);
 
 
 	/*
 	 * In Copan's HA scheme, the "master" and "slave" roles are
 	 * figured out through the slot the controller is in.  Although it
 	 * is an active/active system, someone has to be in charge.
 	 */
 	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "ha_id", CTLFLAG_RDTUN, &softc->ha_id, 0,
 	    "HA head ID (0 - no HA)");
 	if (softc->ha_id == 0 || softc->ha_id > NUM_HA_SHELVES) {
 		softc->flags |= CTL_FLAG_ACTIVE_SHELF;
 		softc->is_single = 1;
 		softc->port_cnt = ctl_max_ports;
 		softc->port_min = 0;
 	} else {
 		softc->port_cnt = ctl_max_ports / NUM_HA_SHELVES;
 		softc->port_min = (softc->ha_id - 1) * softc->port_cnt;
 	}
 	softc->port_max = softc->port_min + softc->port_cnt;
 	softc->init_min = softc->port_min * CTL_MAX_INIT_PER_PORT;
 	softc->init_max = softc->port_max * CTL_MAX_INIT_PER_PORT;
 
 	SYSCTL_ADD_INT(&softc->sysctl_ctx, SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "ha_link", CTLFLAG_RD, (int *)&softc->ha_link, 0,
 	    "HA link state (0 - offline, 1 - unknown, 2 - online)");
 
 	STAILQ_INIT(&softc->lun_list);
 	STAILQ_INIT(&softc->pending_lun_queue);
 	STAILQ_INIT(&softc->fe_list);
 	STAILQ_INIT(&softc->port_list);
 	STAILQ_INIT(&softc->be_list);
 	ctl_tpc_init(softc);
 
 	if (worker_threads <= 0)
 		worker_threads = max(1, mp_ncpus / 4);
 	if (worker_threads > CTL_MAX_THREADS)
 		worker_threads = CTL_MAX_THREADS;
 
 	for (i = 0; i < worker_threads; i++) {
 		struct ctl_thread *thr = &softc->threads[i];
 
 		mtx_init(&thr->queue_lock, "CTL queue mutex", NULL, MTX_DEF);
 		thr->ctl_softc = softc;
 		STAILQ_INIT(&thr->incoming_queue);
 		STAILQ_INIT(&thr->rtr_queue);
 		STAILQ_INIT(&thr->done_queue);
 		STAILQ_INIT(&thr->isc_queue);
 
 		error = kproc_kthread_add(ctl_work_thread, thr,
 		    &softc->ctl_proc, &thr->thread, 0, 0, "ctl", "work%d", i);
 		if (error != 0) {
 			printf("error creating CTL work thread!\n");
 			return (error);
 		}
 	}
 	error = kproc_kthread_add(ctl_lun_thread, softc,
 	    &softc->ctl_proc, &softc->lun_thread, 0, 0, "ctl", "lun");
 	if (error != 0) {
 		printf("error creating CTL lun thread!\n");
 		return (error);
 	}
 	error = kproc_kthread_add(ctl_thresh_thread, softc,
 	    &softc->ctl_proc, &softc->thresh_thread, 0, 0, "ctl", "thresh");
 	if (error != 0) {
 		printf("error creating CTL threshold thread!\n");
 		return (error);
 	}
 
 	SYSCTL_ADD_PROC(&softc->sysctl_ctx,SYSCTL_CHILDREN(softc->sysctl_tree),
 	    OID_AUTO, "ha_role", CTLTYPE_INT | CTLFLAG_RWTUN,
 	    softc, 0, ctl_ha_role_sysctl, "I", "HA role for this head");
 
 	if (softc->is_single == 0) {
 		if (ctl_frontend_register(&ha_frontend) != 0)
 			softc->is_single = 1;
 	}
 	return (0);
 }
 
 static int
 ctl_shutdown(void)
 {
 	struct ctl_softc *softc = control_softc;
 	int i;
 
 	if (softc->is_single == 0)
 		ctl_frontend_deregister(&ha_frontend);
 
 	destroy_dev(softc->dev);
 
 	/* Shutdown CTL threads. */
 	softc->shutdown = 1;
 	for (i = 0; i < worker_threads; i++) {
 		struct ctl_thread *thr = &softc->threads[i];
 		while (thr->thread != NULL) {
 			wakeup(thr);
 			if (thr->thread != NULL)
 				pause("CTL thr shutdown", 1);
 		}
 		mtx_destroy(&thr->queue_lock);
 	}
 	while (softc->lun_thread != NULL) {
 		wakeup(&softc->pending_lun_queue);
 		if (softc->lun_thread != NULL)
 			pause("CTL thr shutdown", 1);
 	}
 	while (softc->thresh_thread != NULL) {
 		wakeup(softc->thresh_thread);
 		if (softc->thresh_thread != NULL)
 			pause("CTL thr shutdown", 1);
 	}
 
 	ctl_tpc_shutdown(softc);
 	uma_zdestroy(softc->io_zone);
 	mtx_destroy(&softc->ctl_lock);
 
 	free(softc->ctl_luns, M_DEVBUF);
 	free(softc->ctl_lun_mask, M_DEVBUF);
 	free(softc->ctl_port_mask, M_DEVBUF);
 	free(softc->ctl_ports, M_DEVBUF);
 
 	sysctl_ctx_free(&softc->sysctl_ctx);
 
 	free(softc, M_DEVBUF);
 	control_softc = NULL;
 	return (0);
 }
 
 static int
 ctl_module_event_handler(module_t mod, int what, void *arg)
 {
 
 	switch (what) {
 	case MOD_LOAD:
 		return (ctl_init());
 	case MOD_UNLOAD:
 		return (ctl_shutdown());
 	default:
 		return (EOPNOTSUPP);
 	}
 }
 
 /*
  * XXX KDM should we do some access checks here?  Bump a reference count to
  * prevent a CTL module from being unloaded while someone has it open?
  */
 static int
 ctl_open(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	return (0);
 }
 
 static int
 ctl_close(struct cdev *dev, int flags, int fmt, struct thread *td)
 {
 	return (0);
 }
 
 /*
  * Remove an initiator by port number and initiator ID.
  * Returns 0 for success, -1 for failure.
  */
 int
 ctl_remove_initiator(struct ctl_port *port, int iid)
 {
 	struct ctl_softc *softc = port->ctl_softc;
 	int last;
 
 	mtx_assert(&softc->ctl_lock, MA_NOTOWNED);
 
 	if (iid > CTL_MAX_INIT_PER_PORT) {
 		printf("%s: initiator ID %u > maximun %u!\n",
 		       __func__, iid, CTL_MAX_INIT_PER_PORT);
 		return (-1);
 	}
 
 	mtx_lock(&softc->ctl_lock);
 	last = (--port->wwpn_iid[iid].in_use == 0);
 	port->wwpn_iid[iid].last_use = time_uptime;
 	mtx_unlock(&softc->ctl_lock);
 	if (last)
 		ctl_i_t_nexus_loss(softc, iid, CTL_UA_POWERON);
 	ctl_isc_announce_iid(port, iid);
 
 	return (0);
 }
 
 /*
  * Add an initiator to the initiator map.
  * Returns iid for success, < 0 for failure.
  */
 int
 ctl_add_initiator(struct ctl_port *port, int iid, uint64_t wwpn, char *name)
 {
 	struct ctl_softc *softc = port->ctl_softc;
 	time_t best_time;
 	int i, best;
 
 	mtx_assert(&softc->ctl_lock, MA_NOTOWNED);
 
 	if (iid >= CTL_MAX_INIT_PER_PORT) {
 		printf("%s: WWPN %#jx initiator ID %u > maximum %u!\n",
 		       __func__, wwpn, iid, CTL_MAX_INIT_PER_PORT);
 		free(name, M_CTL);
 		return (-1);
 	}
 
 	mtx_lock(&softc->ctl_lock);
 
 	if (iid < 0 && (wwpn != 0 || name != NULL)) {
 		for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) {
 			if (wwpn != 0 && wwpn == port->wwpn_iid[i].wwpn) {
 				iid = i;
 				break;
 			}
 			if (name != NULL && port->wwpn_iid[i].name != NULL &&
 			    strcmp(name, port->wwpn_iid[i].name) == 0) {
 				iid = i;
 				break;
 			}
 		}
 	}
 
 	if (iid < 0) {
 		for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) {
 			if (port->wwpn_iid[i].in_use == 0 &&
 			    port->wwpn_iid[i].wwpn == 0 &&
 			    port->wwpn_iid[i].name == NULL) {
 				iid = i;
 				break;
 			}
 		}
 	}
 
 	if (iid < 0) {
 		best = -1;
 		best_time = INT32_MAX;
 		for (i = 0; i < CTL_MAX_INIT_PER_PORT; i++) {
 			if (port->wwpn_iid[i].in_use == 0) {
 				if (port->wwpn_iid[i].last_use < best_time) {
 					best = i;
 					best_time = port->wwpn_iid[i].last_use;
 				}
 			}
 		}
 		iid = best;
 	}
 
 	if (iid < 0) {
 		mtx_unlock(&softc->ctl_lock);
 		free(name, M_CTL);
 		return (-2);
 	}
 
 	if (port->wwpn_iid[iid].in_use > 0 && (wwpn != 0 || name != NULL)) {
 		/*
 		 * This is not an error yet.
 		 */
 		if (wwpn != 0 && wwpn == port->wwpn_iid[iid].wwpn) {
 #if 0
 			printf("%s: port %d iid %u WWPN %#jx arrived"
 			    " again\n", __func__, port->targ_port,
 			    iid, (uintmax_t)wwpn);
 #endif
 			goto take;
 		}
 		if (name != NULL && port->wwpn_iid[iid].name != NULL &&
 		    strcmp(name, port->wwpn_iid[iid].name) == 0) {
 #if 0
 			printf("%s: port %d iid %u name '%s' arrived"
 			    " again\n", __func__, port->targ_port,
 			    iid, name);
 #endif
 			goto take;
 		}
 
 		/*
 		 * This is an error, but what do we do about it?  The
 		 * driver is telling us we have a new WWPN for this
 		 * initiator ID, so we pretty much need to use it.
 		 */
 		printf("%s: port %d iid %u WWPN %#jx '%s' arrived,"
 		    " but WWPN %#jx '%s' is still at that address\n",
 		    __func__, port->targ_port, iid, wwpn, name,
 		    (uintmax_t)port->wwpn_iid[iid].wwpn,
 		    port->wwpn_iid[iid].name);
 	}
 take:
 	free(port->wwpn_iid[iid].name, M_CTL);
 	port->wwpn_iid[iid].name = name;
 	port->wwpn_iid[iid].wwpn = wwpn;
 	port->wwpn_iid[iid].in_use++;
 	mtx_unlock(&softc->ctl_lock);
 	ctl_isc_announce_iid(port, iid);
 
 	return (iid);
 }
 
 static int
 ctl_create_iid(struct ctl_port *port, int iid, uint8_t *buf)
 {
 	int len;
 
 	switch (port->port_type) {
 	case CTL_PORT_FC:
 	{
 		struct scsi_transportid_fcp *id =
 		    (struct scsi_transportid_fcp *)buf;
 		if (port->wwpn_iid[iid].wwpn == 0)
 			return (0);
 		memset(id, 0, sizeof(*id));
 		id->format_protocol = SCSI_PROTO_FC;
 		scsi_u64to8b(port->wwpn_iid[iid].wwpn, id->n_port_name);
 		return (sizeof(*id));
 	}
 	case CTL_PORT_ISCSI:
 	{
 		struct scsi_transportid_iscsi_port *id =
 		    (struct scsi_transportid_iscsi_port *)buf;
 		if (port->wwpn_iid[iid].name == NULL)
 			return (0);
 		memset(id, 0, 256);
 		id->format_protocol = SCSI_TRN_ISCSI_FORMAT_PORT |
 		    SCSI_PROTO_ISCSI;
 		len = strlcpy(id->iscsi_name, port->wwpn_iid[iid].name, 252) + 1;
 		len = roundup2(min(len, 252), 4);
 		scsi_ulto2b(len, id->additional_length);
 		return (sizeof(*id) + len);
 	}
 	case CTL_PORT_SAS:
 	{
 		struct scsi_transportid_sas *id =
 		    (struct scsi_transportid_sas *)buf;
 		if (port->wwpn_iid[iid].wwpn == 0)
 			return (0);
 		memset(id, 0, sizeof(*id));
 		id->format_protocol = SCSI_PROTO_SAS;
 		scsi_u64to8b(port->wwpn_iid[iid].wwpn, id->sas_address);
 		return (sizeof(*id));
 	}
 	default:
 	{
 		struct scsi_transportid_spi *id =
 		    (struct scsi_transportid_spi *)buf;
 		memset(id, 0, sizeof(*id));
 		id->format_protocol = SCSI_PROTO_SPI;
 		scsi_ulto2b(iid, id->scsi_addr);
 		scsi_ulto2b(port->targ_port, id->rel_trgt_port_id);
 		return (sizeof(*id));
 	}
 	}
 }
 
 /*
  * Serialize a command that went down the "wrong" side, and so was sent to
  * this controller for execution.  The logic is a little different than the
  * standard case in ctl_scsiio_precheck().  Errors in this case need to get
  * sent back to the other side, but in the success case, we execute the
  * command on this side (XFER mode) or tell the other side to execute it
  * (SER_ONLY mode).
  */
 static void
 ctl_serialize_other_sc_cmd(struct ctl_scsiio *ctsio)
 {
 	struct ctl_softc *softc = CTL_SOFTC(ctsio);
 	struct ctl_port *port = CTL_PORT(ctsio);
 	union ctl_ha_msg msg_info;
 	struct ctl_lun *lun;
 	const struct ctl_cmd_entry *entry;
 	union ctl_io *bio;
 	uint32_t targ_lun;
 
 	targ_lun = ctsio->io_hdr.nexus.targ_mapped_lun;
 
 	/* Make sure that we know about this port. */
 	if (port == NULL || (port->status & CTL_PORT_STATUS_ONLINE) == 0) {
 		ctl_set_internal_failure(ctsio, /*sks_valid*/ 0,
 					 /*retry_count*/ 1);
 		goto badjuju;
 	}
 
 	/* Make sure that we know about this LUN. */
 	mtx_lock(&softc->ctl_lock);
 	if (targ_lun >= ctl_max_luns ||
 	    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 
 		/*
 		 * The other node would not send this request to us unless
 		 * received announce that we are primary node for this LUN.
 		 * If this LUN does not exist now, it is probably result of
 		 * a race, so respond to initiator in the most opaque way.
 		 */
 		ctl_set_busy(ctsio);
 		goto badjuju;
 	}
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 
 	/*
 	 * If the LUN is invalid, pretend that it doesn't exist.
 	 * It will go away as soon as all pending I/Os completed.
 	 */
 	if (lun->flags & CTL_LUN_DISABLED) {
 		mtx_unlock(&lun->lun_lock);
 		ctl_set_busy(ctsio);
 		goto badjuju;
 	}
 
 	entry = ctl_get_cmd_entry(ctsio, NULL);
 	if (ctl_scsiio_lun_check(lun, entry, ctsio) != 0) {
 		mtx_unlock(&lun->lun_lock);
 		goto badjuju;
 	}
 
 	CTL_LUN(ctsio) = lun;
 	CTL_BACKEND_LUN(ctsio) = lun->be_lun;
 
 	/*
 	 * Every I/O goes into the OOA queue for a
 	 * particular LUN, and stays there until completion.
 	 */
 #ifdef CTL_TIME_IO
 	if (TAILQ_EMPTY(&lun->ooa_queue))
 		lun->idle_time += getsbinuptime() - lun->last_busy;
 #endif
 	TAILQ_INSERT_TAIL(&lun->ooa_queue, &ctsio->io_hdr, ooa_links);
 
 	bio = (union ctl_io *)TAILQ_PREV(&ctsio->io_hdr, ctl_ooaq, ooa_links);
 	switch (ctl_check_ooa(lun, (union ctl_io *)ctsio, &bio)) {
 	case CTL_ACTION_BLOCK:
 		ctsio->io_hdr.blocker = bio;
 		TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &ctsio->io_hdr,
 				  blocked_links);
 		mtx_unlock(&lun->lun_lock);
 		break;
 	case CTL_ACTION_PASS:
 	case CTL_ACTION_SKIP:
 		if (softc->ha_mode == CTL_HA_MODE_XFER) {
 			ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
 			ctl_enqueue_rtr((union ctl_io *)ctsio);
 			mtx_unlock(&lun->lun_lock);
 		} else {
 			ctsio->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE;
 			mtx_unlock(&lun->lun_lock);
 
 			/* send msg back to other side */
 			msg_info.hdr.original_sc = ctsio->io_hdr.remote_io;
 			msg_info.hdr.serializing_sc = (union ctl_io *)ctsio;
 			msg_info.hdr.msg_type = CTL_MSG_R2R;
 			ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 			    sizeof(msg_info.hdr), M_WAITOK);
 		}
 		break;
 	case CTL_ACTION_OVERLAP:
 		TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links);
 		mtx_unlock(&lun->lun_lock);
 		ctl_set_overlapped_cmd(ctsio);
 		goto badjuju;
 	case CTL_ACTION_OVERLAP_TAG:
 		TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links);
 		mtx_unlock(&lun->lun_lock);
 		ctl_set_overlapped_tag(ctsio, ctsio->tag_num);
 		goto badjuju;
 	case CTL_ACTION_ERROR:
 	default:
 		TAILQ_REMOVE(&lun->ooa_queue, &ctsio->io_hdr, ooa_links);
 		mtx_unlock(&lun->lun_lock);
 
 		ctl_set_internal_failure(ctsio, /*sks_valid*/ 0,
 					 /*retry_count*/ 0);
 badjuju:
 		ctl_copy_sense_data_back((union ctl_io *)ctsio, &msg_info);
 		msg_info.hdr.original_sc = ctsio->io_hdr.remote_io;
 		msg_info.hdr.serializing_sc = NULL;
 		msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU;
 		ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 		    sizeof(msg_info.scsi), M_WAITOK);
 		ctl_free_io((union ctl_io *)ctsio);
 		break;
 	}
 }
 
 /*
  * Returns 0 for success, errno for failure.
  */
 static void
 ctl_ioctl_fill_ooa(struct ctl_lun *lun, uint32_t *cur_fill_num,
 		   struct ctl_ooa *ooa_hdr, struct ctl_ooa_entry *kern_entries)
 {
 	union ctl_io *io;
 
 	mtx_lock(&lun->lun_lock);
 	for (io = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); (io != NULL);
 	     (*cur_fill_num)++, io = (union ctl_io *)TAILQ_NEXT(&io->io_hdr,
 	     ooa_links)) {
 		struct ctl_ooa_entry *entry;
 
 		/*
 		 * If we've got more than we can fit, just count the
 		 * remaining entries.
 		 */
 		if (*cur_fill_num >= ooa_hdr->alloc_num)
 			continue;
 
 		entry = &kern_entries[*cur_fill_num];
 
 		entry->tag_num = io->scsiio.tag_num;
 		entry->lun_num = lun->lun;
 #ifdef CTL_TIME_IO
 		entry->start_bt = io->io_hdr.start_bt;
 #endif
 		bcopy(io->scsiio.cdb, entry->cdb, io->scsiio.cdb_len);
 		entry->cdb_len = io->scsiio.cdb_len;
 		if (io->io_hdr.blocker != NULL)
 			entry->cmd_flags |= CTL_OOACMD_FLAG_BLOCKED;
 
 		if (io->io_hdr.flags & CTL_FLAG_DMA_INPROG)
 			entry->cmd_flags |= CTL_OOACMD_FLAG_DMA;
 
 		if (io->io_hdr.flags & CTL_FLAG_ABORT)
 			entry->cmd_flags |= CTL_OOACMD_FLAG_ABORT;
 
 		if (io->io_hdr.flags & CTL_FLAG_IS_WAS_ON_RTR)
 			entry->cmd_flags |= CTL_OOACMD_FLAG_RTR;
 
 		if (io->io_hdr.flags & CTL_FLAG_DMA_QUEUED)
 			entry->cmd_flags |= CTL_OOACMD_FLAG_DMA_QUEUED;
 	}
 	mtx_unlock(&lun->lun_lock);
 }
 
 /*
  * Escape characters that are illegal or not recommended in XML.
  */
 int
 ctl_sbuf_printf_esc(struct sbuf *sb, char *str, int size)
 {
 	char *end = str + size;
 	int retval;
 
 	retval = 0;
 
 	for (; *str && str < end; str++) {
 		switch (*str) {
 		case '&':
 			retval = sbuf_printf(sb, "&amp;");
 			break;
 		case '>':
 			retval = sbuf_printf(sb, "&gt;");
 			break;
 		case '<':
 			retval = sbuf_printf(sb, "&lt;");
 			break;
 		default:
 			retval = sbuf_putc(sb, *str);
 			break;
 		}
 
 		if (retval != 0)
 			break;
 
 	}
 
 	return (retval);
 }
 
 static void
 ctl_id_sbuf(struct ctl_devid *id, struct sbuf *sb)
 {
 	struct scsi_vpd_id_descriptor *desc;
 	int i;
 
 	if (id == NULL || id->len < 4)
 		return;
 	desc = (struct scsi_vpd_id_descriptor *)id->data;
 	switch (desc->id_type & SVPD_ID_TYPE_MASK) {
 	case SVPD_ID_TYPE_T10:
 		sbuf_printf(sb, "t10.");
 		break;
 	case SVPD_ID_TYPE_EUI64:
 		sbuf_printf(sb, "eui.");
 		break;
 	case SVPD_ID_TYPE_NAA:
 		sbuf_printf(sb, "naa.");
 		break;
 	case SVPD_ID_TYPE_SCSI_NAME:
 		break;
 	}
 	switch (desc->proto_codeset & SVPD_ID_CODESET_MASK) {
 	case SVPD_ID_CODESET_BINARY:
 		for (i = 0; i < desc->length; i++)
 			sbuf_printf(sb, "%02x", desc->identifier[i]);
 		break;
 	case SVPD_ID_CODESET_ASCII:
 		sbuf_printf(sb, "%.*s", (int)desc->length,
 		    (char *)desc->identifier);
 		break;
 	case SVPD_ID_CODESET_UTF8:
 		sbuf_printf(sb, "%s", (char *)desc->identifier);
 		break;
 	}
 }
 
 static int
 ctl_ioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
 	  struct thread *td)
 {
 	struct ctl_softc *softc = dev->si_drv1;
 	struct ctl_port *port;
 	struct ctl_lun *lun;
 	int retval;
 
 	retval = 0;
 
 	switch (cmd) {
 	case CTL_IO:
 		retval = ctl_ioctl_io(dev, cmd, addr, flag, td);
 		break;
 	case CTL_ENABLE_PORT:
 	case CTL_DISABLE_PORT:
 	case CTL_SET_PORT_WWNS: {
 		struct ctl_port *port;
 		struct ctl_port_entry *entry;
 
 		entry = (struct ctl_port_entry *)addr;
 		
 		mtx_lock(&softc->ctl_lock);
 		STAILQ_FOREACH(port, &softc->port_list, links) {
 			int action, done;
 
 			if (port->targ_port < softc->port_min ||
 			    port->targ_port >= softc->port_max)
 				continue;
 
 			action = 0;
 			done = 0;
 			if ((entry->port_type == CTL_PORT_NONE)
 			 && (entry->targ_port == port->targ_port)) {
 				/*
 				 * If the user only wants to enable or
 				 * disable or set WWNs on a specific port,
 				 * do the operation and we're done.
 				 */
 				action = 1;
 				done = 1;
 			} else if (entry->port_type & port->port_type) {
 				/*
 				 * Compare the user's type mask with the
 				 * particular frontend type to see if we
 				 * have a match.
 				 */
 				action = 1;
 				done = 0;
 
 				/*
 				 * Make sure the user isn't trying to set
 				 * WWNs on multiple ports at the same time.
 				 */
 				if (cmd == CTL_SET_PORT_WWNS) {
 					printf("%s: Can't set WWNs on "
 					       "multiple ports\n", __func__);
 					retval = EINVAL;
 					break;
 				}
 			}
 			if (action == 0)
 				continue;
 
 			/*
 			 * XXX KDM we have to drop the lock here, because
 			 * the online/offline operations can potentially
 			 * block.  We need to reference count the frontends
 			 * so they can't go away,
 			 */
 			if (cmd == CTL_ENABLE_PORT) {
 				mtx_unlock(&softc->ctl_lock);
 				ctl_port_online(port);
 				mtx_lock(&softc->ctl_lock);
 			} else if (cmd == CTL_DISABLE_PORT) {
 				mtx_unlock(&softc->ctl_lock);
 				ctl_port_offline(port);
 				mtx_lock(&softc->ctl_lock);
 			} else if (cmd == CTL_SET_PORT_WWNS) {
 				ctl_port_set_wwns(port,
 				    (entry->flags & CTL_PORT_WWNN_VALID) ?
 				    1 : 0, entry->wwnn,
 				    (entry->flags & CTL_PORT_WWPN_VALID) ?
 				    1 : 0, entry->wwpn);
 			}
 			if (done != 0)
 				break;
 		}
 		mtx_unlock(&softc->ctl_lock);
 		break;
 	}
 	case CTL_GET_OOA: {
 		struct ctl_ooa *ooa_hdr;
 		struct ctl_ooa_entry *entries;
 		uint32_t cur_fill_num;
 
 		ooa_hdr = (struct ctl_ooa *)addr;
 
 		if ((ooa_hdr->alloc_len == 0)
 		 || (ooa_hdr->alloc_num == 0)) {
 			printf("%s: CTL_GET_OOA: alloc len %u and alloc num %u "
 			       "must be non-zero\n", __func__,
 			       ooa_hdr->alloc_len, ooa_hdr->alloc_num);
 			retval = EINVAL;
 			break;
 		}
 
 		if (ooa_hdr->alloc_len != (ooa_hdr->alloc_num *
 		    sizeof(struct ctl_ooa_entry))) {
 			printf("%s: CTL_GET_OOA: alloc len %u must be alloc "
 			       "num %d * sizeof(struct ctl_ooa_entry) %zd\n",
 			       __func__, ooa_hdr->alloc_len,
 			       ooa_hdr->alloc_num,sizeof(struct ctl_ooa_entry));
 			retval = EINVAL;
 			break;
 		}
 
 		entries = malloc(ooa_hdr->alloc_len, M_CTL, M_WAITOK | M_ZERO);
 		if (entries == NULL) {
 			printf("%s: could not allocate %d bytes for OOA "
 			       "dump\n", __func__, ooa_hdr->alloc_len);
 			retval = ENOMEM;
 			break;
 		}
 
 		mtx_lock(&softc->ctl_lock);
 		if ((ooa_hdr->flags & CTL_OOA_FLAG_ALL_LUNS) == 0 &&
 		    (ooa_hdr->lun_num >= ctl_max_luns ||
 		     softc->ctl_luns[ooa_hdr->lun_num] == NULL)) {
 			mtx_unlock(&softc->ctl_lock);
 			free(entries, M_CTL);
 			printf("%s: CTL_GET_OOA: invalid LUN %ju\n",
 			       __func__, (uintmax_t)ooa_hdr->lun_num);
 			retval = EINVAL;
 			break;
 		}
 
 		cur_fill_num = 0;
 
 		if (ooa_hdr->flags & CTL_OOA_FLAG_ALL_LUNS) {
 			STAILQ_FOREACH(lun, &softc->lun_list, links) {
 				ctl_ioctl_fill_ooa(lun, &cur_fill_num,
 				    ooa_hdr, entries);
 			}
 		} else {
 			lun = softc->ctl_luns[ooa_hdr->lun_num];
 			ctl_ioctl_fill_ooa(lun, &cur_fill_num, ooa_hdr,
 			    entries);
 		}
 		mtx_unlock(&softc->ctl_lock);
 
 		ooa_hdr->fill_num = min(cur_fill_num, ooa_hdr->alloc_num);
 		ooa_hdr->fill_len = ooa_hdr->fill_num *
 			sizeof(struct ctl_ooa_entry);
 		retval = copyout(entries, ooa_hdr->entries, ooa_hdr->fill_len);
 		if (retval != 0) {
 			printf("%s: error copying out %d bytes for OOA dump\n", 
 			       __func__, ooa_hdr->fill_len);
 		}
 
 		getbinuptime(&ooa_hdr->cur_bt);
 
 		if (cur_fill_num > ooa_hdr->alloc_num) {
 			ooa_hdr->dropped_num = cur_fill_num -ooa_hdr->alloc_num;
 			ooa_hdr->status = CTL_OOA_NEED_MORE_SPACE;
 		} else {
 			ooa_hdr->dropped_num = 0;
 			ooa_hdr->status = CTL_OOA_OK;
 		}
 
 		free(entries, M_CTL);
 		break;
 	}
 	case CTL_DELAY_IO: {
 		struct ctl_io_delay_info *delay_info;
 
 		delay_info = (struct ctl_io_delay_info *)addr;
 
 #ifdef CTL_IO_DELAY
 		mtx_lock(&softc->ctl_lock);
 		if (delay_info->lun_id >= ctl_max_luns ||
 		    (lun = softc->ctl_luns[delay_info->lun_id]) == NULL) {
 			mtx_unlock(&softc->ctl_lock);
 			delay_info->status = CTL_DELAY_STATUS_INVALID_LUN;
 			break;
 		}
 		mtx_lock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 		delay_info->status = CTL_DELAY_STATUS_OK;
 		switch (delay_info->delay_type) {
 		case CTL_DELAY_TYPE_CONT:
 		case CTL_DELAY_TYPE_ONESHOT:
 			break;
 		default:
 			delay_info->status = CTL_DELAY_STATUS_INVALID_TYPE;
 			break;
 		}
 		switch (delay_info->delay_loc) {
 		case CTL_DELAY_LOC_DATAMOVE:
 			lun->delay_info.datamove_type = delay_info->delay_type;
 			lun->delay_info.datamove_delay = delay_info->delay_secs;
 			break;
 		case CTL_DELAY_LOC_DONE:
 			lun->delay_info.done_type = delay_info->delay_type;
 			lun->delay_info.done_delay = delay_info->delay_secs;
 			break;
 		default:
 			delay_info->status = CTL_DELAY_STATUS_INVALID_LOC;
 			break;
 		}
 		mtx_unlock(&lun->lun_lock);
 #else
 		delay_info->status = CTL_DELAY_STATUS_NOT_IMPLEMENTED;
 #endif /* CTL_IO_DELAY */
 		break;
 	}
 	case CTL_ERROR_INJECT: {
 		struct ctl_error_desc *err_desc, *new_err_desc;
 
 		err_desc = (struct ctl_error_desc *)addr;
 
 		new_err_desc = malloc(sizeof(*new_err_desc), M_CTL,
 				      M_WAITOK | M_ZERO);
 		bcopy(err_desc, new_err_desc, sizeof(*new_err_desc));
 
 		mtx_lock(&softc->ctl_lock);
 		if (err_desc->lun_id >= ctl_max_luns ||
 		    (lun = softc->ctl_luns[err_desc->lun_id]) == NULL) {
 			mtx_unlock(&softc->ctl_lock);
 			free(new_err_desc, M_CTL);
 			printf("%s: CTL_ERROR_INJECT: invalid LUN %ju\n",
 			       __func__, (uintmax_t)err_desc->lun_id);
 			retval = EINVAL;
 			break;
 		}
 		mtx_lock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 
 		/*
 		 * We could do some checking here to verify the validity
 		 * of the request, but given the complexity of error
 		 * injection requests, the checking logic would be fairly
 		 * complex.
 		 *
 		 * For now, if the request is invalid, it just won't get
 		 * executed and might get deleted.
 		 */
 		STAILQ_INSERT_TAIL(&lun->error_list, new_err_desc, links);
 
 		/*
 		 * XXX KDM check to make sure the serial number is unique,
 		 * in case we somehow manage to wrap.  That shouldn't
 		 * happen for a very long time, but it's the right thing to
 		 * do.
 		 */
 		new_err_desc->serial = lun->error_serial;
 		err_desc->serial = lun->error_serial;
 		lun->error_serial++;
 
 		mtx_unlock(&lun->lun_lock);
 		break;
 	}
 	case CTL_ERROR_INJECT_DELETE: {
 		struct ctl_error_desc *delete_desc, *desc, *desc2;
 		int delete_done;
 
 		delete_desc = (struct ctl_error_desc *)addr;
 		delete_done = 0;
 
 		mtx_lock(&softc->ctl_lock);
 		if (delete_desc->lun_id >= ctl_max_luns ||
 		    (lun = softc->ctl_luns[delete_desc->lun_id]) == NULL) {
 			mtx_unlock(&softc->ctl_lock);
 			printf("%s: CTL_ERROR_INJECT_DELETE: invalid LUN %ju\n",
 			       __func__, (uintmax_t)delete_desc->lun_id);
 			retval = EINVAL;
 			break;
 		}
 		mtx_lock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 		STAILQ_FOREACH_SAFE(desc, &lun->error_list, links, desc2) {
 			if (desc->serial != delete_desc->serial)
 				continue;
 
 			STAILQ_REMOVE(&lun->error_list, desc, ctl_error_desc,
 				      links);
 			free(desc, M_CTL);
 			delete_done = 1;
 		}
 		mtx_unlock(&lun->lun_lock);
 		if (delete_done == 0) {
 			printf("%s: CTL_ERROR_INJECT_DELETE: can't find "
 			       "error serial %ju on LUN %u\n", __func__, 
 			       delete_desc->serial, delete_desc->lun_id);
 			retval = EINVAL;
 			break;
 		}
 		break;
 	}
 	case CTL_DUMP_STRUCTS: {
 		int j, k;
 		struct ctl_port *port;
 		struct ctl_frontend *fe;
 
 		mtx_lock(&softc->ctl_lock);
 		printf("CTL Persistent Reservation information start:\n");
 		STAILQ_FOREACH(lun, &softc->lun_list, links) {
 			mtx_lock(&lun->lun_lock);
 			if ((lun->flags & CTL_LUN_DISABLED) != 0) {
 				mtx_unlock(&lun->lun_lock);
 				continue;
 			}
 
 			for (j = 0; j < ctl_max_ports; j++) {
 				if (lun->pr_keys[j] == NULL)
 					continue;
 				for (k = 0; k < CTL_MAX_INIT_PER_PORT; k++){
 					if (lun->pr_keys[j][k] == 0)
 						continue;
 					printf("  LUN %ju port %d iid %d key "
 					       "%#jx\n", lun->lun, j, k,
 					       (uintmax_t)lun->pr_keys[j][k]);
 				}
 			}
 			mtx_unlock(&lun->lun_lock);
 		}
 		printf("CTL Persistent Reservation information end\n");
 		printf("CTL Ports:\n");
 		STAILQ_FOREACH(port, &softc->port_list, links) {
 			printf("  Port %d '%s' Frontend '%s' Type %u pp %d vp %d WWNN "
 			       "%#jx WWPN %#jx\n", port->targ_port, port->port_name,
 			       port->frontend->name, port->port_type,
 			       port->physical_port, port->virtual_port,
 			       (uintmax_t)port->wwnn, (uintmax_t)port->wwpn);
 			for (j = 0; j < CTL_MAX_INIT_PER_PORT; j++) {
 				if (port->wwpn_iid[j].in_use == 0 &&
 				    port->wwpn_iid[j].wwpn == 0 &&
 				    port->wwpn_iid[j].name == NULL)
 					continue;
 
 				printf("    iid %u use %d WWPN %#jx '%s'\n",
 				    j, port->wwpn_iid[j].in_use,
 				    (uintmax_t)port->wwpn_iid[j].wwpn,
 				    port->wwpn_iid[j].name);
 			}
 		}
 		printf("CTL Port information end\n");
 		mtx_unlock(&softc->ctl_lock);
 		/*
 		 * XXX KDM calling this without a lock.  We'd likely want
 		 * to drop the lock before calling the frontend's dump
 		 * routine anyway.
 		 */
 		printf("CTL Frontends:\n");
 		STAILQ_FOREACH(fe, &softc->fe_list, links) {
 			printf("  Frontend '%s'\n", fe->name);
 			if (fe->fe_dump != NULL)
 				fe->fe_dump();
 		}
 		printf("CTL Frontend information end\n");
 		break;
 	}
 	case CTL_LUN_REQ: {
 		struct ctl_lun_req *lun_req;
 		struct ctl_backend_driver *backend;
 		void *packed;
 		nvlist_t *tmp_args_nvl;
 		size_t packed_len;
 
 		lun_req = (struct ctl_lun_req *)addr;
 		tmp_args_nvl = lun_req->args_nvl;
 
 		backend = ctl_backend_find(lun_req->backend);
 		if (backend == NULL) {
 			lun_req->status = CTL_LUN_ERROR;
 			snprintf(lun_req->error_str,
 				 sizeof(lun_req->error_str),
 				 "Backend \"%s\" not found.",
 				 lun_req->backend);
 			break;
 		}
 
 		if (lun_req->args != NULL) {
 			packed = malloc(lun_req->args_len, M_CTL, M_WAITOK);
 			if (copyin(lun_req->args, packed, lun_req->args_len) != 0) {
 				free(packed, M_CTL);
 				lun_req->status = CTL_LUN_ERROR;
 				snprintf(lun_req->error_str, sizeof(lun_req->error_str),
 				    "Cannot copyin args.");
 				break;
 			}
 			lun_req->args_nvl = nvlist_unpack(packed,
 			    lun_req->args_len, 0);
 			free(packed, M_CTL);
 
 			if (lun_req->args_nvl == NULL) {
 				lun_req->status = CTL_LUN_ERROR;
 				snprintf(lun_req->error_str, sizeof(lun_req->error_str),
 				    "Cannot unpack args nvlist.");
 				break;
 			}
 		} else
 			lun_req->args_nvl = nvlist_create(0);
 
 		retval = backend->ioctl(dev, cmd, addr, flag, td);
 		nvlist_destroy(lun_req->args_nvl);
 		lun_req->args_nvl = tmp_args_nvl;
 
 		if (lun_req->result_nvl != NULL) {
 			if (lun_req->result != NULL) {
 				packed = nvlist_pack(lun_req->result_nvl,
 				    &packed_len);
 				if (packed == NULL) {
 					lun_req->status = CTL_LUN_ERROR;
 					snprintf(lun_req->error_str,
 					    sizeof(lun_req->error_str),
 					    "Cannot pack result nvlist.");
 					break;
 				}
 
 				if (packed_len > lun_req->result_len) {
 					lun_req->status = CTL_LUN_ERROR;
 					snprintf(lun_req->error_str,
 					    sizeof(lun_req->error_str),
 					    "Result nvlist too large.");
 					free(packed, M_NVLIST);
 					break;
 				}
 
 				if (copyout(packed, lun_req->result, packed_len)) {
 					lun_req->status = CTL_LUN_ERROR;
 					snprintf(lun_req->error_str,
 					    sizeof(lun_req->error_str),
 					    "Cannot copyout() the result.");
 					free(packed, M_NVLIST);
 					break;
 				}
 
 				lun_req->result_len = packed_len;
 				free(packed, M_NVLIST);
 			}
 
 			nvlist_destroy(lun_req->result_nvl);
 		}
 		break;
 	}
 	case CTL_LUN_LIST: {
 		struct sbuf *sb;
 		struct ctl_lun_list *list;
 		const char *name, *value;
 		void *cookie;
 		int type;
 
 		list = (struct ctl_lun_list *)addr;
 
 		/*
 		 * Allocate a fixed length sbuf here, based on the length
 		 * of the user's buffer.  We could allocate an auto-extending
 		 * buffer, and then tell the user how much larger our
 		 * amount of data is than his buffer, but that presents
 		 * some problems:
 		 *
 		 * 1.  The sbuf(9) routines use a blocking malloc, and so
 		 *     we can't hold a lock while calling them with an
 		 *     auto-extending buffer.
  		 *
 		 * 2.  There is not currently a LUN reference counting
 		 *     mechanism, outside of outstanding transactions on
 		 *     the LUN's OOA queue.  So a LUN could go away on us
 		 *     while we're getting the LUN number, backend-specific
 		 *     information, etc.  Thus, given the way things
 		 *     currently work, we need to hold the CTL lock while
 		 *     grabbing LUN information.
 		 *
 		 * So, from the user's standpoint, the best thing to do is
 		 * allocate what he thinks is a reasonable buffer length,
 		 * and then if he gets a CTL_LUN_LIST_NEED_MORE_SPACE error,
 		 * double the buffer length and try again.  (And repeat
 		 * that until he succeeds.)
 		 */
 		sb = sbuf_new(NULL, NULL, list->alloc_len, SBUF_FIXEDLEN);
 		if (sb == NULL) {
 			list->status = CTL_LUN_LIST_ERROR;
 			snprintf(list->error_str, sizeof(list->error_str),
 				 "Unable to allocate %d bytes for LUN list",
 				 list->alloc_len);
 			break;
 		}
 
 		sbuf_printf(sb, "<ctllunlist>\n");
 
 		mtx_lock(&softc->ctl_lock);
 		STAILQ_FOREACH(lun, &softc->lun_list, links) {
 			mtx_lock(&lun->lun_lock);
 			retval = sbuf_printf(sb, "<lun id=\"%ju\">\n",
 					     (uintmax_t)lun->lun);
 
 			/*
 			 * Bail out as soon as we see that we've overfilled
 			 * the buffer.
 			 */
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<backend_type>%s"
 					     "</backend_type>\n",
 					     (lun->backend == NULL) ?  "none" :
 					     lun->backend->name);
 
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<lun_type>%d</lun_type>\n",
 					     lun->be_lun->lun_type);
 
 			if (retval != 0)
 				break;
 
 			if (lun->backend == NULL) {
 				retval = sbuf_printf(sb, "</lun>\n");
 				if (retval != 0)
 					break;
 				continue;
 			}
 
 			retval = sbuf_printf(sb, "\t<size>%ju</size>\n",
 					     (lun->be_lun->maxlba > 0) ?
 					     lun->be_lun->maxlba + 1 : 0);
 
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<blocksize>%u</blocksize>\n",
 					     lun->be_lun->blocksize);
 
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<serial_number>");
 
 			if (retval != 0)
 				break;
 
 			retval = ctl_sbuf_printf_esc(sb,
 			    lun->be_lun->serial_num,
 			    sizeof(lun->be_lun->serial_num));
 
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "</serial_number>\n");
 		
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<device_id>");
 
 			if (retval != 0)
 				break;
 
 			retval = ctl_sbuf_printf_esc(sb,
 			    lun->be_lun->device_id,
 			    sizeof(lun->be_lun->device_id));
 
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "</device_id>\n");
 
 			if (retval != 0)
 				break;
 
 			if (lun->backend->lun_info != NULL) {
 				retval = lun->backend->lun_info(lun->be_lun->be_lun, sb);
 				if (retval != 0)
 					break;
 			}
 
 			cookie = NULL;
 			while ((name = nvlist_next(lun->be_lun->options, &type,
 			    &cookie)) != NULL) {
 				sbuf_printf(sb, "\t<%s>", name);
 
 				if (type == NV_TYPE_STRING) {
 					value = dnvlist_get_string(
 					    lun->be_lun->options, name, NULL);
 					if (value != NULL)
 						sbuf_printf(sb, "%s", value);
 				}
 
 				sbuf_printf(sb, "</%s>\n", name);
 			}
 
 			retval = sbuf_printf(sb, "</lun>\n");
 
 			if (retval != 0)
 				break;
 			mtx_unlock(&lun->lun_lock);
 		}
 		if (lun != NULL)
 			mtx_unlock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 
 		if ((retval != 0)
 		 || ((retval = sbuf_printf(sb, "</ctllunlist>\n")) != 0)) {
 			retval = 0;
 			sbuf_delete(sb);
 			list->status = CTL_LUN_LIST_NEED_MORE_SPACE;
 			snprintf(list->error_str, sizeof(list->error_str),
 				 "Out of space, %d bytes is too small",
 				 list->alloc_len);
 			break;
 		}
 
 		sbuf_finish(sb);
 
 		retval = copyout(sbuf_data(sb), list->lun_xml,
 				 sbuf_len(sb) + 1);
 
 		list->fill_len = sbuf_len(sb) + 1;
 		list->status = CTL_LUN_LIST_OK;
 		sbuf_delete(sb);
 		break;
 	}
 	case CTL_ISCSI: {
 		struct ctl_iscsi *ci;
 		struct ctl_frontend *fe;
 
 		ci = (struct ctl_iscsi *)addr;
 
 		fe = ctl_frontend_find("iscsi");
 		if (fe == NULL) {
 			ci->status = CTL_ISCSI_ERROR;
 			snprintf(ci->error_str, sizeof(ci->error_str),
 			    "Frontend \"iscsi\" not found.");
 			break;
 		}
 
 		retval = fe->ioctl(dev, cmd, addr, flag, td);
 		break;
 	}
 	case CTL_PORT_REQ: {
 		struct ctl_req *req;
 		struct ctl_frontend *fe;
 		void *packed;
 		nvlist_t *tmp_args_nvl;
 		size_t packed_len;
 
 		req = (struct ctl_req *)addr;
 		tmp_args_nvl = req->args_nvl;
 
 		fe = ctl_frontend_find(req->driver);
 		if (fe == NULL) {
 			req->status = CTL_LUN_ERROR;
 			snprintf(req->error_str, sizeof(req->error_str),
 			    "Frontend \"%s\" not found.", req->driver);
 			break;
 		}
 
 		if (req->args != NULL) {
 			packed = malloc(req->args_len, M_CTL, M_WAITOK);
 			if (copyin(req->args, packed, req->args_len) != 0) {
 				free(packed, M_CTL);
 				req->status = CTL_LUN_ERROR;
 				snprintf(req->error_str, sizeof(req->error_str),
 				    "Cannot copyin args.");
 				break;
 			}
 			req->args_nvl = nvlist_unpack(packed,
 			    req->args_len, 0);
 			free(packed, M_CTL);
 
 			if (req->args_nvl == NULL) {
 				req->status = CTL_LUN_ERROR;
 				snprintf(req->error_str, sizeof(req->error_str),
 				    "Cannot unpack args nvlist.");
 				break;
 			}
 		} else
 			req->args_nvl = nvlist_create(0);
 
 		if (fe->ioctl)
 			retval = fe->ioctl(dev, cmd, addr, flag, td);
 		else
 			retval = ENODEV;
 
 		nvlist_destroy(req->args_nvl);
 		req->args_nvl = tmp_args_nvl;
 
 		if (req->result_nvl != NULL) {
 			if (req->result != NULL) {
 				packed = nvlist_pack(req->result_nvl,
 				    &packed_len);
 				if (packed == NULL) {
 					req->status = CTL_LUN_ERROR;
 					snprintf(req->error_str,
 					    sizeof(req->error_str),
 					    "Cannot pack result nvlist.");
 					break;
 				}
 
 				if (packed_len > req->result_len) {
 					req->status = CTL_LUN_ERROR;
 					snprintf(req->error_str,
 					    sizeof(req->error_str),
 					    "Result nvlist too large.");
 					free(packed, M_NVLIST);
 					break;
 				}
 
 				if (copyout(packed, req->result, packed_len)) {
 					req->status = CTL_LUN_ERROR;
 					snprintf(req->error_str,
 					    sizeof(req->error_str),
 					    "Cannot copyout() the result.");
 					free(packed, M_NVLIST);
 					break;
 				}
 
 				req->result_len = packed_len;
 				free(packed, M_NVLIST);
 			}
 
 			nvlist_destroy(req->result_nvl);
 		}
 		break;
 	}
 	case CTL_PORT_LIST: {
 		struct sbuf *sb;
 		struct ctl_port *port;
 		struct ctl_lun_list *list;
 		const char *name, *value;
 		void *cookie;
 		int j, type;
 		uint32_t plun;
 
 		list = (struct ctl_lun_list *)addr;
 
 		sb = sbuf_new(NULL, NULL, list->alloc_len, SBUF_FIXEDLEN);
 		if (sb == NULL) {
 			list->status = CTL_LUN_LIST_ERROR;
 			snprintf(list->error_str, sizeof(list->error_str),
 				 "Unable to allocate %d bytes for LUN list",
 				 list->alloc_len);
 			break;
 		}
 
 		sbuf_printf(sb, "<ctlportlist>\n");
 
 		mtx_lock(&softc->ctl_lock);
 		STAILQ_FOREACH(port, &softc->port_list, links) {
 			retval = sbuf_printf(sb, "<targ_port id=\"%ju\">\n",
 					     (uintmax_t)port->targ_port);
 
 			/*
 			 * Bail out as soon as we see that we've overfilled
 			 * the buffer.
 			 */
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<frontend_type>%s"
 			    "</frontend_type>\n", port->frontend->name);
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<port_type>%d</port_type>\n",
 					     port->port_type);
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<online>%s</online>\n",
 			    (port->status & CTL_PORT_STATUS_ONLINE) ? "YES" : "NO");
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<port_name>%s</port_name>\n",
 			    port->port_name);
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<physical_port>%d</physical_port>\n",
 			    port->physical_port);
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "\t<virtual_port>%d</virtual_port>\n",
 			    port->virtual_port);
 			if (retval != 0)
 				break;
 
 			if (port->target_devid != NULL) {
 				sbuf_printf(sb, "\t<target>");
 				ctl_id_sbuf(port->target_devid, sb);
 				sbuf_printf(sb, "</target>\n");
 			}
 
 			if (port->port_devid != NULL) {
 				sbuf_printf(sb, "\t<port>");
 				ctl_id_sbuf(port->port_devid, sb);
 				sbuf_printf(sb, "</port>\n");
 			}
 
 			if (port->port_info != NULL) {
 				retval = port->port_info(port->onoff_arg, sb);
 				if (retval != 0)
 					break;
 			}
 
 			cookie = NULL;
 			while ((name = nvlist_next(port->options, &type,
 			    &cookie)) != NULL) {
 				sbuf_printf(sb, "\t<%s>", name);
 
 				if (type == NV_TYPE_STRING) {
 					value = dnvlist_get_string(port->options,
 					    name, NULL);
 					if (value != NULL)
 						sbuf_printf(sb, "%s", value);
 				}
 
 				sbuf_printf(sb, "</%s>\n", name);
 			}
 
 			if (port->lun_map != NULL) {
 				sbuf_printf(sb, "\t<lun_map>on</lun_map>\n");
 				for (j = 0; j < port->lun_map_size; j++) {
 					plun = ctl_lun_map_from_port(port, j);
 					if (plun == UINT32_MAX)
 						continue;
 					sbuf_printf(sb,
 					    "\t<lun id=\"%u\">%u</lun>\n",
 					    j, plun);
 				}
 			}
 
 			for (j = 0; j < CTL_MAX_INIT_PER_PORT; j++) {
 				if (port->wwpn_iid[j].in_use == 0 ||
 				    (port->wwpn_iid[j].wwpn == 0 &&
 				     port->wwpn_iid[j].name == NULL))
 					continue;
 
 				if (port->wwpn_iid[j].name != NULL)
 					retval = sbuf_printf(sb,
 					    "\t<initiator id=\"%u\">%s</initiator>\n",
 					    j, port->wwpn_iid[j].name);
 				else
 					retval = sbuf_printf(sb,
 					    "\t<initiator id=\"%u\">naa.%08jx</initiator>\n",
 					    j, port->wwpn_iid[j].wwpn);
 				if (retval != 0)
 					break;
 			}
 			if (retval != 0)
 				break;
 
 			retval = sbuf_printf(sb, "</targ_port>\n");
 			if (retval != 0)
 				break;
 		}
 		mtx_unlock(&softc->ctl_lock);
 
 		if ((retval != 0)
 		 || ((retval = sbuf_printf(sb, "</ctlportlist>\n")) != 0)) {
 			retval = 0;
 			sbuf_delete(sb);
 			list->status = CTL_LUN_LIST_NEED_MORE_SPACE;
 			snprintf(list->error_str, sizeof(list->error_str),
 				 "Out of space, %d bytes is too small",
 				 list->alloc_len);
 			break;
 		}
 
 		sbuf_finish(sb);
 
 		retval = copyout(sbuf_data(sb), list->lun_xml,
 				 sbuf_len(sb) + 1);
 
 		list->fill_len = sbuf_len(sb) + 1;
 		list->status = CTL_LUN_LIST_OK;
 		sbuf_delete(sb);
 		break;
 	}
 	case CTL_LUN_MAP: {
 		struct ctl_lun_map *lm  = (struct ctl_lun_map *)addr;
 		struct ctl_port *port;
 
 		mtx_lock(&softc->ctl_lock);
 		if (lm->port < softc->port_min ||
 		    lm->port >= softc->port_max ||
 		    (port = softc->ctl_ports[lm->port]) == NULL) {
 			mtx_unlock(&softc->ctl_lock);
 			return (ENXIO);
 		}
 		if (port->status & CTL_PORT_STATUS_ONLINE) {
 			STAILQ_FOREACH(lun, &softc->lun_list, links) {
 				if (ctl_lun_map_to_port(port, lun->lun) ==
 				    UINT32_MAX)
 					continue;
 				mtx_lock(&lun->lun_lock);
 				ctl_est_ua_port(lun, lm->port, -1,
 				    CTL_UA_LUN_CHANGE);
 				mtx_unlock(&lun->lun_lock);
 			}
 		}
 		mtx_unlock(&softc->ctl_lock); // XXX: port_enable sleeps
 		if (lm->plun != UINT32_MAX) {
 			if (lm->lun == UINT32_MAX)
 				retval = ctl_lun_map_unset(port, lm->plun);
 			else if (lm->lun < ctl_max_luns &&
 			    softc->ctl_luns[lm->lun] != NULL)
 				retval = ctl_lun_map_set(port, lm->plun, lm->lun);
 			else
 				return (ENXIO);
 		} else {
 			if (lm->lun == UINT32_MAX)
 				retval = ctl_lun_map_deinit(port);
 			else
 				retval = ctl_lun_map_init(port);
 		}
 		if (port->status & CTL_PORT_STATUS_ONLINE)
 			ctl_isc_announce_port(port);
 		break;
 	}
 	case CTL_GET_LUN_STATS: {
 		struct ctl_get_io_stats *stats = (struct ctl_get_io_stats *)addr;
 		int i;
 
 		/*
 		 * XXX KDM no locking here.  If the LUN list changes,
 		 * things can blow up.
 		 */
 		i = 0;
 		stats->status = CTL_SS_OK;
 		stats->fill_len = 0;
 		STAILQ_FOREACH(lun, &softc->lun_list, links) {
 			if (lun->lun < stats->first_item)
 				continue;
 			if (stats->fill_len + sizeof(lun->stats) >
 			    stats->alloc_len) {
 				stats->status = CTL_SS_NEED_MORE_SPACE;
 				break;
 			}
 			retval = copyout(&lun->stats, &stats->stats[i++],
 					 sizeof(lun->stats));
 			if (retval != 0)
 				break;
 			stats->fill_len += sizeof(lun->stats);
 		}
 		stats->num_items = softc->num_luns;
 		stats->flags = CTL_STATS_FLAG_NONE;
 #ifdef CTL_TIME_IO
 		stats->flags |= CTL_STATS_FLAG_TIME_VALID;
 #endif
 		getnanouptime(&stats->timestamp);
 		break;
 	}
 	case CTL_GET_PORT_STATS: {
 		struct ctl_get_io_stats *stats = (struct ctl_get_io_stats *)addr;
 		int i;
 
 		/*
 		 * XXX KDM no locking here.  If the LUN list changes,
 		 * things can blow up.
 		 */
 		i = 0;
 		stats->status = CTL_SS_OK;
 		stats->fill_len = 0;
 		STAILQ_FOREACH(port, &softc->port_list, links) {
 			if (port->targ_port < stats->first_item)
 				continue;
 			if (stats->fill_len + sizeof(port->stats) >
 			    stats->alloc_len) {
 				stats->status = CTL_SS_NEED_MORE_SPACE;
 				break;
 			}
 			retval = copyout(&port->stats, &stats->stats[i++],
 					 sizeof(port->stats));
 			if (retval != 0)
 				break;
 			stats->fill_len += sizeof(port->stats);
 		}
 		stats->num_items = softc->num_ports;
 		stats->flags = CTL_STATS_FLAG_NONE;
 #ifdef CTL_TIME_IO
 		stats->flags |= CTL_STATS_FLAG_TIME_VALID;
 #endif
 		getnanouptime(&stats->timestamp);
 		break;
 	}
 	default: {
 		/* XXX KDM should we fix this? */
 #if 0
 		struct ctl_backend_driver *backend;
 		unsigned int type;
 		int found;
 
 		found = 0;
 
 		/*
 		 * We encode the backend type as the ioctl type for backend
 		 * ioctls.  So parse it out here, and then search for a
 		 * backend of this type.
 		 */
 		type = _IOC_TYPE(cmd);
 
 		STAILQ_FOREACH(backend, &softc->be_list, links) {
 			if (backend->type == type) {
 				found = 1;
 				break;
 			}
 		}
 		if (found == 0) {
 			printf("ctl: unknown ioctl command %#lx or backend "
 			       "%d\n", cmd, type);
 			retval = EINVAL;
 			break;
 		}
 		retval = backend->ioctl(dev, cmd, addr, flag, td);
 #endif
 		retval = ENOTTY;
 		break;
 	}
 	}
 	return (retval);
 }
 
 uint32_t
 ctl_get_initindex(struct ctl_nexus *nexus)
 {
 	return (nexus->initid + (nexus->targ_port * CTL_MAX_INIT_PER_PORT));
 }
 
 int
 ctl_lun_map_init(struct ctl_port *port)
 {
 	struct ctl_softc *softc = port->ctl_softc;
 	struct ctl_lun *lun;
 	int size = ctl_lun_map_size;
 	uint32_t i;
 
 	if (port->lun_map == NULL || port->lun_map_size < size) {
 		port->lun_map_size = 0;
 		free(port->lun_map, M_CTL);
 		port->lun_map = malloc(size * sizeof(uint32_t),
 		    M_CTL, M_NOWAIT);
 	}
 	if (port->lun_map == NULL)
 		return (ENOMEM);
 	for (i = 0; i < size; i++)
 		port->lun_map[i] = UINT32_MAX;
 	port->lun_map_size = size;
 	if (port->status & CTL_PORT_STATUS_ONLINE) {
 		if (port->lun_disable != NULL) {
 			STAILQ_FOREACH(lun, &softc->lun_list, links)
 				port->lun_disable(port->targ_lun_arg, lun->lun);
 		}
 		ctl_isc_announce_port(port);
 	}
 	return (0);
 }
 
 int
 ctl_lun_map_deinit(struct ctl_port *port)
 {
 	struct ctl_softc *softc = port->ctl_softc;
 	struct ctl_lun *lun;
 
 	if (port->lun_map == NULL)
 		return (0);
 	port->lun_map_size = 0;
 	free(port->lun_map, M_CTL);
 	port->lun_map = NULL;
 	if (port->status & CTL_PORT_STATUS_ONLINE) {
 		if (port->lun_enable != NULL) {
 			STAILQ_FOREACH(lun, &softc->lun_list, links)
 				port->lun_enable(port->targ_lun_arg, lun->lun);
 		}
 		ctl_isc_announce_port(port);
 	}
 	return (0);
 }
 
 int
 ctl_lun_map_set(struct ctl_port *port, uint32_t plun, uint32_t glun)
 {
 	int status;
 	uint32_t old;
 
 	if (port->lun_map == NULL) {
 		status = ctl_lun_map_init(port);
 		if (status != 0)
 			return (status);
 	}
 	if (plun >= port->lun_map_size)
 		return (EINVAL);
 	old = port->lun_map[plun];
 	port->lun_map[plun] = glun;
 	if ((port->status & CTL_PORT_STATUS_ONLINE) && old == UINT32_MAX) {
 		if (port->lun_enable != NULL)
 			port->lun_enable(port->targ_lun_arg, plun);
 		ctl_isc_announce_port(port);
 	}
 	return (0);
 }
 
 int
 ctl_lun_map_unset(struct ctl_port *port, uint32_t plun)
 {
 	uint32_t old;
 
 	if (port->lun_map == NULL || plun >= port->lun_map_size)
 		return (0);
 	old = port->lun_map[plun];
 	port->lun_map[plun] = UINT32_MAX;
 	if ((port->status & CTL_PORT_STATUS_ONLINE) && old != UINT32_MAX) {
 		if (port->lun_disable != NULL)
 			port->lun_disable(port->targ_lun_arg, plun);
 		ctl_isc_announce_port(port);
 	}
 	return (0);
 }
 
 uint32_t
 ctl_lun_map_from_port(struct ctl_port *port, uint32_t lun_id)
 {
 
 	if (port == NULL)
 		return (UINT32_MAX);
 	if (port->lun_map == NULL)
 		return (lun_id);
 	if (lun_id > port->lun_map_size)
 		return (UINT32_MAX);
 	return (port->lun_map[lun_id]);
 }
 
 uint32_t
 ctl_lun_map_to_port(struct ctl_port *port, uint32_t lun_id)
 {
 	uint32_t i;
 
 	if (port == NULL)
 		return (UINT32_MAX);
 	if (port->lun_map == NULL)
 		return (lun_id);
 	for (i = 0; i < port->lun_map_size; i++) {
 		if (port->lun_map[i] == lun_id)
 			return (i);
 	}
 	return (UINT32_MAX);
 }
 
 uint32_t
 ctl_decode_lun(uint64_t encoded)
 {
 	uint8_t lun[8];
 	uint32_t result = 0xffffffff;
 
 	be64enc(lun, encoded);
 	switch (lun[0] & RPL_LUNDATA_ATYP_MASK) {
 	case RPL_LUNDATA_ATYP_PERIPH:
 		if ((lun[0] & 0x3f) == 0 && lun[2] == 0 && lun[3] == 0 &&
 		    lun[4] == 0 && lun[5] == 0 && lun[6] == 0 && lun[7] == 0)
 			result = lun[1];
 		break;
 	case RPL_LUNDATA_ATYP_FLAT:
 		if (lun[2] == 0 && lun[3] == 0 && lun[4] == 0 && lun[5] == 0 &&
 		    lun[6] == 0 && lun[7] == 0)
 			result = ((lun[0] & 0x3f) << 8) + lun[1];
 		break;
 	case RPL_LUNDATA_ATYP_EXTLUN:
 		switch (lun[0] & RPL_LUNDATA_EXT_EAM_MASK) {
 		case 0x02:
 			switch (lun[0] & RPL_LUNDATA_EXT_LEN_MASK) {
 			case 0x00:
 				result = lun[1];
 				break;
 			case 0x10:
 				result = (lun[1] << 16) + (lun[2] << 8) +
 				    lun[3];
 				break;
 			case 0x20:
 				if (lun[1] == 0 && lun[6] == 0 && lun[7] == 0)
 					result = (lun[2] << 24) +
 					    (lun[3] << 16) + (lun[4] << 8) +
 					    lun[5];
 				break;
 			}
 			break;
 		case RPL_LUNDATA_EXT_EAM_NOT_SPEC:
 			result = 0xffffffff;
 			break;
 		}
 		break;
 	}
 	return (result);
 }
 
 uint64_t
 ctl_encode_lun(uint32_t decoded)
 {
 	uint64_t l = decoded;
 
 	if (l <= 0xff)
 		return (((uint64_t)RPL_LUNDATA_ATYP_PERIPH << 56) | (l << 48));
 	if (l <= 0x3fff)
 		return (((uint64_t)RPL_LUNDATA_ATYP_FLAT << 56) | (l << 48));
 	if (l <= 0xffffff)
 		return (((uint64_t)(RPL_LUNDATA_ATYP_EXTLUN | 0x12) << 56) |
 		    (l << 32));
 	return ((((uint64_t)RPL_LUNDATA_ATYP_EXTLUN | 0x22) << 56) | (l << 16));
 }
 
 int
 ctl_ffz(uint32_t *mask, uint32_t first, uint32_t last)
 {
 	int i;
 
 	for (i = first; i < last; i++) {
 		if ((mask[i / 32] & (1 << (i % 32))) == 0)
 			return (i);
 	}
 	return (-1);
 }
 
 int
 ctl_set_mask(uint32_t *mask, uint32_t bit)
 {
 	uint32_t chunk, piece;
 
 	chunk = bit >> 5;
 	piece = bit % (sizeof(uint32_t) * 8);
 
 	if ((mask[chunk] & (1 << piece)) != 0)
 		return (-1);
 	else
 		mask[chunk] |= (1 << piece);
 
 	return (0);
 }
 
 int
 ctl_clear_mask(uint32_t *mask, uint32_t bit)
 {
 	uint32_t chunk, piece;
 
 	chunk = bit >> 5;
 	piece = bit % (sizeof(uint32_t) * 8);
 
 	if ((mask[chunk] & (1 << piece)) == 0)
 		return (-1);
 	else
 		mask[chunk] &= ~(1 << piece);
 
 	return (0);
 }
 
 int
 ctl_is_set(uint32_t *mask, uint32_t bit)
 {
 	uint32_t chunk, piece;
 
 	chunk = bit >> 5;
 	piece = bit % (sizeof(uint32_t) * 8);
 
 	if ((mask[chunk] & (1 << piece)) == 0)
 		return (0);
 	else
 		return (1);
 }
 
 static uint64_t
 ctl_get_prkey(struct ctl_lun *lun, uint32_t residx)
 {
 	uint64_t *t;
 
 	t = lun->pr_keys[residx/CTL_MAX_INIT_PER_PORT];
 	if (t == NULL)
 		return (0);
 	return (t[residx % CTL_MAX_INIT_PER_PORT]);
 }
 
 static void
 ctl_clr_prkey(struct ctl_lun *lun, uint32_t residx)
 {
 	uint64_t *t;
 
 	t = lun->pr_keys[residx/CTL_MAX_INIT_PER_PORT];
 	if (t == NULL)
 		return;
 	t[residx % CTL_MAX_INIT_PER_PORT] = 0;
 }
 
 static void
 ctl_alloc_prkey(struct ctl_lun *lun, uint32_t residx)
 {
 	uint64_t *p;
 	u_int i;
 
 	i = residx/CTL_MAX_INIT_PER_PORT;
 	if (lun->pr_keys[i] != NULL)
 		return;
 	mtx_unlock(&lun->lun_lock);
 	p = malloc(sizeof(uint64_t) * CTL_MAX_INIT_PER_PORT, M_CTL,
 	    M_WAITOK | M_ZERO);
 	mtx_lock(&lun->lun_lock);
 	if (lun->pr_keys[i] == NULL)
 		lun->pr_keys[i] = p;
 	else
 		free(p, M_CTL);
 }
 
 static void
 ctl_set_prkey(struct ctl_lun *lun, uint32_t residx, uint64_t key)
 {
 	uint64_t *t;
 
 	t = lun->pr_keys[residx/CTL_MAX_INIT_PER_PORT];
 	KASSERT(t != NULL, ("prkey %d is not allocated", residx));
 	t[residx % CTL_MAX_INIT_PER_PORT] = key;
 }
 
 /*
  * ctl_softc, pool_name, total_ctl_io are passed in.
  * npool is passed out.
  */
 int
 ctl_pool_create(struct ctl_softc *ctl_softc, const char *pool_name,
 		uint32_t total_ctl_io, void **npool)
 {
 	struct ctl_io_pool *pool;
 
 	pool = (struct ctl_io_pool *)malloc(sizeof(*pool), M_CTL,
 					    M_NOWAIT | M_ZERO);
 	if (pool == NULL)
 		return (ENOMEM);
 
 	snprintf(pool->name, sizeof(pool->name), "CTL IO %s", pool_name);
 	pool->ctl_softc = ctl_softc;
 #ifdef IO_POOLS
 	pool->zone = uma_zsecond_create(pool->name, NULL,
 	    NULL, NULL, NULL, ctl_softc->io_zone);
 	/* uma_prealloc(pool->zone, total_ctl_io); */
 #else
 	pool->zone = ctl_softc->io_zone;
 #endif
 
 	*npool = pool;
 	return (0);
 }
 
 void
 ctl_pool_free(struct ctl_io_pool *pool)
 {
 
 	if (pool == NULL)
 		return;
 
 #ifdef IO_POOLS
 	uma_zdestroy(pool->zone);
 #endif
 	free(pool, M_CTL);
 }
 
 union ctl_io *
 ctl_alloc_io(void *pool_ref)
 {
 	struct ctl_io_pool *pool = (struct ctl_io_pool *)pool_ref;
 	union ctl_io *io;
 
 	io = uma_zalloc(pool->zone, M_WAITOK);
 	if (io != NULL) {
 		io->io_hdr.pool = pool_ref;
 		CTL_SOFTC(io) = pool->ctl_softc;
 		TAILQ_INIT(&io->io_hdr.blocked_queue);
 	}
 	return (io);
 }
 
 union ctl_io *
 ctl_alloc_io_nowait(void *pool_ref)
 {
 	struct ctl_io_pool *pool = (struct ctl_io_pool *)pool_ref;
 	union ctl_io *io;
 
 	io = uma_zalloc(pool->zone, M_NOWAIT);
 	if (io != NULL) {
 		io->io_hdr.pool = pool_ref;
 		CTL_SOFTC(io) = pool->ctl_softc;
 		TAILQ_INIT(&io->io_hdr.blocked_queue);
 	}
 	return (io);
 }
 
 void
 ctl_free_io(union ctl_io *io)
 {
 	struct ctl_io_pool *pool;
 
 	if (io == NULL)
 		return;
 
 	pool = (struct ctl_io_pool *)io->io_hdr.pool;
 	uma_zfree(pool->zone, io);
 }
 
 void
 ctl_zero_io(union ctl_io *io)
 {
 	struct ctl_io_pool *pool;
 
 	if (io == NULL)
 		return;
 
 	/*
 	 * May need to preserve linked list pointers at some point too.
 	 */
 	pool = io->io_hdr.pool;
 	memset(io, 0, sizeof(*io));
 	io->io_hdr.pool = pool;
 	CTL_SOFTC(io) = pool->ctl_softc;
 	TAILQ_INIT(&io->io_hdr.blocked_queue);
 }
 
 int
 ctl_expand_number(const char *buf, uint64_t *num)
 {
 	char *endptr;
 	uint64_t number;
 	unsigned shift;
 
 	number = strtoq(buf, &endptr, 0);
 
 	switch (tolower((unsigned char)*endptr)) {
 	case 'e':
 		shift = 60;
 		break;
 	case 'p':
 		shift = 50;
 		break;
 	case 't':
 		shift = 40;
 		break;
 	case 'g':
 		shift = 30;
 		break;
 	case 'm':
 		shift = 20;
 		break;
 	case 'k':
 		shift = 10;
 		break;
 	case 'b':
 	case '\0': /* No unit. */
 		*num = number;
 		return (0);
 	default:
 		/* Unrecognized unit. */
 		return (-1);
 	}
 
 	if ((number << shift) >> shift != number) {
 		/* Overflow */
 		return (-1);
 	}
 	*num = number << shift;
 	return (0);
 }
 
 
 /*
  * This routine could be used in the future to load default and/or saved
  * mode page parameters for a particuar lun.
  */
 static int
 ctl_init_page_index(struct ctl_lun *lun)
 {
 	int i, page_code;
 	struct ctl_page_index *page_index;
 	const char *value;
 	uint64_t ival;
 
 	memcpy(&lun->mode_pages.index, page_index_template,
 	       sizeof(page_index_template));
 
 	for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 
 		page_index = &lun->mode_pages.index[i];
 		if (lun->be_lun->lun_type == T_DIRECT &&
 		    (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0)
 			continue;
 		if (lun->be_lun->lun_type == T_PROCESSOR &&
 		    (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0)
 			continue;
 		if (lun->be_lun->lun_type == T_CDROM &&
 		    (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0)
 			continue;
 
 		page_code = page_index->page_code & SMPH_PC_MASK;
 		switch (page_code) {
 		case SMS_RW_ERROR_RECOVERY_PAGE: {
 			KASSERT(page_index->subpage == SMS_SUBPAGE_PAGE_0,
 			    ("subpage %#x for page %#x is incorrect!",
 			    page_index->subpage, page_code));
 			memcpy(&lun->mode_pages.rw_er_page[CTL_PAGE_CURRENT],
 			       &rw_er_page_default,
 			       sizeof(rw_er_page_default));
 			memcpy(&lun->mode_pages.rw_er_page[CTL_PAGE_CHANGEABLE],
 			       &rw_er_page_changeable,
 			       sizeof(rw_er_page_changeable));
 			memcpy(&lun->mode_pages.rw_er_page[CTL_PAGE_DEFAULT],
 			       &rw_er_page_default,
 			       sizeof(rw_er_page_default));
 			memcpy(&lun->mode_pages.rw_er_page[CTL_PAGE_SAVED],
 			       &rw_er_page_default,
 			       sizeof(rw_er_page_default));
 			page_index->page_data =
 				(uint8_t *)lun->mode_pages.rw_er_page;
 			break;
 		}
 		case SMS_FORMAT_DEVICE_PAGE: {
 			struct scsi_format_page *format_page;
 
 			KASSERT(page_index->subpage == SMS_SUBPAGE_PAGE_0,
 			    ("subpage %#x for page %#x is incorrect!",
 			    page_index->subpage, page_code));
 
 			/*
 			 * Sectors per track are set above.  Bytes per
 			 * sector need to be set here on a per-LUN basis.
 			 */
 			memcpy(&lun->mode_pages.format_page[CTL_PAGE_CURRENT],
 			       &format_page_default,
 			       sizeof(format_page_default));
 			memcpy(&lun->mode_pages.format_page[
 			       CTL_PAGE_CHANGEABLE], &format_page_changeable,
 			       sizeof(format_page_changeable));
 			memcpy(&lun->mode_pages.format_page[CTL_PAGE_DEFAULT],
 			       &format_page_default,
 			       sizeof(format_page_default));
 			memcpy(&lun->mode_pages.format_page[CTL_PAGE_SAVED],
 			       &format_page_default,
 			       sizeof(format_page_default));
 
 			format_page = &lun->mode_pages.format_page[
 				CTL_PAGE_CURRENT];
 			scsi_ulto2b(lun->be_lun->blocksize,
 				    format_page->bytes_per_sector);
 
 			format_page = &lun->mode_pages.format_page[
 				CTL_PAGE_DEFAULT];
 			scsi_ulto2b(lun->be_lun->blocksize,
 				    format_page->bytes_per_sector);
 
 			format_page = &lun->mode_pages.format_page[
 				CTL_PAGE_SAVED];
 			scsi_ulto2b(lun->be_lun->blocksize,
 				    format_page->bytes_per_sector);
 
 			page_index->page_data =
 				(uint8_t *)lun->mode_pages.format_page;
 			break;
 		}
 		case SMS_RIGID_DISK_PAGE: {
 			struct scsi_rigid_disk_page *rigid_disk_page;
 			uint32_t sectors_per_cylinder;
 			uint64_t cylinders;
 #ifndef	__XSCALE__
 			int shift;
 #endif /* !__XSCALE__ */
 
 			KASSERT(page_index->subpage == SMS_SUBPAGE_PAGE_0,
 			    ("subpage %#x for page %#x is incorrect!",
 			    page_index->subpage, page_code));
 
 			/*
 			 * Rotation rate and sectors per track are set
 			 * above.  We calculate the cylinders here based on
 			 * capacity.  Due to the number of heads and
 			 * sectors per track we're using, smaller arrays
 			 * may turn out to have 0 cylinders.  Linux and
 			 * FreeBSD don't pay attention to these mode pages
 			 * to figure out capacity, but Solaris does.  It
 			 * seems to deal with 0 cylinders just fine, and
 			 * works out a fake geometry based on the capacity.
 			 */
 			memcpy(&lun->mode_pages.rigid_disk_page[
 			       CTL_PAGE_DEFAULT], &rigid_disk_page_default,
 			       sizeof(rigid_disk_page_default));
 			memcpy(&lun->mode_pages.rigid_disk_page[
 			       CTL_PAGE_CHANGEABLE],&rigid_disk_page_changeable,
 			       sizeof(rigid_disk_page_changeable));
 
 			sectors_per_cylinder = CTL_DEFAULT_SECTORS_PER_TRACK *
 				CTL_DEFAULT_HEADS;
 
 			/*
 			 * The divide method here will be more accurate,
 			 * probably, but results in floating point being
 			 * used in the kernel on i386 (__udivdi3()).  On the
 			 * XScale, though, __udivdi3() is implemented in
 			 * software.
 			 *
 			 * The shift method for cylinder calculation is
 			 * accurate if sectors_per_cylinder is a power of
 			 * 2.  Otherwise it might be slightly off -- you
 			 * might have a bit of a truncation problem.
 			 */
 #ifdef	__XSCALE__
 			cylinders = (lun->be_lun->maxlba + 1) /
 				sectors_per_cylinder;
 #else
 			for (shift = 31; shift > 0; shift--) {
 				if (sectors_per_cylinder & (1 << shift))
 					break;
 			}
 			cylinders = (lun->be_lun->maxlba + 1) >> shift;
 #endif
 
 			/*
 			 * We've basically got 3 bytes, or 24 bits for the
 			 * cylinder size in the mode page.  If we're over,
 			 * just round down to 2^24.
 			 */
 			if (cylinders > 0xffffff)
 				cylinders = 0xffffff;
 
 			rigid_disk_page = &lun->mode_pages.rigid_disk_page[
 				CTL_PAGE_DEFAULT];
 			scsi_ulto3b(cylinders, rigid_disk_page->cylinders);
 
 			if ((value = dnvlist_get_string(lun->be_lun->options,
 			    "rpm", NULL)) != NULL) {
 				scsi_ulto2b(strtol(value, NULL, 0),
 				     rigid_disk_page->rotation_rate);
 			}
 
 			memcpy(&lun->mode_pages.rigid_disk_page[CTL_PAGE_CURRENT],
 			       &lun->mode_pages.rigid_disk_page[CTL_PAGE_DEFAULT],
 			       sizeof(rigid_disk_page_default));
 			memcpy(&lun->mode_pages.rigid_disk_page[CTL_PAGE_SAVED],
 			       &lun->mode_pages.rigid_disk_page[CTL_PAGE_DEFAULT],
 			       sizeof(rigid_disk_page_default));
 
 			page_index->page_data =
 				(uint8_t *)lun->mode_pages.rigid_disk_page;
 			break;
 		}
 		case SMS_VERIFY_ERROR_RECOVERY_PAGE: {
 			KASSERT(page_index->subpage == SMS_SUBPAGE_PAGE_0,
 			    ("subpage %#x for page %#x is incorrect!",
 			    page_index->subpage, page_code));
 			memcpy(&lun->mode_pages.verify_er_page[CTL_PAGE_CURRENT],
 			       &verify_er_page_default,
 			       sizeof(verify_er_page_default));
 			memcpy(&lun->mode_pages.verify_er_page[CTL_PAGE_CHANGEABLE],
 			       &verify_er_page_changeable,
 			       sizeof(verify_er_page_changeable));
 			memcpy(&lun->mode_pages.verify_er_page[CTL_PAGE_DEFAULT],
 			       &verify_er_page_default,
 			       sizeof(verify_er_page_default));
 			memcpy(&lun->mode_pages.verify_er_page[CTL_PAGE_SAVED],
 			       &verify_er_page_default,
 			       sizeof(verify_er_page_default));
 			page_index->page_data =
 				(uint8_t *)lun->mode_pages.verify_er_page;
 			break;
 		}
 		case SMS_CACHING_PAGE: {
 			struct scsi_caching_page *caching_page;
 
 			KASSERT(page_index->subpage == SMS_SUBPAGE_PAGE_0,
 			    ("subpage %#x for page %#x is incorrect!",
 			    page_index->subpage, page_code));
 			memcpy(&lun->mode_pages.caching_page[CTL_PAGE_DEFAULT],
 			       &caching_page_default,
 			       sizeof(caching_page_default));
 			memcpy(&lun->mode_pages.caching_page[
 			       CTL_PAGE_CHANGEABLE], &caching_page_changeable,
 			       sizeof(caching_page_changeable));
 			memcpy(&lun->mode_pages.caching_page[CTL_PAGE_SAVED],
 			       &caching_page_default,
 			       sizeof(caching_page_default));
 			caching_page = &lun->mode_pages.caching_page[
 			    CTL_PAGE_SAVED];
 			value = dnvlist_get_string(lun->be_lun->options,
 			    "writecache", NULL);
 			if (value != NULL && strcmp(value, "off") == 0)
 				caching_page->flags1 &= ~SCP_WCE;
 			value = dnvlist_get_string(lun->be_lun->options,
 			    "readcache", NULL);
 			if (value != NULL && strcmp(value, "off") == 0)
 				caching_page->flags1 |= SCP_RCD;
 			memcpy(&lun->mode_pages.caching_page[CTL_PAGE_CURRENT],
 			       &lun->mode_pages.caching_page[CTL_PAGE_SAVED],
 			       sizeof(caching_page_default));
 			page_index->page_data =
 				(uint8_t *)lun->mode_pages.caching_page;
 			break;
 		}
 		case SMS_CONTROL_MODE_PAGE: {
 			switch (page_index->subpage) {
 			case SMS_SUBPAGE_PAGE_0: {
 				struct scsi_control_page *control_page;
 
 				memcpy(&lun->mode_pages.control_page[
 				    CTL_PAGE_DEFAULT],
 				       &control_page_default,
 				       sizeof(control_page_default));
 				memcpy(&lun->mode_pages.control_page[
 				    CTL_PAGE_CHANGEABLE],
 				       &control_page_changeable,
 				       sizeof(control_page_changeable));
 				memcpy(&lun->mode_pages.control_page[
 				    CTL_PAGE_SAVED],
 				       &control_page_default,
 				       sizeof(control_page_default));
 				control_page = &lun->mode_pages.control_page[
 				    CTL_PAGE_SAVED];
 				value = dnvlist_get_string(lun->be_lun->options,
 				    "reordering", NULL);
 				if (value != NULL &&
 				    strcmp(value, "unrestricted") == 0) {
 					control_page->queue_flags &=
 					    ~SCP_QUEUE_ALG_MASK;
 					control_page->queue_flags |=
 					    SCP_QUEUE_ALG_UNRESTRICTED;
 				}
 				memcpy(&lun->mode_pages.control_page[
 				    CTL_PAGE_CURRENT],
 				       &lun->mode_pages.control_page[
 				    CTL_PAGE_SAVED],
 				       sizeof(control_page_default));
 				page_index->page_data =
 				    (uint8_t *)lun->mode_pages.control_page;
 				break;
 			}
 			case 0x01:
 				memcpy(&lun->mode_pages.control_ext_page[
 				    CTL_PAGE_DEFAULT],
 				       &control_ext_page_default,
 				       sizeof(control_ext_page_default));
 				memcpy(&lun->mode_pages.control_ext_page[
 				    CTL_PAGE_CHANGEABLE],
 				       &control_ext_page_changeable,
 				       sizeof(control_ext_page_changeable));
 				memcpy(&lun->mode_pages.control_ext_page[
 				    CTL_PAGE_SAVED],
 				       &control_ext_page_default,
 				       sizeof(control_ext_page_default));
 				memcpy(&lun->mode_pages.control_ext_page[
 				    CTL_PAGE_CURRENT],
 				       &lun->mode_pages.control_ext_page[
 				    CTL_PAGE_SAVED],
 				       sizeof(control_ext_page_default));
 				page_index->page_data =
 				    (uint8_t *)lun->mode_pages.control_ext_page;
 				break;
 			default:
 				panic("subpage %#x for page %#x is incorrect!",
 				      page_index->subpage, page_code);
 			}
 			break;
 		}
 		case SMS_INFO_EXCEPTIONS_PAGE: {
 			switch (page_index->subpage) {
 			case SMS_SUBPAGE_PAGE_0:
 				memcpy(&lun->mode_pages.ie_page[CTL_PAGE_CURRENT],
 				       &ie_page_default,
 				       sizeof(ie_page_default));
 				memcpy(&lun->mode_pages.ie_page[
 				       CTL_PAGE_CHANGEABLE], &ie_page_changeable,
 				       sizeof(ie_page_changeable));
 				memcpy(&lun->mode_pages.ie_page[CTL_PAGE_DEFAULT],
 				       &ie_page_default,
 				       sizeof(ie_page_default));
 				memcpy(&lun->mode_pages.ie_page[CTL_PAGE_SAVED],
 				       &ie_page_default,
 				       sizeof(ie_page_default));
 				page_index->page_data =
 					(uint8_t *)lun->mode_pages.ie_page;
 				break;
 			case 0x02: {
 				struct ctl_logical_block_provisioning_page *page;
 
 				memcpy(&lun->mode_pages.lbp_page[CTL_PAGE_DEFAULT],
 				       &lbp_page_default,
 				       sizeof(lbp_page_default));
 				memcpy(&lun->mode_pages.lbp_page[
 				       CTL_PAGE_CHANGEABLE], &lbp_page_changeable,
 				       sizeof(lbp_page_changeable));
 				memcpy(&lun->mode_pages.lbp_page[CTL_PAGE_SAVED],
 				       &lbp_page_default,
 				       sizeof(lbp_page_default));
 				page = &lun->mode_pages.lbp_page[CTL_PAGE_SAVED];
 				value = dnvlist_get_string(lun->be_lun->options,
 				    "avail-threshold", NULL);
 				if (value != NULL &&
 				    ctl_expand_number(value, &ival) == 0) {
 					page->descr[0].flags |= SLBPPD_ENABLED |
 					    SLBPPD_ARMING_DEC;
 					if (lun->be_lun->blocksize)
 						ival /= lun->be_lun->blocksize;
 					else
 						ival /= 512;
 					scsi_ulto4b(ival >> CTL_LBP_EXPONENT,
 					    page->descr[0].count);
 				}
 				value = dnvlist_get_string(lun->be_lun->options,
 				    "used-threshold", NULL);
 				if (value != NULL &&
 				    ctl_expand_number(value, &ival) == 0) {
 					page->descr[1].flags |= SLBPPD_ENABLED |
 					    SLBPPD_ARMING_INC;
 					if (lun->be_lun->blocksize)
 						ival /= lun->be_lun->blocksize;
 					else
 						ival /= 512;
 					scsi_ulto4b(ival >> CTL_LBP_EXPONENT,
 					    page->descr[1].count);
 				}
 				value = dnvlist_get_string(lun->be_lun->options,
 				    "pool-avail-threshold", NULL);
 				if (value != NULL &&
 				    ctl_expand_number(value, &ival) == 0) {
 					page->descr[2].flags |= SLBPPD_ENABLED |
 					    SLBPPD_ARMING_DEC;
 					if (lun->be_lun->blocksize)
 						ival /= lun->be_lun->blocksize;
 					else
 						ival /= 512;
 					scsi_ulto4b(ival >> CTL_LBP_EXPONENT,
 					    page->descr[2].count);
 				}
 				value = dnvlist_get_string(lun->be_lun->options,
 				    "pool-used-threshold", NULL);
 				if (value != NULL &&
 				    ctl_expand_number(value, &ival) == 0) {
 					page->descr[3].flags |= SLBPPD_ENABLED |
 					    SLBPPD_ARMING_INC;
 					if (lun->be_lun->blocksize)
 						ival /= lun->be_lun->blocksize;
 					else
 						ival /= 512;
 					scsi_ulto4b(ival >> CTL_LBP_EXPONENT,
 					    page->descr[3].count);
 				}
 				memcpy(&lun->mode_pages.lbp_page[CTL_PAGE_CURRENT],
 				       &lun->mode_pages.lbp_page[CTL_PAGE_SAVED],
 				       sizeof(lbp_page_default));
 				page_index->page_data =
 					(uint8_t *)lun->mode_pages.lbp_page;
 				break;
 			}
 			default:
 				panic("subpage %#x for page %#x is incorrect!",
 				      page_index->subpage, page_code);
 			}
 			break;
 		}
 		case SMS_CDDVD_CAPS_PAGE:{
 			KASSERT(page_index->subpage == SMS_SUBPAGE_PAGE_0,
 			    ("subpage %#x for page %#x is incorrect!",
 			    page_index->subpage, page_code));
 			memcpy(&lun->mode_pages.cddvd_page[CTL_PAGE_DEFAULT],
 			       &cddvd_page_default,
 			       sizeof(cddvd_page_default));
 			memcpy(&lun->mode_pages.cddvd_page[
 			       CTL_PAGE_CHANGEABLE], &cddvd_page_changeable,
 			       sizeof(cddvd_page_changeable));
 			memcpy(&lun->mode_pages.cddvd_page[CTL_PAGE_SAVED],
 			       &cddvd_page_default,
 			       sizeof(cddvd_page_default));
 			memcpy(&lun->mode_pages.cddvd_page[CTL_PAGE_CURRENT],
 			       &lun->mode_pages.cddvd_page[CTL_PAGE_SAVED],
 			       sizeof(cddvd_page_default));
 			page_index->page_data =
 				(uint8_t *)lun->mode_pages.cddvd_page;
 			break;
 		}
 		default:
 			panic("invalid page code value %#x", page_code);
 		}
 	}
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_init_log_page_index(struct ctl_lun *lun)
 {
 	struct ctl_page_index *page_index;
 	int i, j, k, prev;
 
 	memcpy(&lun->log_pages.index, log_page_index_template,
 	       sizeof(log_page_index_template));
 
 	prev = -1;
 	for (i = 0, j = 0, k = 0; i < CTL_NUM_LOG_PAGES; i++) {
 
 		page_index = &lun->log_pages.index[i];
 		if (lun->be_lun->lun_type == T_DIRECT &&
 		    (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0)
 			continue;
 		if (lun->be_lun->lun_type == T_PROCESSOR &&
 		    (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0)
 			continue;
 		if (lun->be_lun->lun_type == T_CDROM &&
 		    (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0)
 			continue;
 
 		if (page_index->page_code == SLS_LOGICAL_BLOCK_PROVISIONING &&
 		    lun->backend->lun_attr == NULL)
 			continue;
 
 		if (page_index->page_code != prev) {
 			lun->log_pages.pages_page[j] = page_index->page_code;
 			prev = page_index->page_code;
 			j++;
 		}
 		lun->log_pages.subpages_page[k*2] = page_index->page_code;
 		lun->log_pages.subpages_page[k*2+1] = page_index->subpage;
 		k++;
 	}
 	lun->log_pages.index[0].page_data = &lun->log_pages.pages_page[0];
 	lun->log_pages.index[0].page_len = j;
 	lun->log_pages.index[1].page_data = &lun->log_pages.subpages_page[0];
 	lun->log_pages.index[1].page_len = k * 2;
-	lun->log_pages.index[2].page_data = &lun->log_pages.lbp_page[0];
-	lun->log_pages.index[2].page_len = 12*CTL_NUM_LBP_PARAMS;
-	lun->log_pages.index[3].page_data = (uint8_t *)&lun->log_pages.stat_page;
-	lun->log_pages.index[3].page_len = sizeof(lun->log_pages.stat_page);
-	lun->log_pages.index[4].page_data = (uint8_t *)&lun->log_pages.ie_page;
-	lun->log_pages.index[4].page_len = sizeof(lun->log_pages.ie_page);
+	lun->log_pages.index[2].page_data = (uint8_t *)&lun->log_pages.temp_page;
+	lun->log_pages.index[2].page_len = sizeof(lun->log_pages.temp_page);
+	lun->log_pages.index[3].page_data = &lun->log_pages.lbp_page[0];
+	lun->log_pages.index[3].page_len = 12*CTL_NUM_LBP_PARAMS;
+	lun->log_pages.index[4].page_data = (uint8_t *)&lun->log_pages.stat_page;
+	lun->log_pages.index[4].page_len = sizeof(lun->log_pages.stat_page);
+	lun->log_pages.index[5].page_data = (uint8_t *)&lun->log_pages.ie_page;
+	lun->log_pages.index[5].page_len = sizeof(lun->log_pages.ie_page);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 hex2bin(const char *str, uint8_t *buf, int buf_size)
 {
 	int i;
 	u_char c;
 
 	memset(buf, 0, buf_size);
 	while (isspace(str[0]))
 		str++;
 	if (str[0] == '0' && (str[1] == 'x' || str[1] == 'X'))
 		str += 2;
 	buf_size *= 2;
 	for (i = 0; str[i] != 0 && i < buf_size; i++) {
 		while (str[i] == '-')	/* Skip dashes in UUIDs. */
 			str++;
 		c = str[i];
 		if (isdigit(c))
 			c -= '0';
 		else if (isalpha(c))
 			c -= isupper(c) ? 'A' - 10 : 'a' - 10;
 		else
 			break;
 		if (c >= 16)
 			break;
 		if ((i & 1) == 0)
 			buf[i / 2] |= (c << 4);
 		else
 			buf[i / 2] |= c;
 	}
 	return ((i + 1) / 2);
 }
 
 /*
  * LUN allocation.
  *
  * Requirements:
  * - caller allocates and zeros LUN storage, or passes in a NULL LUN if he
  *   wants us to allocate the LUN and he can block.
  * - ctl_softc is always set
  * - be_lun is set if the LUN has a backend (needed for disk LUNs)
  *
  * Returns 0 for success, non-zero (errno) for failure.
  */
 static int
 ctl_alloc_lun(struct ctl_softc *ctl_softc, struct ctl_lun *ctl_lun,
 	      struct ctl_be_lun *const be_lun)
 {
 	struct ctl_lun *nlun, *lun;
 	struct scsi_vpd_id_descriptor *desc;
 	struct scsi_vpd_id_t10 *t10id;
 	const char *eui, *naa, *scsiname, *uuid, *vendor, *value;
 	int lun_number, lun_malloced;
 	int devidlen, idlen1, idlen2 = 0, len;
 
 	if (be_lun == NULL)
 		return (EINVAL);
 
 	/*
 	 * We currently only support Direct Access or Processor LUN types.
 	 */
 	switch (be_lun->lun_type) {
 	case T_DIRECT:
 	case T_PROCESSOR:
 	case T_CDROM:
 		break;
 	case T_SEQUENTIAL:
 	case T_CHANGER:
 	default:
 		be_lun->lun_config_status(be_lun->be_lun,
 					  CTL_LUN_CONFIG_FAILURE);
 		break;
 	}
 	if (ctl_lun == NULL) {
 		lun = malloc(sizeof(*lun), M_CTL, M_WAITOK);
 		lun_malloced = 1;
 	} else {
 		lun_malloced = 0;
 		lun = ctl_lun;
 	}
 
 	memset(lun, 0, sizeof(*lun));
 	if (lun_malloced)
 		lun->flags = CTL_LUN_MALLOCED;
 
 	lun->pending_sense = malloc(sizeof(struct scsi_sense_data *) *
 	    ctl_max_ports, M_DEVBUF, M_WAITOK | M_ZERO);
 	lun->pending_ua = malloc(sizeof(ctl_ua_type *) * ctl_max_ports,
 	    M_DEVBUF, M_WAITOK | M_ZERO);
 	lun->pr_keys = malloc(sizeof(uint64_t *) * ctl_max_ports,
 	    M_DEVBUF, M_WAITOK | M_ZERO);
 
 	/* Generate LUN ID. */
 	devidlen = max(CTL_DEVID_MIN_LEN,
 	    strnlen(be_lun->device_id, CTL_DEVID_LEN));
 	idlen1 = sizeof(*t10id) + devidlen;
 	len = sizeof(struct scsi_vpd_id_descriptor) + idlen1;
 	scsiname = dnvlist_get_string(be_lun->options, "scsiname", NULL);
 	if (scsiname != NULL) {
 		idlen2 = roundup2(strlen(scsiname) + 1, 4);
 		len += sizeof(struct scsi_vpd_id_descriptor) + idlen2;
 	}
 	eui = dnvlist_get_string(be_lun->options, "eui", NULL);
 	if (eui != NULL) {
 		len += sizeof(struct scsi_vpd_id_descriptor) + 16;
 	}
 	naa = dnvlist_get_string(be_lun->options, "naa", NULL);
 	if (naa != NULL) {
 		len += sizeof(struct scsi_vpd_id_descriptor) + 16;
 	}
 	uuid = dnvlist_get_string(be_lun->options, "uuid", NULL);
 	if (uuid != NULL) {
 		len += sizeof(struct scsi_vpd_id_descriptor) + 18;
 	}
 	lun->lun_devid = malloc(sizeof(struct ctl_devid) + len,
 	    M_CTL, M_WAITOK | M_ZERO);
 	desc = (struct scsi_vpd_id_descriptor *)lun->lun_devid->data;
 	desc->proto_codeset = SVPD_ID_CODESET_ASCII;
 	desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN | SVPD_ID_TYPE_T10;
 	desc->length = idlen1;
 	t10id = (struct scsi_vpd_id_t10 *)&desc->identifier[0];
 	memset(t10id->vendor, ' ', sizeof(t10id->vendor));
 	if ((vendor = dnvlist_get_string(be_lun->options, "vendor", NULL)) == NULL) {
 		strncpy((char *)t10id->vendor, CTL_VENDOR, sizeof(t10id->vendor));
 	} else {
 		strncpy(t10id->vendor, vendor,
 		    min(sizeof(t10id->vendor), strlen(vendor)));
 	}
 	strncpy((char *)t10id->vendor_spec_id,
 	    (char *)be_lun->device_id, devidlen);
 	if (scsiname != NULL) {
 		desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] +
 		    desc->length);
 		desc->proto_codeset = SVPD_ID_CODESET_UTF8;
 		desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN |
 		    SVPD_ID_TYPE_SCSI_NAME;
 		desc->length = idlen2;
 		strlcpy(desc->identifier, scsiname, idlen2);
 	}
 	if (eui != NULL) {
 		desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] +
 		    desc->length);
 		desc->proto_codeset = SVPD_ID_CODESET_BINARY;
 		desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN |
 		    SVPD_ID_TYPE_EUI64;
 		desc->length = hex2bin(eui, desc->identifier, 16);
 		desc->length = desc->length > 12 ? 16 :
 		    (desc->length > 8 ? 12 : 8);
 		len -= 16 - desc->length;
 	}
 	if (naa != NULL) {
 		desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] +
 		    desc->length);
 		desc->proto_codeset = SVPD_ID_CODESET_BINARY;
 		desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN |
 		    SVPD_ID_TYPE_NAA;
 		desc->length = hex2bin(naa, desc->identifier, 16);
 		desc->length = desc->length > 8 ? 16 : 8;
 		len -= 16 - desc->length;
 	}
 	if (uuid != NULL) {
 		desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] +
 		    desc->length);
 		desc->proto_codeset = SVPD_ID_CODESET_BINARY;
 		desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_LUN |
 		    SVPD_ID_TYPE_UUID;
 		desc->identifier[0] = 0x10;
 		hex2bin(uuid, &desc->identifier[2], 16);
 		desc->length = 18;
 	}
 	lun->lun_devid->len = len;
 
 	mtx_lock(&ctl_softc->ctl_lock);
 	/*
 	 * See if the caller requested a particular LUN number.  If so, see
 	 * if it is available.  Otherwise, allocate the first available LUN.
 	 */
 	if (be_lun->flags & CTL_LUN_FLAG_ID_REQ) {
 		if ((be_lun->req_lun_id > (ctl_max_luns - 1))
 		 || (ctl_is_set(ctl_softc->ctl_lun_mask, be_lun->req_lun_id))) {
 			mtx_unlock(&ctl_softc->ctl_lock);
 			if (be_lun->req_lun_id > (ctl_max_luns - 1)) {
 				printf("ctl: requested LUN ID %d is higher "
 				       "than ctl_max_luns - 1 (%d)\n",
 				       be_lun->req_lun_id, ctl_max_luns - 1);
 			} else {
 				/*
 				 * XXX KDM return an error, or just assign
 				 * another LUN ID in this case??
 				 */
 				printf("ctl: requested LUN ID %d is already "
 				       "in use\n", be_lun->req_lun_id);
 			}
 fail:
 			free(lun->lun_devid, M_CTL);
 			if (lun->flags & CTL_LUN_MALLOCED)
 				free(lun, M_CTL);
 			be_lun->lun_config_status(be_lun->be_lun,
 						  CTL_LUN_CONFIG_FAILURE);
 			return (ENOSPC);
 		}
 		lun_number = be_lun->req_lun_id;
 	} else {
 		lun_number = ctl_ffz(ctl_softc->ctl_lun_mask, 0, ctl_max_luns);
 		if (lun_number == -1) {
 			mtx_unlock(&ctl_softc->ctl_lock);
 			printf("ctl: can't allocate LUN, out of LUNs\n");
 			goto fail;
 		}
 	}
 	ctl_set_mask(ctl_softc->ctl_lun_mask, lun_number);
 	mtx_unlock(&ctl_softc->ctl_lock);
 
 	mtx_init(&lun->lun_lock, "CTL LUN", NULL, MTX_DEF);
 	lun->lun = lun_number;
 	lun->be_lun = be_lun;
 	/*
 	 * The processor LUN is always enabled.  Disk LUNs come on line
 	 * disabled, and must be enabled by the backend.
 	 */
 	lun->flags |= CTL_LUN_DISABLED;
 	lun->backend = be_lun->be;
 	be_lun->ctl_lun = lun;
 	be_lun->lun_id = lun_number;
 	atomic_add_int(&be_lun->be->num_luns, 1);
 	if (be_lun->flags & CTL_LUN_FLAG_EJECTED)
 		lun->flags |= CTL_LUN_EJECTED;
 	if (be_lun->flags & CTL_LUN_FLAG_NO_MEDIA)
 		lun->flags |= CTL_LUN_NO_MEDIA;
 	if (be_lun->flags & CTL_LUN_FLAG_STOPPED)
 		lun->flags |= CTL_LUN_STOPPED;
 
 	if (be_lun->flags & CTL_LUN_FLAG_PRIMARY)
 		lun->flags |= CTL_LUN_PRIMARY_SC;
 
 	value = dnvlist_get_string(be_lun->options, "removable", NULL);
 	if (value != NULL) {
 		if (strcmp(value, "on") == 0)
 			lun->flags |= CTL_LUN_REMOVABLE;
 	} else if (be_lun->lun_type == T_CDROM)
 		lun->flags |= CTL_LUN_REMOVABLE;
 
 	lun->ctl_softc = ctl_softc;
 #ifdef CTL_TIME_IO
 	lun->last_busy = getsbinuptime();
 #endif
 	TAILQ_INIT(&lun->ooa_queue);
 	STAILQ_INIT(&lun->error_list);
 	lun->ie_reported = 1;
 	callout_init_mtx(&lun->ie_callout, &lun->lun_lock, 0);
 	ctl_tpc_lun_init(lun);
 	if (lun->flags & CTL_LUN_REMOVABLE) {
 		lun->prevent = malloc((CTL_MAX_INITIATORS + 31) / 32 * 4,
 		    M_CTL, M_WAITOK);
 	}
 
 	/*
 	 * Initialize the mode and log page index.
 	 */
 	ctl_init_page_index(lun);
 	ctl_init_log_page_index(lun);
 
 	/* Setup statistics gathering */
 	lun->stats.item = lun_number;
 
 	/*
 	 * Now, before we insert this lun on the lun list, set the lun
 	 * inventory changed UA for all other luns.
 	 */
 	mtx_lock(&ctl_softc->ctl_lock);
 	STAILQ_FOREACH(nlun, &ctl_softc->lun_list, links) {
 		mtx_lock(&nlun->lun_lock);
 		ctl_est_ua_all(nlun, -1, CTL_UA_LUN_CHANGE);
 		mtx_unlock(&nlun->lun_lock);
 	}
 	STAILQ_INSERT_TAIL(&ctl_softc->lun_list, lun, links);
 	ctl_softc->ctl_luns[lun_number] = lun;
 	ctl_softc->num_luns++;
 	mtx_unlock(&ctl_softc->ctl_lock);
 
 	lun->be_lun->lun_config_status(lun->be_lun->be_lun, CTL_LUN_CONFIG_OK);
 	return (0);
 }
 
 /*
  * Delete a LUN.
  * Assumptions:
  * - LUN has already been marked invalid and any pending I/O has been taken
  *   care of.
  */
 static int
 ctl_free_lun(struct ctl_lun *lun)
 {
 	struct ctl_softc *softc = lun->ctl_softc;
 	struct ctl_lun *nlun;
 	int i;
 
 	KASSERT(TAILQ_EMPTY(&lun->ooa_queue),
 	    ("Freeing a LUN %p with outstanding I/O!\n", lun));
 
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_REMOVE(&softc->lun_list, lun, ctl_lun, links);
 	ctl_clear_mask(softc->ctl_lun_mask, lun->lun);
 	softc->ctl_luns[lun->lun] = NULL;
 	softc->num_luns--;
 	STAILQ_FOREACH(nlun, &softc->lun_list, links) {
 		mtx_lock(&nlun->lun_lock);
 		ctl_est_ua_all(nlun, -1, CTL_UA_LUN_CHANGE);
 		mtx_unlock(&nlun->lun_lock);
 	}
 	mtx_unlock(&softc->ctl_lock);
 
 	/*
 	 * Tell the backend to free resources, if this LUN has a backend.
 	 */
 	atomic_subtract_int(&lun->be_lun->be->num_luns, 1);
 	lun->be_lun->lun_shutdown(lun->be_lun->be_lun);
 
 	lun->ie_reportcnt = UINT32_MAX;
 	callout_drain(&lun->ie_callout);
 	ctl_tpc_lun_shutdown(lun);
 	mtx_destroy(&lun->lun_lock);
 	free(lun->lun_devid, M_CTL);
 	for (i = 0; i < ctl_max_ports; i++)
 		free(lun->pending_ua[i], M_CTL);
 	free(lun->pending_ua, M_DEVBUF);
 	for (i = 0; i < ctl_max_ports; i++)
 		free(lun->pr_keys[i], M_CTL);
 	free(lun->pr_keys, M_DEVBUF);
 	free(lun->write_buffer, M_CTL);
 	free(lun->prevent, M_CTL);
 	if (lun->flags & CTL_LUN_MALLOCED)
 		free(lun, M_CTL);
 
 	return (0);
 }
 
 static void
 ctl_create_lun(struct ctl_be_lun *be_lun)
 {
 
 	/*
 	 * ctl_alloc_lun() should handle all potential failure cases.
 	 */
 	ctl_alloc_lun(control_softc, NULL, be_lun);
 }
 
 int
 ctl_add_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *softc = control_softc;
 
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_INSERT_TAIL(&softc->pending_lun_queue, be_lun, links);
 	mtx_unlock(&softc->ctl_lock);
 	wakeup(&softc->pending_lun_queue);
 
 	return (0);
 }
 
 int
 ctl_enable_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *softc;
 	struct ctl_port *port, *nport;
 	struct ctl_lun *lun;
 	int retval;
 
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 	softc = lun->ctl_softc;
 
 	mtx_lock(&softc->ctl_lock);
 	mtx_lock(&lun->lun_lock);
 	if ((lun->flags & CTL_LUN_DISABLED) == 0) {
 		/*
 		 * eh?  Why did we get called if the LUN is already
 		 * enabled?
 		 */
 		mtx_unlock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 		return (0);
 	}
 	lun->flags &= ~CTL_LUN_DISABLED;
 	mtx_unlock(&lun->lun_lock);
 
 	STAILQ_FOREACH_SAFE(port, &softc->port_list, links, nport) {
 		if ((port->status & CTL_PORT_STATUS_ONLINE) == 0 ||
 		    port->lun_map != NULL || port->lun_enable == NULL)
 			continue;
 
 		/*
 		 * Drop the lock while we call the FETD's enable routine.
 		 * This can lead to a callback into CTL (at least in the
 		 * case of the internal initiator frontend.
 		 */
 		mtx_unlock(&softc->ctl_lock);
 		retval = port->lun_enable(port->targ_lun_arg, lun->lun);
 		mtx_lock(&softc->ctl_lock);
 		if (retval != 0) {
 			printf("%s: FETD %s port %d returned error "
 			       "%d for lun_enable on lun %jd\n",
 			       __func__, port->port_name, port->targ_port,
 			       retval, (intmax_t)lun->lun);
 		}
 	}
 
 	mtx_unlock(&softc->ctl_lock);
 	ctl_isc_announce_lun(lun);
 
 	return (0);
 }
 
 int
 ctl_disable_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_softc *softc;
 	struct ctl_port *port;
 	struct ctl_lun *lun;
 	int retval;
 
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 	softc = lun->ctl_softc;
 
 	mtx_lock(&softc->ctl_lock);
 	mtx_lock(&lun->lun_lock);
 	if (lun->flags & CTL_LUN_DISABLED) {
 		mtx_unlock(&lun->lun_lock);
 		mtx_unlock(&softc->ctl_lock);
 		return (0);
 	}
 	lun->flags |= CTL_LUN_DISABLED;
 	mtx_unlock(&lun->lun_lock);
 
 	STAILQ_FOREACH(port, &softc->port_list, links) {
 		if ((port->status & CTL_PORT_STATUS_ONLINE) == 0 ||
 		    port->lun_map != NULL || port->lun_disable == NULL)
 			continue;
 
 		/*
 		 * Drop the lock before we call the frontend's disable
 		 * routine, to avoid lock order reversals.
 		 *
 		 * XXX KDM what happens if the frontend list changes while
 		 * we're traversing it?  It's unlikely, but should be handled.
 		 */
 		mtx_unlock(&softc->ctl_lock);
 		retval = port->lun_disable(port->targ_lun_arg, lun->lun);
 		mtx_lock(&softc->ctl_lock);
 		if (retval != 0) {
 			printf("%s: FETD %s port %d returned error "
 			       "%d for lun_disable on lun %jd\n",
 			       __func__, port->port_name, port->targ_port,
 			       retval, (intmax_t)lun->lun);
 		}
 	}
 
 	mtx_unlock(&softc->ctl_lock);
 	ctl_isc_announce_lun(lun);
 
 	return (0);
 }
 
 int
 ctl_start_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags &= ~CTL_LUN_STOPPED;
 	mtx_unlock(&lun->lun_lock);
 	return (0);
 }
 
 int
 ctl_stop_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags |= CTL_LUN_STOPPED;
 	mtx_unlock(&lun->lun_lock);
 	return (0);
 }
 
 int
 ctl_lun_no_media(struct ctl_be_lun *be_lun)
 {
 	struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags |= CTL_LUN_NO_MEDIA;
 	mtx_unlock(&lun->lun_lock);
 	return (0);
 }
 
 int
 ctl_lun_has_media(struct ctl_be_lun *be_lun)
 {
 	struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun;
 	union ctl_ha_msg msg;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags &= ~(CTL_LUN_NO_MEDIA | CTL_LUN_EJECTED);
 	if (lun->flags & CTL_LUN_REMOVABLE)
 		ctl_est_ua_all(lun, -1, CTL_UA_MEDIUM_CHANGE);
 	mtx_unlock(&lun->lun_lock);
 	if ((lun->flags & CTL_LUN_REMOVABLE) &&
 	    lun->ctl_softc->ha_mode == CTL_HA_MODE_XFER) {
 		bzero(&msg.ua, sizeof(msg.ua));
 		msg.hdr.msg_type = CTL_MSG_UA;
 		msg.hdr.nexus.initid = -1;
 		msg.hdr.nexus.targ_port = -1;
 		msg.hdr.nexus.targ_lun = lun->lun;
 		msg.hdr.nexus.targ_mapped_lun = lun->lun;
 		msg.ua.ua_all = 1;
 		msg.ua.ua_set = 1;
 		msg.ua.ua_type = CTL_UA_MEDIUM_CHANGE;
 		ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.ua),
 		    M_WAITOK);
 	}
 	return (0);
 }
 
 int
 ctl_lun_ejected(struct ctl_be_lun *be_lun)
 {
 	struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags |= CTL_LUN_EJECTED;
 	mtx_unlock(&lun->lun_lock);
 	return (0);
 }
 
 int
 ctl_lun_primary(struct ctl_be_lun *be_lun)
 {
 	struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags |= CTL_LUN_PRIMARY_SC;
 	ctl_est_ua_all(lun, -1, CTL_UA_ASYM_ACC_CHANGE);
 	mtx_unlock(&lun->lun_lock);
 	ctl_isc_announce_lun(lun);
 	return (0);
 }
 
 int
 ctl_lun_secondary(struct ctl_be_lun *be_lun)
 {
 	struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 	lun->flags &= ~CTL_LUN_PRIMARY_SC;
 	ctl_est_ua_all(lun, -1, CTL_UA_ASYM_ACC_CHANGE);
 	mtx_unlock(&lun->lun_lock);
 	ctl_isc_announce_lun(lun);
 	return (0);
 }
 
 int
 ctl_invalidate_lun(struct ctl_be_lun *be_lun)
 {
 	struct ctl_lun *lun;
 
 	lun = (struct ctl_lun *)be_lun->ctl_lun;
 
 	mtx_lock(&lun->lun_lock);
 
 	/*
 	 * The LUN needs to be disabled before it can be marked invalid.
 	 */
 	if ((lun->flags & CTL_LUN_DISABLED) == 0) {
 		mtx_unlock(&lun->lun_lock);
 		return (-1);
 	}
 	/*
 	 * Mark the LUN invalid.
 	 */
 	lun->flags |= CTL_LUN_INVALID;
 
 	/*
 	 * If there is nothing in the OOA queue, go ahead and free the LUN.
 	 * If we have something in the OOA queue, we'll free it when the
 	 * last I/O completes.
 	 */
 	if (TAILQ_EMPTY(&lun->ooa_queue)) {
 		mtx_unlock(&lun->lun_lock);
 		ctl_free_lun(lun);
 	} else
 		mtx_unlock(&lun->lun_lock);
 
 	return (0);
 }
 
 void
 ctl_lun_capacity_changed(struct ctl_be_lun *be_lun)
 {
 	struct ctl_lun *lun = (struct ctl_lun *)be_lun->ctl_lun;
 	union ctl_ha_msg msg;
 
 	mtx_lock(&lun->lun_lock);
 	ctl_est_ua_all(lun, -1, CTL_UA_CAPACITY_CHANGE);
 	mtx_unlock(&lun->lun_lock);
 	if (lun->ctl_softc->ha_mode == CTL_HA_MODE_XFER) {
 		/* Send msg to other side. */
 		bzero(&msg.ua, sizeof(msg.ua));
 		msg.hdr.msg_type = CTL_MSG_UA;
 		msg.hdr.nexus.initid = -1;
 		msg.hdr.nexus.targ_port = -1;
 		msg.hdr.nexus.targ_lun = lun->lun;
 		msg.hdr.nexus.targ_mapped_lun = lun->lun;
 		msg.ua.ua_all = 1;
 		msg.ua.ua_set = 1;
 		msg.ua.ua_type = CTL_UA_CAPACITY_CHANGE;
 		ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg, sizeof(msg.ua),
 		    M_WAITOK);
 	}
 }
 
 /*
  * Backend "memory move is complete" callback for requests that never
  * make it down to say RAIDCore's configuration code.
  */
 int
 ctl_config_move_done(union ctl_io *io)
 {
 	int retval;
 
 	CTL_DEBUG_PRINT(("ctl_config_move_done\n"));
 	KASSERT(io->io_hdr.io_type == CTL_IO_SCSI,
 	    ("Config I/O type isn't CTL_IO_SCSI (%d)!", io->io_hdr.io_type));
 
 	if ((io->io_hdr.port_status != 0) &&
 	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
 	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
 		ctl_set_internal_failure(&io->scsiio, /*sks_valid*/ 1,
 		    /*retry_count*/ io->io_hdr.port_status);
 	} else if (io->scsiio.kern_data_resid != 0 &&
 	    (io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT &&
 	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
 	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
 		ctl_set_invalid_field_ciu(&io->scsiio);
 	}
 
 	if (ctl_debug & CTL_DEBUG_CDB_DATA)
 		ctl_data_print(io);
 	if (((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN) ||
 	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
 	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) ||
 	    ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0)) {
 		/*
 		 * XXX KDM just assuming a single pointer here, and not a
 		 * S/G list.  If we start using S/G lists for config data,
 		 * we'll need to know how to clean them up here as well.
 		 */
 		if (io->io_hdr.flags & CTL_FLAG_ALLOCATED)
 			free(io->scsiio.kern_data_ptr, M_CTL);
 		ctl_done(io);
 		retval = CTL_RETVAL_COMPLETE;
 	} else {
 		/*
 		 * XXX KDM now we need to continue data movement.  Some
 		 * options:
 		 * - call ctl_scsiio() again?  We don't do this for data
 		 *   writes, because for those at least we know ahead of
 		 *   time where the write will go and how long it is.  For
 		 *   config writes, though, that information is largely
 		 *   contained within the write itself, thus we need to
 		 *   parse out the data again.
 		 *
 		 * - Call some other function once the data is in?
 		 */
 
 		/*
 		 * XXX KDM call ctl_scsiio() again for now, and check flag
 		 * bits to see whether we're allocated or not.
 		 */
 		retval = ctl_scsiio(&io->scsiio);
 	}
 	return (retval);
 }
 
 /*
  * This gets called by a backend driver when it is done with a
  * data_submit method.
  */
 void
 ctl_data_submit_done(union ctl_io *io)
 {
 	/*
 	 * If the IO_CONT flag is set, we need to call the supplied
 	 * function to continue processing the I/O, instead of completing
 	 * the I/O just yet.
 	 *
 	 * If there is an error, though, we don't want to keep processing.
 	 * Instead, just send status back to the initiator.
 	 */
 	if ((io->io_hdr.flags & CTL_FLAG_IO_CONT) &&
 	    (io->io_hdr.flags & CTL_FLAG_ABORT) == 0 &&
 	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
 	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
 		io->scsiio.io_cont(io);
 		return;
 	}
 	ctl_done(io);
 }
 
 /*
  * This gets called by a backend driver when it is done with a
  * configuration write.
  */
 void
 ctl_config_write_done(union ctl_io *io)
 {
 	uint8_t *buf;
 
 	/*
 	 * If the IO_CONT flag is set, we need to call the supplied
 	 * function to continue processing the I/O, instead of completing
 	 * the I/O just yet.
 	 *
 	 * If there is an error, though, we don't want to keep processing.
 	 * Instead, just send status back to the initiator.
 	 */
 	if ((io->io_hdr.flags & CTL_FLAG_IO_CONT) &&
 	    (io->io_hdr.flags & CTL_FLAG_ABORT) == 0 &&
 	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_STATUS_NONE ||
 	     (io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS)) {
 		io->scsiio.io_cont(io);
 		return;
 	}
 	/*
 	 * Since a configuration write can be done for commands that actually
 	 * have data allocated, like write buffer, and commands that have
 	 * no data, like start/stop unit, we need to check here.
 	 */
 	if (io->io_hdr.flags & CTL_FLAG_ALLOCATED)
 		buf = io->scsiio.kern_data_ptr;
 	else
 		buf = NULL;
 	ctl_done(io);
 	if (buf)
 		free(buf, M_CTL);
 }
 
 void
 ctl_config_read_done(union ctl_io *io)
 {
 	uint8_t *buf;
 
 	/*
 	 * If there is some error -- we are done, skip data transfer.
 	 */
 	if ((io->io_hdr.flags & CTL_FLAG_ABORT) != 0 ||
 	    ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
 	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS)) {
 		if (io->io_hdr.flags & CTL_FLAG_ALLOCATED)
 			buf = io->scsiio.kern_data_ptr;
 		else
 			buf = NULL;
 		ctl_done(io);
 		if (buf)
 			free(buf, M_CTL);
 		return;
 	}
 
 	/*
 	 * If the IO_CONT flag is set, we need to call the supplied
 	 * function to continue processing the I/O, instead of completing
 	 * the I/O just yet.
 	 */
 	if (io->io_hdr.flags & CTL_FLAG_IO_CONT) {
 		io->scsiio.io_cont(io);
 		return;
 	}
 
 	ctl_datamove(io);
 }
 
 /*
  * SCSI release command.
  */
 int
 ctl_scsi_release(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	uint32_t residx;
 
 	CTL_DEBUG_PRINT(("ctl_scsi_release\n"));
 
 	residx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 
 	/*
 	 * XXX KDM right now, we only support LUN reservation.  We don't
 	 * support 3rd party reservations, or extent reservations, which
 	 * might actually need the parameter list.  If we've gotten this
 	 * far, we've got a LUN reservation.  Anything else got kicked out
 	 * above.  So, according to SPC, ignore the length.
 	 */
 
 	mtx_lock(&lun->lun_lock);
 
 	/*
 	 * According to SPC, it is not an error for an intiator to attempt
 	 * to release a reservation on a LUN that isn't reserved, or that
 	 * is reserved by another initiator.  The reservation can only be
 	 * released, though, by the initiator who made it or by one of
 	 * several reset type events.
 	 */
 	if ((lun->flags & CTL_LUN_RESERVED) && (lun->res_idx == residx))
 			lun->flags &= ~CTL_LUN_RESERVED;
 
 	mtx_unlock(&lun->lun_lock);
 
 	ctl_set_success(ctsio);
 	ctl_done((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_scsi_reserve(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	uint32_t residx;
 
 	CTL_DEBUG_PRINT(("ctl_reserve\n"));
 
 	residx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 
 	/*
 	 * XXX KDM right now, we only support LUN reservation.  We don't
 	 * support 3rd party reservations, or extent reservations, which
 	 * might actually need the parameter list.  If we've gotten this
 	 * far, we've got a LUN reservation.  Anything else got kicked out
 	 * above.  So, according to SPC, ignore the length.
 	 */
 
 	mtx_lock(&lun->lun_lock);
 	if ((lun->flags & CTL_LUN_RESERVED) && (lun->res_idx != residx)) {
 		ctl_set_reservation_conflict(ctsio);
 		goto bailout;
 	}
 
 	/* SPC-3 exceptions to SPC-2 RESERVE and RELEASE behavior. */
 	if (lun->flags & CTL_LUN_PR_RESERVED) {
 		ctl_set_success(ctsio);
 		goto bailout;
 	}
 
 	lun->flags |= CTL_LUN_RESERVED;
 	lun->res_idx = residx;
 	ctl_set_success(ctsio);
 
 bailout:
 	mtx_unlock(&lun->lun_lock);
 	ctl_done((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_start_stop(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_start_stop_unit *cdb;
 	int retval;
 
 	CTL_DEBUG_PRINT(("ctl_start_stop\n"));
 
 	cdb = (struct scsi_start_stop_unit *)ctsio->cdb;
 
 	if ((cdb->how & SSS_PC_MASK) == 0) {
 		if ((lun->flags & CTL_LUN_PR_RESERVED) &&
 		    (cdb->how & SSS_START) == 0) {
 			uint32_t residx;
 
 			residx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 			if (ctl_get_prkey(lun, residx) == 0 ||
 			    (lun->pr_res_idx != residx && lun->pr_res_type < 4)) {
 
 				ctl_set_reservation_conflict(ctsio);
 				ctl_done((union ctl_io *)ctsio);
 				return (CTL_RETVAL_COMPLETE);
 			}
 		}
 
 		if ((cdb->how & SSS_LOEJ) &&
 		    (lun->flags & CTL_LUN_REMOVABLE) == 0) {
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 4,
 					      /*bit_valid*/ 1,
 					      /*bit*/ 1);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 
 		if ((cdb->how & SSS_START) == 0 && (cdb->how & SSS_LOEJ) &&
 		    lun->prevent_count > 0) {
 			/* "Medium removal prevented" */
 			ctl_set_sense(ctsio, /*current_error*/ 1,
 			    /*sense_key*/(lun->flags & CTL_LUN_NO_MEDIA) ?
 			     SSD_KEY_NOT_READY : SSD_KEY_ILLEGAL_REQUEST,
 			    /*asc*/ 0x53, /*ascq*/ 0x02, SSD_ELEM_NONE);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 	}
 
 	retval = lun->backend->config_write((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_prevent_allow(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_prevent *cdb;
 	int retval;
 	uint32_t initidx;
 
 	CTL_DEBUG_PRINT(("ctl_prevent_allow\n"));
 
 	cdb = (struct scsi_prevent *)ctsio->cdb;
 
 	if ((lun->flags & CTL_LUN_REMOVABLE) == 0 || lun->prevent == NULL) {
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	initidx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 	mtx_lock(&lun->lun_lock);
 	if ((cdb->how & PR_PREVENT) &&
 	    ctl_is_set(lun->prevent, initidx) == 0) {
 		ctl_set_mask(lun->prevent, initidx);
 		lun->prevent_count++;
 	} else if ((cdb->how & PR_PREVENT) == 0 &&
 	    ctl_is_set(lun->prevent, initidx)) {
 		ctl_clear_mask(lun->prevent, initidx);
 		lun->prevent_count--;
 	}
 	mtx_unlock(&lun->lun_lock);
 	retval = lun->backend->config_write((union ctl_io *)ctsio);
 	return (retval);
 }
 
 /*
  * We support the SYNCHRONIZE CACHE command (10 and 16 byte versions), but
  * we don't really do anything with the LBA and length fields if the user
  * passes them in.  Instead we'll just flush out the cache for the entire
  * LUN.
  */
 int
 ctl_sync_cache(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct ctl_lba_len_flags *lbalen;
 	uint64_t starting_lba;
 	uint32_t block_count;
 	int retval;
 	uint8_t byte2;
 
 	CTL_DEBUG_PRINT(("ctl_sync_cache\n"));
 
 	retval = 0;
 
 	switch (ctsio->cdb[0]) {
 	case SYNCHRONIZE_CACHE: {
 		struct scsi_sync_cache *cdb;
 		cdb = (struct scsi_sync_cache *)ctsio->cdb;
 
 		starting_lba = scsi_4btoul(cdb->begin_lba);
 		block_count = scsi_2btoul(cdb->lb_count);
 		byte2 = cdb->byte2;
 		break;
 	}
 	case SYNCHRONIZE_CACHE_16: {
 		struct scsi_sync_cache_16 *cdb;
 		cdb = (struct scsi_sync_cache_16 *)ctsio->cdb;
 
 		starting_lba = scsi_8btou64(cdb->begin_lba);
 		block_count = scsi_4btoul(cdb->lb_count);
 		byte2 = cdb->byte2;
 		break;
 	}
 	default:
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		goto bailout;
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * We check the LBA and length, but don't do anything with them.
 	 * A SYNCHRONIZE CACHE will cause the entire cache for this lun to
 	 * get flushed.  This check will just help satisfy anyone who wants
 	 * to see an error for an out of range LBA.
 	 */
 	if ((starting_lba + block_count) > (lun->be_lun->maxlba + 1)) {
 		ctl_set_lba_out_of_range(ctsio,
 		    MAX(starting_lba, lun->be_lun->maxlba + 1));
 		ctl_done((union ctl_io *)ctsio);
 		goto bailout;
 	}
 
 	lbalen = (struct ctl_lba_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	lbalen->lba = starting_lba;
 	lbalen->len = block_count;
 	lbalen->flags = byte2;
 	retval = lun->backend->config_write((union ctl_io *)ctsio);
 
 bailout:
 	return (retval);
 }
 
 int
 ctl_format(struct ctl_scsiio *ctsio)
 {
 	struct scsi_format *cdb;
 	int length, defect_list_len;
 
 	CTL_DEBUG_PRINT(("ctl_format\n"));
 
 	cdb = (struct scsi_format *)ctsio->cdb;
 
 	length = 0;
 	if (cdb->byte2 & SF_FMTDATA) {
 		if (cdb->byte2 & SF_LONGLIST)
 			length = sizeof(struct scsi_format_header_long);
 		else
 			length = sizeof(struct scsi_format_header_short);
 	}
 
 	if (((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0)
 	 && (length > 0)) {
 		ctsio->kern_data_ptr = malloc(length, M_CTL, M_WAITOK);
 		ctsio->kern_data_len = length;
 		ctsio->kern_total_len = length;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	defect_list_len = 0;
 
 	if (cdb->byte2 & SF_FMTDATA) {
 		if (cdb->byte2 & SF_LONGLIST) {
 			struct scsi_format_header_long *header;
 
 			header = (struct scsi_format_header_long *)
 				ctsio->kern_data_ptr;
 
 			defect_list_len = scsi_4btoul(header->defect_list_len);
 			if (defect_list_len != 0) {
 				ctl_set_invalid_field(ctsio,
 						      /*sks_valid*/ 1,
 						      /*command*/ 0,
 						      /*field*/ 2,
 						      /*bit_valid*/ 0,
 						      /*bit*/ 0);
 				goto bailout;
 			}
 		} else {
 			struct scsi_format_header_short *header;
 
 			header = (struct scsi_format_header_short *)
 				ctsio->kern_data_ptr;
 
 			defect_list_len = scsi_2btoul(header->defect_list_len);
 			if (defect_list_len != 0) {
 				ctl_set_invalid_field(ctsio,
 						      /*sks_valid*/ 1,
 						      /*command*/ 0,
 						      /*field*/ 2,
 						      /*bit_valid*/ 0,
 						      /*bit*/ 0);
 				goto bailout;
 			}
 		}
 	}
 
 	ctl_set_success(ctsio);
 bailout:
 
 	if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) {
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED;
 	}
 
 	ctl_done((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_read_buffer(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	uint64_t buffer_offset;
 	uint32_t len;
 	uint8_t byte2;
 	static uint8_t descr[4];
 	static uint8_t echo_descr[4] = { 0 };
 
 	CTL_DEBUG_PRINT(("ctl_read_buffer\n"));
 
 	switch (ctsio->cdb[0]) {
 	case READ_BUFFER: {
 		struct scsi_read_buffer *cdb;
 
 		cdb = (struct scsi_read_buffer *)ctsio->cdb;
 		buffer_offset = scsi_3btoul(cdb->offset);
 		len = scsi_3btoul(cdb->length);
 		byte2 = cdb->byte2;
 		break;
 	}
 	case READ_BUFFER_16: {
 		struct scsi_read_buffer_16 *cdb;
 
 		cdb = (struct scsi_read_buffer_16 *)ctsio->cdb;
 		buffer_offset = scsi_8btou64(cdb->offset);
 		len = scsi_4btoul(cdb->length);
 		byte2 = cdb->byte2;
 		break;
 	}
 	default: /* This shouldn't happen. */
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	if (buffer_offset > CTL_WRITE_BUFFER_SIZE ||
 	    buffer_offset + len > CTL_WRITE_BUFFER_SIZE) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 6,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	if ((byte2 & RWB_MODE) == RWB_MODE_DESCR) {
 		descr[0] = 0;
 		scsi_ulto3b(CTL_WRITE_BUFFER_SIZE, &descr[1]);
 		ctsio->kern_data_ptr = descr;
 		len = min(len, sizeof(descr));
 	} else if ((byte2 & RWB_MODE) == RWB_MODE_ECHO_DESCR) {
 		ctsio->kern_data_ptr = echo_descr;
 		len = min(len, sizeof(echo_descr));
 	} else {
 		if (lun->write_buffer == NULL) {
 			lun->write_buffer = malloc(CTL_WRITE_BUFFER_SIZE,
 			    M_CTL, M_WAITOK);
 		}
 		ctsio->kern_data_ptr = lun->write_buffer + buffer_offset;
 	}
 	ctsio->kern_data_len = len;
 	ctsio->kern_total_len = len;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctl_set_success(ctsio);
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_write_buffer(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_write_buffer *cdb;
 	int buffer_offset, len;
 
 	CTL_DEBUG_PRINT(("ctl_write_buffer\n"));
 
 	cdb = (struct scsi_write_buffer *)ctsio->cdb;
 
 	len = scsi_3btoul(cdb->length);
 	buffer_offset = scsi_3btoul(cdb->offset);
 
 	if (buffer_offset + len > CTL_WRITE_BUFFER_SIZE) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 6,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * If we've got a kernel request that hasn't been malloced yet,
 	 * malloc it and tell the caller the data buffer is here.
 	 */
 	if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) {
 		if (lun->write_buffer == NULL) {
 			lun->write_buffer = malloc(CTL_WRITE_BUFFER_SIZE,
 			    M_CTL, M_WAITOK);
 		}
 		ctsio->kern_data_ptr = lun->write_buffer + buffer_offset;
 		ctsio->kern_data_len = len;
 		ctsio->kern_total_len = len;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	ctl_set_success(ctsio);
 	ctl_done((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_write_same(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct ctl_lba_len_flags *lbalen;
 	uint64_t lba;
 	uint32_t num_blocks;
 	int len, retval;
 	uint8_t byte2;
 
 	CTL_DEBUG_PRINT(("ctl_write_same\n"));
 
 	switch (ctsio->cdb[0]) {
 	case WRITE_SAME_10: {
 		struct scsi_write_same_10 *cdb;
 
 		cdb = (struct scsi_write_same_10 *)ctsio->cdb;
 
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_2btoul(cdb->length);
 		byte2 = cdb->byte2;
 		break;
 	}
 	case WRITE_SAME_16: {
 		struct scsi_write_same_16 *cdb;
 
 		cdb = (struct scsi_write_same_16 *)ctsio->cdb;
 
 		lba = scsi_8btou64(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		byte2 = cdb->byte2;
 		break;
 	}
 	default:
 		/*
 		 * We got a command we don't support.  This shouldn't
 		 * happen, commands should be filtered out above us.
 		 */
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 		break; /* NOTREACHED */
 	}
 
 	/* ANCHOR flag can be used only together with UNMAP */
 	if ((byte2 & SWS_UNMAP) == 0 && (byte2 & SWS_ANCHOR) != 0) {
 		ctl_set_invalid_field(ctsio, /*sks_valid*/ 1,
 		    /*command*/ 1, /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * The first check is to make sure we're in bounds, the second
 	 * check is to catch wrap-around problems.  If the lba + num blocks
 	 * is less than the lba, then we've wrapped around and the block
 	 * range is invalid anyway.
 	 */
 	if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
 	 || ((lba + num_blocks) < lba)) {
 		ctl_set_lba_out_of_range(ctsio,
 		    MAX(lba, lun->be_lun->maxlba + 1));
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/* Zero number of blocks means "to the last logical block" */
 	if (num_blocks == 0) {
 		if ((lun->be_lun->maxlba + 1) - lba > UINT32_MAX) {
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 0,
 					      /*command*/ 1,
 					      /*field*/ 0,
 					      /*bit_valid*/ 0,
 					      /*bit*/ 0);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		num_blocks = (lun->be_lun->maxlba + 1) - lba;
 	}
 
 	len = lun->be_lun->blocksize;
 
 	/*
 	 * If we've got a kernel request that hasn't been malloced yet,
 	 * malloc it and tell the caller the data buffer is here.
 	 */
 	if ((byte2 & SWS_NDOB) == 0 &&
 	    (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) {
 		ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK);
 		ctsio->kern_data_len = len;
 		ctsio->kern_total_len = len;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	lbalen = (struct ctl_lba_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	lbalen->lba = lba;
 	lbalen->len = num_blocks;
 	lbalen->flags = byte2;
 	retval = lun->backend->config_write((union ctl_io *)ctsio);
 
 	return (retval);
 }
 
 int
 ctl_unmap(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_unmap *cdb;
 	struct ctl_ptr_len_flags *ptrlen;
 	struct scsi_unmap_header *hdr;
 	struct scsi_unmap_desc *buf, *end, *endnz, *range;
 	uint64_t lba;
 	uint32_t num_blocks;
 	int len, retval;
 	uint8_t byte2;
 
 	CTL_DEBUG_PRINT(("ctl_unmap\n"));
 
 	cdb = (struct scsi_unmap *)ctsio->cdb;
 	len = scsi_2btoul(cdb->length);
 	byte2 = cdb->byte2;
 
 	/*
 	 * If we've got a kernel request that hasn't been malloced yet,
 	 * malloc it and tell the caller the data buffer is here.
 	 */
 	if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) {
 		ctsio->kern_data_ptr = malloc(len, M_CTL, M_WAITOK);
 		ctsio->kern_data_len = len;
 		ctsio->kern_total_len = len;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	len = ctsio->kern_total_len - ctsio->kern_data_resid;
 	hdr = (struct scsi_unmap_header *)ctsio->kern_data_ptr;
 	if (len < sizeof (*hdr) ||
 	    len < (scsi_2btoul(hdr->length) + sizeof(hdr->length)) ||
 	    len < (scsi_2btoul(hdr->desc_length) + sizeof (*hdr)) ||
 	    scsi_2btoul(hdr->desc_length) % sizeof(*buf) != 0) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 0,
 				      /*command*/ 0,
 				      /*field*/ 0,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		goto done;
 	}
 	len = scsi_2btoul(hdr->desc_length);
 	buf = (struct scsi_unmap_desc *)(hdr + 1);
 	end = buf + len / sizeof(*buf);
 
 	endnz = buf;
 	for (range = buf; range < end; range++) {
 		lba = scsi_8btou64(range->lba);
 		num_blocks = scsi_4btoul(range->length);
 		if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
 		 || ((lba + num_blocks) < lba)) {
 			ctl_set_lba_out_of_range(ctsio,
 			    MAX(lba, lun->be_lun->maxlba + 1));
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		if (num_blocks != 0)
 			endnz = range + 1;
 	}
 
 	/*
 	 * Block backend can not handle zero last range.
 	 * Filter it out and return if there is nothing left.
 	 */
 	len = (uint8_t *)endnz - (uint8_t *)buf;
 	if (len == 0) {
 		ctl_set_success(ctsio);
 		goto done;
 	}
 
 	mtx_lock(&lun->lun_lock);
 	ptrlen = (struct ctl_ptr_len_flags *)
 	    &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	ptrlen->ptr = (void *)buf;
 	ptrlen->len = len;
 	ptrlen->flags = byte2;
 	ctl_try_unblock_others(lun, (union ctl_io *)ctsio, FALSE);
 	mtx_unlock(&lun->lun_lock);
 
 	retval = lun->backend->config_write((union ctl_io *)ctsio);
 	return (retval);
 
 done:
 	if (ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) {
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctsio->io_hdr.flags &= ~CTL_FLAG_ALLOCATED;
 	}
 	ctl_done((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_default_page_handler(struct ctl_scsiio *ctsio,
 			 struct ctl_page_index *page_index, uint8_t *page_ptr)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	uint8_t *current_cp;
 	int set_ua;
 	uint32_t initidx;
 
 	initidx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 	set_ua = 0;
 
 	current_cp = (page_index->page_data + (page_index->page_len *
 	    CTL_PAGE_CURRENT));
 
 	mtx_lock(&lun->lun_lock);
 	if (memcmp(current_cp, page_ptr, page_index->page_len)) {
 		memcpy(current_cp, page_ptr, page_index->page_len);
 		set_ua = 1;
 	}
 	if (set_ua != 0)
 		ctl_est_ua_all(lun, initidx, CTL_UA_MODE_CHANGE);
 	mtx_unlock(&lun->lun_lock);
 	if (set_ua) {
 		ctl_isc_announce_mode(lun,
 		    ctl_get_initindex(&ctsio->io_hdr.nexus),
 		    page_index->page_code, page_index->subpage);
 	}
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static void
 ctl_ie_timer(void *arg)
 {
 	struct ctl_lun *lun = arg;
 	uint64_t t;
 
 	if (lun->ie_asc == 0)
 		return;
 
 	if (lun->MODE_IE.mrie == SIEP_MRIE_UA)
 		ctl_est_ua_all(lun, -1, CTL_UA_IE);
 	else
 		lun->ie_reported = 0;
 
 	if (lun->ie_reportcnt < scsi_4btoul(lun->MODE_IE.report_count)) {
 		lun->ie_reportcnt++;
 		t = scsi_4btoul(lun->MODE_IE.interval_timer);
 		if (t == 0 || t == UINT32_MAX)
 			t = 3000;  /* 5 min */
 		callout_schedule(&lun->ie_callout, t * hz / 10);
 	}
 }
 
 int
 ctl_ie_page_handler(struct ctl_scsiio *ctsio,
 			 struct ctl_page_index *page_index, uint8_t *page_ptr)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_info_exceptions_page *pg;
 	uint64_t t;
 
 	(void)ctl_default_page_handler(ctsio, page_index, page_ptr);
 
 	pg = (struct scsi_info_exceptions_page *)page_ptr;
 	mtx_lock(&lun->lun_lock);
 	if (pg->info_flags & SIEP_FLAGS_TEST) {
 		lun->ie_asc = 0x5d;
 		lun->ie_ascq = 0xff;
 		if (pg->mrie == SIEP_MRIE_UA) {
 			ctl_est_ua_all(lun, -1, CTL_UA_IE);
 			lun->ie_reported = 1;
 		} else {
 			ctl_clr_ua_all(lun, -1, CTL_UA_IE);
 			lun->ie_reported = -1;
 		}
 		lun->ie_reportcnt = 1;
 		if (lun->ie_reportcnt < scsi_4btoul(pg->report_count)) {
 			lun->ie_reportcnt++;
 			t = scsi_4btoul(pg->interval_timer);
 			if (t == 0 || t == UINT32_MAX)
 				t = 3000;  /* 5 min */
 			callout_reset(&lun->ie_callout, t * hz / 10,
 			    ctl_ie_timer, lun);
 		}
 	} else {
 		lun->ie_asc = 0;
 		lun->ie_ascq = 0;
 		lun->ie_reported = 1;
 		ctl_clr_ua_all(lun, -1, CTL_UA_IE);
 		lun->ie_reportcnt = UINT32_MAX;
 		callout_stop(&lun->ie_callout);
 	}
 	mtx_unlock(&lun->lun_lock);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_do_mode_select(union ctl_io *io)
 {
 	struct ctl_lun *lun = CTL_LUN(io);
 	struct scsi_mode_page_header *page_header;
 	struct ctl_page_index *page_index;
 	struct ctl_scsiio *ctsio;
 	int page_len, page_len_offset, page_len_size;
 	union ctl_modepage_info *modepage_info;
 	uint16_t *len_left, *len_used;
 	int retval, i;
 
 	ctsio = &io->scsiio;
 	page_index = NULL;
 	page_len = 0;
 
 	modepage_info = (union ctl_modepage_info *)
 		ctsio->io_hdr.ctl_private[CTL_PRIV_MODEPAGE].bytes;
 	len_left = &modepage_info->header.len_left;
 	len_used = &modepage_info->header.len_used;
 
 do_next_page:
 
 	page_header = (struct scsi_mode_page_header *)
 		(ctsio->kern_data_ptr + *len_used);
 
 	if (*len_left == 0) {
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	} else if (*len_left < sizeof(struct scsi_mode_page_header)) {
 
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_set_param_len_error(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 
 	} else if ((page_header->page_code & SMPH_SPF)
 		&& (*len_left < sizeof(struct scsi_mode_page_header_sp))) {
 
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_set_param_len_error(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 
 	/*
 	 * XXX KDM should we do something with the block descriptor?
 	 */
 	for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 		page_index = &lun->mode_pages.index[i];
 		if (lun->be_lun->lun_type == T_DIRECT &&
 		    (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0)
 			continue;
 		if (lun->be_lun->lun_type == T_PROCESSOR &&
 		    (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0)
 			continue;
 		if (lun->be_lun->lun_type == T_CDROM &&
 		    (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0)
 			continue;
 
 		if ((page_index->page_code & SMPH_PC_MASK) !=
 		    (page_header->page_code & SMPH_PC_MASK))
 			continue;
 
 		/*
 		 * If neither page has a subpage code, then we've got a
 		 * match.
 		 */
 		if (((page_index->page_code & SMPH_SPF) == 0)
 		 && ((page_header->page_code & SMPH_SPF) == 0)) {
 			page_len = page_header->page_length;
 			break;
 		}
 
 		/*
 		 * If both pages have subpages, then the subpage numbers
 		 * have to match.
 		 */
 		if ((page_index->page_code & SMPH_SPF)
 		  && (page_header->page_code & SMPH_SPF)) {
 			struct scsi_mode_page_header_sp *sph;
 
 			sph = (struct scsi_mode_page_header_sp *)page_header;
 			if (page_index->subpage == sph->subpage) {
 				page_len = scsi_2btoul(sph->page_length);
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If we couldn't find the page, or if we don't have a mode select
 	 * handler for it, send back an error to the user.
 	 */
 	if ((i >= CTL_NUM_MODE_PAGES)
 	 || (page_index->select_handler == NULL)) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 0,
 				      /*field*/ *len_used,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	if (page_index->page_code & SMPH_SPF) {
 		page_len_offset = 2;
 		page_len_size = 2;
 	} else {
 		page_len_size = 1;
 		page_len_offset = 1;
 	}
 
 	/*
 	 * If the length the initiator gives us isn't the one we specify in
 	 * the mode page header, or if they didn't specify enough data in
 	 * the CDB to avoid truncating this page, kick out the request.
 	 */
 	if (page_len != page_index->page_len - page_len_offset - page_len_size) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 0,
 				      /*field*/ *len_used + page_len_offset,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 	if (*len_left < page_index->page_len) {
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_set_param_len_error(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * Run through the mode page, checking to make sure that the bits
 	 * the user changed are actually legal for him to change.
 	 */
 	for (i = 0; i < page_index->page_len; i++) {
 		uint8_t *user_byte, *change_mask, *current_byte;
 		int bad_bit;
 		int j;
 
 		user_byte = (uint8_t *)page_header + i;
 		change_mask = page_index->page_data +
 			      (page_index->page_len * CTL_PAGE_CHANGEABLE) + i;
 		current_byte = page_index->page_data +
 			       (page_index->page_len * CTL_PAGE_CURRENT) + i;
 
 		/*
 		 * Check to see whether the user set any bits in this byte
 		 * that he is not allowed to set.
 		 */
 		if ((*user_byte & ~(*change_mask)) ==
 		    (*current_byte & ~(*change_mask)))
 			continue;
 
 		/*
 		 * Go through bit by bit to determine which one is illegal.
 		 */
 		bad_bit = 0;
 		for (j = 7; j >= 0; j--) {
 			if ((((1 << i) & ~(*change_mask)) & *user_byte) !=
 			    (((1 << i) & ~(*change_mask)) & *current_byte)) {
 				bad_bit = i;
 				break;
 			}
 		}
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 0,
 				      /*field*/ *len_used + i,
 				      /*bit_valid*/ 1,
 				      /*bit*/ bad_bit);
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * Decrement these before we call the page handler, since we may
 	 * end up getting called back one way or another before the handler
 	 * returns to this context.
 	 */
 	*len_left -= page_index->page_len;
 	*len_used += page_index->page_len;
 
 	retval = page_index->select_handler(ctsio, page_index,
 					    (uint8_t *)page_header);
 
 	/*
 	 * If the page handler returns CTL_RETVAL_QUEUED, then we need to
 	 * wait until this queued command completes to finish processing
 	 * the mode page.  If it returns anything other than
 	 * CTL_RETVAL_COMPLETE (e.g. CTL_RETVAL_ERROR), then it should have
 	 * already set the sense information, freed the data pointer, and
 	 * completed the io for us.
 	 */
 	if (retval != CTL_RETVAL_COMPLETE)
 		goto bailout_no_done;
 
 	/*
 	 * If the initiator sent us more than one page, parse the next one.
 	 */
 	if (*len_left > 0)
 		goto do_next_page;
 
 	ctl_set_success(ctsio);
 	free(ctsio->kern_data_ptr, M_CTL);
 	ctl_done((union ctl_io *)ctsio);
 
 bailout_no_done:
 
 	return (CTL_RETVAL_COMPLETE);
 
 }
 
 int
 ctl_mode_select(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	union ctl_modepage_info *modepage_info;
 	int bd_len, i, header_size, param_len, rtd;
 	uint32_t initidx;
 
 	initidx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 	switch (ctsio->cdb[0]) {
 	case MODE_SELECT_6: {
 		struct scsi_mode_select_6 *cdb;
 
 		cdb = (struct scsi_mode_select_6 *)ctsio->cdb;
 
 		rtd = (cdb->byte2 & SMS_RTD) ? 1 : 0;
 		param_len = cdb->length;
 		header_size = sizeof(struct scsi_mode_header_6);
 		break;
 	}
 	case MODE_SELECT_10: {
 		struct scsi_mode_select_10 *cdb;
 
 		cdb = (struct scsi_mode_select_10 *)ctsio->cdb;
 
 		rtd = (cdb->byte2 & SMS_RTD) ? 1 : 0;
 		param_len = scsi_2btoul(cdb->length);
 		header_size = sizeof(struct scsi_mode_header_10);
 		break;
 	}
 	default:
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	if (rtd) {
 		if (param_len != 0) {
 			ctl_set_invalid_field(ctsio, /*sks_valid*/ 0,
 			    /*command*/ 1, /*field*/ 0,
 			    /*bit_valid*/ 0, /*bit*/ 0);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 
 		/* Revert to defaults. */
 		ctl_init_page_index(lun);
 		mtx_lock(&lun->lun_lock);
 		ctl_est_ua_all(lun, initidx, CTL_UA_MODE_CHANGE);
 		mtx_unlock(&lun->lun_lock);
 		for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 			ctl_isc_announce_mode(lun, -1,
 			    lun->mode_pages.index[i].page_code & SMPH_PC_MASK,
 			    lun->mode_pages.index[i].subpage);
 		}
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * From SPC-3:
 	 * "A parameter list length of zero indicates that the Data-Out Buffer
 	 * shall be empty. This condition shall not be considered as an error."
 	 */
 	if (param_len == 0) {
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * Since we'll hit this the first time through, prior to
 	 * allocation, we don't need to free a data buffer here.
 	 */
 	if (param_len < header_size) {
 		ctl_set_param_len_error(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * Allocate the data buffer and grab the user's data.  In theory,
 	 * we shouldn't have to sanity check the parameter list length here
 	 * because the maximum size is 64K.  We should be able to malloc
 	 * that much without too many problems.
 	 */
 	if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) {
 		ctsio->kern_data_ptr = malloc(param_len, M_CTL, M_WAITOK);
 		ctsio->kern_data_len = param_len;
 		ctsio->kern_total_len = param_len;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	switch (ctsio->cdb[0]) {
 	case MODE_SELECT_6: {
 		struct scsi_mode_header_6 *mh6;
 
 		mh6 = (struct scsi_mode_header_6 *)ctsio->kern_data_ptr;
 		bd_len = mh6->blk_desc_len;
 		break;
 	}
 	case MODE_SELECT_10: {
 		struct scsi_mode_header_10 *mh10;
 
 		mh10 = (struct scsi_mode_header_10 *)ctsio->kern_data_ptr;
 		bd_len = scsi_2btoul(mh10->blk_desc_len);
 		break;
 	}
 	default:
 		panic("%s: Invalid CDB type %#x", __func__, ctsio->cdb[0]);
 	}
 
 	if (param_len < (header_size + bd_len)) {
 		free(ctsio->kern_data_ptr, M_CTL);
 		ctl_set_param_len_error(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * Set the IO_CONT flag, so that if this I/O gets passed to
 	 * ctl_config_write_done(), it'll get passed back to
 	 * ctl_do_mode_select() for further processing, or completion if
 	 * we're all done.
 	 */
 	ctsio->io_hdr.flags |= CTL_FLAG_IO_CONT;
 	ctsio->io_cont = ctl_do_mode_select;
 
 	modepage_info = (union ctl_modepage_info *)
 		ctsio->io_hdr.ctl_private[CTL_PRIV_MODEPAGE].bytes;
 	memset(modepage_info, 0, sizeof(*modepage_info));
 	modepage_info->header.len_left = param_len - header_size - bd_len;
 	modepage_info->header.len_used = header_size + bd_len;
 
 	return (ctl_do_mode_select((union ctl_io *)ctsio));
 }
 
 int
 ctl_mode_sense(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
-	int pc, page_code, dbd, subpage;
-	int alloc_len, page_len, header_len, total_len;
-	struct scsi_mode_block_descr *block_desc;
+	int pc, page_code, llba, subpage;
+	int alloc_len, page_len, header_len, bd_len, total_len;
+	void *block_desc;
 	struct ctl_page_index *page_index;
 
-	dbd = 0;
-	block_desc = NULL;
+	llba = 0;
 
 	CTL_DEBUG_PRINT(("ctl_mode_sense\n"));
 
 	switch (ctsio->cdb[0]) {
 	case MODE_SENSE_6: {
 		struct scsi_mode_sense_6 *cdb;
 
 		cdb = (struct scsi_mode_sense_6 *)ctsio->cdb;
 
 		header_len = sizeof(struct scsi_mode_hdr_6);
 		if (cdb->byte2 & SMS_DBD)
-			dbd = 1;
+			bd_len = 0;
 		else
-			header_len += sizeof(struct scsi_mode_block_descr);
+			bd_len = sizeof(struct scsi_mode_block_descr);
+		header_len += bd_len;
 
 		pc = (cdb->page & SMS_PAGE_CTRL_MASK) >> 6;
 		page_code = cdb->page & SMS_PAGE_CODE;
 		subpage = cdb->subpage;
 		alloc_len = cdb->length;
 		break;
 	}
 	case MODE_SENSE_10: {
 		struct scsi_mode_sense_10 *cdb;
 
 		cdb = (struct scsi_mode_sense_10 *)ctsio->cdb;
 
 		header_len = sizeof(struct scsi_mode_hdr_10);
+		if (cdb->byte2 & SMS_DBD) {
+			bd_len = 0;
+		} else if (lun->be_lun->lun_type == T_DIRECT) {
+			if (cdb->byte2 & SMS10_LLBAA) {
+				llba = 1;
+				bd_len = sizeof(struct scsi_mode_block_descr_dlong);
+			} else
+				bd_len = sizeof(struct scsi_mode_block_descr_dshort);
+		} else
+			bd_len = sizeof(struct scsi_mode_block_descr);
+		header_len += bd_len;
 
-		if (cdb->byte2 & SMS_DBD)
-			dbd = 1;
-		else
-			header_len += sizeof(struct scsi_mode_block_descr);
 		pc = (cdb->page & SMS_PAGE_CTRL_MASK) >> 6;
 		page_code = cdb->page & SMS_PAGE_CODE;
 		subpage = cdb->subpage;
 		alloc_len = scsi_2btoul(cdb->length);
 		break;
 	}
 	default:
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * We have to make a first pass through to calculate the size of
 	 * the pages that match the user's query.  Then we allocate enough
 	 * memory to hold it, and actually copy the data into the buffer.
 	 */
 	switch (page_code) {
 	case SMS_ALL_PAGES_PAGE: {
 		u_int i;
 
 		page_len = 0;
 
 		/*
 		 * At the moment, values other than 0 and 0xff here are
 		 * reserved according to SPC-3.
 		 */
 		if ((subpage != SMS_SUBPAGE_PAGE_0)
 		 && (subpage != SMS_SUBPAGE_ALL)) {
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 3,
 					      /*bit_valid*/ 0,
 					      /*bit*/ 0);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 
 		for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 			page_index = &lun->mode_pages.index[i];
 
 			/* Make sure the page is supported for this dev type */
 			if (lun->be_lun->lun_type == T_DIRECT &&
 			    (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0)
 				continue;
 			if (lun->be_lun->lun_type == T_PROCESSOR &&
 			    (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0)
 				continue;
 			if (lun->be_lun->lun_type == T_CDROM &&
 			    (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0)
 				continue;
 
 			/*
 			 * We don't use this subpage if the user didn't
 			 * request all subpages.
 			 */
 			if ((page_index->subpage != 0)
 			 && (subpage == SMS_SUBPAGE_PAGE_0))
 				continue;
 
 			page_len += page_index->page_len;
 		}
 		break;
 	}
 	default: {
 		u_int i;
 
 		page_len = 0;
 
 		for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 			page_index = &lun->mode_pages.index[i];
 
 			/* Make sure the page is supported for this dev type */
 			if (lun->be_lun->lun_type == T_DIRECT &&
 			    (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0)
 				continue;
 			if (lun->be_lun->lun_type == T_PROCESSOR &&
 			    (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0)
 				continue;
 			if (lun->be_lun->lun_type == T_CDROM &&
 			    (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0)
 				continue;
 
 			/* Look for the right page code */
 			if ((page_index->page_code & SMPH_PC_MASK) != page_code)
 				continue;
 
 			/* Look for the right subpage or the subpage wildcard*/
 			if ((page_index->subpage != subpage)
 			 && (subpage != SMS_SUBPAGE_ALL))
 				continue;
 
 			page_len += page_index->page_len;
 		}
 
 		if (page_len == 0) {
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 2,
 					      /*bit_valid*/ 1,
 					      /*bit*/ 5);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		break;
 	}
 	}
 
 	total_len = header_len + page_len;
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(total_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	switch (ctsio->cdb[0]) {
 	case MODE_SENSE_6: {
 		struct scsi_mode_hdr_6 *header;
 
 		header = (struct scsi_mode_hdr_6 *)ctsio->kern_data_ptr;
 
 		header->datalen = MIN(total_len - 1, 254);
 		if (lun->be_lun->lun_type == T_DIRECT) {
 			header->dev_specific = 0x10; /* DPOFUA */
 			if ((lun->be_lun->flags & CTL_LUN_FLAG_READONLY) ||
 			    (lun->MODE_CTRL.eca_and_aen & SCP_SWP) != 0)
 				header->dev_specific |= 0x80; /* WP */
 		}
-		if (dbd)
-			header->block_descr_len = 0;
-		else
-			header->block_descr_len =
-				sizeof(struct scsi_mode_block_descr);
-		block_desc = (struct scsi_mode_block_descr *)&header[1];
+		header->block_descr_len = bd_len;
+		block_desc = &header[1];
 		break;
 	}
 	case MODE_SENSE_10: {
 		struct scsi_mode_hdr_10 *header;
 		int datalen;
 
 		header = (struct scsi_mode_hdr_10 *)ctsio->kern_data_ptr;
 
 		datalen = MIN(total_len - 2, 65533);
 		scsi_ulto2b(datalen, header->datalen);
 		if (lun->be_lun->lun_type == T_DIRECT) {
 			header->dev_specific = 0x10; /* DPOFUA */
 			if ((lun->be_lun->flags & CTL_LUN_FLAG_READONLY) ||
 			    (lun->MODE_CTRL.eca_and_aen & SCP_SWP) != 0)
 				header->dev_specific |= 0x80; /* WP */
 		}
-		if (dbd)
-			scsi_ulto2b(0, header->block_descr_len);
-		else
-			scsi_ulto2b(sizeof(struct scsi_mode_block_descr),
-				    header->block_descr_len);
-		block_desc = (struct scsi_mode_block_descr *)&header[1];
+		if (llba)
+			header->flags |= SMH_LONGLBA;
+		scsi_ulto2b(bd_len, header->block_descr_len);
+		block_desc = &header[1];
 		break;
 	}
 	default:
 		panic("%s: Invalid CDB type %#x", __func__, ctsio->cdb[0]);
 	}
 
 	/*
 	 * If we've got a disk, use its blocksize in the block
 	 * descriptor.  Otherwise, just set it to 0.
 	 */
-	if (dbd == 0) {
-		if (lun->be_lun->lun_type == T_DIRECT)
-			scsi_ulto3b(lun->be_lun->blocksize,
-				    block_desc->block_len);
-		else
-			scsi_ulto3b(0, block_desc->block_len);
+	if (bd_len > 0) {
+		if (lun->be_lun->lun_type == T_DIRECT) {
+			if (llba) {
+				struct scsi_mode_block_descr_dlong *bd = block_desc;
+				if (lun->be_lun->maxlba != 0)
+					scsi_u64to8b(lun->be_lun->maxlba + 1,
+					    bd->num_blocks);
+				scsi_ulto4b(lun->be_lun->blocksize,
+				    bd->block_len);
+			} else {
+				struct scsi_mode_block_descr_dshort *bd = block_desc;
+				if (lun->be_lun->maxlba != 0)
+					scsi_ulto4b(MIN(lun->be_lun->maxlba+1,
+					    UINT32_MAX), bd->num_blocks);
+				scsi_ulto3b(lun->be_lun->blocksize,
+				    bd->block_len);
+			}
+		} else {
+			struct scsi_mode_block_descr *bd = block_desc;
+			scsi_ulto3b(0, bd->block_len);
+		}
 	}
 
 	switch (page_code) {
 	case SMS_ALL_PAGES_PAGE: {
 		int i, data_used;
 
 		data_used = header_len;
 		for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 			struct ctl_page_index *page_index;
 
 			page_index = &lun->mode_pages.index[i];
 			if (lun->be_lun->lun_type == T_DIRECT &&
 			    (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0)
 				continue;
 			if (lun->be_lun->lun_type == T_PROCESSOR &&
 			    (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0)
 				continue;
 			if (lun->be_lun->lun_type == T_CDROM &&
 			    (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0)
 				continue;
 
 			/*
 			 * We don't use this subpage if the user didn't
 			 * request all subpages.  We already checked (above)
 			 * to make sure the user only specified a subpage
 			 * of 0 or 0xff in the SMS_ALL_PAGES_PAGE case.
 			 */
 			if ((page_index->subpage != 0)
 			 && (subpage == SMS_SUBPAGE_PAGE_0))
 				continue;
 
 			/*
 			 * Call the handler, if it exists, to update the
 			 * page to the latest values.
 			 */
 			if (page_index->sense_handler != NULL)
 				page_index->sense_handler(ctsio, page_index,pc);
 
 			memcpy(ctsio->kern_data_ptr + data_used,
 			       page_index->page_data +
 			       (page_index->page_len * pc),
 			       page_index->page_len);
 			data_used += page_index->page_len;
 		}
 		break;
 	}
 	default: {
 		int i, data_used;
 
 		data_used = header_len;
 
 		for (i = 0; i < CTL_NUM_MODE_PAGES; i++) {
 			struct ctl_page_index *page_index;
 
 			page_index = &lun->mode_pages.index[i];
 
 			/* Look for the right page code */
 			if ((page_index->page_code & SMPH_PC_MASK) != page_code)
 				continue;
 
 			/* Look for the right subpage or the subpage wildcard*/
 			if ((page_index->subpage != subpage)
 			 && (subpage != SMS_SUBPAGE_ALL))
 				continue;
 
 			/* Make sure the page is supported for this dev type */
 			if (lun->be_lun->lun_type == T_DIRECT &&
 			    (page_index->page_flags & CTL_PAGE_FLAG_DIRECT) == 0)
 				continue;
 			if (lun->be_lun->lun_type == T_PROCESSOR &&
 			    (page_index->page_flags & CTL_PAGE_FLAG_PROC) == 0)
 				continue;
 			if (lun->be_lun->lun_type == T_CDROM &&
 			    (page_index->page_flags & CTL_PAGE_FLAG_CDROM) == 0)
 				continue;
 
 			/*
 			 * Call the handler, if it exists, to update the
 			 * page to the latest values.
 			 */
 			if (page_index->sense_handler != NULL)
 				page_index->sense_handler(ctsio, page_index,pc);
 
 			memcpy(ctsio->kern_data_ptr + data_used,
 			       page_index->page_data +
 			       (page_index->page_len * pc),
 			       page_index->page_len);
 			data_used += page_index->page_len;
 		}
 		break;
 	}
 	}
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
+ctl_temp_log_sense_handler(struct ctl_scsiio *ctsio,
+			       struct ctl_page_index *page_index,
+			       int pc)
+{
+	struct ctl_lun *lun = CTL_LUN(ctsio);
+	struct scsi_log_temperature *data;
+	const char *value;
+
+	data = (struct scsi_log_temperature *)page_index->page_data;
+
+	scsi_ulto2b(SLP_TEMPERATURE, data->hdr.param_code);
+	data->hdr.param_control = SLP_LBIN;
+	data->hdr.param_len = sizeof(struct scsi_log_temperature) -
+	    sizeof(struct scsi_log_param_header);
+	if ((value = dnvlist_get_string(lun->be_lun->options, "temperature",
+	    NULL)) != NULL)
+		data->temperature = strtol(value, NULL, 0);
+	else
+		data->temperature = 0xff;
+	data++;
+
+	scsi_ulto2b(SLP_REFTEMPERATURE, data->hdr.param_code);
+	data->hdr.param_control = SLP_LBIN;
+	data->hdr.param_len = sizeof(struct scsi_log_temperature) -
+	    sizeof(struct scsi_log_param_header);
+	if ((value = dnvlist_get_string(lun->be_lun->options, "reftemperature",
+	    NULL)) != NULL)
+		data->temperature = strtol(value, NULL, 0);
+	else
+		data->temperature = 0xff;
+	return (0);
+}
+
+int
 ctl_lbp_log_sense_handler(struct ctl_scsiio *ctsio,
 			       struct ctl_page_index *page_index,
 			       int pc)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_log_param_header *phdr;
 	uint8_t *data;
 	uint64_t val;
 
 	data = page_index->page_data;
 
 	if (lun->backend->lun_attr != NULL &&
 	    (val = lun->backend->lun_attr(lun->be_lun->be_lun, "blocksavail"))
 	     != UINT64_MAX) {
 		phdr = (struct scsi_log_param_header *)data;
 		scsi_ulto2b(0x0001, phdr->param_code);
 		phdr->param_control = SLP_LBIN | SLP_LP;
 		phdr->param_len = 8;
 		data = (uint8_t *)(phdr + 1);
 		scsi_ulto4b(val >> CTL_LBP_EXPONENT, data);
 		data[4] = 0x02; /* per-pool */
 		data += phdr->param_len;
 	}
 
 	if (lun->backend->lun_attr != NULL &&
 	    (val = lun->backend->lun_attr(lun->be_lun->be_lun, "blocksused"))
 	     != UINT64_MAX) {
 		phdr = (struct scsi_log_param_header *)data;
 		scsi_ulto2b(0x0002, phdr->param_code);
 		phdr->param_control = SLP_LBIN | SLP_LP;
 		phdr->param_len = 8;
 		data = (uint8_t *)(phdr + 1);
 		scsi_ulto4b(val >> CTL_LBP_EXPONENT, data);
 		data[4] = 0x01; /* per-LUN */
 		data += phdr->param_len;
 	}
 
 	if (lun->backend->lun_attr != NULL &&
 	    (val = lun->backend->lun_attr(lun->be_lun->be_lun, "poolblocksavail"))
 	     != UINT64_MAX) {
 		phdr = (struct scsi_log_param_header *)data;
 		scsi_ulto2b(0x00f1, phdr->param_code);
 		phdr->param_control = SLP_LBIN | SLP_LP;
 		phdr->param_len = 8;
 		data = (uint8_t *)(phdr + 1);
 		scsi_ulto4b(val >> CTL_LBP_EXPONENT, data);
 		data[4] = 0x02; /* per-pool */
 		data += phdr->param_len;
 	}
 
 	if (lun->backend->lun_attr != NULL &&
 	    (val = lun->backend->lun_attr(lun->be_lun->be_lun, "poolblocksused"))
 	     != UINT64_MAX) {
 		phdr = (struct scsi_log_param_header *)data;
 		scsi_ulto2b(0x00f2, phdr->param_code);
 		phdr->param_control = SLP_LBIN | SLP_LP;
 		phdr->param_len = 8;
 		data = (uint8_t *)(phdr + 1);
 		scsi_ulto4b(val >> CTL_LBP_EXPONENT, data);
 		data[4] = 0x02; /* per-pool */
 		data += phdr->param_len;
 	}
 
 	page_index->page_len = data - page_index->page_data;
 	return (0);
 }
 
 int
 ctl_sap_log_sense_handler(struct ctl_scsiio *ctsio,
 			       struct ctl_page_index *page_index,
 			       int pc)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct stat_page *data;
 	struct bintime *t;
 
 	data = (struct stat_page *)page_index->page_data;
 
 	scsi_ulto2b(SLP_SAP, data->sap.hdr.param_code);
 	data->sap.hdr.param_control = SLP_LBIN;
 	data->sap.hdr.param_len = sizeof(struct scsi_log_stat_and_perf) -
 	    sizeof(struct scsi_log_param_header);
 	scsi_u64to8b(lun->stats.operations[CTL_STATS_READ],
 	    data->sap.read_num);
 	scsi_u64to8b(lun->stats.operations[CTL_STATS_WRITE],
 	    data->sap.write_num);
 	if (lun->be_lun->blocksize > 0) {
 		scsi_u64to8b(lun->stats.bytes[CTL_STATS_WRITE] /
 		    lun->be_lun->blocksize, data->sap.recvieved_lba);
 		scsi_u64to8b(lun->stats.bytes[CTL_STATS_READ] /
 		    lun->be_lun->blocksize, data->sap.transmitted_lba);
 	}
 	t = &lun->stats.time[CTL_STATS_READ];
 	scsi_u64to8b((uint64_t)t->sec * 1000 + t->frac / (UINT64_MAX / 1000),
 	    data->sap.read_int);
 	t = &lun->stats.time[CTL_STATS_WRITE];
 	scsi_u64to8b((uint64_t)t->sec * 1000 + t->frac / (UINT64_MAX / 1000),
 	    data->sap.write_int);
 	scsi_u64to8b(0, data->sap.weighted_num);
 	scsi_u64to8b(0, data->sap.weighted_int);
 	scsi_ulto2b(SLP_IT, data->it.hdr.param_code);
 	data->it.hdr.param_control = SLP_LBIN;
 	data->it.hdr.param_len = sizeof(struct scsi_log_idle_time) -
 	    sizeof(struct scsi_log_param_header);
 #ifdef CTL_TIME_IO
 	scsi_u64to8b(lun->idle_time / SBT_1MS, data->it.idle_int);
 #endif
 	scsi_ulto2b(SLP_TI, data->ti.hdr.param_code);
 	data->it.hdr.param_control = SLP_LBIN;
 	data->ti.hdr.param_len = sizeof(struct scsi_log_time_interval) -
 	    sizeof(struct scsi_log_param_header);
 	scsi_ulto4b(3, data->ti.exponent);
 	scsi_ulto4b(1, data->ti.integer);
 	return (0);
 }
 
 int
 ctl_ie_log_sense_handler(struct ctl_scsiio *ctsio,
 			       struct ctl_page_index *page_index,
 			       int pc)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_log_informational_exceptions *data;
+	const char *value;
 
 	data = (struct scsi_log_informational_exceptions *)page_index->page_data;
 
 	scsi_ulto2b(SLP_IE_GEN, data->hdr.param_code);
 	data->hdr.param_control = SLP_LBIN;
 	data->hdr.param_len = sizeof(struct scsi_log_informational_exceptions) -
 	    sizeof(struct scsi_log_param_header);
 	data->ie_asc = lun->ie_asc;
 	data->ie_ascq = lun->ie_ascq;
-	data->temperature = 0xff;
+	if ((value = dnvlist_get_string(lun->be_lun->options, "temperature",
+	    NULL)) != NULL)
+		data->temperature = strtol(value, NULL, 0);
+	else
+		data->temperature = 0xff;
 	return (0);
 }
 
 int
 ctl_log_sense(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	int i, pc, page_code, subpage;
 	int alloc_len, total_len;
 	struct ctl_page_index *page_index;
 	struct scsi_log_sense *cdb;
 	struct scsi_log_header *header;
 
 	CTL_DEBUG_PRINT(("ctl_log_sense\n"));
 
 	cdb = (struct scsi_log_sense *)ctsio->cdb;
 	pc = (cdb->page & SLS_PAGE_CTRL_MASK) >> 6;
 	page_code = cdb->page & SLS_PAGE_CODE;
 	subpage = cdb->subpage;
 	alloc_len = scsi_2btoul(cdb->length);
 
 	page_index = NULL;
 	for (i = 0; i < CTL_NUM_LOG_PAGES; i++) {
 		page_index = &lun->log_pages.index[i];
 
 		/* Look for the right page code */
 		if ((page_index->page_code & SL_PAGE_CODE) != page_code)
 			continue;
 
 		/* Look for the right subpage or the subpage wildcard*/
 		if (page_index->subpage != subpage)
 			continue;
 
 		break;
 	}
 	if (i >= CTL_NUM_LOG_PAGES) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	total_len = sizeof(struct scsi_log_header) + page_index->page_len;
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(total_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	header = (struct scsi_log_header *)ctsio->kern_data_ptr;
 	header->page = page_index->page_code;
 	if (page_index->page_code == SLS_LOGICAL_BLOCK_PROVISIONING)
 		header->page |= SL_DS;
 	if (page_index->subpage) {
 		header->page |= SL_SPF;
 		header->subpage = page_index->subpage;
 	}
 	scsi_ulto2b(page_index->page_len, header->datalen);
 
 	/*
 	 * Call the handler, if it exists, to update the
 	 * page to the latest values.
 	 */
 	if (page_index->sense_handler != NULL)
 		page_index->sense_handler(ctsio, page_index, pc);
 
 	memcpy(header + 1, page_index->page_data, page_index->page_len);
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_read_capacity(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_read_capacity *cdb;
 	struct scsi_read_capacity_data *data;
 	uint32_t lba;
 
 	CTL_DEBUG_PRINT(("ctl_read_capacity\n"));
 
 	cdb = (struct scsi_read_capacity *)ctsio->cdb;
 
 	lba = scsi_4btoul(cdb->addr);
 	if (((cdb->pmi & SRC_PMI) == 0)
 	 && (lba != 0)) {
 		ctl_set_invalid_field(/*ctsio*/ ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	ctsio->kern_data_ptr = malloc(sizeof(*data), M_CTL, M_WAITOK | M_ZERO);
 	data = (struct scsi_read_capacity_data *)ctsio->kern_data_ptr;
 	ctsio->kern_data_len = sizeof(*data);
 	ctsio->kern_total_len = sizeof(*data);
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 
 	/*
 	 * If the maximum LBA is greater than 0xfffffffe, the user must
 	 * issue a SERVICE ACTION IN (16) command, with the read capacity
 	 * serivce action set.
 	 */
 	if (lun->be_lun->maxlba > 0xfffffffe)
 		scsi_ulto4b(0xffffffff, data->addr);
 	else
 		scsi_ulto4b(lun->be_lun->maxlba, data->addr);
 
 	/*
 	 * XXX KDM this may not be 512 bytes...
 	 */
 	scsi_ulto4b(lun->be_lun->blocksize, data->length);
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_read_capacity_16(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_read_capacity_16 *cdb;
 	struct scsi_read_capacity_data_long *data;
 	uint64_t lba;
 	uint32_t alloc_len;
 
 	CTL_DEBUG_PRINT(("ctl_read_capacity_16\n"));
 
 	cdb = (struct scsi_read_capacity_16 *)ctsio->cdb;
 
 	alloc_len = scsi_4btoul(cdb->alloc_len);
 	lba = scsi_8btou64(cdb->addr);
 
 	if ((cdb->reladr & SRC16_PMI)
 	 && (lba != 0)) {
 		ctl_set_invalid_field(/*ctsio*/ ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	ctsio->kern_data_ptr = malloc(sizeof(*data), M_CTL, M_WAITOK | M_ZERO);
 	data = (struct scsi_read_capacity_data_long *)ctsio->kern_data_ptr;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_len = min(sizeof(*data), alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	scsi_u64to8b(lun->be_lun->maxlba, data->addr);
 	/* XXX KDM this may not be 512 bytes... */
 	scsi_ulto4b(lun->be_lun->blocksize, data->length);
 	data->prot_lbppbe = lun->be_lun->pblockexp & SRC16_LBPPBE;
 	scsi_ulto2b(lun->be_lun->pblockoff & SRC16_LALBA_A, data->lalba_lbp);
 	if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP)
 		data->lalba_lbp[0] |= SRC16_LBPME | SRC16_LBPRZ;
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_get_lba_status(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_get_lba_status *cdb;
 	struct scsi_get_lba_status_data *data;
 	struct ctl_lba_len_flags *lbalen;
 	uint64_t lba;
 	uint32_t alloc_len, total_len;
 	int retval;
 
 	CTL_DEBUG_PRINT(("ctl_get_lba_status\n"));
 
 	cdb = (struct scsi_get_lba_status *)ctsio->cdb;
 	lba = scsi_8btou64(cdb->addr);
 	alloc_len = scsi_4btoul(cdb->alloc_len);
 
 	if (lba > lun->be_lun->maxlba) {
 		ctl_set_lba_out_of_range(ctsio, lba);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	total_len = sizeof(*data) + sizeof(data->descr[0]);
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 	data = (struct scsi_get_lba_status_data *)ctsio->kern_data_ptr;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_len = min(total_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	/* Fill dummy data in case backend can't tell anything. */
 	scsi_ulto4b(4 + sizeof(data->descr[0]), data->length);
 	scsi_u64to8b(lba, data->descr[0].addr);
 	scsi_ulto4b(MIN(UINT32_MAX, lun->be_lun->maxlba + 1 - lba),
 	    data->descr[0].length);
 	data->descr[0].status = 0; /* Mapped or unknown. */
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 
 	lbalen = (struct ctl_lba_len_flags *)&ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	lbalen->lba = lba;
 	lbalen->len = total_len;
 	lbalen->flags = 0;
 	retval = lun->backend->config_read((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_read_defect(struct ctl_scsiio *ctsio)
 {
 	struct scsi_read_defect_data_10 *ccb10;
 	struct scsi_read_defect_data_12 *ccb12;
 	struct scsi_read_defect_data_hdr_10 *data10;
 	struct scsi_read_defect_data_hdr_12 *data12;
 	uint32_t alloc_len, data_len;
 	uint8_t format;
 
 	CTL_DEBUG_PRINT(("ctl_read_defect\n"));
 
 	if (ctsio->cdb[0] == READ_DEFECT_DATA_10) {
 		ccb10 = (struct scsi_read_defect_data_10 *)&ctsio->cdb;
 		format = ccb10->format;
 		alloc_len = scsi_2btoul(ccb10->alloc_length);
 		data_len = sizeof(*data10);
 	} else {
 		ccb12 = (struct scsi_read_defect_data_12 *)&ctsio->cdb;
 		format = ccb12->format;
 		alloc_len = scsi_4btoul(ccb12->alloc_length);
 		data_len = sizeof(*data12);
 	}
 	if (alloc_len == 0) {
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_len = min(data_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	if (ctsio->cdb[0] == READ_DEFECT_DATA_10) {
 		data10 = (struct scsi_read_defect_data_hdr_10 *)
 		    ctsio->kern_data_ptr;
 		data10->format = format;
 		scsi_ulto2b(0, data10->length);
 	} else {
 		data12 = (struct scsi_read_defect_data_hdr_12 *)
 		    ctsio->kern_data_ptr;
 		data12->format = format;
 		scsi_ulto2b(0, data12->generation);
 		scsi_ulto4b(0, data12->length);
 	}
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_report_tagret_port_groups(struct ctl_scsiio *ctsio)
 {
 	struct ctl_softc *softc = CTL_SOFTC(ctsio);
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_maintenance_in *cdb;
 	int retval;
 	int alloc_len, ext, total_len = 0, g, pc, pg, ts, os;
 	int num_ha_groups, num_target_ports, shared_group;
 	struct ctl_port *port;
 	struct scsi_target_group_data *rtg_ptr;
 	struct scsi_target_group_data_extended *rtg_ext_ptr;
 	struct scsi_target_port_group_descriptor *tpg_desc;
 
 	CTL_DEBUG_PRINT(("ctl_report_tagret_port_groups\n"));
 
 	cdb = (struct scsi_maintenance_in *)ctsio->cdb;
 	retval = CTL_RETVAL_COMPLETE;
 
 	switch (cdb->byte2 & STG_PDF_MASK) {
 	case STG_PDF_LENGTH:
 		ext = 0;
 		break;
 	case STG_PDF_EXTENDED:
 		ext = 1;
 		break;
 	default:
 		ctl_set_invalid_field(/*ctsio*/ ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 1,
 				      /*bit*/ 5);
 		ctl_done((union ctl_io *)ctsio);
 		return(retval);
 	}
 
 	num_target_ports = 0;
 	shared_group = (softc->is_single != 0);
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_FOREACH(port, &softc->port_list, links) {
 		if ((port->status & CTL_PORT_STATUS_ONLINE) == 0)
 			continue;
 		if (ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX)
 			continue;
 		num_target_ports++;
 		if (port->status & CTL_PORT_STATUS_HA_SHARED)
 			shared_group = 1;
 	}
 	mtx_unlock(&softc->ctl_lock);
 	num_ha_groups = (softc->is_single) ? 0 : NUM_HA_SHELVES;
 
 	if (ext)
 		total_len = sizeof(struct scsi_target_group_data_extended);
 	else
 		total_len = sizeof(struct scsi_target_group_data);
 	total_len += sizeof(struct scsi_target_port_group_descriptor) *
 		(shared_group + num_ha_groups) +
 	    sizeof(struct scsi_target_port_descriptor) * num_target_ports;
 
 	alloc_len = scsi_4btoul(cdb->length);
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(total_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	if (ext) {
 		rtg_ext_ptr = (struct scsi_target_group_data_extended *)
 		    ctsio->kern_data_ptr;
 		scsi_ulto4b(total_len - 4, rtg_ext_ptr->length);
 		rtg_ext_ptr->format_type = 0x10;
 		rtg_ext_ptr->implicit_transition_time = 0;
 		tpg_desc = &rtg_ext_ptr->groups[0];
 	} else {
 		rtg_ptr = (struct scsi_target_group_data *)
 		    ctsio->kern_data_ptr;
 		scsi_ulto4b(total_len - 4, rtg_ptr->length);
 		tpg_desc = &rtg_ptr->groups[0];
 	}
 
 	mtx_lock(&softc->ctl_lock);
 	pg = softc->port_min / softc->port_cnt;
 	if (lun->flags & (CTL_LUN_PRIMARY_SC | CTL_LUN_PEER_SC_PRIMARY)) {
 		/* Some shelf is known to be primary. */
 		if (softc->ha_link == CTL_HA_LINK_OFFLINE)
 			os = TPG_ASYMMETRIC_ACCESS_UNAVAILABLE;
 		else if (softc->ha_link == CTL_HA_LINK_UNKNOWN)
 			os = TPG_ASYMMETRIC_ACCESS_TRANSITIONING;
 		else if (softc->ha_mode == CTL_HA_MODE_ACT_STBY)
 			os = TPG_ASYMMETRIC_ACCESS_STANDBY;
 		else
 			os = TPG_ASYMMETRIC_ACCESS_NONOPTIMIZED;
 		if (lun->flags & CTL_LUN_PRIMARY_SC) {
 			ts = TPG_ASYMMETRIC_ACCESS_OPTIMIZED;
 		} else {
 			ts = os;
 			os = TPG_ASYMMETRIC_ACCESS_OPTIMIZED;
 		}
 	} else {
 		/* No known primary shelf. */
 		if (softc->ha_link == CTL_HA_LINK_OFFLINE) {
 			ts = TPG_ASYMMETRIC_ACCESS_UNAVAILABLE;
 			os = TPG_ASYMMETRIC_ACCESS_OPTIMIZED;
 		} else if (softc->ha_link == CTL_HA_LINK_UNKNOWN) {
 			ts = TPG_ASYMMETRIC_ACCESS_TRANSITIONING;
 			os = TPG_ASYMMETRIC_ACCESS_OPTIMIZED;
 		} else {
 			ts = os = TPG_ASYMMETRIC_ACCESS_TRANSITIONING;
 		}
 	}
 	if (shared_group) {
 		tpg_desc->pref_state = ts;
 		tpg_desc->support = TPG_AO_SUP | TPG_AN_SUP | TPG_S_SUP |
 		    TPG_U_SUP | TPG_T_SUP;
 		scsi_ulto2b(1, tpg_desc->target_port_group);
 		tpg_desc->status = TPG_IMPLICIT;
 		pc = 0;
 		STAILQ_FOREACH(port, &softc->port_list, links) {
 			if ((port->status & CTL_PORT_STATUS_ONLINE) == 0)
 				continue;
 			if (!softc->is_single &&
 			    (port->status & CTL_PORT_STATUS_HA_SHARED) == 0)
 				continue;
 			if (ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX)
 				continue;
 			scsi_ulto2b(port->targ_port, tpg_desc->descriptors[pc].
 			    relative_target_port_identifier);
 			pc++;
 		}
 		tpg_desc->target_port_count = pc;
 		tpg_desc = (struct scsi_target_port_group_descriptor *)
 		    &tpg_desc->descriptors[pc];
 	}
 	for (g = 0; g < num_ha_groups; g++) {
 		tpg_desc->pref_state = (g == pg) ? ts : os;
 		tpg_desc->support = TPG_AO_SUP | TPG_AN_SUP | TPG_S_SUP |
 		    TPG_U_SUP | TPG_T_SUP;
 		scsi_ulto2b(2 + g, tpg_desc->target_port_group);
 		tpg_desc->status = TPG_IMPLICIT;
 		pc = 0;
 		STAILQ_FOREACH(port, &softc->port_list, links) {
 			if (port->targ_port < g * softc->port_cnt ||
 			    port->targ_port >= (g + 1) * softc->port_cnt)
 				continue;
 			if ((port->status & CTL_PORT_STATUS_ONLINE) == 0)
 				continue;
 			if (port->status & CTL_PORT_STATUS_HA_SHARED)
 				continue;
 			if (ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX)
 				continue;
 			scsi_ulto2b(port->targ_port, tpg_desc->descriptors[pc].
 			    relative_target_port_identifier);
 			pc++;
 		}
 		tpg_desc->target_port_count = pc;
 		tpg_desc = (struct scsi_target_port_group_descriptor *)
 		    &tpg_desc->descriptors[pc];
 	}
 	mtx_unlock(&softc->ctl_lock);
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return(retval);
 }
 
 int
 ctl_report_supported_opcodes(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_report_supported_opcodes *cdb;
 	const struct ctl_cmd_entry *entry, *sentry;
 	struct scsi_report_supported_opcodes_all *all;
 	struct scsi_report_supported_opcodes_descr *descr;
 	struct scsi_report_supported_opcodes_one *one;
 	int retval;
 	int alloc_len, total_len;
 	int opcode, service_action, i, j, num;
 
 	CTL_DEBUG_PRINT(("ctl_report_supported_opcodes\n"));
 
 	cdb = (struct scsi_report_supported_opcodes *)ctsio->cdb;
 	retval = CTL_RETVAL_COMPLETE;
 
 	opcode = cdb->requested_opcode;
 	service_action = scsi_2btoul(cdb->requested_service_action);
 	switch (cdb->options & RSO_OPTIONS_MASK) {
 	case RSO_OPTIONS_ALL:
 		num = 0;
 		for (i = 0; i < 256; i++) {
 			entry = &ctl_cmd_table[i];
 			if (entry->flags & CTL_CMD_FLAG_SA5) {
 				for (j = 0; j < 32; j++) {
 					sentry = &((const struct ctl_cmd_entry *)
 					    entry->execute)[j];
 					if (ctl_cmd_applicable(
 					    lun->be_lun->lun_type, sentry))
 						num++;
 				}
 			} else {
 				if (ctl_cmd_applicable(lun->be_lun->lun_type,
 				    entry))
 					num++;
 			}
 		}
 		total_len = sizeof(struct scsi_report_supported_opcodes_all) +
 		    num * sizeof(struct scsi_report_supported_opcodes_descr);
 		break;
 	case RSO_OPTIONS_OC:
 		if (ctl_cmd_table[opcode].flags & CTL_CMD_FLAG_SA5) {
 			ctl_set_invalid_field(/*ctsio*/ ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 2,
 					      /*bit_valid*/ 1,
 					      /*bit*/ 2);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		total_len = sizeof(struct scsi_report_supported_opcodes_one) + 32;
 		break;
 	case RSO_OPTIONS_OC_SA:
 		if ((ctl_cmd_table[opcode].flags & CTL_CMD_FLAG_SA5) == 0 ||
 		    service_action >= 32) {
 			ctl_set_invalid_field(/*ctsio*/ ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 2,
 					      /*bit_valid*/ 1,
 					      /*bit*/ 2);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		/* FALLTHROUGH */
 	case RSO_OPTIONS_OC_ASA:
 		total_len = sizeof(struct scsi_report_supported_opcodes_one) + 32;
 		break;
 	default:
 		ctl_set_invalid_field(/*ctsio*/ ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 1,
 				      /*bit*/ 2);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	alloc_len = scsi_4btoul(cdb->length);
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(total_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	switch (cdb->options & RSO_OPTIONS_MASK) {
 	case RSO_OPTIONS_ALL:
 		all = (struct scsi_report_supported_opcodes_all *)
 		    ctsio->kern_data_ptr;
 		num = 0;
 		for (i = 0; i < 256; i++) {
 			entry = &ctl_cmd_table[i];
 			if (entry->flags & CTL_CMD_FLAG_SA5) {
 				for (j = 0; j < 32; j++) {
 					sentry = &((const struct ctl_cmd_entry *)
 					    entry->execute)[j];
 					if (!ctl_cmd_applicable(
 					    lun->be_lun->lun_type, sentry))
 						continue;
 					descr = &all->descr[num++];
 					descr->opcode = i;
 					scsi_ulto2b(j, descr->service_action);
 					descr->flags = RSO_SERVACTV;
 					scsi_ulto2b(sentry->length,
 					    descr->cdb_length);
 				}
 			} else {
 				if (!ctl_cmd_applicable(lun->be_lun->lun_type,
 				    entry))
 					continue;
 				descr = &all->descr[num++];
 				descr->opcode = i;
 				scsi_ulto2b(0, descr->service_action);
 				descr->flags = 0;
 				scsi_ulto2b(entry->length, descr->cdb_length);
 			}
 		}
 		scsi_ulto4b(
 		    num * sizeof(struct scsi_report_supported_opcodes_descr),
 		    all->length);
 		break;
 	case RSO_OPTIONS_OC:
 		one = (struct scsi_report_supported_opcodes_one *)
 		    ctsio->kern_data_ptr;
 		entry = &ctl_cmd_table[opcode];
 		goto fill_one;
 	case RSO_OPTIONS_OC_SA:
 		one = (struct scsi_report_supported_opcodes_one *)
 		    ctsio->kern_data_ptr;
 		entry = &ctl_cmd_table[opcode];
 		entry = &((const struct ctl_cmd_entry *)
 		    entry->execute)[service_action];
 fill_one:
 		if (ctl_cmd_applicable(lun->be_lun->lun_type, entry)) {
 			one->support = 3;
 			scsi_ulto2b(entry->length, one->cdb_length);
 			one->cdb_usage[0] = opcode;
 			memcpy(&one->cdb_usage[1], entry->usage,
 			    entry->length - 1);
 		} else
 			one->support = 1;
 		break;
 	case RSO_OPTIONS_OC_ASA:
 		one = (struct scsi_report_supported_opcodes_one *)
 		    ctsio->kern_data_ptr;
 		entry = &ctl_cmd_table[opcode];
 		if (entry->flags & CTL_CMD_FLAG_SA5) {
 			entry = &((const struct ctl_cmd_entry *)
 			    entry->execute)[service_action];
 		} else if (service_action != 0) {
 			one->support = 1;
 			break;
 		}
 		goto fill_one;
 	}
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return(retval);
 }
 
 int
 ctl_report_supported_tmf(struct ctl_scsiio *ctsio)
 {
 	struct scsi_report_supported_tmf *cdb;
 	struct scsi_report_supported_tmf_ext_data *data;
 	int retval;
 	int alloc_len, total_len;
 
 	CTL_DEBUG_PRINT(("ctl_report_supported_tmf\n"));
 
 	cdb = (struct scsi_report_supported_tmf *)ctsio->cdb;
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	if (cdb->options & RST_REPD)
 		total_len = sizeof(struct scsi_report_supported_tmf_ext_data);
 	else
 		total_len = sizeof(struct scsi_report_supported_tmf_data);
 	alloc_len = scsi_4btoul(cdb->length);
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(total_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	data = (struct scsi_report_supported_tmf_ext_data *)ctsio->kern_data_ptr;
 	data->byte1 |= RST_ATS | RST_ATSS | RST_CTSS | RST_LURS | RST_QTS |
 	    RST_TRS;
 	data->byte2 |= RST_QAES | RST_QTSS | RST_ITNRS;
 	data->length = total_len - 4;
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_report_timestamp(struct ctl_scsiio *ctsio)
 {
 	struct scsi_report_timestamp *cdb;
 	struct scsi_report_timestamp_data *data;
 	struct timeval tv;
 	int64_t timestamp;
 	int retval;
 	int alloc_len, total_len;
 
 	CTL_DEBUG_PRINT(("ctl_report_timestamp\n"));
 
 	cdb = (struct scsi_report_timestamp *)ctsio->cdb;
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	total_len = sizeof(struct scsi_report_timestamp_data);
 	alloc_len = scsi_4btoul(cdb->length);
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(total_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	data = (struct scsi_report_timestamp_data *)ctsio->kern_data_ptr;
 	scsi_ulto2b(sizeof(*data) - 2, data->length);
 	data->origin = RTS_ORIG_OUTSIDE;
 	getmicrotime(&tv);
 	timestamp = (int64_t)tv.tv_sec * 1000 + tv.tv_usec / 1000;
 	scsi_ulto4b(timestamp >> 16, data->timestamp);
 	scsi_ulto2b(timestamp & 0xffff, &data->timestamp[4]);
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_persistent_reserve_in(struct ctl_scsiio *ctsio)
 {
 	struct ctl_softc *softc = CTL_SOFTC(ctsio);
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_per_res_in *cdb;
 	int alloc_len, total_len = 0;
 	/* struct scsi_per_res_in_rsrv in_data; */
 	uint64_t key;
 
 	CTL_DEBUG_PRINT(("ctl_persistent_reserve_in\n"));
 
 	cdb = (struct scsi_per_res_in *)ctsio->cdb;
 
 	alloc_len = scsi_2btoul(cdb->length);
 
 retry:
 	mtx_lock(&lun->lun_lock);
 	switch (cdb->action) {
 	case SPRI_RK: /* read keys */
 		total_len = sizeof(struct scsi_per_res_in_keys) +
 			lun->pr_key_count *
 			sizeof(struct scsi_per_res_key);
 		break;
 	case SPRI_RR: /* read reservation */
 		if (lun->flags & CTL_LUN_PR_RESERVED)
 			total_len = sizeof(struct scsi_per_res_in_rsrv);
 		else
 			total_len = sizeof(struct scsi_per_res_in_header);
 		break;
 	case SPRI_RC: /* report capabilities */
 		total_len = sizeof(struct scsi_per_res_cap);
 		break;
 	case SPRI_RS: /* read full status */
 		total_len = sizeof(struct scsi_per_res_in_header) +
 		    (sizeof(struct scsi_per_res_in_full_desc) + 256) *
 		    lun->pr_key_count;
 		break;
 	default:
 		panic("%s: Invalid PR type %#x", __func__, cdb->action);
 	}
 	mtx_unlock(&lun->lun_lock);
 
 	ctsio->kern_data_ptr = malloc(total_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_len = min(total_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	mtx_lock(&lun->lun_lock);
 	switch (cdb->action) {
 	case SPRI_RK: { // read keys
         struct scsi_per_res_in_keys *res_keys;
 		int i, key_count;
 
 		res_keys = (struct scsi_per_res_in_keys*)ctsio->kern_data_ptr;
 
 		/*
 		 * We had to drop the lock to allocate our buffer, which
 		 * leaves time for someone to come in with another
 		 * persistent reservation.  (That is unlikely, though,
 		 * since this should be the only persistent reservation
 		 * command active right now.)
 		 */
 		if (total_len != (sizeof(struct scsi_per_res_in_keys) +
 		    (lun->pr_key_count *
 		     sizeof(struct scsi_per_res_key)))){
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			printf("%s: reservation length changed, retrying\n",
 			       __func__);
 			goto retry;
 		}
 
 		scsi_ulto4b(lun->pr_generation, res_keys->header.generation);
 
 		scsi_ulto4b(sizeof(struct scsi_per_res_key) *
 			     lun->pr_key_count, res_keys->header.length);
 
 		for (i = 0, key_count = 0; i < CTL_MAX_INITIATORS; i++) {
 			if ((key = ctl_get_prkey(lun, i)) == 0)
 				continue;
 
 			/*
 			 * We used lun->pr_key_count to calculate the
 			 * size to allocate.  If it turns out the number of
 			 * initiators with the registered flag set is
 			 * larger than that (i.e. they haven't been kept in
 			 * sync), we've got a problem.
 			 */
 			if (key_count >= lun->pr_key_count) {
 				key_count++;
 				continue;
 			}
 			scsi_u64to8b(key, res_keys->keys[key_count].key);
 			key_count++;
 		}
 		break;
 	}
 	case SPRI_RR: { // read reservation
 		struct scsi_per_res_in_rsrv *res;
 		int tmp_len, header_only;
 
 		res = (struct scsi_per_res_in_rsrv *)ctsio->kern_data_ptr;
 
 		scsi_ulto4b(lun->pr_generation, res->header.generation);
 
 		if (lun->flags & CTL_LUN_PR_RESERVED)
 		{
 			tmp_len = sizeof(struct scsi_per_res_in_rsrv);
 			scsi_ulto4b(sizeof(struct scsi_per_res_in_rsrv_data),
 				    res->header.length);
 			header_only = 0;
 		} else {
 			tmp_len = sizeof(struct scsi_per_res_in_header);
 			scsi_ulto4b(0, res->header.length);
 			header_only = 1;
 		}
 
 		/*
 		 * We had to drop the lock to allocate our buffer, which
 		 * leaves time for someone to come in with another
 		 * persistent reservation.  (That is unlikely, though,
 		 * since this should be the only persistent reservation
 		 * command active right now.)
 		 */
 		if (tmp_len != total_len) {
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			printf("%s: reservation status changed, retrying\n",
 			       __func__);
 			goto retry;
 		}
 
 		/*
 		 * No reservation held, so we're done.
 		 */
 		if (header_only != 0)
 			break;
 
 		/*
 		 * If the registration is an All Registrants type, the key
 		 * is 0, since it doesn't really matter.
 		 */
 		if (lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) {
 			scsi_u64to8b(ctl_get_prkey(lun, lun->pr_res_idx),
 			    res->data.reservation);
 		}
 		res->data.scopetype = lun->pr_res_type;
 		break;
 	}
 	case SPRI_RC:     //report capabilities
 	{
 		struct scsi_per_res_cap *res_cap;
 		uint16_t type_mask;
 
 		res_cap = (struct scsi_per_res_cap *)ctsio->kern_data_ptr;
 		scsi_ulto2b(sizeof(*res_cap), res_cap->length);
 		res_cap->flags1 = SPRI_CRH;
 		res_cap->flags2 = SPRI_TMV | SPRI_ALLOW_5;
 		type_mask = SPRI_TM_WR_EX_AR |
 			    SPRI_TM_EX_AC_RO |
 			    SPRI_TM_WR_EX_RO |
 			    SPRI_TM_EX_AC |
 			    SPRI_TM_WR_EX |
 			    SPRI_TM_EX_AC_AR;
 		scsi_ulto2b(type_mask, res_cap->type_mask);
 		break;
 	}
 	case SPRI_RS: { // read full status
 		struct scsi_per_res_in_full *res_status;
 		struct scsi_per_res_in_full_desc *res_desc;
 		struct ctl_port *port;
 		int i, len;
 
 		res_status = (struct scsi_per_res_in_full*)ctsio->kern_data_ptr;
 
 		/*
 		 * We had to drop the lock to allocate our buffer, which
 		 * leaves time for someone to come in with another
 		 * persistent reservation.  (That is unlikely, though,
 		 * since this should be the only persistent reservation
 		 * command active right now.)
 		 */
 		if (total_len < (sizeof(struct scsi_per_res_in_header) +
 		    (sizeof(struct scsi_per_res_in_full_desc) + 256) *
 		     lun->pr_key_count)){
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			printf("%s: reservation length changed, retrying\n",
 			       __func__);
 			goto retry;
 		}
 
 		scsi_ulto4b(lun->pr_generation, res_status->header.generation);
 
 		res_desc = &res_status->desc[0];
 		for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 			if ((key = ctl_get_prkey(lun, i)) == 0)
 				continue;
 
 			scsi_u64to8b(key, res_desc->res_key.key);
 			if ((lun->flags & CTL_LUN_PR_RESERVED) &&
 			    (lun->pr_res_idx == i ||
 			     lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS)) {
 				res_desc->flags = SPRI_FULL_R_HOLDER;
 				res_desc->scopetype = lun->pr_res_type;
 			}
 			scsi_ulto2b(i / CTL_MAX_INIT_PER_PORT,
 			    res_desc->rel_trgt_port_id);
 			len = 0;
 			port = softc->ctl_ports[i / CTL_MAX_INIT_PER_PORT];
 			if (port != NULL)
 				len = ctl_create_iid(port,
 				    i % CTL_MAX_INIT_PER_PORT,
 				    res_desc->transport_id);
 			scsi_ulto4b(len, res_desc->additional_length);
 			res_desc = (struct scsi_per_res_in_full_desc *)
 			    &res_desc->transport_id[len];
 		}
 		scsi_ulto4b((uint8_t *)res_desc - (uint8_t *)&res_status->desc[0],
 		    res_status->header.length);
 		break;
 	}
 	default:
 		panic("%s: Invalid PR type %#x", __func__, cdb->action);
 	}
 	mtx_unlock(&lun->lun_lock);
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 /*
  * Returns 0 if ctl_persistent_reserve_out() should continue, non-zero if
  * it should return.
  */
 static int
 ctl_pro_preempt(struct ctl_softc *softc, struct ctl_lun *lun, uint64_t res_key,
 		uint64_t sa_res_key, uint8_t type, uint32_t residx,
 		struct ctl_scsiio *ctsio, struct scsi_per_res_out *cdb,
 		struct scsi_per_res_out_parms* param)
 {
 	union ctl_ha_msg persis_io;
 	int i;
 
 	mtx_lock(&lun->lun_lock);
 	if (sa_res_key == 0) {
 		if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) {
 			/* validate scope and type */
 			if ((cdb->scope_type & SPR_SCOPE_MASK) !=
 			     SPR_LU_SCOPE) {
 				mtx_unlock(&lun->lun_lock);
 				ctl_set_invalid_field(/*ctsio*/ ctsio,
 						      /*sks_valid*/ 1,
 						      /*command*/ 1,
 						      /*field*/ 2,
 						      /*bit_valid*/ 1,
 						      /*bit*/ 4);
 				ctl_done((union ctl_io *)ctsio);
 				return (1);
 			}
 
 		        if (type>8 || type==2 || type==4 || type==0) {
 				mtx_unlock(&lun->lun_lock);
 				ctl_set_invalid_field(/*ctsio*/ ctsio,
        	           				      /*sks_valid*/ 1,
 						      /*command*/ 1,
 						      /*field*/ 2,
 						      /*bit_valid*/ 1,
 						      /*bit*/ 0);
 				ctl_done((union ctl_io *)ctsio);
 				return (1);
 		        }
 
 			/*
 			 * Unregister everybody else and build UA for
 			 * them
 			 */
 			for(i = 0; i < CTL_MAX_INITIATORS; i++) {
 				if (i == residx || ctl_get_prkey(lun, i) == 0)
 					continue;
 
 				ctl_clr_prkey(lun, i);
 				ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT);
 			}
 			lun->pr_key_count = 1;
 			lun->pr_res_type = type;
 			if (lun->pr_res_type != SPR_TYPE_WR_EX_AR &&
 			    lun->pr_res_type != SPR_TYPE_EX_AC_AR)
 				lun->pr_res_idx = residx;
 			lun->pr_generation++;
 			mtx_unlock(&lun->lun_lock);
 
 			/* send msg to other side */
 			persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 			persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 			persis_io.pr.pr_info.action = CTL_PR_PREEMPT;
 			persis_io.pr.pr_info.residx = lun->pr_res_idx;
 			persis_io.pr.pr_info.res_type = type;
 			memcpy(persis_io.pr.pr_info.sa_res_key,
 			       param->serv_act_res_key,
 			       sizeof(param->serv_act_res_key));
 			ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io,
 			    sizeof(persis_io.pr), M_WAITOK);
 		} else {
 			/* not all registrants */
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 0,
 					      /*field*/ 8,
 					      /*bit_valid*/ 0,
 					      /*bit*/ 0);
 			ctl_done((union ctl_io *)ctsio);
 			return (1);
 		}
 	} else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS
 		|| !(lun->flags & CTL_LUN_PR_RESERVED)) {
 		int found = 0;
 
 		if (res_key == sa_res_key) {
 			/* special case */
 			/*
 			 * The spec implies this is not good but doesn't
 			 * say what to do. There are two choices either
 			 * generate a res conflict or check condition
 			 * with illegal field in parameter data. Since
 			 * that is what is done when the sa_res_key is
 			 * zero I'll take that approach since this has
 			 * to do with the sa_res_key.
 			 */
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 0,
 					      /*field*/ 8,
 					      /*bit_valid*/ 0,
 					      /*bit*/ 0);
 			ctl_done((union ctl_io *)ctsio);
 			return (1);
 		}
 
 		for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 			if (ctl_get_prkey(lun, i) != sa_res_key)
 				continue;
 
 			found = 1;
 			ctl_clr_prkey(lun, i);
 			lun->pr_key_count--;
 			ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT);
 		}
 		if (!found) {
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_reservation_conflict(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		lun->pr_generation++;
 		mtx_unlock(&lun->lun_lock);
 
 		/* send msg to other side */
 		persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 		persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 		persis_io.pr.pr_info.action = CTL_PR_PREEMPT;
 		persis_io.pr.pr_info.residx = lun->pr_res_idx;
 		persis_io.pr.pr_info.res_type = type;
 		memcpy(persis_io.pr.pr_info.sa_res_key,
 		       param->serv_act_res_key,
 		       sizeof(param->serv_act_res_key));
 		ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io,
 		    sizeof(persis_io.pr), M_WAITOK);
 	} else {
 		/* Reserved but not all registrants */
 		/* sa_res_key is res holder */
 		if (sa_res_key == ctl_get_prkey(lun, lun->pr_res_idx)) {
 			/* validate scope and type */
 			if ((cdb->scope_type & SPR_SCOPE_MASK) !=
 			     SPR_LU_SCOPE) {
 				mtx_unlock(&lun->lun_lock);
 				ctl_set_invalid_field(/*ctsio*/ ctsio,
 						      /*sks_valid*/ 1,
 						      /*command*/ 1,
 						      /*field*/ 2,
 						      /*bit_valid*/ 1,
 						      /*bit*/ 4);
 				ctl_done((union ctl_io *)ctsio);
 				return (1);
 			}
 
 			if (type>8 || type==2 || type==4 || type==0) {
 				mtx_unlock(&lun->lun_lock);
 				ctl_set_invalid_field(/*ctsio*/ ctsio,
 						      /*sks_valid*/ 1,
 						      /*command*/ 1,
 						      /*field*/ 2,
 						      /*bit_valid*/ 1,
 						      /*bit*/ 0);
 				ctl_done((union ctl_io *)ctsio);
 				return (1);
 			}
 
 			/*
 			 * Do the following:
 			 * if sa_res_key != res_key remove all
 			 * registrants w/sa_res_key and generate UA
 			 * for these registrants(Registrations
 			 * Preempted) if it wasn't an exclusive
 			 * reservation generate UA(Reservations
 			 * Preempted) for all other registered nexuses
 			 * if the type has changed. Establish the new
 			 * reservation and holder. If res_key and
 			 * sa_res_key are the same do the above
 			 * except don't unregister the res holder.
 			 */
 
 			for(i = 0; i < CTL_MAX_INITIATORS; i++) {
 				if (i == residx || ctl_get_prkey(lun, i) == 0)
 					continue;
 
 				if (sa_res_key == ctl_get_prkey(lun, i)) {
 					ctl_clr_prkey(lun, i);
 					lun->pr_key_count--;
 					ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT);
 				} else if (type != lun->pr_res_type &&
 				    (lun->pr_res_type == SPR_TYPE_WR_EX_RO ||
 				     lun->pr_res_type == SPR_TYPE_EX_AC_RO)) {
 					ctl_est_ua(lun, i, CTL_UA_RES_RELEASE);
 				}
 			}
 			lun->pr_res_type = type;
 			if (lun->pr_res_type != SPR_TYPE_WR_EX_AR &&
 			    lun->pr_res_type != SPR_TYPE_EX_AC_AR)
 				lun->pr_res_idx = residx;
 			else
 				lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS;
 			lun->pr_generation++;
 			mtx_unlock(&lun->lun_lock);
 
 			persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 			persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 			persis_io.pr.pr_info.action = CTL_PR_PREEMPT;
 			persis_io.pr.pr_info.residx = lun->pr_res_idx;
 			persis_io.pr.pr_info.res_type = type;
 			memcpy(persis_io.pr.pr_info.sa_res_key,
 			       param->serv_act_res_key,
 			       sizeof(param->serv_act_res_key));
 			ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io,
 			    sizeof(persis_io.pr), M_WAITOK);
 		} else {
 			/*
 			 * sa_res_key is not the res holder just
 			 * remove registrants
 			 */
 			int found=0;
 
 			for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 				if (sa_res_key != ctl_get_prkey(lun, i))
 					continue;
 
 				found = 1;
 				ctl_clr_prkey(lun, i);
 				lun->pr_key_count--;
 				ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT);
 			}
 
 			if (!found) {
 				mtx_unlock(&lun->lun_lock);
 				free(ctsio->kern_data_ptr, M_CTL);
 				ctl_set_reservation_conflict(ctsio);
 				ctl_done((union ctl_io *)ctsio);
 		        	return (1);
 			}
 			lun->pr_generation++;
 			mtx_unlock(&lun->lun_lock);
 
 			persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 			persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 			persis_io.pr.pr_info.action = CTL_PR_PREEMPT;
 			persis_io.pr.pr_info.residx = lun->pr_res_idx;
 			persis_io.pr.pr_info.res_type = type;
 			memcpy(persis_io.pr.pr_info.sa_res_key,
 			       param->serv_act_res_key,
 			       sizeof(param->serv_act_res_key));
 			ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io,
 			    sizeof(persis_io.pr), M_WAITOK);
 		}
 	}
 	return (0);
 }
 
 static void
 ctl_pro_preempt_other(struct ctl_lun *lun, union ctl_ha_msg *msg)
 {
 	uint64_t sa_res_key;
 	int i;
 
 	sa_res_key = scsi_8btou64(msg->pr.pr_info.sa_res_key);
 
 	if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS
 	 || lun->pr_res_idx == CTL_PR_NO_RESERVATION
 	 || sa_res_key != ctl_get_prkey(lun, lun->pr_res_idx)) {
 		if (sa_res_key == 0) {
 			/*
 			 * Unregister everybody else and build UA for
 			 * them
 			 */
 			for(i = 0; i < CTL_MAX_INITIATORS; i++) {
 				if (i == msg->pr.pr_info.residx ||
 				    ctl_get_prkey(lun, i) == 0)
 					continue;
 
 				ctl_clr_prkey(lun, i);
 				ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT);
 			}
 
 			lun->pr_key_count = 1;
 			lun->pr_res_type = msg->pr.pr_info.res_type;
 			if (lun->pr_res_type != SPR_TYPE_WR_EX_AR &&
 			    lun->pr_res_type != SPR_TYPE_EX_AC_AR)
 				lun->pr_res_idx = msg->pr.pr_info.residx;
 		} else {
 		        for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 				if (sa_res_key == ctl_get_prkey(lun, i))
 					continue;
 
 				ctl_clr_prkey(lun, i);
 				lun->pr_key_count--;
 				ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT);
 			}
 		}
 	} else {
 		for (i = 0; i < CTL_MAX_INITIATORS; i++) {
 			if (i == msg->pr.pr_info.residx ||
 			    ctl_get_prkey(lun, i) == 0)
 				continue;
 
 			if (sa_res_key == ctl_get_prkey(lun, i)) {
 				ctl_clr_prkey(lun, i);
 				lun->pr_key_count--;
 				ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT);
 			} else if (msg->pr.pr_info.res_type != lun->pr_res_type
 			    && (lun->pr_res_type == SPR_TYPE_WR_EX_RO ||
 			     lun->pr_res_type == SPR_TYPE_EX_AC_RO)) {
 				ctl_est_ua(lun, i, CTL_UA_RES_RELEASE);
 			}
 		}
 		lun->pr_res_type = msg->pr.pr_info.res_type;
 		if (lun->pr_res_type != SPR_TYPE_WR_EX_AR &&
 		    lun->pr_res_type != SPR_TYPE_EX_AC_AR)
 			lun->pr_res_idx = msg->pr.pr_info.residx;
 		else
 			lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS;
 	}
 	lun->pr_generation++;
 
 }
 
 
 int
 ctl_persistent_reserve_out(struct ctl_scsiio *ctsio)
 {
 	struct ctl_softc *softc = CTL_SOFTC(ctsio);
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	int retval;
 	u_int32_t param_len;
 	struct scsi_per_res_out *cdb;
 	struct scsi_per_res_out_parms* param;
 	uint32_t residx;
 	uint64_t res_key, sa_res_key, key;
 	uint8_t type;
 	union ctl_ha_msg persis_io;
 	int    i;
 
 	CTL_DEBUG_PRINT(("ctl_persistent_reserve_out\n"));
 
 	cdb = (struct scsi_per_res_out *)ctsio->cdb;
 	retval = CTL_RETVAL_COMPLETE;
 
 	/*
 	 * We only support whole-LUN scope.  The scope & type are ignored for
 	 * register, register and ignore existing key and clear.
 	 * We sometimes ignore scope and type on preempts too!!
 	 * Verify reservation type here as well.
 	 */
 	type = cdb->scope_type & SPR_TYPE_MASK;
 	if ((cdb->action == SPRO_RESERVE)
 	 || (cdb->action == SPRO_RELEASE)) {
 		if ((cdb->scope_type & SPR_SCOPE_MASK) != SPR_LU_SCOPE) {
 			ctl_set_invalid_field(/*ctsio*/ ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 2,
 					      /*bit_valid*/ 1,
 					      /*bit*/ 4);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 
 		if (type>8 || type==2 || type==4 || type==0) {
 			ctl_set_invalid_field(/*ctsio*/ ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 2,
 					      /*bit_valid*/ 1,
 					      /*bit*/ 0);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 	}
 
 	param_len = scsi_4btoul(cdb->length);
 
 	if ((ctsio->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0) {
 		ctsio->kern_data_ptr = malloc(param_len, M_CTL, M_WAITOK);
 		ctsio->kern_data_len = param_len;
 		ctsio->kern_total_len = param_len;
 		ctsio->kern_rel_offset = 0;
 		ctsio->kern_sg_entries = 0;
 		ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 		ctsio->be_move_done = ctl_config_move_done;
 		ctl_datamove((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	param = (struct scsi_per_res_out_parms *)ctsio->kern_data_ptr;
 
 	residx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 	res_key = scsi_8btou64(param->res_key.key);
 	sa_res_key = scsi_8btou64(param->serv_act_res_key);
 
 	/*
 	 * Validate the reservation key here except for SPRO_REG_IGNO
 	 * This must be done for all other service actions
 	 */
 	if ((cdb->action & SPRO_ACTION_MASK) != SPRO_REG_IGNO) {
 		mtx_lock(&lun->lun_lock);
 		if ((key = ctl_get_prkey(lun, residx)) != 0) {
 			if (res_key != key) {
 				/*
 				 * The current key passed in doesn't match
 				 * the one the initiator previously
 				 * registered.
 				 */
 				mtx_unlock(&lun->lun_lock);
 				free(ctsio->kern_data_ptr, M_CTL);
 				ctl_set_reservation_conflict(ctsio);
 				ctl_done((union ctl_io *)ctsio);
 				return (CTL_RETVAL_COMPLETE);
 			}
 		} else if ((cdb->action & SPRO_ACTION_MASK) != SPRO_REGISTER) {
 			/*
 			 * We are not registered
 			 */
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_reservation_conflict(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		} else if (res_key != 0) {
 			/*
 			 * We are not registered and trying to register but
 			 * the register key isn't zero.
 			 */
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_reservation_conflict(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		mtx_unlock(&lun->lun_lock);
 	}
 
 	switch (cdb->action & SPRO_ACTION_MASK) {
 	case SPRO_REGISTER:
 	case SPRO_REG_IGNO: {
 
 		/*
 		 * We don't support any of these options, as we report in
 		 * the read capabilities request (see
 		 * ctl_persistent_reserve_in(), above).
 		 */
 		if ((param->flags & SPR_SPEC_I_PT)
 		 || (param->flags & SPR_ALL_TG_PT)
 		 || (param->flags & SPR_APTPL)) {
 			int bit_ptr;
 
 			if (param->flags & SPR_APTPL)
 				bit_ptr = 0;
 			else if (param->flags & SPR_ALL_TG_PT)
 				bit_ptr = 2;
 			else /* SPR_SPEC_I_PT */
 				bit_ptr = 3;
 
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 0,
 					      /*field*/ 20,
 					      /*bit_valid*/ 1,
 					      /*bit*/ bit_ptr);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 
 		mtx_lock(&lun->lun_lock);
 
 		/*
 		 * The initiator wants to clear the
 		 * key/unregister.
 		 */
 		if (sa_res_key == 0) {
 			if ((res_key == 0
 			  && (cdb->action & SPRO_ACTION_MASK) == SPRO_REGISTER)
 			 || ((cdb->action & SPRO_ACTION_MASK) == SPRO_REG_IGNO
 			  && ctl_get_prkey(lun, residx) == 0)) {
 				mtx_unlock(&lun->lun_lock);
 				goto done;
 			}
 
 			ctl_clr_prkey(lun, residx);
 			lun->pr_key_count--;
 
 			if (residx == lun->pr_res_idx) {
 				lun->flags &= ~CTL_LUN_PR_RESERVED;
 				lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 
 				if ((lun->pr_res_type == SPR_TYPE_WR_EX_RO ||
 				     lun->pr_res_type == SPR_TYPE_EX_AC_RO) &&
 				    lun->pr_key_count) {
 					/*
 					 * If the reservation is a registrants
 					 * only type we need to generate a UA
 					 * for other registered inits.  The
 					 * sense code should be RESERVATIONS
 					 * RELEASED
 					 */
 
 					for (i = softc->init_min; i < softc->init_max; i++){
 						if (ctl_get_prkey(lun, i) == 0)
 							continue;
 						ctl_est_ua(lun, i,
 						    CTL_UA_RES_RELEASE);
 					}
 				}
 				lun->pr_res_type = 0;
 			} else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) {
 				if (lun->pr_key_count==0) {
 					lun->flags &= ~CTL_LUN_PR_RESERVED;
 					lun->pr_res_type = 0;
 					lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 				}
 			}
 			lun->pr_generation++;
 			mtx_unlock(&lun->lun_lock);
 
 			persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 			persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 			persis_io.pr.pr_info.action = CTL_PR_UNREG_KEY;
 			persis_io.pr.pr_info.residx = residx;
 			ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io,
 			    sizeof(persis_io.pr), M_WAITOK);
 		} else /* sa_res_key != 0 */ {
 
 			/*
 			 * If we aren't registered currently then increment
 			 * the key count and set the registered flag.
 			 */
 			ctl_alloc_prkey(lun, residx);
 			if (ctl_get_prkey(lun, residx) == 0)
 				lun->pr_key_count++;
 			ctl_set_prkey(lun, residx, sa_res_key);
 			lun->pr_generation++;
 			mtx_unlock(&lun->lun_lock);
 
 			persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 			persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 			persis_io.pr.pr_info.action = CTL_PR_REG_KEY;
 			persis_io.pr.pr_info.residx = residx;
 			memcpy(persis_io.pr.pr_info.sa_res_key,
 			       param->serv_act_res_key,
 			       sizeof(param->serv_act_res_key));
 			ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io,
 			    sizeof(persis_io.pr), M_WAITOK);
 		}
 
 		break;
 	}
 	case SPRO_RESERVE:
 		mtx_lock(&lun->lun_lock);
 		if (lun->flags & CTL_LUN_PR_RESERVED) {
 			/*
 			 * if this isn't the reservation holder and it's
 			 * not a "all registrants" type or if the type is
 			 * different then we have a conflict
 			 */
 			if ((lun->pr_res_idx != residx
 			  && lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS)
 			 || lun->pr_res_type != type) {
 				mtx_unlock(&lun->lun_lock);
 				free(ctsio->kern_data_ptr, M_CTL);
 				ctl_set_reservation_conflict(ctsio);
 				ctl_done((union ctl_io *)ctsio);
 				return (CTL_RETVAL_COMPLETE);
 			}
 			mtx_unlock(&lun->lun_lock);
 		} else /* create a reservation */ {
 			/*
 			 * If it's not an "all registrants" type record
 			 * reservation holder
 			 */
 			if (type != SPR_TYPE_WR_EX_AR
 			 && type != SPR_TYPE_EX_AC_AR)
 				lun->pr_res_idx = residx; /* Res holder */
 			else
 				lun->pr_res_idx = CTL_PR_ALL_REGISTRANTS;
 
 			lun->flags |= CTL_LUN_PR_RESERVED;
 			lun->pr_res_type = type;
 
 			mtx_unlock(&lun->lun_lock);
 
 			/* send msg to other side */
 			persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 			persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 			persis_io.pr.pr_info.action = CTL_PR_RESERVE;
 			persis_io.pr.pr_info.residx = lun->pr_res_idx;
 			persis_io.pr.pr_info.res_type = type;
 			ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io,
 			    sizeof(persis_io.pr), M_WAITOK);
 		}
 		break;
 
 	case SPRO_RELEASE:
 		mtx_lock(&lun->lun_lock);
 		if ((lun->flags & CTL_LUN_PR_RESERVED) == 0) {
 			/* No reservation exists return good status */
 			mtx_unlock(&lun->lun_lock);
 			goto done;
 		}
 		/*
 		 * Is this nexus a reservation holder?
 		 */
 		if (lun->pr_res_idx != residx
 		 && lun->pr_res_idx != CTL_PR_ALL_REGISTRANTS) {
 			/*
 			 * not a res holder return good status but
 			 * do nothing
 			 */
 			mtx_unlock(&lun->lun_lock);
 			goto done;
 		}
 
 		if (lun->pr_res_type != type) {
 			mtx_unlock(&lun->lun_lock);
 			free(ctsio->kern_data_ptr, M_CTL);
 			ctl_set_illegal_pr_release(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 
 		/* okay to release */
 		lun->flags &= ~CTL_LUN_PR_RESERVED;
 		lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 		lun->pr_res_type = 0;
 
 		/*
 		 * If this isn't an exclusive access reservation and NUAR
 		 * is not set, generate UA for all other registrants.
 		 */
 		if (type != SPR_TYPE_EX_AC && type != SPR_TYPE_WR_EX &&
 		    (lun->MODE_CTRL.queue_flags & SCP_NUAR) == 0) {
 			for (i = softc->init_min; i < softc->init_max; i++) {
 				if (i == residx || ctl_get_prkey(lun, i) == 0)
 					continue;
 				ctl_est_ua(lun, i, CTL_UA_RES_RELEASE);
 			}
 		}
 		mtx_unlock(&lun->lun_lock);
 
 		/* Send msg to other side */
 		persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 		persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 		persis_io.pr.pr_info.action = CTL_PR_RELEASE;
 		ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io,
 		     sizeof(persis_io.pr), M_WAITOK);
 		break;
 
 	case SPRO_CLEAR:
 		/* send msg to other side */
 
 		mtx_lock(&lun->lun_lock);
 		lun->flags &= ~CTL_LUN_PR_RESERVED;
 		lun->pr_res_type = 0;
 		lun->pr_key_count = 0;
 		lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 
 		ctl_clr_prkey(lun, residx);
 		for (i = 0; i < CTL_MAX_INITIATORS; i++)
 			if (ctl_get_prkey(lun, i) != 0) {
 				ctl_clr_prkey(lun, i);
 				ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT);
 			}
 		lun->pr_generation++;
 		mtx_unlock(&lun->lun_lock);
 
 		persis_io.hdr.nexus = ctsio->io_hdr.nexus;
 		persis_io.hdr.msg_type = CTL_MSG_PERS_ACTION;
 		persis_io.pr.pr_info.action = CTL_PR_CLEAR;
 		ctl_ha_msg_send(CTL_HA_CHAN_CTL, &persis_io,
 		     sizeof(persis_io.pr), M_WAITOK);
 		break;
 
 	case SPRO_PREEMPT:
 	case SPRO_PRE_ABO: {
 		int nretval;
 
 		nretval = ctl_pro_preempt(softc, lun, res_key, sa_res_key, type,
 					  residx, ctsio, cdb, param);
 		if (nretval != 0)
 			return (CTL_RETVAL_COMPLETE);
 		break;
 	}
 	default:
 		panic("%s: Invalid PR type %#x", __func__, cdb->action);
 	}
 
 done:
 	free(ctsio->kern_data_ptr, M_CTL);
 	ctl_set_success(ctsio);
 	ctl_done((union ctl_io *)ctsio);
 
 	return (retval);
 }
 
 /*
  * This routine is for handling a message from the other SC pertaining to
  * persistent reserve out. All the error checking will have been done
  * so only perorming the action need be done here to keep the two
  * in sync.
  */
 static void
 ctl_hndl_per_res_out_on_other_sc(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	union ctl_ha_msg *msg = (union ctl_ha_msg *)&io->presio.pr_msg;
 	struct ctl_lun *lun;
 	int i;
 	uint32_t residx, targ_lun;
 
 	targ_lun = msg->hdr.nexus.targ_mapped_lun;
 	mtx_lock(&softc->ctl_lock);
 	if (targ_lun >= ctl_max_luns ||
 	    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 		return;
 	}
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 	if (lun->flags & CTL_LUN_DISABLED) {
 		mtx_unlock(&lun->lun_lock);
 		return;
 	}
 	residx = ctl_get_initindex(&msg->hdr.nexus);
 	switch(msg->pr.pr_info.action) {
 	case CTL_PR_REG_KEY:
 		ctl_alloc_prkey(lun, msg->pr.pr_info.residx);
 		if (ctl_get_prkey(lun, msg->pr.pr_info.residx) == 0)
 			lun->pr_key_count++;
 		ctl_set_prkey(lun, msg->pr.pr_info.residx,
 		    scsi_8btou64(msg->pr.pr_info.sa_res_key));
 		lun->pr_generation++;
 		break;
 
 	case CTL_PR_UNREG_KEY:
 		ctl_clr_prkey(lun, msg->pr.pr_info.residx);
 		lun->pr_key_count--;
 
 		/* XXX Need to see if the reservation has been released */
 		/* if so do we need to generate UA? */
 		if (msg->pr.pr_info.residx == lun->pr_res_idx) {
 			lun->flags &= ~CTL_LUN_PR_RESERVED;
 			lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 
 			if ((lun->pr_res_type == SPR_TYPE_WR_EX_RO ||
 			     lun->pr_res_type == SPR_TYPE_EX_AC_RO) &&
 			    lun->pr_key_count) {
 				/*
 				 * If the reservation is a registrants
 				 * only type we need to generate a UA
 				 * for other registered inits.  The
 				 * sense code should be RESERVATIONS
 				 * RELEASED
 				 */
 
 				for (i = softc->init_min; i < softc->init_max; i++) {
 					if (ctl_get_prkey(lun, i) == 0)
 						continue;
 
 					ctl_est_ua(lun, i, CTL_UA_RES_RELEASE);
 				}
 			}
 			lun->pr_res_type = 0;
 		} else if (lun->pr_res_idx == CTL_PR_ALL_REGISTRANTS) {
 			if (lun->pr_key_count==0) {
 				lun->flags &= ~CTL_LUN_PR_RESERVED;
 				lun->pr_res_type = 0;
 				lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 			}
 		}
 		lun->pr_generation++;
 		break;
 
 	case CTL_PR_RESERVE:
 		lun->flags |= CTL_LUN_PR_RESERVED;
 		lun->pr_res_type = msg->pr.pr_info.res_type;
 		lun->pr_res_idx = msg->pr.pr_info.residx;
 
 		break;
 
 	case CTL_PR_RELEASE:
 		/*
 		 * If this isn't an exclusive access reservation and NUAR
 		 * is not set, generate UA for all other registrants.
 		 */
 		if (lun->pr_res_type != SPR_TYPE_EX_AC &&
 		    lun->pr_res_type != SPR_TYPE_WR_EX &&
 		    (lun->MODE_CTRL.queue_flags & SCP_NUAR) == 0) {
 			for (i = softc->init_min; i < softc->init_max; i++) {
 				if (i == residx || ctl_get_prkey(lun, i) == 0)
 					continue;
 				ctl_est_ua(lun, i, CTL_UA_RES_RELEASE);
 			}
 		}
 
 		lun->flags &= ~CTL_LUN_PR_RESERVED;
 		lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 		lun->pr_res_type = 0;
 		break;
 
 	case CTL_PR_PREEMPT:
 		ctl_pro_preempt_other(lun, msg);
 		break;
 	case CTL_PR_CLEAR:
 		lun->flags &= ~CTL_LUN_PR_RESERVED;
 		lun->pr_res_type = 0;
 		lun->pr_key_count = 0;
 		lun->pr_res_idx = CTL_PR_NO_RESERVATION;
 
 		for (i=0; i < CTL_MAX_INITIATORS; i++) {
 			if (ctl_get_prkey(lun, i) == 0)
 				continue;
 			ctl_clr_prkey(lun, i);
 			ctl_est_ua(lun, i, CTL_UA_REG_PREEMPT);
 		}
 		lun->pr_generation++;
 		break;
 	}
 
 	mtx_unlock(&lun->lun_lock);
 }
 
 int
 ctl_read_write(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct ctl_lba_len_flags *lbalen;
 	uint64_t lba;
 	uint32_t num_blocks;
 	int flags, retval;
 	int isread;
 
 	CTL_DEBUG_PRINT(("ctl_read_write: command: %#x\n", ctsio->cdb[0]));
 
 	flags = 0;
 	isread = ctsio->cdb[0] == READ_6  || ctsio->cdb[0] == READ_10
 	      || ctsio->cdb[0] == READ_12 || ctsio->cdb[0] == READ_16;
 	switch (ctsio->cdb[0]) {
 	case READ_6:
 	case WRITE_6: {
 		struct scsi_rw_6 *cdb;
 
 		cdb = (struct scsi_rw_6 *)ctsio->cdb;
 
 		lba = scsi_3btoul(cdb->addr);
 		/* only 5 bits are valid in the most significant address byte */
 		lba &= 0x1fffff;
 		num_blocks = cdb->length;
 		/*
 		 * This is correct according to SBC-2.
 		 */
 		if (num_blocks == 0)
 			num_blocks = 256;
 		break;
 	}
 	case READ_10:
 	case WRITE_10: {
 		struct scsi_rw_10 *cdb;
 
 		cdb = (struct scsi_rw_10 *)ctsio->cdb;
 		if (cdb->byte2 & SRW10_FUA)
 			flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SRW10_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_2btoul(cdb->length);
 		break;
 	}
 	case WRITE_VERIFY_10: {
 		struct scsi_write_verify_10 *cdb;
 
 		cdb = (struct scsi_write_verify_10 *)ctsio->cdb;
 		flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SWV_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_2btoul(cdb->length);
 		break;
 	}
 	case READ_12:
 	case WRITE_12: {
 		struct scsi_rw_12 *cdb;
 
 		cdb = (struct scsi_rw_12 *)ctsio->cdb;
 		if (cdb->byte2 & SRW12_FUA)
 			flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SRW12_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		break;
 	}
 	case WRITE_VERIFY_12: {
 		struct scsi_write_verify_12 *cdb;
 
 		cdb = (struct scsi_write_verify_12 *)ctsio->cdb;
 		flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SWV_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		break;
 	}
 	case READ_16:
 	case WRITE_16: {
 		struct scsi_rw_16 *cdb;
 
 		cdb = (struct scsi_rw_16 *)ctsio->cdb;
 		if (cdb->byte2 & SRW12_FUA)
 			flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SRW12_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_8btou64(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		break;
 	}
 	case WRITE_ATOMIC_16: {
 		struct scsi_write_atomic_16 *cdb;
 
 		if (lun->be_lun->atomicblock == 0) {
 			ctl_set_invalid_opcode(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 
 		cdb = (struct scsi_write_atomic_16 *)ctsio->cdb;
 		if (cdb->byte2 & SRW12_FUA)
 			flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SRW12_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_8btou64(cdb->addr);
 		num_blocks = scsi_2btoul(cdb->length);
 		if (num_blocks > lun->be_lun->atomicblock) {
 			ctl_set_invalid_field(ctsio, /*sks_valid*/ 1,
 			    /*command*/ 1, /*field*/ 12, /*bit_valid*/ 0,
 			    /*bit*/ 0);
 			ctl_done((union ctl_io *)ctsio);
 			return (CTL_RETVAL_COMPLETE);
 		}
 		break;
 	}
 	case WRITE_VERIFY_16: {
 		struct scsi_write_verify_16 *cdb;
 
 		cdb = (struct scsi_write_verify_16 *)ctsio->cdb;
 		flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SWV_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_8btou64(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		break;
 	}
 	default:
 		/*
 		 * We got a command we don't support.  This shouldn't
 		 * happen, commands should be filtered out above us.
 		 */
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * The first check is to make sure we're in bounds, the second
 	 * check is to catch wrap-around problems.  If the lba + num blocks
 	 * is less than the lba, then we've wrapped around and the block
 	 * range is invalid anyway.
 	 */
 	if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
 	 || ((lba + num_blocks) < lba)) {
 		ctl_set_lba_out_of_range(ctsio,
 		    MAX(lba, lun->be_lun->maxlba + 1));
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * According to SBC-3, a transfer length of 0 is not an error.
 	 * Note that this cannot happen with WRITE(6) or READ(6), since 0
 	 * translates to 256 blocks for those commands.
 	 */
 	if (num_blocks == 0) {
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/* Set FUA and/or DPO if caches are disabled. */
 	if (isread) {
 		if ((lun->MODE_CACHING.flags1 & SCP_RCD) != 0)
 			flags |= CTL_LLF_FUA | CTL_LLF_DPO;
 	} else {
 		if ((lun->MODE_CACHING.flags1 & SCP_WCE) == 0)
 			flags |= CTL_LLF_FUA;
 	}
 
 	lbalen = (struct ctl_lba_len_flags *)
 	    &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	lbalen->lba = lba;
 	lbalen->len = num_blocks;
 	lbalen->flags = (isread ? CTL_LLF_READ : CTL_LLF_WRITE) | flags;
 
 	ctsio->kern_total_len = num_blocks * lun->be_lun->blocksize;
 	ctsio->kern_rel_offset = 0;
 
 	CTL_DEBUG_PRINT(("ctl_read_write: calling data_submit()\n"));
 
 	retval = lun->backend->data_submit((union ctl_io *)ctsio);
 	return (retval);
 }
 
 static int
 ctl_cnw_cont(union ctl_io *io)
 {
 	struct ctl_lun *lun = CTL_LUN(io);
 	struct ctl_scsiio *ctsio;
 	struct ctl_lba_len_flags *lbalen;
 	int retval;
 
 	ctsio = &io->scsiio;
 	ctsio->io_hdr.status = CTL_STATUS_NONE;
 	ctsio->io_hdr.flags &= ~CTL_FLAG_IO_CONT;
 	lbalen = (struct ctl_lba_len_flags *)
 	    &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	lbalen->flags &= ~CTL_LLF_COMPARE;
 	lbalen->flags |= CTL_LLF_WRITE;
 
 	CTL_DEBUG_PRINT(("ctl_cnw_cont: calling data_submit()\n"));
 	retval = lun->backend->data_submit((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_cnw(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct ctl_lba_len_flags *lbalen;
 	uint64_t lba;
 	uint32_t num_blocks;
 	int flags, retval;
 
 	CTL_DEBUG_PRINT(("ctl_cnw: command: %#x\n", ctsio->cdb[0]));
 
 	flags = 0;
 	switch (ctsio->cdb[0]) {
 	case COMPARE_AND_WRITE: {
 		struct scsi_compare_and_write *cdb;
 
 		cdb = (struct scsi_compare_and_write *)ctsio->cdb;
 		if (cdb->byte2 & SRW10_FUA)
 			flags |= CTL_LLF_FUA;
 		if (cdb->byte2 & SRW10_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_8btou64(cdb->addr);
 		num_blocks = cdb->length;
 		break;
 	}
 	default:
 		/*
 		 * We got a command we don't support.  This shouldn't
 		 * happen, commands should be filtered out above us.
 		 */
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 
 		return (CTL_RETVAL_COMPLETE);
 		break; /* NOTREACHED */
 	}
 
 	/*
 	 * The first check is to make sure we're in bounds, the second
 	 * check is to catch wrap-around problems.  If the lba + num blocks
 	 * is less than the lba, then we've wrapped around and the block
 	 * range is invalid anyway.
 	 */
 	if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
 	 || ((lba + num_blocks) < lba)) {
 		ctl_set_lba_out_of_range(ctsio,
 		    MAX(lba, lun->be_lun->maxlba + 1));
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * According to SBC-3, a transfer length of 0 is not an error.
 	 */
 	if (num_blocks == 0) {
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/* Set FUA if write cache is disabled. */
 	if ((lun->MODE_CACHING.flags1 & SCP_WCE) == 0)
 		flags |= CTL_LLF_FUA;
 
 	ctsio->kern_total_len = 2 * num_blocks * lun->be_lun->blocksize;
 	ctsio->kern_rel_offset = 0;
 
 	/*
 	 * Set the IO_CONT flag, so that if this I/O gets passed to
 	 * ctl_data_submit_done(), it'll get passed back to
 	 * ctl_ctl_cnw_cont() for further processing.
 	 */
 	ctsio->io_hdr.flags |= CTL_FLAG_IO_CONT;
 	ctsio->io_cont = ctl_cnw_cont;
 
 	lbalen = (struct ctl_lba_len_flags *)
 	    &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	lbalen->lba = lba;
 	lbalen->len = num_blocks;
 	lbalen->flags = CTL_LLF_COMPARE | flags;
 
 	CTL_DEBUG_PRINT(("ctl_cnw: calling data_submit()\n"));
 	retval = lun->backend->data_submit((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_verify(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct ctl_lba_len_flags *lbalen;
 	uint64_t lba;
 	uint32_t num_blocks;
 	int bytchk, flags;
 	int retval;
 
 	CTL_DEBUG_PRINT(("ctl_verify: command: %#x\n", ctsio->cdb[0]));
 
 	bytchk = 0;
 	flags = CTL_LLF_FUA;
 	switch (ctsio->cdb[0]) {
 	case VERIFY_10: {
 		struct scsi_verify_10 *cdb;
 
 		cdb = (struct scsi_verify_10 *)ctsio->cdb;
 		if (cdb->byte2 & SVFY_BYTCHK)
 			bytchk = 1;
 		if (cdb->byte2 & SVFY_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_2btoul(cdb->length);
 		break;
 	}
 	case VERIFY_12: {
 		struct scsi_verify_12 *cdb;
 
 		cdb = (struct scsi_verify_12 *)ctsio->cdb;
 		if (cdb->byte2 & SVFY_BYTCHK)
 			bytchk = 1;
 		if (cdb->byte2 & SVFY_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_4btoul(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		break;
 	}
 	case VERIFY_16: {
 		struct scsi_rw_16 *cdb;
 
 		cdb = (struct scsi_rw_16 *)ctsio->cdb;
 		if (cdb->byte2 & SVFY_BYTCHK)
 			bytchk = 1;
 		if (cdb->byte2 & SVFY_DPO)
 			flags |= CTL_LLF_DPO;
 		lba = scsi_8btou64(cdb->addr);
 		num_blocks = scsi_4btoul(cdb->length);
 		break;
 	}
 	default:
 		/*
 		 * We got a command we don't support.  This shouldn't
 		 * happen, commands should be filtered out above us.
 		 */
 		ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * The first check is to make sure we're in bounds, the second
 	 * check is to catch wrap-around problems.  If the lba + num blocks
 	 * is less than the lba, then we've wrapped around and the block
 	 * range is invalid anyway.
 	 */
 	if (((lba + num_blocks) > (lun->be_lun->maxlba + 1))
 	 || ((lba + num_blocks) < lba)) {
 		ctl_set_lba_out_of_range(ctsio,
 		    MAX(lba, lun->be_lun->maxlba + 1));
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	/*
 	 * According to SBC-3, a transfer length of 0 is not an error.
 	 */
 	if (num_blocks == 0) {
 		ctl_set_success(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	lbalen = (struct ctl_lba_len_flags *)
 	    &ctsio->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	lbalen->lba = lba;
 	lbalen->len = num_blocks;
 	if (bytchk) {
 		lbalen->flags = CTL_LLF_COMPARE | flags;
 		ctsio->kern_total_len = num_blocks * lun->be_lun->blocksize;
 	} else {
 		lbalen->flags = CTL_LLF_VERIFY | flags;
 		ctsio->kern_total_len = 0;
 	}
 	ctsio->kern_rel_offset = 0;
 
 	CTL_DEBUG_PRINT(("ctl_verify: calling data_submit()\n"));
 	retval = lun->backend->data_submit((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_report_luns(struct ctl_scsiio *ctsio)
 {
 	struct ctl_softc *softc = CTL_SOFTC(ctsio);
 	struct ctl_port *port = CTL_PORT(ctsio);
 	struct ctl_lun *lun, *request_lun = CTL_LUN(ctsio);
 	struct scsi_report_luns *cdb;
 	struct scsi_report_luns_data *lun_data;
 	int num_filled, num_luns, num_port_luns, retval;
 	uint32_t alloc_len, lun_datalen;
 	uint32_t initidx, targ_lun_id, lun_id;
 
 	retval = CTL_RETVAL_COMPLETE;
 	cdb = (struct scsi_report_luns *)ctsio->cdb;
 
 	CTL_DEBUG_PRINT(("ctl_report_luns\n"));
 
 	num_luns = 0;
 	num_port_luns = port->lun_map ? port->lun_map_size : ctl_max_luns;
 	mtx_lock(&softc->ctl_lock);
 	for (targ_lun_id = 0; targ_lun_id < num_port_luns; targ_lun_id++) {
 		if (ctl_lun_map_from_port(port, targ_lun_id) != UINT32_MAX)
 			num_luns++;
 	}
 	mtx_unlock(&softc->ctl_lock);
 
 	switch (cdb->select_report) {
 	case RPL_REPORT_DEFAULT:
 	case RPL_REPORT_ALL:
 	case RPL_REPORT_NONSUBSID:
 		break;
 	case RPL_REPORT_WELLKNOWN:
 	case RPL_REPORT_ADMIN:
 	case RPL_REPORT_CONGLOM:
 		num_luns = 0;
 		break;
 	default:
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (retval);
 		break; /* NOTREACHED */
 	}
 
 	alloc_len = scsi_4btoul(cdb->length);
 	/*
 	 * The initiator has to allocate at least 16 bytes for this request,
 	 * so he can at least get the header and the first LUN.  Otherwise
 	 * we reject the request (per SPC-3 rev 14, section 6.21).
 	 */
 	if (alloc_len < (sizeof(struct scsi_report_luns_data) +
 	    sizeof(struct scsi_report_luns_lundata))) {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 6,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (retval);
 	}
 
 	lun_datalen = sizeof(*lun_data) +
 		(num_luns * sizeof(struct scsi_report_luns_lundata));
 
 	ctsio->kern_data_ptr = malloc(lun_datalen, M_CTL, M_WAITOK | M_ZERO);
 	lun_data = (struct scsi_report_luns_data *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 
 	initidx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 
 	mtx_lock(&softc->ctl_lock);
 	for (targ_lun_id = 0, num_filled = 0;
 	    targ_lun_id < num_port_luns && num_filled < num_luns;
 	    targ_lun_id++) {
 		lun_id = ctl_lun_map_from_port(port, targ_lun_id);
 		if (lun_id == UINT32_MAX)
 			continue;
 		lun = softc->ctl_luns[lun_id];
 		if (lun == NULL)
 			continue;
 
 		be64enc(lun_data->luns[num_filled++].lundata,
 		    ctl_encode_lun(targ_lun_id));
 
 		/*
 		 * According to SPC-3, rev 14 section 6.21:
 		 *
 		 * "The execution of a REPORT LUNS command to any valid and
 		 * installed logical unit shall clear the REPORTED LUNS DATA
 		 * HAS CHANGED unit attention condition for all logical
 		 * units of that target with respect to the requesting
 		 * initiator. A valid and installed logical unit is one
 		 * having a PERIPHERAL QUALIFIER of 000b in the standard
 		 * INQUIRY data (see 6.4.2)."
 		 *
 		 * If request_lun is NULL, the LUN this report luns command
 		 * was issued to is either disabled or doesn't exist. In that
 		 * case, we shouldn't clear any pending lun change unit
 		 * attention.
 		 */
 		if (request_lun != NULL) {
 			mtx_lock(&lun->lun_lock);
 			ctl_clr_ua(lun, initidx, CTL_UA_LUN_CHANGE);
 			mtx_unlock(&lun->lun_lock);
 		}
 	}
 	mtx_unlock(&softc->ctl_lock);
 
 	/*
 	 * It's quite possible that we've returned fewer LUNs than we allocated
 	 * space for.  Trim it.
 	 */
 	lun_datalen = sizeof(*lun_data) +
 		(num_filled * sizeof(struct scsi_report_luns_lundata));
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_len = min(lun_datalen, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	/*
 	 * We set this to the actual data length, regardless of how much
 	 * space we actually have to return results.  If the user looks at
 	 * this value, he'll know whether or not he allocated enough space
 	 * and reissue the command if necessary.  We don't support well
 	 * known logical units, so if the user asks for that, return none.
 	 */
 	scsi_ulto4b(lun_datalen - 8, lun_data->length);
 
 	/*
 	 * We can only return SCSI_STATUS_CHECK_COND when we can't satisfy
 	 * this request.
 	 */
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (retval);
 }
 
 int
 ctl_request_sense(struct ctl_scsiio *ctsio)
 {
 	struct ctl_softc *softc = CTL_SOFTC(ctsio);
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_request_sense *cdb;
 	struct scsi_sense_data *sense_ptr, *ps;
 	uint32_t initidx;
 	int have_error;
 	u_int sense_len = SSD_FULL_SIZE;
 	scsi_sense_data_type sense_format;
 	ctl_ua_type ua_type;
 	uint8_t asc = 0, ascq = 0;
 
 	cdb = (struct scsi_request_sense *)ctsio->cdb;
 
 	CTL_DEBUG_PRINT(("ctl_request_sense\n"));
 
 	/*
 	 * Determine which sense format the user wants.
 	 */
 	if (cdb->byte2 & SRS_DESC)
 		sense_format = SSD_TYPE_DESC;
 	else
 		sense_format = SSD_TYPE_FIXED;
 
 	ctsio->kern_data_ptr = malloc(sizeof(*sense_ptr), M_CTL, M_WAITOK);
 	sense_ptr = (struct scsi_sense_data *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 
 	/*
 	 * struct scsi_sense_data, which is currently set to 256 bytes, is
 	 * larger than the largest allowed value for the length field in the
 	 * REQUEST SENSE CDB, which is 252 bytes as of SPC-4.
 	 */
 	ctsio->kern_data_len = cdb->length;
 	ctsio->kern_total_len = cdb->length;
 
 	/*
 	 * If we don't have a LUN, we don't have any pending sense.
 	 */
 	if (lun == NULL ||
 	    ((lun->flags & CTL_LUN_PRIMARY_SC) == 0 &&
 	     softc->ha_link < CTL_HA_LINK_UNKNOWN)) {
 		/* "Logical unit not supported" */
 		ctl_set_sense_data(sense_ptr, &sense_len, NULL, sense_format,
 		    /*current_error*/ 1,
 		    /*sense_key*/ SSD_KEY_ILLEGAL_REQUEST,
 		    /*asc*/ 0x25,
 		    /*ascq*/ 0x00,
 		    SSD_ELEM_NONE);
 		goto send;
 	}
 
 	have_error = 0;
 	initidx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 	/*
 	 * Check for pending sense, and then for pending unit attentions.
 	 * Pending sense gets returned first, then pending unit attentions.
 	 */
 	mtx_lock(&lun->lun_lock);
 	ps = lun->pending_sense[initidx / CTL_MAX_INIT_PER_PORT];
 	if (ps != NULL)
 		ps += initidx % CTL_MAX_INIT_PER_PORT;
 	if (ps != NULL && ps->error_code != 0) {
 		scsi_sense_data_type stored_format;
 
 		/*
 		 * Check to see which sense format was used for the stored
 		 * sense data.
 		 */
 		stored_format = scsi_sense_type(ps);
 
 		/*
 		 * If the user requested a different sense format than the
 		 * one we stored, then we need to convert it to the other
 		 * format.  If we're going from descriptor to fixed format
 		 * sense data, we may lose things in translation, depending
 		 * on what options were used.
 		 *
 		 * If the stored format is SSD_TYPE_NONE (i.e. invalid),
 		 * for some reason we'll just copy it out as-is.
 		 */
 		if ((stored_format == SSD_TYPE_FIXED)
 		 && (sense_format == SSD_TYPE_DESC))
 			ctl_sense_to_desc((struct scsi_sense_data_fixed *)
 			    ps, (struct scsi_sense_data_desc *)sense_ptr);
 		else if ((stored_format == SSD_TYPE_DESC)
 		      && (sense_format == SSD_TYPE_FIXED))
 			ctl_sense_to_fixed((struct scsi_sense_data_desc *)
 			    ps, (struct scsi_sense_data_fixed *)sense_ptr);
 		else
 			memcpy(sense_ptr, ps, sizeof(*sense_ptr));
 
 		ps->error_code = 0;
 		have_error = 1;
 	} else {
 		ua_type = ctl_build_ua(lun, initidx, sense_ptr, &sense_len,
 		    sense_format);
 		if (ua_type != CTL_UA_NONE)
 			have_error = 1;
 	}
 	if (have_error == 0) {
 		/*
 		 * Report informational exception if have one and allowed.
 		 */
 		if (lun->MODE_IE.mrie != SIEP_MRIE_NO) {
 			asc = lun->ie_asc;
 			ascq = lun->ie_ascq;
 		}
 		ctl_set_sense_data(sense_ptr, &sense_len, lun, sense_format,
 		    /*current_error*/ 1,
 		    /*sense_key*/ SSD_KEY_NO_SENSE,
 		    /*asc*/ asc,
 		    /*ascq*/ ascq,
 		    SSD_ELEM_NONE);
 	}
 	mtx_unlock(&lun->lun_lock);
 
 send:
 	/*
 	 * We report the SCSI status as OK, since the status of the command
 	 * itself is OK.  We're reporting sense as parameter data.
 	 */
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_tur(struct ctl_scsiio *ctsio)
 {
 
 	CTL_DEBUG_PRINT(("ctl_tur\n"));
 
 	ctl_set_success(ctsio);
 	ctl_done((union ctl_io *)ctsio);
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 /*
  * SCSI VPD page 0x00, the Supported VPD Pages page.
  */
 static int
 ctl_inquiry_evpd_supported(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_vpd_supported_pages *pages;
 	int sup_page_size;
 	int p;
 
 	sup_page_size = sizeof(struct scsi_vpd_supported_pages) *
 	    SCSI_EVPD_NUM_SUPPORTED_PAGES;
 	ctsio->kern_data_ptr = malloc(sup_page_size, M_CTL, M_WAITOK | M_ZERO);
 	pages = (struct scsi_vpd_supported_pages *)ctsio->kern_data_ptr;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_len = min(sup_page_size, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.  Need to change this
 	 * to figure out whether the disk device is actually online or not.
 	 */
 	if (lun != NULL)
 		pages->device = (SID_QUAL_LU_CONNECTED << 5) |
 				lun->be_lun->lun_type;
 	else
 		pages->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 
 	p = 0;
 	/* Supported VPD pages */
 	pages->page_list[p++] = SVPD_SUPPORTED_PAGES;
 	/* Serial Number */
 	pages->page_list[p++] = SVPD_UNIT_SERIAL_NUMBER;
 	/* Device Identification */
 	pages->page_list[p++] = SVPD_DEVICE_ID;
 	/* Extended INQUIRY Data */
 	pages->page_list[p++] = SVPD_EXTENDED_INQUIRY_DATA;
 	/* Mode Page Policy */
 	pages->page_list[p++] = SVPD_MODE_PAGE_POLICY;
 	/* SCSI Ports */
 	pages->page_list[p++] = SVPD_SCSI_PORTS;
 	/* Third-party Copy */
 	pages->page_list[p++] = SVPD_SCSI_TPC;
+	/* SCSI Feature Sets */
+	pages->page_list[p++] = SVPD_SCSI_SFS;
 	if (lun != NULL && lun->be_lun->lun_type == T_DIRECT) {
 		/* Block limits */
 		pages->page_list[p++] = SVPD_BLOCK_LIMITS;
 		/* Block Device Characteristics */
 		pages->page_list[p++] = SVPD_BDC;
 		/* Logical Block Provisioning */
 		pages->page_list[p++] = SVPD_LBP;
 	}
 	pages->length = p;
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 /*
  * SCSI VPD page 0x80, the Unit Serial Number page.
  */
 static int
 ctl_inquiry_evpd_serial(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_vpd_unit_serial_number *sn_ptr;
 	int data_len;
 
 	data_len = 4 + CTL_SN_LEN;
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	sn_ptr = (struct scsi_vpd_unit_serial_number *)ctsio->kern_data_ptr;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_len = min(data_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.  Need to change this
 	 * to figure out whether the disk device is actually online or not.
 	 */
 	if (lun != NULL)
 		sn_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				  lun->be_lun->lun_type;
 	else
 		sn_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 
 	sn_ptr->page_code = SVPD_UNIT_SERIAL_NUMBER;
 	sn_ptr->length = CTL_SN_LEN;
 	/*
 	 * If we don't have a LUN, we just leave the serial number as
 	 * all spaces.
 	 */
 	if (lun != NULL) {
 		strncpy((char *)sn_ptr->serial_num,
 			(char *)lun->be_lun->serial_num, CTL_SN_LEN);
 	} else
 		memset(sn_ptr->serial_num, 0x20, CTL_SN_LEN);
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 
 /*
  * SCSI VPD page 0x86, the Extended INQUIRY Data page.
  */
 static int
 ctl_inquiry_evpd_eid(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_vpd_extended_inquiry_data *eid_ptr;
 	int data_len;
 
 	data_len = sizeof(struct scsi_vpd_extended_inquiry_data);
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	eid_ptr = (struct scsi_vpd_extended_inquiry_data *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(data_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.
 	 */
 	if (lun != NULL)
 		eid_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				     lun->be_lun->lun_type;
 	else
 		eid_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 	eid_ptr->page_code = SVPD_EXTENDED_INQUIRY_DATA;
 	scsi_ulto2b(data_len - 4, eid_ptr->page_length);
 	/*
 	 * We support head of queue, ordered and simple tags.
 	 */
 	eid_ptr->flags2 = SVPD_EID_HEADSUP | SVPD_EID_ORDSUP | SVPD_EID_SIMPSUP;
 	/*
 	 * Volatile cache supported.
 	 */
 	eid_ptr->flags3 = SVPD_EID_V_SUP;
 
 	/*
 	 * This means that we clear the REPORTED LUNS DATA HAS CHANGED unit
 	 * attention for a particular IT nexus on all LUNs once we report
 	 * it to that nexus once.  This bit is required as of SPC-4.
 	 */
 	eid_ptr->flags4 = SVPD_EID_LUICLR;
 
 	/*
 	 * We support revert to defaults (RTD) bit in MODE SELECT.
 	 */
 	eid_ptr->flags5 = SVPD_EID_RTD_SUP;
 
 	/*
 	 * XXX KDM in order to correctly answer this, we would need
 	 * information from the SIM to determine how much sense data it
 	 * can send.  So this would really be a path inquiry field, most
 	 * likely.  This can be set to a maximum of 252 according to SPC-4,
 	 * but the hardware may or may not be able to support that much.
 	 * 0 just means that the maximum sense data length is not reported.
 	 */
 	eid_ptr->max_sense_length = 0;
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_inquiry_evpd_mpp(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_vpd_mode_page_policy *mpp_ptr;
 	int data_len;
 
 	data_len = sizeof(struct scsi_vpd_mode_page_policy) +
 	    sizeof(struct scsi_vpd_mode_page_policy_descr);
 
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	mpp_ptr = (struct scsi_vpd_mode_page_policy *)ctsio->kern_data_ptr;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_len = min(data_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.
 	 */
 	if (lun != NULL)
 		mpp_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				     lun->be_lun->lun_type;
 	else
 		mpp_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 	mpp_ptr->page_code = SVPD_MODE_PAGE_POLICY;
 	scsi_ulto2b(data_len - 4, mpp_ptr->page_length);
 	mpp_ptr->descr[0].page_code = 0x3f;
 	mpp_ptr->descr[0].subpage_code = 0xff;
 	mpp_ptr->descr[0].policy = SVPD_MPP_SHARED;
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 /*
  * SCSI VPD page 0x83, the Device Identification page.
  */
 static int
 ctl_inquiry_evpd_devid(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct ctl_softc *softc = CTL_SOFTC(ctsio);
 	struct ctl_port *port = CTL_PORT(ctsio);
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_vpd_device_id *devid_ptr;
 	struct scsi_vpd_id_descriptor *desc;
 	int data_len, g;
 	uint8_t proto;
 
 	data_len = sizeof(struct scsi_vpd_device_id) +
 	    sizeof(struct scsi_vpd_id_descriptor) +
 		sizeof(struct scsi_vpd_id_rel_trgt_port_id) +
 	    sizeof(struct scsi_vpd_id_descriptor) +
 		sizeof(struct scsi_vpd_id_trgt_port_grp_id);
 	if (lun && lun->lun_devid)
 		data_len += lun->lun_devid->len;
 	if (port && port->port_devid)
 		data_len += port->port_devid->len;
 	if (port && port->target_devid)
 		data_len += port->target_devid->len;
 
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	devid_ptr = (struct scsi_vpd_device_id *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_len = min(data_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.
 	 */
 	if (lun != NULL)
 		devid_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				     lun->be_lun->lun_type;
 	else
 		devid_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 	devid_ptr->page_code = SVPD_DEVICE_ID;
 	scsi_ulto2b(data_len - 4, devid_ptr->length);
 
 	if (port && port->port_type == CTL_PORT_FC)
 		proto = SCSI_PROTO_FC << 4;
 	else if (port && port->port_type == CTL_PORT_SAS)
 		proto = SCSI_PROTO_SAS << 4;
 	else if (port && port->port_type == CTL_PORT_ISCSI)
 		proto = SCSI_PROTO_ISCSI << 4;
 	else
 		proto = SCSI_PROTO_SPI << 4;
 	desc = (struct scsi_vpd_id_descriptor *)devid_ptr->desc_list;
 
 	/*
 	 * We're using a LUN association here.  i.e., this device ID is a
 	 * per-LUN identifier.
 	 */
 	if (lun && lun->lun_devid) {
 		memcpy(desc, lun->lun_devid->data, lun->lun_devid->len);
 		desc = (struct scsi_vpd_id_descriptor *)((uint8_t *)desc +
 		    lun->lun_devid->len);
 	}
 
 	/*
 	 * This is for the WWPN which is a port association.
 	 */
 	if (port && port->port_devid) {
 		memcpy(desc, port->port_devid->data, port->port_devid->len);
 		desc = (struct scsi_vpd_id_descriptor *)((uint8_t *)desc +
 		    port->port_devid->len);
 	}
 
 	/*
 	 * This is for the Relative Target Port(type 4h) identifier
 	 */
 	desc->proto_codeset = proto | SVPD_ID_CODESET_BINARY;
 	desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_PORT |
 	    SVPD_ID_TYPE_RELTARG;
 	desc->length = 4;
 	scsi_ulto2b(ctsio->io_hdr.nexus.targ_port, &desc->identifier[2]);
 	desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] +
 	    sizeof(struct scsi_vpd_id_rel_trgt_port_id));
 
 	/*
 	 * This is for the Target Port Group(type 5h) identifier
 	 */
 	desc->proto_codeset = proto | SVPD_ID_CODESET_BINARY;
 	desc->id_type = SVPD_ID_PIV | SVPD_ID_ASSOC_PORT |
 	    SVPD_ID_TYPE_TPORTGRP;
 	desc->length = 4;
 	if (softc->is_single ||
 	    (port && port->status & CTL_PORT_STATUS_HA_SHARED))
 		g = 1;
 	else
 		g = 2 + ctsio->io_hdr.nexus.targ_port / softc->port_cnt;
 	scsi_ulto2b(g, &desc->identifier[2]);
 	desc = (struct scsi_vpd_id_descriptor *)(&desc->identifier[0] +
 	    sizeof(struct scsi_vpd_id_trgt_port_grp_id));
 
 	/*
 	 * This is for the Target identifier
 	 */
 	if (port && port->target_devid) {
 		memcpy(desc, port->target_devid->data, port->target_devid->len);
 	}
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_inquiry_evpd_scsi_ports(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct ctl_softc *softc = CTL_SOFTC(ctsio);
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_vpd_scsi_ports *sp;
 	struct scsi_vpd_port_designation *pd;
 	struct scsi_vpd_port_designation_cont *pdc;
 	struct ctl_port *port;
 	int data_len, num_target_ports, iid_len, id_len;
 
 	num_target_ports = 0;
 	iid_len = 0;
 	id_len = 0;
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_FOREACH(port, &softc->port_list, links) {
 		if ((port->status & CTL_PORT_STATUS_ONLINE) == 0)
 			continue;
 		if (lun != NULL &&
 		    ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX)
 			continue;
 		num_target_ports++;
 		if (port->init_devid)
 			iid_len += port->init_devid->len;
 		if (port->port_devid)
 			id_len += port->port_devid->len;
 	}
 	mtx_unlock(&softc->ctl_lock);
 
 	data_len = sizeof(struct scsi_vpd_scsi_ports) +
 	    num_target_ports * (sizeof(struct scsi_vpd_port_designation) +
 	     sizeof(struct scsi_vpd_port_designation_cont)) + iid_len + id_len;
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	sp = (struct scsi_vpd_scsi_ports *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_len = min(data_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.  Need to change this
 	 * to figure out whether the disk device is actually online or not.
 	 */
 	if (lun != NULL)
 		sp->device = (SID_QUAL_LU_CONNECTED << 5) |
 				  lun->be_lun->lun_type;
 	else
 		sp->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 
 	sp->page_code = SVPD_SCSI_PORTS;
 	scsi_ulto2b(data_len - sizeof(struct scsi_vpd_scsi_ports),
 	    sp->page_length);
 	pd = &sp->design[0];
 
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_FOREACH(port, &softc->port_list, links) {
 		if ((port->status & CTL_PORT_STATUS_ONLINE) == 0)
 			continue;
 		if (lun != NULL &&
 		    ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX)
 			continue;
 		scsi_ulto2b(port->targ_port, pd->relative_port_id);
 		if (port->init_devid) {
 			iid_len = port->init_devid->len;
 			memcpy(pd->initiator_transportid,
 			    port->init_devid->data, port->init_devid->len);
 		} else
 			iid_len = 0;
 		scsi_ulto2b(iid_len, pd->initiator_transportid_length);
 		pdc = (struct scsi_vpd_port_designation_cont *)
 		    (&pd->initiator_transportid[iid_len]);
 		if (port->port_devid) {
 			id_len = port->port_devid->len;
 			memcpy(pdc->target_port_descriptors,
 			    port->port_devid->data, port->port_devid->len);
 		} else
 			id_len = 0;
 		scsi_ulto2b(id_len, pdc->target_port_descriptors_length);
 		pd = (struct scsi_vpd_port_designation *)
 		    ((uint8_t *)pdc->target_port_descriptors + id_len);
 	}
 	mtx_unlock(&softc->ctl_lock);
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
+ctl_inquiry_evpd_sfs(struct ctl_scsiio *ctsio, int alloc_len)
+{
+	struct ctl_lun *lun = CTL_LUN(ctsio);
+	struct scsi_vpd_sfs *sfs_ptr;
+	int sfs_page_size, n;
+
+	sfs_page_size = sizeof(*sfs_ptr) + 5 * 2;
+	ctsio->kern_data_ptr = malloc(sfs_page_size, M_CTL, M_WAITOK | M_ZERO);
+	sfs_ptr = (struct scsi_vpd_sfs *)ctsio->kern_data_ptr;
+	ctsio->kern_sg_entries = 0;
+	ctsio->kern_rel_offset = 0;
+	ctsio->kern_sg_entries = 0;
+	ctsio->kern_data_len = min(sfs_page_size, alloc_len);
+	ctsio->kern_total_len = ctsio->kern_data_len;
+
+	/*
+	 * The control device is always connected.  The disk device, on the
+	 * other hand, may not be online all the time.  Need to change this
+	 * to figure out whether the disk device is actually online or not.
+	 */
+	if (lun != NULL)
+		sfs_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
+				  lun->be_lun->lun_type;
+	else
+		sfs_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
+
+	sfs_ptr->page_code = SVPD_SCSI_SFS;
+	n = 0;
+	/* Discovery 2016 */
+	scsi_ulto2b(0x0001, &sfs_ptr->codes[2 * n++]);
+	if (lun != NULL && lun->be_lun->lun_type == T_DIRECT) {
+		 /* SBC Base 2016 */
+		scsi_ulto2b(0x0101, &sfs_ptr->codes[2 * n++]);
+		 /* SBC Base 2010 */
+		scsi_ulto2b(0x0102, &sfs_ptr->codes[2 * n++]);
+		if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) {
+			/* Basic Provisioning 2016 */
+			scsi_ulto2b(0x0103, &sfs_ptr->codes[2 * n++]);
+		}
+		/* Drive Maintenance 2016 */
+		//scsi_ulto2b(0x0104, &sfs_ptr->codes[2 * n++]);
+	}
+	scsi_ulto2b(4 + 2 * n, sfs_ptr->page_length);
+
+	ctl_set_success(ctsio);
+	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
+	ctsio->be_move_done = ctl_config_move_done;
+	ctl_datamove((union ctl_io *)ctsio);
+	return (CTL_RETVAL_COMPLETE);
+}
+
+static int
 ctl_inquiry_evpd_block_limits(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_vpd_block_limits *bl_ptr;
 	const char *val;
 	uint64_t ival;
 
 	ctsio->kern_data_ptr = malloc(sizeof(*bl_ptr), M_CTL, M_WAITOK | M_ZERO);
 	bl_ptr = (struct scsi_vpd_block_limits *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_data_len = min(sizeof(*bl_ptr), alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.  Need to change this
 	 * to figure out whether the disk device is actually online or not.
 	 */
 	if (lun != NULL)
 		bl_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				  lun->be_lun->lun_type;
 	else
 		bl_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 
 	bl_ptr->page_code = SVPD_BLOCK_LIMITS;
 	scsi_ulto2b(sizeof(*bl_ptr) - 4, bl_ptr->page_length);
 	bl_ptr->max_cmp_write_len = 0xff;
 	scsi_ulto4b(0xffffffff, bl_ptr->max_txfer_len);
 	if (lun != NULL) {
 		scsi_ulto4b(lun->be_lun->opttxferlen, bl_ptr->opt_txfer_len);
 		if (lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) {
 			ival = 0xffffffff;
 			val = dnvlist_get_string(lun->be_lun->options,
 			    "unmap_max_lba", NULL);
 			if (val != NULL)
 				ctl_expand_number(val, &ival);
 			scsi_ulto4b(ival, bl_ptr->max_unmap_lba_cnt);
 			ival = 0xffffffff;
 			val = dnvlist_get_string(lun->be_lun->options,
 			    "unmap_max_descr", NULL);
 			if (val != NULL)
 				ctl_expand_number(val, &ival);
 			scsi_ulto4b(ival, bl_ptr->max_unmap_blk_cnt);
 			if (lun->be_lun->ublockexp != 0) {
 				scsi_ulto4b((1 << lun->be_lun->ublockexp),
 				    bl_ptr->opt_unmap_grain);
 				scsi_ulto4b(0x80000000 | lun->be_lun->ublockoff,
 				    bl_ptr->unmap_grain_align);
 			}
 		}
 		scsi_ulto4b(lun->be_lun->atomicblock,
 		    bl_ptr->max_atomic_transfer_length);
 		scsi_ulto4b(0, bl_ptr->atomic_alignment);
 		scsi_ulto4b(0, bl_ptr->atomic_transfer_length_granularity);
 		scsi_ulto4b(0, bl_ptr->max_atomic_transfer_length_with_atomic_boundary);
 		scsi_ulto4b(0, bl_ptr->max_atomic_boundary_size);
 		ival = UINT64_MAX;
 		val = dnvlist_get_string(lun->be_lun->options,
 		    "write_same_max_lba", NULL);
 		if (val != NULL)
 			ctl_expand_number(val, &ival);
 		scsi_u64to8b(ival, bl_ptr->max_write_same_length);
 	}
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_inquiry_evpd_bdc(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_vpd_block_device_characteristics *bdc_ptr;
 	const char *value;
 	u_int i;
 
 	ctsio->kern_data_ptr = malloc(sizeof(*bdc_ptr), M_CTL, M_WAITOK | M_ZERO);
 	bdc_ptr = (struct scsi_vpd_block_device_characteristics *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(sizeof(*bdc_ptr), alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.  Need to change this
 	 * to figure out whether the disk device is actually online or not.
 	 */
 	if (lun != NULL)
 		bdc_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				  lun->be_lun->lun_type;
 	else
 		bdc_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 	bdc_ptr->page_code = SVPD_BDC;
 	scsi_ulto2b(sizeof(*bdc_ptr) - 4, bdc_ptr->page_length);
 	if (lun != NULL &&
 	    (value = dnvlist_get_string(lun->be_lun->options, "rpm", NULL)) != NULL)
 		i = strtol(value, NULL, 0);
 	else
 		i = CTL_DEFAULT_ROTATION_RATE;
 	scsi_ulto2b(i, bdc_ptr->medium_rotation_rate);
 	if (lun != NULL &&
 	    (value = dnvlist_get_string(lun->be_lun->options, "formfactor", NULL)) != NULL)
 		i = strtol(value, NULL, 0);
 	else
 		i = 0;
 	bdc_ptr->wab_wac_ff = (i & 0x0f);
-	bdc_ptr->flags = SVPD_FUAB | SVPD_VBULS;
+	bdc_ptr->flags = SVPD_RBWZ | SVPD_FUAB | SVPD_VBULS;
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static int
 ctl_inquiry_evpd_lbp(struct ctl_scsiio *ctsio, int alloc_len)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_vpd_logical_block_prov *lbp_ptr;
 	const char *value;
 
 	ctsio->kern_data_ptr = malloc(sizeof(*lbp_ptr), M_CTL, M_WAITOK | M_ZERO);
 	lbp_ptr = (struct scsi_vpd_logical_block_prov *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(sizeof(*lbp_ptr), alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	/*
 	 * The control device is always connected.  The disk device, on the
 	 * other hand, may not be online all the time.  Need to change this
 	 * to figure out whether the disk device is actually online or not.
 	 */
 	if (lun != NULL)
 		lbp_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 				  lun->be_lun->lun_type;
 	else
 		lbp_ptr->device = (SID_QUAL_LU_OFFLINE << 5) | T_DIRECT;
 
 	lbp_ptr->page_code = SVPD_LBP;
 	scsi_ulto2b(sizeof(*lbp_ptr) - 4, lbp_ptr->page_length);
 	lbp_ptr->threshold_exponent = CTL_LBP_EXPONENT;
 	if (lun != NULL && lun->be_lun->flags & CTL_LUN_FLAG_UNMAP) {
 		lbp_ptr->flags = SVPD_LBP_UNMAP | SVPD_LBP_WS16 |
 		    SVPD_LBP_WS10 | SVPD_LBP_RZ | SVPD_LBP_ANC_SUP;
 		value = dnvlist_get_string(lun->be_lun->options,
 		    "provisioning_type", NULL);
 		if (value != NULL) {
 			if (strcmp(value, "resource") == 0)
 				lbp_ptr->prov_type = SVPD_LBP_RESOURCE;
 			else if (strcmp(value, "thin") == 0)
 				lbp_ptr->prov_type = SVPD_LBP_THIN;
 		} else
 			lbp_ptr->prov_type = SVPD_LBP_THIN;
 	}
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 /*
  * INQUIRY with the EVPD bit set.
  */
 static int
 ctl_inquiry_evpd(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_inquiry *cdb;
 	int alloc_len, retval;
 
 	cdb = (struct scsi_inquiry *)ctsio->cdb;
 	alloc_len = scsi_2btoul(cdb->length);
 
 	switch (cdb->page_code) {
 	case SVPD_SUPPORTED_PAGES:
 		retval = ctl_inquiry_evpd_supported(ctsio, alloc_len);
 		break;
 	case SVPD_UNIT_SERIAL_NUMBER:
 		retval = ctl_inquiry_evpd_serial(ctsio, alloc_len);
 		break;
 	case SVPD_DEVICE_ID:
 		retval = ctl_inquiry_evpd_devid(ctsio, alloc_len);
 		break;
 	case SVPD_EXTENDED_INQUIRY_DATA:
 		retval = ctl_inquiry_evpd_eid(ctsio, alloc_len);
 		break;
 	case SVPD_MODE_PAGE_POLICY:
 		retval = ctl_inquiry_evpd_mpp(ctsio, alloc_len);
 		break;
 	case SVPD_SCSI_PORTS:
 		retval = ctl_inquiry_evpd_scsi_ports(ctsio, alloc_len);
 		break;
 	case SVPD_SCSI_TPC:
 		retval = ctl_inquiry_evpd_tpc(ctsio, alloc_len);
+		break;
+	case SVPD_SCSI_SFS:
+		retval = ctl_inquiry_evpd_sfs(ctsio, alloc_len);
 		break;
 	case SVPD_BLOCK_LIMITS:
 		if (lun == NULL || lun->be_lun->lun_type != T_DIRECT)
 			goto err;
 		retval = ctl_inquiry_evpd_block_limits(ctsio, alloc_len);
 		break;
 	case SVPD_BDC:
 		if (lun == NULL || lun->be_lun->lun_type != T_DIRECT)
 			goto err;
 		retval = ctl_inquiry_evpd_bdc(ctsio, alloc_len);
 		break;
 	case SVPD_LBP:
 		if (lun == NULL || lun->be_lun->lun_type != T_DIRECT)
 			goto err;
 		retval = ctl_inquiry_evpd_lbp(ctsio, alloc_len);
 		break;
 	default:
 err:
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		retval = CTL_RETVAL_COMPLETE;
 		break;
 	}
 
 	return (retval);
 }
 
 /*
  * Standard INQUIRY data.
  */
 static int
 ctl_inquiry_std(struct ctl_scsiio *ctsio)
 {
 	struct ctl_softc *softc = CTL_SOFTC(ctsio);
 	struct ctl_port *port = CTL_PORT(ctsio);
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_inquiry_data *inq_ptr;
 	struct scsi_inquiry *cdb;
 	const char *val;
 	uint32_t alloc_len, data_len;
 	ctl_port_type port_type;
 
 	port_type = port->port_type;
 	if (port_type == CTL_PORT_IOCTL || port_type == CTL_PORT_INTERNAL)
 		port_type = CTL_PORT_SCSI;
 
 	cdb = (struct scsi_inquiry *)ctsio->cdb;
 	alloc_len = scsi_2btoul(cdb->length);
 
 	/*
 	 * We malloc the full inquiry data size here and fill it
 	 * in.  If the user only asks for less, we'll give him
 	 * that much.
 	 */
 	data_len = offsetof(struct scsi_inquiry_data, vendor_specific1);
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	inq_ptr = (struct scsi_inquiry_data *)ctsio->kern_data_ptr;
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(data_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	if (lun != NULL) {
 		if ((lun->flags & CTL_LUN_PRIMARY_SC) ||
 		    softc->ha_link >= CTL_HA_LINK_UNKNOWN) {
 			inq_ptr->device = (SID_QUAL_LU_CONNECTED << 5) |
 			    lun->be_lun->lun_type;
 		} else {
 			inq_ptr->device = (SID_QUAL_LU_OFFLINE << 5) |
 			    lun->be_lun->lun_type;
 		}
 		if (lun->flags & CTL_LUN_REMOVABLE)
 			inq_ptr->dev_qual2 |= SID_RMB;
 	} else
 		inq_ptr->device = (SID_QUAL_BAD_LU << 5) | T_NODEVICE;
 
 	/* RMB in byte 2 is 0 */
 	inq_ptr->version = SCSI_REV_SPC5;
 
 	/*
 	 * According to SAM-3, even if a device only supports a single
 	 * level of LUN addressing, it should still set the HISUP bit:
 	 *
 	 * 4.9.1 Logical unit numbers overview
 	 *
 	 * All logical unit number formats described in this standard are
 	 * hierarchical in structure even when only a single level in that
 	 * hierarchy is used. The HISUP bit shall be set to one in the
 	 * standard INQUIRY data (see SPC-2) when any logical unit number
 	 * format described in this standard is used.  Non-hierarchical
 	 * formats are outside the scope of this standard.
 	 *
 	 * Therefore we set the HiSup bit here.
 	 *
 	 * The response format is 2, per SPC-3.
 	 */
 	inq_ptr->response_format = SID_HiSup | 2;
 
 	inq_ptr->additional_length = data_len -
 	    (offsetof(struct scsi_inquiry_data, additional_length) + 1);
 	CTL_DEBUG_PRINT(("additional_length = %d\n",
 			 inq_ptr->additional_length));
 
 	inq_ptr->spc3_flags = SPC3_SID_3PC | SPC3_SID_TPGS_IMPLICIT;
 	if (port_type == CTL_PORT_SCSI)
 		inq_ptr->spc2_flags = SPC2_SID_ADDR16;
 	inq_ptr->spc2_flags |= SPC2_SID_MultiP;
 	inq_ptr->flags = SID_CmdQue;
 	if (port_type == CTL_PORT_SCSI)
 		inq_ptr->flags |= SID_WBus16 | SID_Sync;
 
 	/*
 	 * Per SPC-3, unused bytes in ASCII strings are filled with spaces.
 	 * We have 8 bytes for the vendor name, and 16 bytes for the device
 	 * name and 4 bytes for the revision.
 	 */
 	if (lun == NULL || (val = dnvlist_get_string(lun->be_lun->options,
 	    "vendor", NULL)) == NULL) {
 		strncpy(inq_ptr->vendor, CTL_VENDOR, sizeof(inq_ptr->vendor));
 	} else {
 		memset(inq_ptr->vendor, ' ', sizeof(inq_ptr->vendor));
 		strncpy(inq_ptr->vendor, val,
 		    min(sizeof(inq_ptr->vendor), strlen(val)));
 	}
 	if (lun == NULL) {
 		strncpy(inq_ptr->product, CTL_DIRECT_PRODUCT,
 		    sizeof(inq_ptr->product));
 	} else if ((val = dnvlist_get_string(lun->be_lun->options, "product",
 	    NULL)) == NULL) {
 		switch (lun->be_lun->lun_type) {
 		case T_DIRECT:
 			strncpy(inq_ptr->product, CTL_DIRECT_PRODUCT,
 			    sizeof(inq_ptr->product));
 			break;
 		case T_PROCESSOR:
 			strncpy(inq_ptr->product, CTL_PROCESSOR_PRODUCT,
 			    sizeof(inq_ptr->product));
 			break;
 		case T_CDROM:
 			strncpy(inq_ptr->product, CTL_CDROM_PRODUCT,
 			    sizeof(inq_ptr->product));
 			break;
 		default:
 			strncpy(inq_ptr->product, CTL_UNKNOWN_PRODUCT,
 			    sizeof(inq_ptr->product));
 			break;
 		}
 	} else {
 		memset(inq_ptr->product, ' ', sizeof(inq_ptr->product));
 		strncpy(inq_ptr->product, val,
 		    min(sizeof(inq_ptr->product), strlen(val)));
 	}
 
 	/*
 	 * XXX make this a macro somewhere so it automatically gets
 	 * incremented when we make changes.
 	 */
 	if (lun == NULL || (val = dnvlist_get_string(lun->be_lun->options,
 	    "revision", NULL)) == NULL) {
 		strncpy(inq_ptr->revision, "0001", sizeof(inq_ptr->revision));
 	} else {
 		memset(inq_ptr->revision, ' ', sizeof(inq_ptr->revision));
 		strncpy(inq_ptr->revision, val,
 		    min(sizeof(inq_ptr->revision), strlen(val)));
 	}
 
 	/*
 	 * For parallel SCSI, we support double transition and single
 	 * transition clocking.  We also support QAS (Quick Arbitration
 	 * and Selection) and Information Unit transfers on both the
 	 * control and array devices.
 	 */
 	if (port_type == CTL_PORT_SCSI)
 		inq_ptr->spi3data = SID_SPI_CLOCK_DT_ST | SID_SPI_QAS |
 				    SID_SPI_IUS;
 
 	/* SAM-6 (no version claimed) */
 	scsi_ulto2b(0x00C0, inq_ptr->version1);
 	/* SPC-5 (no version claimed) */
 	scsi_ulto2b(0x05C0, inq_ptr->version2);
 	if (port_type == CTL_PORT_FC) {
 		/* FCP-2 ANSI INCITS.350:2003 */
 		scsi_ulto2b(0x0917, inq_ptr->version3);
 	} else if (port_type == CTL_PORT_SCSI) {
 		/* SPI-4 ANSI INCITS.362:200x */
 		scsi_ulto2b(0x0B56, inq_ptr->version3);
 	} else if (port_type == CTL_PORT_ISCSI) {
 		/* iSCSI (no version claimed) */
 		scsi_ulto2b(0x0960, inq_ptr->version3);
 	} else if (port_type == CTL_PORT_SAS) {
 		/* SAS (no version claimed) */
 		scsi_ulto2b(0x0BE0, inq_ptr->version3);
 	} else if (port_type == CTL_PORT_UMASS) {
 		/* USB Mass Storage Class Bulk-Only Transport, Revision 1.0 */
 		scsi_ulto2b(0x1730, inq_ptr->version3);
 	}
 
 	if (lun == NULL) {
 		/* SBC-4 (no version claimed) */
 		scsi_ulto2b(0x0600, inq_ptr->version4);
 	} else {
 		switch (lun->be_lun->lun_type) {
 		case T_DIRECT:
 			/* SBC-4 (no version claimed) */
 			scsi_ulto2b(0x0600, inq_ptr->version4);
 			break;
 		case T_PROCESSOR:
 			break;
 		case T_CDROM:
 			/* MMC-6 (no version claimed) */
 			scsi_ulto2b(0x04E0, inq_ptr->version4);
 			break;
 		default:
 			break;
 		}
 	}
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_inquiry(struct ctl_scsiio *ctsio)
 {
 	struct scsi_inquiry *cdb;
 	int retval;
 
 	CTL_DEBUG_PRINT(("ctl_inquiry\n"));
 
 	cdb = (struct scsi_inquiry *)ctsio->cdb;
 	if (cdb->byte2 & SI_EVPD)
 		retval = ctl_inquiry_evpd(ctsio);
 	else if (cdb->page_code == 0)
 		retval = ctl_inquiry_std(ctsio);
 	else {
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ 2,
 				      /*bit_valid*/ 0,
 				      /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 
 	return (retval);
 }
 
 int
 ctl_get_config(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_get_config_header *hdr;
 	struct scsi_get_config_feature *feature;
 	struct scsi_get_config *cdb;
 	uint32_t alloc_len, data_len;
 	int rt, starting;
 
 	cdb = (struct scsi_get_config *)ctsio->cdb;
 	rt = (cdb->rt & SGC_RT_MASK);
 	starting = scsi_2btoul(cdb->starting_feature);
 	alloc_len = scsi_2btoul(cdb->length);
 
 	data_len = sizeof(struct scsi_get_config_header) +
 	    sizeof(struct scsi_get_config_feature) + 8 +
 	    sizeof(struct scsi_get_config_feature) + 8 +
 	    sizeof(struct scsi_get_config_feature) + 4 +
 	    sizeof(struct scsi_get_config_feature) + 4 +
 	    sizeof(struct scsi_get_config_feature) + 8 +
 	    sizeof(struct scsi_get_config_feature) +
 	    sizeof(struct scsi_get_config_feature) + 4 +
 	    sizeof(struct scsi_get_config_feature) + 4 +
 	    sizeof(struct scsi_get_config_feature) + 4 +
 	    sizeof(struct scsi_get_config_feature) + 4 +
 	    sizeof(struct scsi_get_config_feature) + 4 +
 	    sizeof(struct scsi_get_config_feature) + 4;
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 
 	hdr = (struct scsi_get_config_header *)ctsio->kern_data_ptr;
 	if (lun->flags & CTL_LUN_NO_MEDIA)
 		scsi_ulto2b(0x0000, hdr->current_profile);
 	else
 		scsi_ulto2b(0x0010, hdr->current_profile);
 	feature = (struct scsi_get_config_feature *)(hdr + 1);
 
 	if (starting > 0x003b)
 		goto done;
 	if (starting > 0x003a)
 		goto f3b;
 	if (starting > 0x002b)
 		goto f3a;
 	if (starting > 0x002a)
 		goto f2b;
 	if (starting > 0x001f)
 		goto f2a;
 	if (starting > 0x001e)
 		goto f1f;
 	if (starting > 0x001d)
 		goto f1e;
 	if (starting > 0x0010)
 		goto f1d;
 	if (starting > 0x0003)
 		goto f10;
 	if (starting > 0x0002)
 		goto f3;
 	if (starting > 0x0001)
 		goto f2;
 	if (starting > 0x0000)
 		goto f1;
 
 	/* Profile List */
 	scsi_ulto2b(0x0000, feature->feature_code);
 	feature->flags = SGC_F_PERSISTENT | SGC_F_CURRENT;
 	feature->add_length = 8;
 	scsi_ulto2b(0x0008, &feature->feature_data[0]);	/* CD-ROM */
 	feature->feature_data[2] = 0x00;
 	scsi_ulto2b(0x0010, &feature->feature_data[4]);	/* DVD-ROM */
 	feature->feature_data[6] = 0x01;
 	feature = (struct scsi_get_config_feature *)
 	    &feature->feature_data[feature->add_length];
 
 f1:	/* Core */
 	scsi_ulto2b(0x0001, feature->feature_code);
 	feature->flags = 0x08 | SGC_F_PERSISTENT | SGC_F_CURRENT;
 	feature->add_length = 8;
 	scsi_ulto4b(0x00000000, &feature->feature_data[0]);
 	feature->feature_data[4] = 0x03;
 	feature = (struct scsi_get_config_feature *)
 	    &feature->feature_data[feature->add_length];
 
 f2:	/* Morphing */
 	scsi_ulto2b(0x0002, feature->feature_code);
 	feature->flags = 0x04 | SGC_F_PERSISTENT | SGC_F_CURRENT;
 	feature->add_length = 4;
 	feature->feature_data[0] = 0x02;
 	feature = (struct scsi_get_config_feature *)
 	    &feature->feature_data[feature->add_length];
 
 f3:	/* Removable Medium */
 	scsi_ulto2b(0x0003, feature->feature_code);
 	feature->flags = 0x04 | SGC_F_PERSISTENT | SGC_F_CURRENT;
 	feature->add_length = 4;
 	feature->feature_data[0] = 0x39;
 	feature = (struct scsi_get_config_feature *)
 	    &feature->feature_data[feature->add_length];
 
 	if (rt == SGC_RT_CURRENT && (lun->flags & CTL_LUN_NO_MEDIA))
 		goto done;
 
 f10:	/* Random Read */
 	scsi_ulto2b(0x0010, feature->feature_code);
 	feature->flags = 0x00;
 	if ((lun->flags & CTL_LUN_NO_MEDIA) == 0)
 		feature->flags |= SGC_F_CURRENT;
 	feature->add_length = 8;
 	scsi_ulto4b(lun->be_lun->blocksize, &feature->feature_data[0]);
 	scsi_ulto2b(1, &feature->feature_data[4]);
 	feature->feature_data[6] = 0x00;
 	feature = (struct scsi_get_config_feature *)
 	    &feature->feature_data[feature->add_length];
 
 f1d:	/* Multi-Read */
 	scsi_ulto2b(0x001D, feature->feature_code);
 	feature->flags = 0x00;
 	if ((lun->flags & CTL_LUN_NO_MEDIA) == 0)
 		feature->flags |= SGC_F_CURRENT;
 	feature->add_length = 0;
 	feature = (struct scsi_get_config_feature *)
 	    &feature->feature_data[feature->add_length];
 
 f1e:	/* CD Read */
 	scsi_ulto2b(0x001E, feature->feature_code);
 	feature->flags = 0x00;
 	if ((lun->flags & CTL_LUN_NO_MEDIA) == 0)
 		feature->flags |= SGC_F_CURRENT;
 	feature->add_length = 4;
 	feature->feature_data[0] = 0x00;
 	feature = (struct scsi_get_config_feature *)
 	    &feature->feature_data[feature->add_length];
 
 f1f:	/* DVD Read */
 	scsi_ulto2b(0x001F, feature->feature_code);
 	feature->flags = 0x08;
 	if ((lun->flags & CTL_LUN_NO_MEDIA) == 0)
 		feature->flags |= SGC_F_CURRENT;
 	feature->add_length = 4;
 	feature->feature_data[0] = 0x01;
 	feature->feature_data[2] = 0x03;
 	feature = (struct scsi_get_config_feature *)
 	    &feature->feature_data[feature->add_length];
 
 f2a:	/* DVD+RW */
 	scsi_ulto2b(0x002A, feature->feature_code);
 	feature->flags = 0x04;
 	if ((lun->flags & CTL_LUN_NO_MEDIA) == 0)
 		feature->flags |= SGC_F_CURRENT;
 	feature->add_length = 4;
 	feature->feature_data[0] = 0x00;
 	feature->feature_data[1] = 0x00;
 	feature = (struct scsi_get_config_feature *)
 	    &feature->feature_data[feature->add_length];
 
 f2b:	/* DVD+R */
 	scsi_ulto2b(0x002B, feature->feature_code);
 	feature->flags = 0x00;
 	if ((lun->flags & CTL_LUN_NO_MEDIA) == 0)
 		feature->flags |= SGC_F_CURRENT;
 	feature->add_length = 4;
 	feature->feature_data[0] = 0x00;
 	feature = (struct scsi_get_config_feature *)
 	    &feature->feature_data[feature->add_length];
 
 f3a:	/* DVD+RW Dual Layer */
 	scsi_ulto2b(0x003A, feature->feature_code);
 	feature->flags = 0x00;
 	if ((lun->flags & CTL_LUN_NO_MEDIA) == 0)
 		feature->flags |= SGC_F_CURRENT;
 	feature->add_length = 4;
 	feature->feature_data[0] = 0x00;
 	feature->feature_data[1] = 0x00;
 	feature = (struct scsi_get_config_feature *)
 	    &feature->feature_data[feature->add_length];
 
 f3b:	/* DVD+R Dual Layer */
 	scsi_ulto2b(0x003B, feature->feature_code);
 	feature->flags = 0x00;
 	if ((lun->flags & CTL_LUN_NO_MEDIA) == 0)
 		feature->flags |= SGC_F_CURRENT;
 	feature->add_length = 4;
 	feature->feature_data[0] = 0x00;
 	feature = (struct scsi_get_config_feature *)
 	    &feature->feature_data[feature->add_length];
 
 done:
 	data_len = (uint8_t *)feature - (uint8_t *)hdr;
 	if (rt == SGC_RT_SPECIFIC && data_len > 4) {
 		feature = (struct scsi_get_config_feature *)(hdr + 1);
 		if (scsi_2btoul(feature->feature_code) == starting)
 			feature = (struct scsi_get_config_feature *)
 			    &feature->feature_data[feature->add_length];
 		data_len = (uint8_t *)feature - (uint8_t *)hdr;
 	}
 	scsi_ulto4b(data_len - 4, hdr->data_length);
 	ctsio->kern_data_len = min(data_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_get_event_status(struct ctl_scsiio *ctsio)
 {
 	struct scsi_get_event_status_header *hdr;
 	struct scsi_get_event_status *cdb;
 	uint32_t alloc_len, data_len;
 
 	cdb = (struct scsi_get_event_status *)ctsio->cdb;
 	if ((cdb->byte2 & SGESN_POLLED) == 0) {
 		ctl_set_invalid_field(ctsio, /*sks_valid*/ 1, /*command*/ 1,
 		    /*field*/ 1, /*bit_valid*/ 1, /*bit*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		return (CTL_RETVAL_COMPLETE);
 	}
 	alloc_len = scsi_2btoul(cdb->length);
 
 	data_len = sizeof(struct scsi_get_event_status_header);
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(data_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	hdr = (struct scsi_get_event_status_header *)ctsio->kern_data_ptr;
 	scsi_ulto2b(0, hdr->descr_length);
 	hdr->nea_class = SGESN_NEA;
 	hdr->supported_class = 0;
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 int
 ctl_mechanism_status(struct ctl_scsiio *ctsio)
 {
 	struct scsi_mechanism_status_header *hdr;
 	struct scsi_mechanism_status *cdb;
 	uint32_t alloc_len, data_len;
 
 	cdb = (struct scsi_mechanism_status *)ctsio->cdb;
 	alloc_len = scsi_2btoul(cdb->length);
 
 	data_len = sizeof(struct scsi_mechanism_status_header);
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(data_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	hdr = (struct scsi_mechanism_status_header *)ctsio->kern_data_ptr;
 	hdr->state1 = 0x00;
 	hdr->state2 = 0xe0;
 	scsi_ulto3b(0, hdr->lba);
 	hdr->slots_num = 0;
 	scsi_ulto2b(0, hdr->slots_length);
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 static void
 ctl_ultomsf(uint32_t lba, uint8_t *buf)
 {
 
 	lba += 150;
 	buf[0] = 0;
 	buf[1] = bin2bcd((lba / 75) / 60);
 	buf[2] = bin2bcd((lba / 75) % 60);
 	buf[3] = bin2bcd(lba % 75);
 }
 
 int
 ctl_read_toc(struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun = CTL_LUN(ctsio);
 	struct scsi_read_toc_hdr *hdr;
 	struct scsi_read_toc_type01_descr *descr;
 	struct scsi_read_toc *cdb;
 	uint32_t alloc_len, data_len;
 	int format, msf;
 
 	cdb = (struct scsi_read_toc *)ctsio->cdb;
 	msf = (cdb->byte2 & CD_MSF) != 0;
 	format = cdb->format;
 	alloc_len = scsi_2btoul(cdb->data_len);
 
 	data_len = sizeof(struct scsi_read_toc_hdr);
 	if (format == 0)
 		data_len += 2 * sizeof(struct scsi_read_toc_type01_descr);
 	else
 		data_len += sizeof(struct scsi_read_toc_type01_descr);
 	ctsio->kern_data_ptr = malloc(data_len, M_CTL, M_WAITOK | M_ZERO);
 	ctsio->kern_sg_entries = 0;
 	ctsio->kern_rel_offset = 0;
 	ctsio->kern_data_len = min(data_len, alloc_len);
 	ctsio->kern_total_len = ctsio->kern_data_len;
 
 	hdr = (struct scsi_read_toc_hdr *)ctsio->kern_data_ptr;
 	if (format == 0) {
 		scsi_ulto2b(0x12, hdr->data_length);
 		hdr->first = 1;
 		hdr->last = 1;
 		descr = (struct scsi_read_toc_type01_descr *)(hdr + 1);
 		descr->addr_ctl = 0x14;
 		descr->track_number = 1;
 		if (msf)
 			ctl_ultomsf(0, descr->track_start);
 		else
 			scsi_ulto4b(0, descr->track_start);
 		descr++;
 		descr->addr_ctl = 0x14;
 		descr->track_number = 0xaa;
 		if (msf)
 			ctl_ultomsf(lun->be_lun->maxlba+1, descr->track_start);
 		else
 			scsi_ulto4b(lun->be_lun->maxlba+1, descr->track_start);
 	} else {
 		scsi_ulto2b(0x0a, hdr->data_length);
 		hdr->first = 1;
 		hdr->last = 1;
 		descr = (struct scsi_read_toc_type01_descr *)(hdr + 1);
 		descr->addr_ctl = 0x14;
 		descr->track_number = 1;
 		if (msf)
 			ctl_ultomsf(0, descr->track_start);
 		else
 			scsi_ulto4b(0, descr->track_start);
 	}
 
 	ctl_set_success(ctsio);
 	ctsio->io_hdr.flags |= CTL_FLAG_ALLOCATED;
 	ctsio->be_move_done = ctl_config_move_done;
 	ctl_datamove((union ctl_io *)ctsio);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 /*
  * For known CDB types, parse the LBA and length.
  */
 static int
 ctl_get_lba_len(union ctl_io *io, uint64_t *lba, uint64_t *len)
 {
 	if (io->io_hdr.io_type != CTL_IO_SCSI)
 		return (1);
 
 	switch (io->scsiio.cdb[0]) {
 	case COMPARE_AND_WRITE: {
 		struct scsi_compare_and_write *cdb;
 
 		cdb = (struct scsi_compare_and_write *)io->scsiio.cdb;
 
 		*lba = scsi_8btou64(cdb->addr);
 		*len = cdb->length;
 		break;
 	}
 	case READ_6:
 	case WRITE_6: {
 		struct scsi_rw_6 *cdb;
 
 		cdb = (struct scsi_rw_6 *)io->scsiio.cdb;
 
 		*lba = scsi_3btoul(cdb->addr);
 		/* only 5 bits are valid in the most significant address byte */
 		*lba &= 0x1fffff;
 		*len = cdb->length;
 		break;
 	}
 	case READ_10:
 	case WRITE_10: {
 		struct scsi_rw_10 *cdb;
 
 		cdb = (struct scsi_rw_10 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_2btoul(cdb->length);
 		break;
 	}
 	case WRITE_VERIFY_10: {
 		struct scsi_write_verify_10 *cdb;
 
 		cdb = (struct scsi_write_verify_10 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_2btoul(cdb->length);
 		break;
 	}
 	case READ_12:
 	case WRITE_12: {
 		struct scsi_rw_12 *cdb;
 
 		cdb = (struct scsi_rw_12 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case WRITE_VERIFY_12: {
 		struct scsi_write_verify_12 *cdb;
 
 		cdb = (struct scsi_write_verify_12 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case READ_16:
 	case WRITE_16: {
 		struct scsi_rw_16 *cdb;
 
 		cdb = (struct scsi_rw_16 *)io->scsiio.cdb;
 
 		*lba = scsi_8btou64(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case WRITE_ATOMIC_16: {
 		struct scsi_write_atomic_16 *cdb;
 
 		cdb = (struct scsi_write_atomic_16 *)io->scsiio.cdb;
 
 		*lba = scsi_8btou64(cdb->addr);
 		*len = scsi_2btoul(cdb->length);
 		break;
 	}
 	case WRITE_VERIFY_16: {
 		struct scsi_write_verify_16 *cdb;
 
 		cdb = (struct scsi_write_verify_16 *)io->scsiio.cdb;
 
 		*lba = scsi_8btou64(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case WRITE_SAME_10: {
 		struct scsi_write_same_10 *cdb;
 
 		cdb = (struct scsi_write_same_10 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_2btoul(cdb->length);
 		break;
 	}
 	case WRITE_SAME_16: {
 		struct scsi_write_same_16 *cdb;
 
 		cdb = (struct scsi_write_same_16 *)io->scsiio.cdb;
 
 		*lba = scsi_8btou64(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case VERIFY_10: {
 		struct scsi_verify_10 *cdb;
 
 		cdb = (struct scsi_verify_10 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_2btoul(cdb->length);
 		break;
 	}
 	case VERIFY_12: {
 		struct scsi_verify_12 *cdb;
 
 		cdb = (struct scsi_verify_12 *)io->scsiio.cdb;
 
 		*lba = scsi_4btoul(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case VERIFY_16: {
 		struct scsi_verify_16 *cdb;
 
 		cdb = (struct scsi_verify_16 *)io->scsiio.cdb;
 
 		*lba = scsi_8btou64(cdb->addr);
 		*len = scsi_4btoul(cdb->length);
 		break;
 	}
 	case UNMAP: {
 		*lba = 0;
 		*len = UINT64_MAX;
 		break;
 	}
 	case SERVICE_ACTION_IN: {	/* GET LBA STATUS */
 		struct scsi_get_lba_status *cdb;
 
 		cdb = (struct scsi_get_lba_status *)io->scsiio.cdb;
 		*lba = scsi_8btou64(cdb->addr);
 		*len = UINT32_MAX;
 		break;
 	}
 	default:
 		return (1);
 		break; /* NOTREACHED */
 	}
 
 	return (0);
 }
 
 static ctl_action
 ctl_extent_check_lba(uint64_t lba1, uint64_t len1, uint64_t lba2, uint64_t len2,
     bool seq)
 {
 	uint64_t endlba1, endlba2;
 
 	endlba1 = lba1 + len1 - (seq ? 0 : 1);
 	endlba2 = lba2 + len2 - 1;
 
 	if ((endlba1 < lba2) || (endlba2 < lba1))
 		return (CTL_ACTION_PASS);
 	else
 		return (CTL_ACTION_BLOCK);
 }
 
 static int
 ctl_extent_check_unmap(union ctl_io *io, uint64_t lba2, uint64_t len2)
 {
 	struct ctl_ptr_len_flags *ptrlen;
 	struct scsi_unmap_desc *buf, *end, *range;
 	uint64_t lba;
 	uint32_t len;
 
 	/* If not UNMAP -- go other way. */
 	if (io->io_hdr.io_type != CTL_IO_SCSI ||
 	    io->scsiio.cdb[0] != UNMAP)
 		return (CTL_ACTION_ERROR);
 
 	/* If UNMAP without data -- block and wait for data. */
 	ptrlen = (struct ctl_ptr_len_flags *)
 	    &io->io_hdr.ctl_private[CTL_PRIV_LBA_LEN];
 	if ((io->io_hdr.flags & CTL_FLAG_ALLOCATED) == 0 ||
 	    ptrlen->ptr == NULL)
 		return (CTL_ACTION_BLOCK);
 
 	/* UNMAP with data -- check for collision. */
 	buf = (struct scsi_unmap_desc *)ptrlen->ptr;
 	end = buf + ptrlen->len / sizeof(*buf);
 	for (range = buf; range < end; range++) {
 		lba = scsi_8btou64(range->lba);
 		len = scsi_4btoul(range->length);
 		if ((lba < lba2 + len2) && (lba + len > lba2))
 			return (CTL_ACTION_BLOCK);
 	}
 	return (CTL_ACTION_PASS);
 }
 
 static ctl_action
 ctl_extent_check(union ctl_io *io1, union ctl_io *io2, bool seq)
 {
 	uint64_t lba1, lba2;
 	uint64_t len1, len2;
 	int retval;
 
 	if (ctl_get_lba_len(io2, &lba2, &len2) != 0)
 		return (CTL_ACTION_ERROR);
 
 	retval = ctl_extent_check_unmap(io1, lba2, len2);
 	if (retval != CTL_ACTION_ERROR)
 		return (retval);
 
 	if (ctl_get_lba_len(io1, &lba1, &len1) != 0)
 		return (CTL_ACTION_ERROR);
 
 	if (io1->io_hdr.flags & CTL_FLAG_SERSEQ_DONE)
 		seq = FALSE;
 	return (ctl_extent_check_lba(lba1, len1, lba2, len2, seq));
 }
 
 static ctl_action
 ctl_extent_check_seq(union ctl_io *io1, union ctl_io *io2)
 {
 	uint64_t lba1, lba2;
 	uint64_t len1, len2;
 
 	if (io1->io_hdr.flags & CTL_FLAG_SERSEQ_DONE)
 		return (CTL_ACTION_PASS);
 	if (ctl_get_lba_len(io1, &lba1, &len1) != 0)
 		return (CTL_ACTION_ERROR);
 	if (ctl_get_lba_len(io2, &lba2, &len2) != 0)
 		return (CTL_ACTION_ERROR);
 
 	if (lba1 + len1 == lba2)
 		return (CTL_ACTION_BLOCK);
 	return (CTL_ACTION_PASS);
 }
 
 static ctl_action
 ctl_check_for_blockage(struct ctl_lun *lun, union ctl_io *pending_io,
     union ctl_io *ooa_io)
 {
 	const struct ctl_cmd_entry *pending_entry, *ooa_entry;
 	const ctl_serialize_action *serialize_row;
 
 	/*
 	 * Aborted commands are not going to be executed and may even
 	 * not report completion, so we don't care about their order.
 	 * Let them complete ASAP to clean the OOA queue.
 	 */
 	if (pending_io->io_hdr.flags & CTL_FLAG_ABORT)
 		return (CTL_ACTION_SKIP);
 
 	/*
 	 * The initiator attempted multiple untagged commands at the same
 	 * time.  Can't do that.
 	 */
 	if ((pending_io->scsiio.tag_type == CTL_TAG_UNTAGGED)
 	 && (ooa_io->scsiio.tag_type == CTL_TAG_UNTAGGED)
 	 && ((pending_io->io_hdr.nexus.targ_port ==
 	      ooa_io->io_hdr.nexus.targ_port)
 	  && (pending_io->io_hdr.nexus.initid ==
 	      ooa_io->io_hdr.nexus.initid))
 	 && ((ooa_io->io_hdr.flags & (CTL_FLAG_ABORT |
 	      CTL_FLAG_STATUS_SENT)) == 0))
 		return (CTL_ACTION_OVERLAP);
 
 	/*
 	 * The initiator attempted to send multiple tagged commands with
 	 * the same ID.  (It's fine if different initiators have the same
 	 * tag ID.)
 	 *
 	 * Even if all of those conditions are true, we don't kill the I/O
 	 * if the command ahead of us has been aborted.  We won't end up
 	 * sending it to the FETD, and it's perfectly legal to resend a
 	 * command with the same tag number as long as the previous
 	 * instance of this tag number has been aborted somehow.
 	 */
 	if ((pending_io->scsiio.tag_type != CTL_TAG_UNTAGGED)
 	 && (ooa_io->scsiio.tag_type != CTL_TAG_UNTAGGED)
 	 && (pending_io->scsiio.tag_num == ooa_io->scsiio.tag_num)
 	 && ((pending_io->io_hdr.nexus.targ_port ==
 	      ooa_io->io_hdr.nexus.targ_port)
 	  && (pending_io->io_hdr.nexus.initid ==
 	      ooa_io->io_hdr.nexus.initid))
 	 && ((ooa_io->io_hdr.flags & (CTL_FLAG_ABORT |
 	      CTL_FLAG_STATUS_SENT)) == 0))
 		return (CTL_ACTION_OVERLAP_TAG);
 
 	/*
 	 * If we get a head of queue tag, SAM-3 says that we should
 	 * immediately execute it.
 	 *
 	 * What happens if this command would normally block for some other
 	 * reason?  e.g. a request sense with a head of queue tag
 	 * immediately after a write.  Normally that would block, but this
 	 * will result in its getting executed immediately...
 	 *
 	 * We currently return "pass" instead of "skip", so we'll end up
 	 * going through the rest of the queue to check for overlapped tags.
 	 *
 	 * XXX KDM check for other types of blockage first??
 	 */
 	if (pending_io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE)
 		return (CTL_ACTION_PASS);
 
 	/*
 	 * Ordered tags have to block until all items ahead of them
 	 * have completed.  If we get called with an ordered tag, we always
 	 * block, if something else is ahead of us in the queue.
 	 */
 	if (pending_io->scsiio.tag_type == CTL_TAG_ORDERED)
 		return (CTL_ACTION_BLOCK);
 
 	/*
 	 * Simple tags get blocked until all head of queue and ordered tags
 	 * ahead of them have completed.  I'm lumping untagged commands in
 	 * with simple tags here.  XXX KDM is that the right thing to do?
 	 */
 	if (((pending_io->scsiio.tag_type == CTL_TAG_UNTAGGED)
 	  || (pending_io->scsiio.tag_type == CTL_TAG_SIMPLE))
 	 && ((ooa_io->scsiio.tag_type == CTL_TAG_HEAD_OF_QUEUE)
 	  || (ooa_io->scsiio.tag_type == CTL_TAG_ORDERED)))
 		return (CTL_ACTION_BLOCK);
 
 	pending_entry = ctl_get_cmd_entry(&pending_io->scsiio, NULL);
 	KASSERT(pending_entry->seridx < CTL_SERIDX_COUNT,
 	    ("%s: Invalid seridx %d for pending CDB %02x %02x @ %p",
 	     __func__, pending_entry->seridx, pending_io->scsiio.cdb[0],
 	     pending_io->scsiio.cdb[1], pending_io));
 	ooa_entry = ctl_get_cmd_entry(&ooa_io->scsiio, NULL);
 	if (ooa_entry->seridx == CTL_SERIDX_INVLD)
 		return (CTL_ACTION_PASS); /* Unsupported command in OOA queue */
 	KASSERT(ooa_entry->seridx < CTL_SERIDX_COUNT,
 	    ("%s: Invalid seridx %d for ooa CDB %02x %02x @ %p",
 	     __func__, ooa_entry->seridx, ooa_io->scsiio.cdb[0],
 	     ooa_io->scsiio.cdb[1], ooa_io));
 
 	serialize_row = ctl_serialize_table[ooa_entry->seridx];
 
 	switch (serialize_row[pending_entry->seridx]) {
 	case CTL_SER_BLOCK:
 		return (CTL_ACTION_BLOCK);
 	case CTL_SER_EXTENT:
 		return (ctl_extent_check(ooa_io, pending_io,
 		    (lun->be_lun && lun->be_lun->serseq == CTL_LUN_SERSEQ_ON)));
 	case CTL_SER_EXTENTOPT:
 		if ((lun->MODE_CTRL.queue_flags & SCP_QUEUE_ALG_MASK) !=
 		    SCP_QUEUE_ALG_UNRESTRICTED)
 			return (ctl_extent_check(ooa_io, pending_io,
 			    (lun->be_lun &&
 			     lun->be_lun->serseq == CTL_LUN_SERSEQ_ON)));
 		return (CTL_ACTION_PASS);
 	case CTL_SER_EXTENTSEQ:
 		if (lun->be_lun && lun->be_lun->serseq != CTL_LUN_SERSEQ_OFF)
 			return (ctl_extent_check_seq(ooa_io, pending_io));
 		return (CTL_ACTION_PASS);
 	case CTL_SER_PASS:
 		return (CTL_ACTION_PASS);
 	case CTL_SER_BLOCKOPT:
 		if ((lun->MODE_CTRL.queue_flags & SCP_QUEUE_ALG_MASK) !=
 		    SCP_QUEUE_ALG_UNRESTRICTED)
 			return (CTL_ACTION_BLOCK);
 		return (CTL_ACTION_PASS);
 	case CTL_SER_SKIP:
 		return (CTL_ACTION_SKIP);
 	default:
 		panic("%s: Invalid serialization value %d for %d => %d",
 		    __func__, serialize_row[pending_entry->seridx],
 		    pending_entry->seridx, ooa_entry->seridx);
 	}
 
 	return (CTL_ACTION_ERROR);
 }
 
 /*
  * Check for blockage or overlaps against the OOA (Order Of Arrival) queue.
  * Assumptions:
  * - pending_io is generally either incoming, or on the blocked queue
  * - starting I/O is the I/O we want to start the check with.
  */
 static ctl_action
 ctl_check_ooa(struct ctl_lun *lun, union ctl_io *pending_io,
 	      union ctl_io **starting_io)
 {
 	union ctl_io *ooa_io;
 	ctl_action action;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 
 	/*
 	 * Run back along the OOA queue, starting with the current
 	 * blocked I/O and going through every I/O before it on the
 	 * queue.  If starting_io is NULL, we'll just end up returning
 	 * CTL_ACTION_PASS.
 	 */
 	for (ooa_io = *starting_io; ooa_io != NULL;
 	     ooa_io = (union ctl_io *)TAILQ_PREV(&ooa_io->io_hdr, ctl_ooaq,
 	     ooa_links)){
 		action = ctl_check_for_blockage(lun, pending_io, ooa_io);
 		if (action != CTL_ACTION_PASS) {
 			*starting_io = ooa_io;
 			return (action);
 		}
 	}
 
 	*starting_io = NULL;
 	return (CTL_ACTION_PASS);
 }
 
 /*
  * Try to unblock the specified I/O.
  *
  * skip parameter allows explicitly skip present blocker of the I/O,
  * starting from the previous one on OOA queue.  It can be used when
  * we know for sure that the blocker I/O does no longer count.
  */
 static void
 ctl_try_unblock_io(struct ctl_lun *lun, union ctl_io *io, bool skip)
 {
 	struct ctl_softc *softc = lun->ctl_softc;
 	union ctl_io *bio, *obio;
 	const struct ctl_cmd_entry *entry;
 	union ctl_ha_msg msg_info;
 	ctl_action action;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 
 	if (io->io_hdr.blocker == NULL)
 		return;
 
 	obio = bio = io->io_hdr.blocker;
 	if (skip)
 		bio = (union ctl_io *)TAILQ_PREV(&bio->io_hdr, ctl_ooaq,
 		    ooa_links);
 	action = ctl_check_ooa(lun, io, &bio);
 	if (action == CTL_ACTION_BLOCK) {
 		/* Still blocked, but may be by different I/O now. */
 		if (bio != obio) {
 			TAILQ_REMOVE(&obio->io_hdr.blocked_queue,
 			    &io->io_hdr, blocked_links);
 			TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue,
 			    &io->io_hdr, blocked_links);
 			io->io_hdr.blocker = bio;
 		}
 		return;
 	}
 
 	/* No longer blocked, one way or another. */
 	TAILQ_REMOVE(&obio->io_hdr.blocked_queue, &io->io_hdr, blocked_links);
 	io->io_hdr.blocker = NULL;
 
 	switch (action) {
 	case CTL_ACTION_OVERLAP:
 		ctl_set_overlapped_cmd(&io->scsiio);
 		goto error;
 	case CTL_ACTION_OVERLAP_TAG:
 		ctl_set_overlapped_tag(&io->scsiio,
 		    io->scsiio.tag_num & 0xff);
 		goto error;
 	case CTL_ACTION_PASS:
 	case CTL_ACTION_SKIP:
 
 		/* Serializing commands from the other SC retire there. */
 		if ((io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) &&
 		    (softc->ha_mode != CTL_HA_MODE_XFER)) {
 			io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE;
 			msg_info.hdr.original_sc = io->io_hdr.remote_io;
 			msg_info.hdr.serializing_sc = io;
 			msg_info.hdr.msg_type = CTL_MSG_R2R;
 			ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 			    sizeof(msg_info.hdr), M_NOWAIT);
 			break;
 		}
 
 		/*
 		 * Check this I/O for LUN state changes that may have happened
 		 * while this command was blocked. The LUN state may have been
 		 * changed by a command ahead of us in the queue.
 		 */
 		entry = ctl_get_cmd_entry(&io->scsiio, NULL);
 		if (ctl_scsiio_lun_check(lun, entry, &io->scsiio) != 0) {
 			ctl_done(io);
 			break;
 		}
 
 		io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
 		ctl_enqueue_rtr(io);
 		break;
 	case CTL_ACTION_ERROR:
 	default:
 		ctl_set_internal_failure(&io->scsiio,
 					 /*sks_valid*/ 0,
 					 /*retry_count*/ 0);
 
 error:
 		/* Serializing commands from the other SC are done here. */
 		if ((io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) &&
 		    (softc->ha_mode != CTL_HA_MODE_XFER)) {
 			ctl_try_unblock_others(lun, io, TRUE);
 			TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links);
 
 			ctl_copy_sense_data_back(io, &msg_info);
 			msg_info.hdr.original_sc = io->io_hdr.remote_io;
 			msg_info.hdr.serializing_sc = NULL;
 			msg_info.hdr.msg_type = CTL_MSG_BAD_JUJU;
 			ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 			    sizeof(msg_info.scsi), M_WAITOK);
 			ctl_free_io(io);
 			break;
 		}
 
 		ctl_done(io);
 		break;
 	}
 }
 
 /*
  * Try to unblock I/Os blocked by the specified I/O.
  *
  * skip parameter allows explicitly skip the specified I/O as blocker,
  * starting from the previous one on the OOA queue.  It can be used when
  * we know for sure that the specified I/O does no longer count (done).
  * It has to be still on OOA queue though so that we know where to start.
  */
 static void
 ctl_try_unblock_others(struct ctl_lun *lun, union ctl_io *bio, bool skip)
 {
 	union ctl_io *io, *next_io;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 
 	for (io = (union ctl_io *)TAILQ_FIRST(&bio->io_hdr.blocked_queue);
 	     io != NULL; io = next_io) {
 		next_io = (union ctl_io *)TAILQ_NEXT(&io->io_hdr, blocked_links);
 
 		KASSERT(io->io_hdr.blocker != NULL,
 		    ("I/O %p on blocked list without blocker", io));
 		ctl_try_unblock_io(lun, io, skip);
 	}
 	KASSERT(!skip || TAILQ_EMPTY(&bio->io_hdr.blocked_queue),
 	    ("blocked_queue is not empty after skipping %p", bio));
 }
 
 /*
  * This routine (with one exception) checks LUN flags that can be set by
  * commands ahead of us in the OOA queue.  These flags have to be checked
  * when a command initially comes in, and when we pull a command off the
  * blocked queue and are preparing to execute it.  The reason we have to
  * check these flags for commands on the blocked queue is that the LUN
  * state may have been changed by a command ahead of us while we're on the
  * blocked queue.
  *
  * Ordering is somewhat important with these checks, so please pay
  * careful attention to the placement of any new checks.
  */
 static int
 ctl_scsiio_lun_check(struct ctl_lun *lun,
     const struct ctl_cmd_entry *entry, struct ctl_scsiio *ctsio)
 {
 	struct ctl_softc *softc = lun->ctl_softc;
 	int retval;
 	uint32_t residx;
 
 	retval = 0;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 
 	/*
 	 * If this shelf is a secondary shelf controller, we may have to
 	 * reject some commands disallowed by HA mode and link state.
 	 */
 	if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0) {
 		if (softc->ha_link == CTL_HA_LINK_OFFLINE &&
 		    (entry->flags & CTL_CMD_FLAG_OK_ON_UNAVAIL) == 0) {
 			ctl_set_lun_unavail(ctsio);
 			retval = 1;
 			goto bailout;
 		}
 		if ((lun->flags & CTL_LUN_PEER_SC_PRIMARY) == 0 &&
 		    (entry->flags & CTL_CMD_FLAG_OK_ON_UNAVAIL) == 0) {
 			ctl_set_lun_transit(ctsio);
 			retval = 1;
 			goto bailout;
 		}
 		if (softc->ha_mode == CTL_HA_MODE_ACT_STBY &&
 		    (entry->flags & CTL_CMD_FLAG_OK_ON_STANDBY) == 0) {
 			ctl_set_lun_standby(ctsio);
 			retval = 1;
 			goto bailout;
 		}
 
 		/* The rest of checks are only done on executing side */
 		if (softc->ha_mode == CTL_HA_MODE_XFER)
 			goto bailout;
 	}
 
 	if (entry->pattern & CTL_LUN_PAT_WRITE) {
 		if (lun->be_lun &&
 		    lun->be_lun->flags & CTL_LUN_FLAG_READONLY) {
 			ctl_set_hw_write_protected(ctsio);
 			retval = 1;
 			goto bailout;
 		}
 		if ((lun->MODE_CTRL.eca_and_aen & SCP_SWP) != 0) {
 			ctl_set_sense(ctsio, /*current_error*/ 1,
 			    /*sense_key*/ SSD_KEY_DATA_PROTECT,
 			    /*asc*/ 0x27, /*ascq*/ 0x02, SSD_ELEM_NONE);
 			retval = 1;
 			goto bailout;
 		}
 	}
 
 	/*
 	 * Check for a reservation conflict.  If this command isn't allowed
 	 * even on reserved LUNs, and if this initiator isn't the one who
 	 * reserved us, reject the command with a reservation conflict.
 	 */
 	residx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 	if ((lun->flags & CTL_LUN_RESERVED)
 	 && ((entry->flags & CTL_CMD_FLAG_ALLOW_ON_RESV) == 0)) {
 		if (lun->res_idx != residx) {
 			ctl_set_reservation_conflict(ctsio);
 			retval = 1;
 			goto bailout;
 		}
 	}
 
 	if ((lun->flags & CTL_LUN_PR_RESERVED) == 0 ||
 	    (entry->flags & CTL_CMD_FLAG_ALLOW_ON_PR_RESV)) {
 		/* No reservation or command is allowed. */;
 	} else if ((entry->flags & CTL_CMD_FLAG_ALLOW_ON_PR_WRESV) &&
 	    (lun->pr_res_type == SPR_TYPE_WR_EX ||
 	     lun->pr_res_type == SPR_TYPE_WR_EX_RO ||
 	     lun->pr_res_type == SPR_TYPE_WR_EX_AR)) {
 		/* The command is allowed for Write Exclusive resv. */;
 	} else {
 		/*
 		 * if we aren't registered or it's a res holder type
 		 * reservation and this isn't the res holder then set a
 		 * conflict.
 		 */
 		if (ctl_get_prkey(lun, residx) == 0 ||
 		    (residx != lun->pr_res_idx && lun->pr_res_type < 4)) {
 			ctl_set_reservation_conflict(ctsio);
 			retval = 1;
 			goto bailout;
 		}
 	}
 
 	if ((entry->flags & CTL_CMD_FLAG_OK_ON_NO_MEDIA) == 0) {
 		if (lun->flags & CTL_LUN_EJECTED)
 			ctl_set_lun_ejected(ctsio);
 		else if (lun->flags & CTL_LUN_NO_MEDIA) {
 			if (lun->flags & CTL_LUN_REMOVABLE)
 				ctl_set_lun_no_media(ctsio);
 			else
 				ctl_set_lun_int_reqd(ctsio);
 		} else if (lun->flags & CTL_LUN_STOPPED)
 			ctl_set_lun_stopped(ctsio);
 		else
 			goto bailout;
 		retval = 1;
 		goto bailout;
 	}
 
 bailout:
 	return (retval);
 }
 
 static void
 ctl_failover_io(union ctl_io *io, int have_lock)
 {
 	ctl_set_busy(&io->scsiio);
 	ctl_done(io);
 }
 
 static void
 ctl_failover_lun(union ctl_io *rio)
 {
 	struct ctl_softc *softc = CTL_SOFTC(rio);
 	struct ctl_lun *lun;
 	struct ctl_io_hdr *io, *next_io;
 	uint32_t targ_lun;
 
 	targ_lun = rio->io_hdr.nexus.targ_mapped_lun;
 	CTL_DEBUG_PRINT(("FAILOVER for lun %ju\n", targ_lun));
 
 	/* Find and lock the LUN. */
 	mtx_lock(&softc->ctl_lock);
 	if (targ_lun > ctl_max_luns ||
 	    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 		return;
 	}
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 	if (lun->flags & CTL_LUN_DISABLED) {
 		mtx_unlock(&lun->lun_lock);
 		return;
 	}
 
 	if (softc->ha_mode == CTL_HA_MODE_XFER) {
 		TAILQ_FOREACH_SAFE(io, &lun->ooa_queue, ooa_links, next_io) {
 			/* We are master */
 			if (io->flags & CTL_FLAG_FROM_OTHER_SC) {
 				if (io->flags & CTL_FLAG_IO_ACTIVE) {
 					io->flags |= CTL_FLAG_ABORT;
 					io->flags |= CTL_FLAG_FAILOVER;
 					ctl_try_unblock_io(lun,
 					    (union ctl_io *)io, FALSE);
 				} else { /* This can be only due to DATAMOVE */
 					io->msg_type = CTL_MSG_DATAMOVE_DONE;
 					io->flags &= ~CTL_FLAG_DMA_INPROG;
 					io->flags |= CTL_FLAG_IO_ACTIVE;
 					io->port_status = 31340;
 					ctl_enqueue_isc((union ctl_io *)io);
 				}
 			} else
 			/* We are slave */
 			if (io->flags & CTL_FLAG_SENT_2OTHER_SC) {
 				io->flags &= ~CTL_FLAG_SENT_2OTHER_SC;
 				if (io->flags & CTL_FLAG_IO_ACTIVE) {
 					io->flags |= CTL_FLAG_FAILOVER;
 				} else {
 					ctl_set_busy(&((union ctl_io *)io)->
 					    scsiio);
 					ctl_done((union ctl_io *)io);
 				}
 			}
 		}
 	} else { /* SERIALIZE modes */
 		TAILQ_FOREACH_SAFE(io, &lun->ooa_queue, ooa_links, next_io) {
 			/* We are master */
 			if (io->flags & CTL_FLAG_FROM_OTHER_SC) {
 				if (io->blocker != NULL) {
 					TAILQ_REMOVE(&io->blocker->io_hdr.blocked_queue,
 					    io, blocked_links);
 					io->blocker = NULL;
 				}
 				ctl_try_unblock_others(lun, (union ctl_io *)io,
 				    TRUE);
 				TAILQ_REMOVE(&lun->ooa_queue, io, ooa_links);
 				ctl_free_io((union ctl_io *)io);
 			} else
 			/* We are slave */
 			if (io->flags & CTL_FLAG_SENT_2OTHER_SC) {
 				io->flags &= ~CTL_FLAG_SENT_2OTHER_SC;
 				if (!(io->flags & CTL_FLAG_IO_ACTIVE)) {
 					ctl_set_busy(&((union ctl_io *)io)->
 					    scsiio);
 					ctl_done((union ctl_io *)io);
 				}
 			}
 		}
 	}
 	mtx_unlock(&lun->lun_lock);
 }
 
 static int
 ctl_scsiio_precheck(struct ctl_softc *softc, struct ctl_scsiio *ctsio)
 {
 	struct ctl_lun *lun;
 	const struct ctl_cmd_entry *entry;
 	union ctl_io *bio;
 	uint32_t initidx, targ_lun;
 	int retval = 0;
 
 	lun = NULL;
 	targ_lun = ctsio->io_hdr.nexus.targ_mapped_lun;
 	if (targ_lun < ctl_max_luns)
 		lun = softc->ctl_luns[targ_lun];
 	if (lun) {
 		/*
 		 * If the LUN is invalid, pretend that it doesn't exist.
 		 * It will go away as soon as all pending I/O has been
 		 * completed.
 		 */
 		mtx_lock(&lun->lun_lock);
 		if (lun->flags & CTL_LUN_DISABLED) {
 			mtx_unlock(&lun->lun_lock);
 			lun = NULL;
 		}
 	}
 	CTL_LUN(ctsio) = lun;
 	if (lun) {
 		CTL_BACKEND_LUN(ctsio) = lun->be_lun;
 
 		/*
 		 * Every I/O goes into the OOA queue for a particular LUN,
 		 * and stays there until completion.
 		 */
 #ifdef CTL_TIME_IO
 		if (TAILQ_EMPTY(&lun->ooa_queue))
 			lun->idle_time += getsbinuptime() - lun->last_busy;
 #endif
 		TAILQ_INSERT_TAIL(&lun->ooa_queue, &ctsio->io_hdr, ooa_links);
 	}
 
 	/* Get command entry and return error if it is unsuppotyed. */
 	entry = ctl_validate_command(ctsio);
 	if (entry == NULL) {
 		if (lun)
 			mtx_unlock(&lun->lun_lock);
 		return (retval);
 	}
 
 	ctsio->io_hdr.flags &= ~CTL_FLAG_DATA_MASK;
 	ctsio->io_hdr.flags |= entry->flags & CTL_FLAG_DATA_MASK;
 
 	/*
 	 * Check to see whether we can send this command to LUNs that don't
 	 * exist.  This should pretty much only be the case for inquiry
 	 * and request sense.  Further checks, below, really require having
 	 * a LUN, so we can't really check the command anymore.  Just put
 	 * it on the rtr queue.
 	 */
 	if (lun == NULL) {
 		if (entry->flags & CTL_CMD_FLAG_OK_ON_NO_LUN) {
 			ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
 			ctl_enqueue_rtr((union ctl_io *)ctsio);
 			return (retval);
 		}
 
 		ctl_set_unsupported_lun(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		CTL_DEBUG_PRINT(("ctl_scsiio_precheck: bailing out due to invalid LUN\n"));
 		return (retval);
 	} else {
 		/*
 		 * Make sure we support this particular command on this LUN.
 		 * e.g., we don't support writes to the control LUN.
 		 */
 		if (!ctl_cmd_applicable(lun->be_lun->lun_type, entry)) {
 			mtx_unlock(&lun->lun_lock);
 			ctl_set_invalid_opcode(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (retval);
 		}
 	}
 
 	initidx = ctl_get_initindex(&ctsio->io_hdr.nexus);
 
 	/*
 	 * If we've got a request sense, it'll clear the contingent
 	 * allegiance condition.  Otherwise, if we have a CA condition for
 	 * this initiator, clear it, because it sent down a command other
 	 * than request sense.
 	 */
 	if (ctsio->cdb[0] != REQUEST_SENSE) {
 		struct scsi_sense_data *ps;
 
 		ps = lun->pending_sense[initidx / CTL_MAX_INIT_PER_PORT];
 		if (ps != NULL)
 			ps[initidx % CTL_MAX_INIT_PER_PORT].error_code = 0;
 	}
 
 	/*
 	 * If the command has this flag set, it handles its own unit
 	 * attention reporting, we shouldn't do anything.  Otherwise we
 	 * check for any pending unit attentions, and send them back to the
 	 * initiator.  We only do this when a command initially comes in,
 	 * not when we pull it off the blocked queue.
 	 *
 	 * According to SAM-3, section 5.3.2, the order that things get
 	 * presented back to the host is basically unit attentions caused
 	 * by some sort of reset event, busy status, reservation conflicts
 	 * or task set full, and finally any other status.
 	 *
 	 * One issue here is that some of the unit attentions we report
 	 * don't fall into the "reset" category (e.g. "reported luns data
 	 * has changed").  So reporting it here, before the reservation
 	 * check, may be technically wrong.  I guess the only thing to do
 	 * would be to check for and report the reset events here, and then
 	 * check for the other unit attention types after we check for a
 	 * reservation conflict.
 	 *
 	 * XXX KDM need to fix this
 	 */
 	if ((entry->flags & CTL_CMD_FLAG_NO_SENSE) == 0) {
 		ctl_ua_type ua_type;
 		u_int sense_len = 0;
 
 		ua_type = ctl_build_ua(lun, initidx, &ctsio->sense_data,
 		    &sense_len, SSD_TYPE_NONE);
 		if (ua_type != CTL_UA_NONE) {
 			mtx_unlock(&lun->lun_lock);
 			ctsio->scsi_status = SCSI_STATUS_CHECK_COND;
 			ctsio->io_hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE;
 			ctsio->sense_len = sense_len;
 			ctl_done((union ctl_io *)ctsio);
 			return (retval);
 		}
 	}
 
 
 	if (ctl_scsiio_lun_check(lun, entry, ctsio) != 0) {
 		mtx_unlock(&lun->lun_lock);
 		ctl_done((union ctl_io *)ctsio);
 		return (retval);
 	}
 
 	/*
 	 * XXX CHD this is where we want to send IO to other side if
 	 * this LUN is secondary on this SC. We will need to make a copy
 	 * of the IO and flag the IO on this side as SENT_2OTHER and the flag
 	 * the copy we send as FROM_OTHER.
 	 * We also need to stuff the address of the original IO so we can
 	 * find it easily. Something similar will need be done on the other
 	 * side so when we are done we can find the copy.
 	 */
 	if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0 &&
 	    (lun->flags & CTL_LUN_PEER_SC_PRIMARY) != 0 &&
 	    (entry->flags & CTL_CMD_FLAG_RUN_HERE) == 0) {
 		union ctl_ha_msg msg_info;
 		int isc_retval;
 
 		ctsio->io_hdr.flags |= CTL_FLAG_SENT_2OTHER_SC;
 		ctsio->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE;
 		mtx_unlock(&lun->lun_lock);
 
 		msg_info.hdr.msg_type = CTL_MSG_SERIALIZE;
 		msg_info.hdr.original_sc = (union ctl_io *)ctsio;
 		msg_info.hdr.serializing_sc = NULL;
 		msg_info.hdr.nexus = ctsio->io_hdr.nexus;
 		msg_info.scsi.tag_num = ctsio->tag_num;
 		msg_info.scsi.tag_type = ctsio->tag_type;
 		msg_info.scsi.cdb_len = ctsio->cdb_len;
 		memcpy(msg_info.scsi.cdb, ctsio->cdb, CTL_MAX_CDBLEN);
 
 		if ((isc_retval = ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 		    sizeof(msg_info.scsi) - sizeof(msg_info.scsi.sense_data),
 		    M_WAITOK)) > CTL_HA_STATUS_SUCCESS) {
 			ctl_set_busy(ctsio);
 			ctl_done((union ctl_io *)ctsio);
 			return (retval);
 		}
 		return (retval);
 	}
 
 	bio = (union ctl_io *)TAILQ_PREV(&ctsio->io_hdr, ctl_ooaq, ooa_links);
 	switch (ctl_check_ooa(lun, (union ctl_io *)ctsio, &bio)) {
 	case CTL_ACTION_BLOCK:
 		ctsio->io_hdr.blocker = bio;
 		TAILQ_INSERT_TAIL(&bio->io_hdr.blocked_queue, &ctsio->io_hdr,
 				  blocked_links);
 		mtx_unlock(&lun->lun_lock);
 		return (retval);
 	case CTL_ACTION_PASS:
 	case CTL_ACTION_SKIP:
 		ctsio->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
 		mtx_unlock(&lun->lun_lock);
 		ctl_enqueue_rtr((union ctl_io *)ctsio);
 		break;
 	case CTL_ACTION_OVERLAP:
 		mtx_unlock(&lun->lun_lock);
 		ctl_set_overlapped_cmd(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		break;
 	case CTL_ACTION_OVERLAP_TAG:
 		mtx_unlock(&lun->lun_lock);
 		ctl_set_overlapped_tag(ctsio, ctsio->tag_num & 0xff);
 		ctl_done((union ctl_io *)ctsio);
 		break;
 	case CTL_ACTION_ERROR:
 	default:
 		mtx_unlock(&lun->lun_lock);
 		ctl_set_internal_failure(ctsio,
 					 /*sks_valid*/ 0,
 					 /*retry_count*/ 0);
 		ctl_done((union ctl_io *)ctsio);
 		break;
 	}
 	return (retval);
 }
 
 const struct ctl_cmd_entry *
 ctl_get_cmd_entry(struct ctl_scsiio *ctsio, int *sa)
 {
 	const struct ctl_cmd_entry *entry;
 	int service_action;
 
 	entry = &ctl_cmd_table[ctsio->cdb[0]];
 	if (sa)
 		*sa = ((entry->flags & CTL_CMD_FLAG_SA5) != 0);
 	if (entry->flags & CTL_CMD_FLAG_SA5) {
 		service_action = ctsio->cdb[1] & SERVICE_ACTION_MASK;
 		entry = &((const struct ctl_cmd_entry *)
 		    entry->execute)[service_action];
 	}
 	return (entry);
 }
 
 const struct ctl_cmd_entry *
 ctl_validate_command(struct ctl_scsiio *ctsio)
 {
 	const struct ctl_cmd_entry *entry;
 	int i, sa;
 	uint8_t diff;
 
 	entry = ctl_get_cmd_entry(ctsio, &sa);
 	if (entry->execute == NULL) {
 		if (sa)
 			ctl_set_invalid_field(ctsio,
 					      /*sks_valid*/ 1,
 					      /*command*/ 1,
 					      /*field*/ 1,
 					      /*bit_valid*/ 1,
 					      /*bit*/ 4);
 		else
 			ctl_set_invalid_opcode(ctsio);
 		ctl_done((union ctl_io *)ctsio);
 		return (NULL);
 	}
 	KASSERT(entry->length > 0,
 	    ("Not defined length for command 0x%02x/0x%02x",
 	     ctsio->cdb[0], ctsio->cdb[1]));
 	for (i = 1; i < entry->length; i++) {
 		diff = ctsio->cdb[i] & ~entry->usage[i - 1];
 		if (diff == 0)
 			continue;
 		ctl_set_invalid_field(ctsio,
 				      /*sks_valid*/ 1,
 				      /*command*/ 1,
 				      /*field*/ i,
 				      /*bit_valid*/ 1,
 				      /*bit*/ fls(diff) - 1);
 		ctl_done((union ctl_io *)ctsio);
 		return (NULL);
 	}
 	return (entry);
 }
 
 static int
 ctl_cmd_applicable(uint8_t lun_type, const struct ctl_cmd_entry *entry)
 {
 
 	switch (lun_type) {
 	case T_DIRECT:
 		if ((entry->flags & CTL_CMD_FLAG_OK_ON_DIRECT) == 0)
 			return (0);
 		break;
 	case T_PROCESSOR:
 		if ((entry->flags & CTL_CMD_FLAG_OK_ON_PROC) == 0)
 			return (0);
 		break;
 	case T_CDROM:
 		if ((entry->flags & CTL_CMD_FLAG_OK_ON_CDROM) == 0)
 			return (0);
 		break;
 	default:
 		return (0);
 	}
 	return (1);
 }
 
 static int
 ctl_scsiio(struct ctl_scsiio *ctsio)
 {
 	int retval;
 	const struct ctl_cmd_entry *entry;
 
 	retval = CTL_RETVAL_COMPLETE;
 
 	CTL_DEBUG_PRINT(("ctl_scsiio cdb[0]=%02X\n", ctsio->cdb[0]));
 
 	entry = ctl_get_cmd_entry(ctsio, NULL);
 
 	/*
 	 * If this I/O has been aborted, just send it straight to
 	 * ctl_done() without executing it.
 	 */
 	if (ctsio->io_hdr.flags & CTL_FLAG_ABORT) {
 		ctl_done((union ctl_io *)ctsio);
 		goto bailout;
 	}
 
 	/*
 	 * All the checks should have been handled by ctl_scsiio_precheck().
 	 * We should be clear now to just execute the I/O.
 	 */
 	retval = entry->execute(ctsio);
 
 bailout:
 	return (retval);
 }
 
 static int
 ctl_target_reset(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	struct ctl_port *port = CTL_PORT(io);
 	struct ctl_lun *lun;
 	uint32_t initidx;
 	ctl_ua_type ua_type;
 
 	if (!(io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) {
 		union ctl_ha_msg msg_info;
 
 		msg_info.hdr.nexus = io->io_hdr.nexus;
 		msg_info.task.task_action = io->taskio.task_action;
 		msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS;
 		msg_info.hdr.original_sc = NULL;
 		msg_info.hdr.serializing_sc = NULL;
 		ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 		    sizeof(msg_info.task), M_WAITOK);
 	}
 
 	initidx = ctl_get_initindex(&io->io_hdr.nexus);
 	if (io->taskio.task_action == CTL_TASK_TARGET_RESET)
 		ua_type = CTL_UA_TARG_RESET;
 	else
 		ua_type = CTL_UA_BUS_RESET;
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_FOREACH(lun, &softc->lun_list, links) {
 		if (port != NULL &&
 		    ctl_lun_map_to_port(port, lun->lun) == UINT32_MAX)
 			continue;
 		ctl_do_lun_reset(lun, initidx, ua_type);
 	}
 	mtx_unlock(&softc->ctl_lock);
 	io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE;
 	return (0);
 }
 
 /*
  * The LUN should always be set.  The I/O is optional, and is used to
  * distinguish between I/Os sent by this initiator, and by other
  * initiators.  We set unit attention for initiators other than this one.
  * SAM-3 is vague on this point.  It does say that a unit attention should
  * be established for other initiators when a LUN is reset (see section
  * 5.7.3), but it doesn't specifically say that the unit attention should
  * be established for this particular initiator when a LUN is reset.  Here
  * is the relevant text, from SAM-3 rev 8:
  *
  * 5.7.2 When a SCSI initiator port aborts its own tasks
  *
  * When a SCSI initiator port causes its own task(s) to be aborted, no
  * notification that the task(s) have been aborted shall be returned to
  * the SCSI initiator port other than the completion response for the
  * command or task management function action that caused the task(s) to
  * be aborted and notification(s) associated with related effects of the
  * action (e.g., a reset unit attention condition).
  *
  * XXX KDM for now, we're setting unit attention for all initiators.
  */
 static void
 ctl_do_lun_reset(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua_type)
 {
 	union ctl_io *xio;
 	int i;
 
 	mtx_lock(&lun->lun_lock);
 	/* Abort tasks. */
 	for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL;
 	     xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) {
 		xio->io_hdr.flags |= CTL_FLAG_ABORT | CTL_FLAG_ABORT_STATUS;
 		ctl_try_unblock_io(lun, xio, FALSE);
 	}
 	/* Clear CA. */
 	for (i = 0; i < ctl_max_ports; i++) {
 		free(lun->pending_sense[i], M_CTL);
 		lun->pending_sense[i] = NULL;
 	}
 	/* Clear reservation. */
 	lun->flags &= ~CTL_LUN_RESERVED;
 	/* Clear prevent media removal. */
 	if (lun->prevent) {
 		for (i = 0; i < CTL_MAX_INITIATORS; i++)
 			ctl_clear_mask(lun->prevent, i);
 		lun->prevent_count = 0;
 	}
 	/* Clear TPC status */
 	ctl_tpc_lun_clear(lun, -1);
 	/* Establish UA. */
 #if 0
 	ctl_est_ua_all(lun, initidx, ua_type);
 #else
 	ctl_est_ua_all(lun, -1, ua_type);
 #endif
 	mtx_unlock(&lun->lun_lock);
 }
 
 static int
 ctl_lun_reset(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	struct ctl_lun *lun;
 	uint32_t targ_lun, initidx;
 
 	targ_lun = io->io_hdr.nexus.targ_mapped_lun;
 	initidx = ctl_get_initindex(&io->io_hdr.nexus);
 	mtx_lock(&softc->ctl_lock);
 	if (targ_lun >= ctl_max_luns ||
 	    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 		io->taskio.task_status = CTL_TASK_LUN_DOES_NOT_EXIST;
 		return (1);
 	}
 	ctl_do_lun_reset(lun, initidx, CTL_UA_LUN_RESET);
 	mtx_unlock(&softc->ctl_lock);
 	io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE;
 
 	if ((io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) == 0) {
 		union ctl_ha_msg msg_info;
 
 		msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS;
 		msg_info.hdr.nexus = io->io_hdr.nexus;
 		msg_info.task.task_action = CTL_TASK_LUN_RESET;
 		msg_info.hdr.original_sc = NULL;
 		msg_info.hdr.serializing_sc = NULL;
 		ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 		    sizeof(msg_info.task), M_WAITOK);
 	}
 	return (0);
 }
 
 static void
 ctl_abort_tasks_lun(struct ctl_lun *lun, uint32_t targ_port, uint32_t init_id,
     int other_sc)
 {
 	union ctl_io *xio;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 
 	/*
 	 * Run through the OOA queue and attempt to find the given I/O.
 	 * The target port, initiator ID, tag type and tag number have to
 	 * match the values that we got from the initiator.  If we have an
 	 * untagged command to abort, simply abort the first untagged command
 	 * we come to.  We only allow one untagged command at a time of course.
 	 */
 	for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL;
 	     xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) {
 
 		if ((targ_port == UINT32_MAX ||
 		     targ_port == xio->io_hdr.nexus.targ_port) &&
 		    (init_id == UINT32_MAX ||
 		     init_id == xio->io_hdr.nexus.initid)) {
 			if (targ_port != xio->io_hdr.nexus.targ_port ||
 			    init_id != xio->io_hdr.nexus.initid)
 				xio->io_hdr.flags |= CTL_FLAG_ABORT_STATUS;
 			xio->io_hdr.flags |= CTL_FLAG_ABORT;
 			if (!other_sc && !(lun->flags & CTL_LUN_PRIMARY_SC)) {
 				union ctl_ha_msg msg_info;
 
 				msg_info.hdr.nexus = xio->io_hdr.nexus;
 				msg_info.task.task_action = CTL_TASK_ABORT_TASK;
 				msg_info.task.tag_num = xio->scsiio.tag_num;
 				msg_info.task.tag_type = xio->scsiio.tag_type;
 				msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS;
 				msg_info.hdr.original_sc = NULL;
 				msg_info.hdr.serializing_sc = NULL;
 				ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 				    sizeof(msg_info.task), M_NOWAIT);
 			}
 			ctl_try_unblock_io(lun, xio, FALSE);
 		}
 	}
 }
 
 static int
 ctl_abort_task_set(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	struct ctl_lun *lun;
 	uint32_t targ_lun;
 
 	/*
 	 * Look up the LUN.
 	 */
 	targ_lun = io->io_hdr.nexus.targ_mapped_lun;
 	mtx_lock(&softc->ctl_lock);
 	if (targ_lun >= ctl_max_luns ||
 	    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 		io->taskio.task_status = CTL_TASK_LUN_DOES_NOT_EXIST;
 		return (1);
 	}
 
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 	if (io->taskio.task_action == CTL_TASK_ABORT_TASK_SET) {
 		ctl_abort_tasks_lun(lun, io->io_hdr.nexus.targ_port,
 		    io->io_hdr.nexus.initid,
 		    (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0);
 	} else { /* CTL_TASK_CLEAR_TASK_SET */
 		ctl_abort_tasks_lun(lun, UINT32_MAX, UINT32_MAX,
 		    (io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) != 0);
 	}
 	mtx_unlock(&lun->lun_lock);
 	io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE;
 	return (0);
 }
 
 static void
 ctl_i_t_nexus_loss(struct ctl_softc *softc, uint32_t initidx,
     ctl_ua_type ua_type)
 {
 	struct ctl_lun *lun;
 	struct scsi_sense_data *ps;
 	uint32_t p, i;
 
 	p = initidx / CTL_MAX_INIT_PER_PORT;
 	i = initidx % CTL_MAX_INIT_PER_PORT;
 	mtx_lock(&softc->ctl_lock);
 	STAILQ_FOREACH(lun, &softc->lun_list, links) {
 		mtx_lock(&lun->lun_lock);
 		/* Abort tasks. */
 		ctl_abort_tasks_lun(lun, p, i, 1);
 		/* Clear CA. */
 		ps = lun->pending_sense[p];
 		if (ps != NULL)
 			ps[i].error_code = 0;
 		/* Clear reservation. */
 		if ((lun->flags & CTL_LUN_RESERVED) && (lun->res_idx == initidx))
 			lun->flags &= ~CTL_LUN_RESERVED;
 		/* Clear prevent media removal. */
 		if (lun->prevent && ctl_is_set(lun->prevent, initidx)) {
 			ctl_clear_mask(lun->prevent, initidx);
 			lun->prevent_count--;
 		}
 		/* Clear TPC status */
 		ctl_tpc_lun_clear(lun, initidx);
 		/* Establish UA. */
 		ctl_est_ua(lun, initidx, ua_type);
 		mtx_unlock(&lun->lun_lock);
 	}
 	mtx_unlock(&softc->ctl_lock);
 }
 
 static int
 ctl_i_t_nexus_reset(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	uint32_t initidx;
 
 	if (!(io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC)) {
 		union ctl_ha_msg msg_info;
 
 		msg_info.hdr.nexus = io->io_hdr.nexus;
 		msg_info.task.task_action = CTL_TASK_I_T_NEXUS_RESET;
 		msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS;
 		msg_info.hdr.original_sc = NULL;
 		msg_info.hdr.serializing_sc = NULL;
 		ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 		    sizeof(msg_info.task), M_WAITOK);
 	}
 
 	initidx = ctl_get_initindex(&io->io_hdr.nexus);
 	ctl_i_t_nexus_loss(softc, initidx, CTL_UA_I_T_NEXUS_LOSS);
 	io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE;
 	return (0);
 }
 
 static int
 ctl_abort_task(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	union ctl_io *xio;
 	struct ctl_lun *lun;
 	uint32_t targ_lun;
 
 	/*
 	 * Look up the LUN.
 	 */
 	targ_lun = io->io_hdr.nexus.targ_mapped_lun;
 	mtx_lock(&softc->ctl_lock);
 	if (targ_lun >= ctl_max_luns ||
 	    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 		io->taskio.task_status = CTL_TASK_LUN_DOES_NOT_EXIST;
 		return (1);
 	}
 
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 	/*
 	 * Run through the OOA queue and attempt to find the given I/O.
 	 * The target port, initiator ID, tag type and tag number have to
 	 * match the values that we got from the initiator.  If we have an
 	 * untagged command to abort, simply abort the first untagged command
 	 * we come to.  We only allow one untagged command at a time of course.
 	 */
 	for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL;
 	     xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) {
 
 		if ((xio->io_hdr.nexus.targ_port != io->io_hdr.nexus.targ_port)
 		 || (xio->io_hdr.nexus.initid != io->io_hdr.nexus.initid)
 		 || (xio->io_hdr.flags & CTL_FLAG_ABORT))
 			continue;
 
 		/*
 		 * If the abort says that the task is untagged, the
 		 * task in the queue must be untagged.  Otherwise,
 		 * we just check to see whether the tag numbers
 		 * match.  This is because the QLogic firmware
 		 * doesn't pass back the tag type in an abort
 		 * request.
 		 */
 #if 0
 		if (((xio->scsiio.tag_type == CTL_TAG_UNTAGGED)
 		  && (io->taskio.tag_type == CTL_TAG_UNTAGGED))
 		 || (xio->scsiio.tag_num == io->taskio.tag_num)) {
 #else
 		/*
 		 * XXX KDM we've got problems with FC, because it
 		 * doesn't send down a tag type with aborts.  So we
 		 * can only really go by the tag number...
 		 * This may cause problems with parallel SCSI.
 		 * Need to figure that out!!
 		 */
 		if (xio->scsiio.tag_num == io->taskio.tag_num) {
 #endif
 			xio->io_hdr.flags |= CTL_FLAG_ABORT;
 			if ((io->io_hdr.flags & CTL_FLAG_FROM_OTHER_SC) == 0 &&
 			    !(lun->flags & CTL_LUN_PRIMARY_SC)) {
 				union ctl_ha_msg msg_info;
 
 				msg_info.hdr.nexus = io->io_hdr.nexus;
 				msg_info.task.task_action = CTL_TASK_ABORT_TASK;
 				msg_info.task.tag_num = io->taskio.tag_num;
 				msg_info.task.tag_type = io->taskio.tag_type;
 				msg_info.hdr.msg_type = CTL_MSG_MANAGE_TASKS;
 				msg_info.hdr.original_sc = NULL;
 				msg_info.hdr.serializing_sc = NULL;
 				ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg_info,
 				    sizeof(msg_info.task), M_NOWAIT);
 			}
 			ctl_try_unblock_io(lun, xio, FALSE);
 		}
 	}
 	mtx_unlock(&lun->lun_lock);
 	io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE;
 	return (0);
 }
 
 static int
 ctl_query_task(union ctl_io *io, int task_set)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	union ctl_io *xio;
 	struct ctl_lun *lun;
 	int found = 0;
 	uint32_t targ_lun;
 
 	targ_lun = io->io_hdr.nexus.targ_mapped_lun;
 	mtx_lock(&softc->ctl_lock);
 	if (targ_lun >= ctl_max_luns ||
 	    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 		io->taskio.task_status = CTL_TASK_LUN_DOES_NOT_EXIST;
 		return (1);
 	}
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 	for (xio = (union ctl_io *)TAILQ_FIRST(&lun->ooa_queue); xio != NULL;
 	     xio = (union ctl_io *)TAILQ_NEXT(&xio->io_hdr, ooa_links)) {
 
 		if ((xio->io_hdr.nexus.targ_port != io->io_hdr.nexus.targ_port)
 		 || (xio->io_hdr.nexus.initid != io->io_hdr.nexus.initid)
 		 || (xio->io_hdr.flags & CTL_FLAG_ABORT))
 			continue;
 
 		if (task_set || xio->scsiio.tag_num == io->taskio.tag_num) {
 			found = 1;
 			break;
 		}
 	}
 	mtx_unlock(&lun->lun_lock);
 	if (found)
 		io->taskio.task_status = CTL_TASK_FUNCTION_SUCCEEDED;
 	else
 		io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE;
 	return (0);
 }
 
 static int
 ctl_query_async_event(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	struct ctl_lun *lun;
 	ctl_ua_type ua;
 	uint32_t targ_lun, initidx;
 
 	targ_lun = io->io_hdr.nexus.targ_mapped_lun;
 	mtx_lock(&softc->ctl_lock);
 	if (targ_lun >= ctl_max_luns ||
 	    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 		io->taskio.task_status = CTL_TASK_LUN_DOES_NOT_EXIST;
 		return (1);
 	}
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 	initidx = ctl_get_initindex(&io->io_hdr.nexus);
 	ua = ctl_build_qae(lun, initidx, io->taskio.task_resp);
 	mtx_unlock(&lun->lun_lock);
 	if (ua != CTL_UA_NONE)
 		io->taskio.task_status = CTL_TASK_FUNCTION_SUCCEEDED;
 	else
 		io->taskio.task_status = CTL_TASK_FUNCTION_COMPLETE;
 	return (0);
 }
 
 static void
 ctl_run_task(union ctl_io *io)
 {
 	int retval = 1;
 
 	CTL_DEBUG_PRINT(("ctl_run_task\n"));
 	KASSERT(io->io_hdr.io_type == CTL_IO_TASK,
 	    ("ctl_run_task: Unextected io_type %d\n", io->io_hdr.io_type));
 	io->taskio.task_status = CTL_TASK_FUNCTION_NOT_SUPPORTED;
 	bzero(io->taskio.task_resp, sizeof(io->taskio.task_resp));
 	switch (io->taskio.task_action) {
 	case CTL_TASK_ABORT_TASK:
 		retval = ctl_abort_task(io);
 		break;
 	case CTL_TASK_ABORT_TASK_SET:
 	case CTL_TASK_CLEAR_TASK_SET:
 		retval = ctl_abort_task_set(io);
 		break;
 	case CTL_TASK_CLEAR_ACA:
 		break;
 	case CTL_TASK_I_T_NEXUS_RESET:
 		retval = ctl_i_t_nexus_reset(io);
 		break;
 	case CTL_TASK_LUN_RESET:
 		retval = ctl_lun_reset(io);
 		break;
 	case CTL_TASK_TARGET_RESET:
 	case CTL_TASK_BUS_RESET:
 		retval = ctl_target_reset(io);
 		break;
 	case CTL_TASK_PORT_LOGIN:
 		break;
 	case CTL_TASK_PORT_LOGOUT:
 		break;
 	case CTL_TASK_QUERY_TASK:
 		retval = ctl_query_task(io, 0);
 		break;
 	case CTL_TASK_QUERY_TASK_SET:
 		retval = ctl_query_task(io, 1);
 		break;
 	case CTL_TASK_QUERY_ASYNC_EVENT:
 		retval = ctl_query_async_event(io);
 		break;
 	default:
 		printf("%s: got unknown task management event %d\n",
 		       __func__, io->taskio.task_action);
 		break;
 	}
 	if (retval == 0)
 		io->io_hdr.status = CTL_SUCCESS;
 	else
 		io->io_hdr.status = CTL_ERROR;
 	ctl_done(io);
 }
 
 /*
  * For HA operation.  Handle commands that come in from the other
  * controller.
  */
 static void
 ctl_handle_isc(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	struct ctl_lun *lun;
 	const struct ctl_cmd_entry *entry;
 	uint32_t targ_lun;
 
 	targ_lun = io->io_hdr.nexus.targ_mapped_lun;
 	switch (io->io_hdr.msg_type) {
 	case CTL_MSG_SERIALIZE:
 		ctl_serialize_other_sc_cmd(&io->scsiio);
 		break;
 	case CTL_MSG_R2R:		/* Only used in SER_ONLY mode. */
 		entry = ctl_get_cmd_entry(&io->scsiio, NULL);
 		if (targ_lun >= ctl_max_luns ||
 		    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 			ctl_done(io);
 			break;
 		}
 		mtx_lock(&lun->lun_lock);
 		if (ctl_scsiio_lun_check(lun, entry, &io->scsiio) != 0) {
 			mtx_unlock(&lun->lun_lock);
 			ctl_done(io);
 			break;
 		}
 		io->io_hdr.flags |= CTL_FLAG_IS_WAS_ON_RTR;
 		mtx_unlock(&lun->lun_lock);
 		ctl_enqueue_rtr(io);
 		break;
 	case CTL_MSG_FINISH_IO:
 		if (softc->ha_mode == CTL_HA_MODE_XFER) {
 			ctl_done(io);
 			break;
 		}
 		if (targ_lun >= ctl_max_luns ||
 		    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 			ctl_free_io(io);
 			break;
 		}
 		mtx_lock(&lun->lun_lock);
 		ctl_try_unblock_others(lun, io, TRUE);
 		TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links);
 		mtx_unlock(&lun->lun_lock);
 		ctl_free_io(io);
 		break;
 	case CTL_MSG_PERS_ACTION:
 		ctl_hndl_per_res_out_on_other_sc(io);
 		ctl_free_io(io);
 		break;
 	case CTL_MSG_BAD_JUJU:
 		ctl_done(io);
 		break;
 	case CTL_MSG_DATAMOVE:		/* Only used in XFER mode */
 		ctl_datamove_remote(io);
 		break;
 	case CTL_MSG_DATAMOVE_DONE:	/* Only used in XFER mode */
 		io->scsiio.be_move_done(io);
 		break;
 	case CTL_MSG_FAILOVER:
 		ctl_failover_lun(io);
 		ctl_free_io(io);
 		break;
 	default:
 		printf("%s: Invalid message type %d\n",
 		       __func__, io->io_hdr.msg_type);
 		ctl_free_io(io);
 		break;
 	}
 
 }
 
 
 /*
  * Returns the match type in the case of a match, or CTL_LUN_PAT_NONE if
  * there is no match.
  */
 static ctl_lun_error_pattern
 ctl_cmd_pattern_match(struct ctl_scsiio *ctsio, struct ctl_error_desc *desc)
 {
 	const struct ctl_cmd_entry *entry;
 	ctl_lun_error_pattern filtered_pattern, pattern;
 
 	pattern = desc->error_pattern;
 
 	/*
 	 * XXX KDM we need more data passed into this function to match a
 	 * custom pattern, and we actually need to implement custom pattern
 	 * matching.
 	 */
 	if (pattern & CTL_LUN_PAT_CMD)
 		return (CTL_LUN_PAT_CMD);
 
 	if ((pattern & CTL_LUN_PAT_MASK) == CTL_LUN_PAT_ANY)
 		return (CTL_LUN_PAT_ANY);
 
 	entry = ctl_get_cmd_entry(ctsio, NULL);
 
 	filtered_pattern = entry->pattern & pattern;
 
 	/*
 	 * If the user requested specific flags in the pattern (e.g.
 	 * CTL_LUN_PAT_RANGE), make sure the command supports all of those
 	 * flags.
 	 *
 	 * If the user did not specify any flags, it doesn't matter whether
 	 * or not the command supports the flags.
 	 */
 	if ((filtered_pattern & ~CTL_LUN_PAT_MASK) !=
 	     (pattern & ~CTL_LUN_PAT_MASK))
 		return (CTL_LUN_PAT_NONE);
 
 	/*
 	 * If the user asked for a range check, see if the requested LBA
 	 * range overlaps with this command's LBA range.
 	 */
 	if (filtered_pattern & CTL_LUN_PAT_RANGE) {
 		uint64_t lba1;
 		uint64_t len1;
 		ctl_action action;
 		int retval;
 
 		retval = ctl_get_lba_len((union ctl_io *)ctsio, &lba1, &len1);
 		if (retval != 0)
 			return (CTL_LUN_PAT_NONE);
 
 		action = ctl_extent_check_lba(lba1, len1, desc->lba_range.lba,
 					      desc->lba_range.len, FALSE);
 		/*
 		 * A "pass" means that the LBA ranges don't overlap, so
 		 * this doesn't match the user's range criteria.
 		 */
 		if (action == CTL_ACTION_PASS)
 			return (CTL_LUN_PAT_NONE);
 	}
 
 	return (filtered_pattern);
 }
 
 static void
 ctl_inject_error(struct ctl_lun *lun, union ctl_io *io)
 {
 	struct ctl_error_desc *desc, *desc2;
 
 	mtx_assert(&lun->lun_lock, MA_OWNED);
 
 	STAILQ_FOREACH_SAFE(desc, &lun->error_list, links, desc2) {
 		ctl_lun_error_pattern pattern;
 		/*
 		 * Check to see whether this particular command matches
 		 * the pattern in the descriptor.
 		 */
 		pattern = ctl_cmd_pattern_match(&io->scsiio, desc);
 		if ((pattern & CTL_LUN_PAT_MASK) == CTL_LUN_PAT_NONE)
 			continue;
 
 		switch (desc->lun_error & CTL_LUN_INJ_TYPE) {
 		case CTL_LUN_INJ_ABORTED:
 			ctl_set_aborted(&io->scsiio);
 			break;
 		case CTL_LUN_INJ_MEDIUM_ERR:
 			ctl_set_medium_error(&io->scsiio,
 			    (io->io_hdr.flags & CTL_FLAG_DATA_MASK) !=
 			     CTL_FLAG_DATA_OUT);
 			break;
 		case CTL_LUN_INJ_UA:
 			/* 29h/00h  POWER ON, RESET, OR BUS DEVICE RESET
 			 * OCCURRED */
 			ctl_set_ua(&io->scsiio, 0x29, 0x00);
 			break;
 		case CTL_LUN_INJ_CUSTOM:
 			/*
 			 * We're assuming the user knows what he is doing.
 			 * Just copy the sense information without doing
 			 * checks.
 			 */
 			bcopy(&desc->custom_sense, &io->scsiio.sense_data,
 			      MIN(sizeof(desc->custom_sense),
 				  sizeof(io->scsiio.sense_data)));
 			io->scsiio.scsi_status = SCSI_STATUS_CHECK_COND;
 			io->scsiio.sense_len = SSD_FULL_SIZE;
 			io->io_hdr.status = CTL_SCSI_ERROR | CTL_AUTOSENSE;
 			break;
 		case CTL_LUN_INJ_NONE:
 		default:
 			/*
 			 * If this is an error injection type we don't know
 			 * about, clear the continuous flag (if it is set)
 			 * so it will get deleted below.
 			 */
 			desc->lun_error &= ~CTL_LUN_INJ_CONTINUOUS;
 			break;
 		}
 		/*
 		 * By default, each error injection action is a one-shot
 		 */
 		if (desc->lun_error & CTL_LUN_INJ_CONTINUOUS)
 			continue;
 
 		STAILQ_REMOVE(&lun->error_list, desc, ctl_error_desc, links);
 
 		free(desc, M_CTL);
 	}
 }
 
 #ifdef CTL_IO_DELAY
 static void
 ctl_datamove_timer_wakeup(void *arg)
 {
 	union ctl_io *io;
 
 	io = (union ctl_io *)arg;
 
 	ctl_datamove(io);
 }
 #endif /* CTL_IO_DELAY */
 
 void
 ctl_datamove(union ctl_io *io)
 {
 	void (*fe_datamove)(union ctl_io *io);
 
 	mtx_assert(&((struct ctl_softc *)CTL_SOFTC(io))->ctl_lock, MA_NOTOWNED);
 
 	CTL_DEBUG_PRINT(("ctl_datamove\n"));
 
 	/* No data transferred yet.  Frontend must update this when done. */
 	io->scsiio.kern_data_resid = io->scsiio.kern_data_len;
 
 #ifdef CTL_TIME_IO
 	if ((time_uptime - io->io_hdr.start_time) > ctl_time_io_secs) {
 		char str[256];
 		char path_str[64];
 		struct sbuf sb;
 
 		ctl_scsi_path_string(io, path_str, sizeof(path_str));
 		sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN);
 
 		sbuf_cat(&sb, path_str);
 		switch (io->io_hdr.io_type) {
 		case CTL_IO_SCSI:
 			ctl_scsi_command_string(&io->scsiio, NULL, &sb);
 			sbuf_printf(&sb, "\n");
 			sbuf_cat(&sb, path_str);
 			sbuf_printf(&sb, "Tag: 0x%04x, type %d\n",
 				    io->scsiio.tag_num, io->scsiio.tag_type);
 			break;
 		case CTL_IO_TASK:
 			sbuf_printf(&sb, "Task I/O type: %d, Tag: 0x%04x, "
 				    "Tag Type: %d\n", io->taskio.task_action,
 				    io->taskio.tag_num, io->taskio.tag_type);
 			break;
 		default:
 			panic("%s: Invalid CTL I/O type %d\n",
 			    __func__, io->io_hdr.io_type);
 		}
 		sbuf_cat(&sb, path_str);
 		sbuf_printf(&sb, "ctl_datamove: %jd seconds\n",
 			    (intmax_t)time_uptime - io->io_hdr.start_time);
 		sbuf_finish(&sb);
 		printf("%s", sbuf_data(&sb));
 	}
 #endif /* CTL_TIME_IO */
 
 #ifdef CTL_IO_DELAY
 	if (io->io_hdr.flags & CTL_FLAG_DELAY_DONE) {
 		io->io_hdr.flags &= ~CTL_FLAG_DELAY_DONE;
 	} else {
 		struct ctl_lun *lun;
 
 		lun = CTL_LUN(io);
 		if ((lun != NULL)
 		 && (lun->delay_info.datamove_delay > 0)) {
 
 			callout_init(&io->io_hdr.delay_callout, /*mpsafe*/ 1);
 			io->io_hdr.flags |= CTL_FLAG_DELAY_DONE;
 			callout_reset(&io->io_hdr.delay_callout,
 				      lun->delay_info.datamove_delay * hz,
 				      ctl_datamove_timer_wakeup, io);
 			if (lun->delay_info.datamove_type ==
 			    CTL_DELAY_TYPE_ONESHOT)
 				lun->delay_info.datamove_delay = 0;
 			return;
 		}
 	}
 #endif
 
 	/*
 	 * This command has been aborted.  Set the port status, so we fail
 	 * the data move.
 	 */
 	if (io->io_hdr.flags & CTL_FLAG_ABORT) {
 		printf("ctl_datamove: tag 0x%04x on (%u:%u:%u) aborted\n",
 		       io->scsiio.tag_num, io->io_hdr.nexus.initid,
 		       io->io_hdr.nexus.targ_port,
 		       io->io_hdr.nexus.targ_lun);
 		io->io_hdr.port_status = 31337;
 		/*
 		 * Note that the backend, in this case, will get the
 		 * callback in its context.  In other cases it may get
 		 * called in the frontend's interrupt thread context.
 		 */
 		io->scsiio.be_move_done(io);
 		return;
 	}
 
 	/* Don't confuse frontend with zero length data move. */
 	if (io->scsiio.kern_data_len == 0) {
 		io->scsiio.be_move_done(io);
 		return;
 	}
 
 	fe_datamove = CTL_PORT(io)->fe_datamove;
 	fe_datamove(io);
 }
 
 static void
 ctl_send_datamove_done(union ctl_io *io, int have_lock)
 {
 	union ctl_ha_msg msg;
 #ifdef CTL_TIME_IO
 	struct bintime cur_bt;
 #endif
 
 	memset(&msg, 0, sizeof(msg));
 	msg.hdr.msg_type = CTL_MSG_DATAMOVE_DONE;
 	msg.hdr.original_sc = io;
 	msg.hdr.serializing_sc = io->io_hdr.remote_io;
 	msg.hdr.nexus = io->io_hdr.nexus;
 	msg.hdr.status = io->io_hdr.status;
 	msg.scsi.kern_data_resid = io->scsiio.kern_data_resid;
 	msg.scsi.tag_num = io->scsiio.tag_num;
 	msg.scsi.tag_type = io->scsiio.tag_type;
 	msg.scsi.scsi_status = io->scsiio.scsi_status;
 	memcpy(&msg.scsi.sense_data, &io->scsiio.sense_data,
 	       io->scsiio.sense_len);
 	msg.scsi.sense_len = io->scsiio.sense_len;
 	msg.scsi.port_status = io->io_hdr.port_status;
 	io->io_hdr.flags &= ~CTL_FLAG_IO_ACTIVE;
 	if (io->io_hdr.flags & CTL_FLAG_FAILOVER) {
 		ctl_failover_io(io, /*have_lock*/ have_lock);
 		return;
 	}
 	ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg,
 	    sizeof(msg.scsi) - sizeof(msg.scsi.sense_data) +
 	    msg.scsi.sense_len, M_WAITOK);
 
 #ifdef CTL_TIME_IO
 	getbinuptime(&cur_bt);
 	bintime_sub(&cur_bt, &io->io_hdr.dma_start_bt);
 	bintime_add(&io->io_hdr.dma_bt, &cur_bt);
 #endif
 	io->io_hdr.num_dmas++;
 }
 
 /*
  * The DMA to the remote side is done, now we need to tell the other side
  * we're done so it can continue with its data movement.
  */
 static void
 ctl_datamove_remote_write_cb(struct ctl_ha_dt_req *rq)
 {
 	union ctl_io *io;
 	uint32_t i;
 
 	io = rq->context;
 
 	if (rq->ret != CTL_HA_STATUS_SUCCESS) {
 		printf("%s: ISC DMA write failed with error %d", __func__,
 		       rq->ret);
 		ctl_set_internal_failure(&io->scsiio,
 					 /*sks_valid*/ 1,
 					 /*retry_count*/ rq->ret);
 	}
 
 	ctl_dt_req_free(rq);
 
 	for (i = 0; i < io->scsiio.kern_sg_entries; i++)
 		free(CTL_LSGLT(io)[i].addr, M_CTL);
 	free(CTL_RSGL(io), M_CTL);
 	CTL_RSGL(io) = NULL;
 	CTL_LSGL(io) = NULL;
 
 	/*
 	 * The data is in local and remote memory, so now we need to send
 	 * status (good or back) back to the other side.
 	 */
 	ctl_send_datamove_done(io, /*have_lock*/ 0);
 }
 
 /*
  * We've moved the data from the host/controller into local memory.  Now we
  * need to push it over to the remote controller's memory.
  */
 static int
 ctl_datamove_remote_dm_write_cb(union ctl_io *io)
 {
 	int retval;
 
 	retval = ctl_datamove_remote_xfer(io, CTL_HA_DT_CMD_WRITE,
 					  ctl_datamove_remote_write_cb);
 	return (retval);
 }
 
 static void
 ctl_datamove_remote_write(union ctl_io *io)
 {
 	int retval;
 	void (*fe_datamove)(union ctl_io *io);
 
 	/*
 	 * - Get the data from the host/HBA into local memory.
 	 * - DMA memory from the local controller to the remote controller.
 	 * - Send status back to the remote controller.
 	 */
 
 	retval = ctl_datamove_remote_sgl_setup(io);
 	if (retval != 0)
 		return;
 
 	/* Switch the pointer over so the FETD knows what to do */
 	io->scsiio.kern_data_ptr = (uint8_t *)CTL_LSGL(io);
 
 	/*
 	 * Use a custom move done callback, since we need to send completion
 	 * back to the other controller, not to the backend on this side.
 	 */
 	io->scsiio.be_move_done = ctl_datamove_remote_dm_write_cb;
 
 	fe_datamove = CTL_PORT(io)->fe_datamove;
 	fe_datamove(io);
 }
 
 static int
 ctl_datamove_remote_dm_read_cb(union ctl_io *io)
 {
 	uint32_t i;
 
 	for (i = 0; i < io->scsiio.kern_sg_entries; i++)
 		free(CTL_LSGLT(io)[i].addr, M_CTL);
 	free(CTL_RSGL(io), M_CTL);
 	CTL_RSGL(io) = NULL;
 	CTL_LSGL(io) = NULL;
 
 	/*
 	 * The read is done, now we need to send status (good or bad) back
 	 * to the other side.
 	 */
 	ctl_send_datamove_done(io, /*have_lock*/ 0);
 
 	return (0);
 }
 
 static void
 ctl_datamove_remote_read_cb(struct ctl_ha_dt_req *rq)
 {
 	union ctl_io *io;
 	void (*fe_datamove)(union ctl_io *io);
 
 	io = rq->context;
 
 	if (rq->ret != CTL_HA_STATUS_SUCCESS) {
 		printf("%s: ISC DMA read failed with error %d\n", __func__,
 		       rq->ret);
 		ctl_set_internal_failure(&io->scsiio,
 					 /*sks_valid*/ 1,
 					 /*retry_count*/ rq->ret);
 	}
 
 	ctl_dt_req_free(rq);
 
 	/* Switch the pointer over so the FETD knows what to do */
 	io->scsiio.kern_data_ptr = (uint8_t *)CTL_LSGL(io);
 
 	/*
 	 * Use a custom move done callback, since we need to send completion
 	 * back to the other controller, not to the backend on this side.
 	 */
 	io->scsiio.be_move_done = ctl_datamove_remote_dm_read_cb;
 
 	/* XXX KDM add checks like the ones in ctl_datamove? */
 
 	fe_datamove = CTL_PORT(io)->fe_datamove;
 	fe_datamove(io);
 }
 
 static int
 ctl_datamove_remote_sgl_setup(union ctl_io *io)
 {
 	struct ctl_sg_entry *local_sglist;
 	uint32_t len_to_go;
 	int retval;
 	int i;
 
 	retval = 0;
 	local_sglist = CTL_LSGL(io);
 	len_to_go = io->scsiio.kern_data_len;
 
 	/*
 	 * The difficult thing here is that the size of the various
 	 * S/G segments may be different than the size from the
 	 * remote controller.  That'll make it harder when DMAing
 	 * the data back to the other side.
 	 */
 	for (i = 0; len_to_go > 0; i++) {
 		local_sglist[i].len = MIN(len_to_go, CTL_HA_DATAMOVE_SEGMENT);
 		local_sglist[i].addr =
 		    malloc(local_sglist[i].len, M_CTL, M_WAITOK);
 
 		len_to_go -= local_sglist[i].len;
 	}
 	/*
 	 * Reset the number of S/G entries accordingly.  The original
 	 * number of S/G entries is available in rem_sg_entries.
 	 */
 	io->scsiio.kern_sg_entries = i;
 
 	return (retval);
 }
 
 static int
 ctl_datamove_remote_xfer(union ctl_io *io, unsigned command,
 			 ctl_ha_dt_cb callback)
 {
 	struct ctl_ha_dt_req *rq;
 	struct ctl_sg_entry *remote_sglist, *local_sglist;
 	uint32_t local_used, remote_used, total_used;
 	int i, j, isc_ret;
 
 	rq = ctl_dt_req_alloc();
 
 	/*
 	 * If we failed to allocate the request, and if the DMA didn't fail
 	 * anyway, set busy status.  This is just a resource allocation
 	 * failure.
 	 */
 	if ((rq == NULL)
 	 && ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
 	     (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS))
 		ctl_set_busy(&io->scsiio);
 
 	if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_STATUS_NONE &&
 	    (io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS) {
 
 		if (rq != NULL)
 			ctl_dt_req_free(rq);
 
 		/*
 		 * The data move failed.  We need to return status back
 		 * to the other controller.  No point in trying to DMA
 		 * data to the remote controller.
 		 */
 
 		ctl_send_datamove_done(io, /*have_lock*/ 0);
 
 		return (1);
 	}
 
 	local_sglist = CTL_LSGL(io);
 	remote_sglist = CTL_RSGL(io);
 	local_used = 0;
 	remote_used = 0;
 	total_used = 0;
 
 	/*
 	 * Pull/push the data over the wire from/to the other controller.
 	 * This takes into account the possibility that the local and
 	 * remote sglists may not be identical in terms of the size of
 	 * the elements and the number of elements.
 	 *
 	 * One fundamental assumption here is that the length allocated for
 	 * both the local and remote sglists is identical.  Otherwise, we've
 	 * essentially got a coding error of some sort.
 	 */
 	isc_ret = CTL_HA_STATUS_SUCCESS;
 	for (i = 0, j = 0; total_used < io->scsiio.kern_data_len; ) {
 		uint32_t cur_len;
 		uint8_t *tmp_ptr;
 
 		rq->command = command;
 		rq->context = io;
 
 		/*
 		 * Both pointers should be aligned.  But it is possible
 		 * that the allocation length is not.  They should both
 		 * also have enough slack left over at the end, though,
 		 * to round up to the next 8 byte boundary.
 		 */
 		cur_len = MIN(local_sglist[i].len - local_used,
 			      remote_sglist[j].len - remote_used);
 		rq->size = cur_len;
 
 		tmp_ptr = (uint8_t *)local_sglist[i].addr;
 		tmp_ptr += local_used;
 
 #if 0
 		/* Use physical addresses when talking to ISC hardware */
 		if ((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0) {
 			/* XXX KDM use busdma */
 			rq->local = vtophys(tmp_ptr);
 		} else
 			rq->local = tmp_ptr;
 #else
 		KASSERT((io->io_hdr.flags & CTL_FLAG_BUS_ADDR) == 0,
 		    ("HA does not support BUS_ADDR"));
 		rq->local = tmp_ptr;
 #endif
 
 		tmp_ptr = (uint8_t *)remote_sglist[j].addr;
 		tmp_ptr += remote_used;
 		rq->remote = tmp_ptr;
 
 		rq->callback = NULL;
 
 		local_used += cur_len;
 		if (local_used >= local_sglist[i].len) {
 			i++;
 			local_used = 0;
 		}
 
 		remote_used += cur_len;
 		if (remote_used >= remote_sglist[j].len) {
 			j++;
 			remote_used = 0;
 		}
 		total_used += cur_len;
 
 		if (total_used >= io->scsiio.kern_data_len)
 			rq->callback = callback;
 
 		isc_ret = ctl_dt_single(rq);
 		if (isc_ret > CTL_HA_STATUS_SUCCESS)
 			break;
 	}
 	if (isc_ret != CTL_HA_STATUS_WAIT) {
 		rq->ret = isc_ret;
 		callback(rq);
 	}
 
 	return (0);
 }
 
 static void
 ctl_datamove_remote_read(union ctl_io *io)
 {
 	int retval;
 	uint32_t i;
 
 	/*
 	 * This will send an error to the other controller in the case of a
 	 * failure.
 	 */
 	retval = ctl_datamove_remote_sgl_setup(io);
 	if (retval != 0)
 		return;
 
 	retval = ctl_datamove_remote_xfer(io, CTL_HA_DT_CMD_READ,
 					  ctl_datamove_remote_read_cb);
 	if (retval != 0) {
 		/*
 		 * Make sure we free memory if there was an error..  The
 		 * ctl_datamove_remote_xfer() function will send the
 		 * datamove done message, or call the callback with an
 		 * error if there is a problem.
 		 */
 		for (i = 0; i < io->scsiio.kern_sg_entries; i++)
 			free(CTL_LSGLT(io)[i].addr, M_CTL);
 		free(CTL_RSGL(io), M_CTL);
 		CTL_RSGL(io) = NULL;
 		CTL_LSGL(io) = NULL;
 	}
 }
 
 /*
  * Process a datamove request from the other controller.  This is used for
  * XFER mode only, not SER_ONLY mode.  For writes, we DMA into local memory
  * first.  Once that is complete, the data gets DMAed into the remote
  * controller's memory.  For reads, we DMA from the remote controller's
  * memory into our memory first, and then move it out to the FETD.
  */
 static void
 ctl_datamove_remote(union ctl_io *io)
 {
 
 	mtx_assert(&((struct ctl_softc *)CTL_SOFTC(io))->ctl_lock, MA_NOTOWNED);
 
 	if (io->io_hdr.flags & CTL_FLAG_FAILOVER) {
 		ctl_failover_io(io, /*have_lock*/ 0);
 		return;
 	}
 
 	/*
 	 * Note that we look for an aborted I/O here, but don't do some of
 	 * the other checks that ctl_datamove() normally does.
 	 * We don't need to run the datamove delay code, since that should
 	 * have been done if need be on the other controller.
 	 */
 	if (io->io_hdr.flags & CTL_FLAG_ABORT) {
 		printf("%s: tag 0x%04x on (%u:%u:%u) aborted\n", __func__,
 		       io->scsiio.tag_num, io->io_hdr.nexus.initid,
 		       io->io_hdr.nexus.targ_port,
 		       io->io_hdr.nexus.targ_lun);
 		io->io_hdr.port_status = 31338;
 		ctl_send_datamove_done(io, /*have_lock*/ 0);
 		return;
 	}
 
 	if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_OUT)
 		ctl_datamove_remote_write(io);
 	else if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) == CTL_FLAG_DATA_IN)
 		ctl_datamove_remote_read(io);
 	else {
 		io->io_hdr.port_status = 31339;
 		ctl_send_datamove_done(io, /*have_lock*/ 0);
 	}
 }
 
 static void
 ctl_process_done(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	struct ctl_port *port = CTL_PORT(io);
 	struct ctl_lun *lun = CTL_LUN(io);
 	void (*fe_done)(union ctl_io *io);
 	union ctl_ha_msg msg;
 
 	CTL_DEBUG_PRINT(("ctl_process_done\n"));
 	fe_done = port->fe_done;
 
 #ifdef CTL_TIME_IO
 	if ((time_uptime - io->io_hdr.start_time) > ctl_time_io_secs) {
 		char str[256];
 		char path_str[64];
 		struct sbuf sb;
 
 		ctl_scsi_path_string(io, path_str, sizeof(path_str));
 		sbuf_new(&sb, str, sizeof(str), SBUF_FIXEDLEN);
 
 		sbuf_cat(&sb, path_str);
 		switch (io->io_hdr.io_type) {
 		case CTL_IO_SCSI:
 			ctl_scsi_command_string(&io->scsiio, NULL, &sb);
 			sbuf_printf(&sb, "\n");
 			sbuf_cat(&sb, path_str);
 			sbuf_printf(&sb, "Tag: 0x%04x, type %d\n",
 				    io->scsiio.tag_num, io->scsiio.tag_type);
 			break;
 		case CTL_IO_TASK:
 			sbuf_printf(&sb, "Task I/O type: %d, Tag: 0x%04x, "
 				    "Tag Type: %d\n", io->taskio.task_action,
 				    io->taskio.tag_num, io->taskio.tag_type);
 			break;
 		default:
 			panic("%s: Invalid CTL I/O type %d\n",
 			    __func__, io->io_hdr.io_type);
 		}
 		sbuf_cat(&sb, path_str);
 		sbuf_printf(&sb, "ctl_process_done: %jd seconds\n",
 			    (intmax_t)time_uptime - io->io_hdr.start_time);
 		sbuf_finish(&sb);
 		printf("%s", sbuf_data(&sb));
 	}
 #endif /* CTL_TIME_IO */
 
 	switch (io->io_hdr.io_type) {
 	case CTL_IO_SCSI:
 		break;
 	case CTL_IO_TASK:
 		if (ctl_debug & CTL_DEBUG_INFO)
 			ctl_io_error_print(io, NULL);
 		fe_done(io);
 		return;
 	default:
 		panic("%s: Invalid CTL I/O type %d\n",
 		    __func__, io->io_hdr.io_type);
 	}
 
 	if (lun == NULL) {
 		CTL_DEBUG_PRINT(("NULL LUN for lun %d\n",
 				 io->io_hdr.nexus.targ_mapped_lun));
 		goto bailout;
 	}
 
 	mtx_lock(&lun->lun_lock);
 
 	/*
 	 * Check to see if we have any informational exception and status
 	 * of this command can be modified to report it in form of either
 	 * RECOVERED ERROR or NO SENSE, depending on MRIE mode page field.
 	 */
 	if (lun->ie_reported == 0 && lun->ie_asc != 0 &&
 	    io->io_hdr.status == CTL_SUCCESS &&
 	    (io->io_hdr.flags & CTL_FLAG_STATUS_SENT) == 0) {
 		uint8_t mrie = lun->MODE_IE.mrie;
 		uint8_t per = ((lun->MODE_RWER.byte3 & SMS_RWER_PER) ||
 		    (lun->MODE_VER.byte3 & SMS_VER_PER));
 		if (((mrie == SIEP_MRIE_REC_COND && per) ||
 		     mrie == SIEP_MRIE_REC_UNCOND ||
 		     mrie == SIEP_MRIE_NO_SENSE) &&
 		    (ctl_get_cmd_entry(&io->scsiio, NULL)->flags &
 		     CTL_CMD_FLAG_NO_SENSE) == 0) {
 			ctl_set_sense(&io->scsiio,
 			      /*current_error*/ 1,
 			      /*sense_key*/ (mrie == SIEP_MRIE_NO_SENSE) ?
 			        SSD_KEY_NO_SENSE : SSD_KEY_RECOVERED_ERROR,
 			      /*asc*/ lun->ie_asc,
 			      /*ascq*/ lun->ie_ascq,
 			      SSD_ELEM_NONE);
 			lun->ie_reported = 1;
 		}
 	} else if (lun->ie_reported < 0)
 		lun->ie_reported = 0;
 
 	/*
 	 * Check to see if we have any errors to inject here.  We only
 	 * inject errors for commands that don't already have errors set.
 	 */
 	if (!STAILQ_EMPTY(&lun->error_list) &&
 	    ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS) &&
 	    ((io->io_hdr.flags & CTL_FLAG_STATUS_SENT) == 0))
 		ctl_inject_error(lun, io);
 
 	/*
 	 * XXX KDM how do we treat commands that aren't completed
 	 * successfully?
 	 *
 	 * XXX KDM should we also track I/O latency?
 	 */
 	if ((io->io_hdr.status & CTL_STATUS_MASK) == CTL_SUCCESS &&
 	    io->io_hdr.io_type == CTL_IO_SCSI) {
 		int type;
 #ifdef CTL_TIME_IO
 		struct bintime bt;
 
 		getbinuptime(&bt);
 		bintime_sub(&bt, &io->io_hdr.start_bt);
 #endif
 		if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) ==
 		    CTL_FLAG_DATA_IN)
 			type = CTL_STATS_READ;
 		else if ((io->io_hdr.flags & CTL_FLAG_DATA_MASK) ==
 		    CTL_FLAG_DATA_OUT)
 			type = CTL_STATS_WRITE;
 		else
 			type = CTL_STATS_NO_IO;
 
 		lun->stats.bytes[type] += io->scsiio.kern_total_len;
 		lun->stats.operations[type] ++;
 		lun->stats.dmas[type] += io->io_hdr.num_dmas;
 #ifdef CTL_TIME_IO
 		bintime_add(&lun->stats.dma_time[type], &io->io_hdr.dma_bt);
 		bintime_add(&lun->stats.time[type], &bt);
 #endif
 
 		mtx_lock(&port->port_lock);
 		port->stats.bytes[type] += io->scsiio.kern_total_len;
 		port->stats.operations[type] ++;
 		port->stats.dmas[type] += io->io_hdr.num_dmas;
 #ifdef CTL_TIME_IO
 		bintime_add(&port->stats.dma_time[type], &io->io_hdr.dma_bt);
 		bintime_add(&port->stats.time[type], &bt);
 #endif
 		mtx_unlock(&port->port_lock);
 	}
 
 	/*
 	 * Run through the blocked queue of this I/O and see if anything
 	 * can be unblocked, now that this I/O is done and will be removed.
 	 * We need to do it before removal to have OOA position to start.
 	 */
 	ctl_try_unblock_others(lun, io, TRUE);
 
 	/*
 	 * Remove this from the OOA queue.
 	 */
 	TAILQ_REMOVE(&lun->ooa_queue, &io->io_hdr, ooa_links);
 #ifdef CTL_TIME_IO
 	if (TAILQ_EMPTY(&lun->ooa_queue))
 		lun->last_busy = getsbinuptime();
 #endif
 
 	/*
 	 * If the LUN has been invalidated, free it if there is nothing
 	 * left on its OOA queue.
 	 */
 	if ((lun->flags & CTL_LUN_INVALID)
 	 && TAILQ_EMPTY(&lun->ooa_queue)) {
 		mtx_unlock(&lun->lun_lock);
 		ctl_free_lun(lun);
 	} else
 		mtx_unlock(&lun->lun_lock);
 
 bailout:
 
 	/*
 	 * If this command has been aborted, make sure we set the status
 	 * properly.  The FETD is responsible for freeing the I/O and doing
 	 * whatever it needs to do to clean up its state.
 	 */
 	if (io->io_hdr.flags & CTL_FLAG_ABORT)
 		ctl_set_task_aborted(&io->scsiio);
 
 	/*
 	 * If enabled, print command error status.
 	 */
 	if ((io->io_hdr.status & CTL_STATUS_MASK) != CTL_SUCCESS &&
 	    (ctl_debug & CTL_DEBUG_INFO) != 0)
 		ctl_io_error_print(io, NULL);
 
 	/*
 	 * Tell the FETD or the other shelf controller we're done with this
 	 * command.  Note that only SCSI commands get to this point.  Task
 	 * management commands are completed above.
 	 */
 	if ((softc->ha_mode != CTL_HA_MODE_XFER) &&
 	    (io->io_hdr.flags & CTL_FLAG_SENT_2OTHER_SC)) {
 		memset(&msg, 0, sizeof(msg));
 		msg.hdr.msg_type = CTL_MSG_FINISH_IO;
 		msg.hdr.serializing_sc = io->io_hdr.remote_io;
 		msg.hdr.nexus = io->io_hdr.nexus;
 		ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg,
 		    sizeof(msg.scsi) - sizeof(msg.scsi.sense_data),
 		    M_WAITOK);
 	}
 
 	fe_done(io);
 }
 
 /*
  * Front end should call this if it doesn't do autosense.  When the request
  * sense comes back in from the initiator, we'll dequeue this and send it.
  */
 int
 ctl_queue_sense(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	struct ctl_port *port = CTL_PORT(io);
 	struct ctl_lun *lun;
 	struct scsi_sense_data *ps;
 	uint32_t initidx, p, targ_lun;
 
 	CTL_DEBUG_PRINT(("ctl_queue_sense\n"));
 
 	targ_lun = ctl_lun_map_from_port(port, io->io_hdr.nexus.targ_lun);
 
 	/*
 	 * LUN lookup will likely move to the ctl_work_thread() once we
 	 * have our new queueing infrastructure (that doesn't put things on
 	 * a per-LUN queue initially).  That is so that we can handle
 	 * things like an INQUIRY to a LUN that we don't have enabled.  We
 	 * can't deal with that right now.
 	 * If we don't have a LUN for this, just toss the sense information.
 	 */
 	mtx_lock(&softc->ctl_lock);
 	if (targ_lun >= ctl_max_luns ||
 	    (lun = softc->ctl_luns[targ_lun]) == NULL) {
 		mtx_unlock(&softc->ctl_lock);
 		goto bailout;
 	}
 	mtx_lock(&lun->lun_lock);
 	mtx_unlock(&softc->ctl_lock);
 
 	initidx = ctl_get_initindex(&io->io_hdr.nexus);
 	p = initidx / CTL_MAX_INIT_PER_PORT;
 	if (lun->pending_sense[p] == NULL) {
 		lun->pending_sense[p] = malloc(sizeof(*ps) * CTL_MAX_INIT_PER_PORT,
 		    M_CTL, M_NOWAIT | M_ZERO);
 	}
 	if ((ps = lun->pending_sense[p]) != NULL) {
 		ps += initidx % CTL_MAX_INIT_PER_PORT;
 		memset(ps, 0, sizeof(*ps));
 		memcpy(ps, &io->scsiio.sense_data, io->scsiio.sense_len);
 	}
 	mtx_unlock(&lun->lun_lock);
 
 bailout:
 	ctl_free_io(io);
 	return (CTL_RETVAL_COMPLETE);
 }
 
 /*
  * Primary command inlet from frontend ports.  All SCSI and task I/O
  * requests must go through this function.
  */
 int
 ctl_queue(union ctl_io *io)
 {
 	struct ctl_port *port = CTL_PORT(io);
 
 	CTL_DEBUG_PRINT(("ctl_queue cdb[0]=%02X\n", io->scsiio.cdb[0]));
 
 #ifdef CTL_TIME_IO
 	io->io_hdr.start_time = time_uptime;
 	getbinuptime(&io->io_hdr.start_bt);
 #endif /* CTL_TIME_IO */
 
 	/* Map FE-specific LUN ID into global one. */
 	io->io_hdr.nexus.targ_mapped_lun =
 	    ctl_lun_map_from_port(port, io->io_hdr.nexus.targ_lun);
 
 	switch (io->io_hdr.io_type) {
 	case CTL_IO_SCSI:
 	case CTL_IO_TASK:
 		if (ctl_debug & CTL_DEBUG_CDB)
 			ctl_io_print(io);
 		ctl_enqueue_incoming(io);
 		break;
 	default:
 		printf("ctl_queue: unknown I/O type %d\n", io->io_hdr.io_type);
 		return (EINVAL);
 	}
 
 	return (CTL_RETVAL_COMPLETE);
 }
 
 #ifdef CTL_IO_DELAY
 static void
 ctl_done_timer_wakeup(void *arg)
 {
 	union ctl_io *io;
 
 	io = (union ctl_io *)arg;
 	ctl_done(io);
 }
 #endif /* CTL_IO_DELAY */
 
 void
 ctl_serseq_done(union ctl_io *io)
 {
 	struct ctl_lun *lun = CTL_LUN(io);;
 
 	if (lun->be_lun == NULL ||
 	    lun->be_lun->serseq == CTL_LUN_SERSEQ_OFF)
 		return;
 	mtx_lock(&lun->lun_lock);
 	io->io_hdr.flags |= CTL_FLAG_SERSEQ_DONE;
 	ctl_try_unblock_others(lun, io, FALSE);
 	mtx_unlock(&lun->lun_lock);
 }
 
 void
 ctl_done(union ctl_io *io)
 {
 
 	/*
 	 * Enable this to catch duplicate completion issues.
 	 */
 #if 0
 	if (io->io_hdr.flags & CTL_FLAG_ALREADY_DONE) {
 		printf("%s: type %d msg %d cdb %x iptl: "
 		       "%u:%u:%u tag 0x%04x "
 		       "flag %#x status %x\n",
 			__func__,
 			io->io_hdr.io_type,
 			io->io_hdr.msg_type,
 			io->scsiio.cdb[0],
 			io->io_hdr.nexus.initid,
 			io->io_hdr.nexus.targ_port,
 			io->io_hdr.nexus.targ_lun,
 			(io->io_hdr.io_type ==
 			CTL_IO_TASK) ?
 			io->taskio.tag_num :
 			io->scsiio.tag_num,
 		        io->io_hdr.flags,
 			io->io_hdr.status);
 	} else
 		io->io_hdr.flags |= CTL_FLAG_ALREADY_DONE;
 #endif
 
 	/*
 	 * This is an internal copy of an I/O, and should not go through
 	 * the normal done processing logic.
 	 */
 	if (io->io_hdr.flags & CTL_FLAG_INT_COPY)
 		return;
 
 #ifdef CTL_IO_DELAY
 	if (io->io_hdr.flags & CTL_FLAG_DELAY_DONE) {
 		io->io_hdr.flags &= ~CTL_FLAG_DELAY_DONE;
 	} else {
 		struct ctl_lun *lun = CTL_LUN(io);
 
 		if ((lun != NULL)
 		 && (lun->delay_info.done_delay > 0)) {
 
 			callout_init(&io->io_hdr.delay_callout, /*mpsafe*/ 1);
 			io->io_hdr.flags |= CTL_FLAG_DELAY_DONE;
 			callout_reset(&io->io_hdr.delay_callout,
 				      lun->delay_info.done_delay * hz,
 				      ctl_done_timer_wakeup, io);
 			if (lun->delay_info.done_type == CTL_DELAY_TYPE_ONESHOT)
 				lun->delay_info.done_delay = 0;
 			return;
 		}
 	}
 #endif /* CTL_IO_DELAY */
 
 	ctl_enqueue_done(io);
 }
 
 static void
 ctl_work_thread(void *arg)
 {
 	struct ctl_thread *thr = (struct ctl_thread *)arg;
 	struct ctl_softc *softc = thr->ctl_softc;
 	union ctl_io *io;
 	int retval;
 
 	CTL_DEBUG_PRINT(("ctl_work_thread starting\n"));
 	thread_lock(curthread);
 	sched_prio(curthread, PUSER - 1);
 	thread_unlock(curthread);
 
 	while (!softc->shutdown) {
 		/*
 		 * We handle the queues in this order:
 		 * - ISC
 		 * - done queue (to free up resources, unblock other commands)
 		 * - incoming queue
 		 * - RtR queue
 		 *
 		 * If those queues are empty, we break out of the loop and
 		 * go to sleep.
 		 */
 		mtx_lock(&thr->queue_lock);
 		io = (union ctl_io *)STAILQ_FIRST(&thr->isc_queue);
 		if (io != NULL) {
 			STAILQ_REMOVE_HEAD(&thr->isc_queue, links);
 			mtx_unlock(&thr->queue_lock);
 			ctl_handle_isc(io);
 			continue;
 		}
 		io = (union ctl_io *)STAILQ_FIRST(&thr->done_queue);
 		if (io != NULL) {
 			STAILQ_REMOVE_HEAD(&thr->done_queue, links);
 			/* clear any blocked commands, call fe_done */
 			mtx_unlock(&thr->queue_lock);
 			ctl_process_done(io);
 			continue;
 		}
 		io = (union ctl_io *)STAILQ_FIRST(&thr->incoming_queue);
 		if (io != NULL) {
 			STAILQ_REMOVE_HEAD(&thr->incoming_queue, links);
 			mtx_unlock(&thr->queue_lock);
 			if (io->io_hdr.io_type == CTL_IO_TASK)
 				ctl_run_task(io);
 			else
 				ctl_scsiio_precheck(softc, &io->scsiio);
 			continue;
 		}
 		io = (union ctl_io *)STAILQ_FIRST(&thr->rtr_queue);
 		if (io != NULL) {
 			STAILQ_REMOVE_HEAD(&thr->rtr_queue, links);
 			mtx_unlock(&thr->queue_lock);
 			retval = ctl_scsiio(&io->scsiio);
 			if (retval != CTL_RETVAL_COMPLETE)
 				CTL_DEBUG_PRINT(("ctl_scsiio failed\n"));
 			continue;
 		}
 
 		/* Sleep until we have something to do. */
 		mtx_sleep(thr, &thr->queue_lock, PDROP, "-", 0);
 	}
 	thr->thread = NULL;
 	kthread_exit();
 }
 
 static void
 ctl_lun_thread(void *arg)
 {
 	struct ctl_softc *softc = (struct ctl_softc *)arg;
 	struct ctl_be_lun *be_lun;
 
 	CTL_DEBUG_PRINT(("ctl_lun_thread starting\n"));
 	thread_lock(curthread);
 	sched_prio(curthread, PUSER - 1);
 	thread_unlock(curthread);
 
 	while (!softc->shutdown) {
 		mtx_lock(&softc->ctl_lock);
 		be_lun = STAILQ_FIRST(&softc->pending_lun_queue);
 		if (be_lun != NULL) {
 			STAILQ_REMOVE_HEAD(&softc->pending_lun_queue, links);
 			mtx_unlock(&softc->ctl_lock);
 			ctl_create_lun(be_lun);
 			continue;
 		}
 
 		/* Sleep until we have something to do. */
 		mtx_sleep(&softc->pending_lun_queue, &softc->ctl_lock,
 		    PDROP, "-", 0);
 	}
 	softc->lun_thread = NULL;
 	kthread_exit();
 }
 
 static void
 ctl_thresh_thread(void *arg)
 {
 	struct ctl_softc *softc = (struct ctl_softc *)arg;
 	struct ctl_lun *lun;
 	struct ctl_logical_block_provisioning_page *page;
 	const char *attr;
 	union ctl_ha_msg msg;
 	uint64_t thres, val;
 	int i, e, set;
 
 	CTL_DEBUG_PRINT(("ctl_thresh_thread starting\n"));
 	thread_lock(curthread);
 	sched_prio(curthread, PUSER - 1);
 	thread_unlock(curthread);
 
 	while (!softc->shutdown) {
 		mtx_lock(&softc->ctl_lock);
 		STAILQ_FOREACH(lun, &softc->lun_list, links) {
 			if ((lun->flags & CTL_LUN_DISABLED) ||
 			    (lun->flags & CTL_LUN_NO_MEDIA) ||
 			    lun->backend->lun_attr == NULL)
 				continue;
 			if ((lun->flags & CTL_LUN_PRIMARY_SC) == 0 &&
 			    softc->ha_mode == CTL_HA_MODE_XFER)
 				continue;
 			if ((lun->MODE_RWER.byte8 & SMS_RWER_LBPERE) == 0)
 				continue;
 			e = 0;
 			page = &lun->MODE_LBP;
 			for (i = 0; i < CTL_NUM_LBP_THRESH; i++) {
 				if ((page->descr[i].flags & SLBPPD_ENABLED) == 0)
 					continue;
 				thres = scsi_4btoul(page->descr[i].count);
 				thres <<= CTL_LBP_EXPONENT;
 				switch (page->descr[i].resource) {
 				case 0x01:
 					attr = "blocksavail";
 					break;
 				case 0x02:
 					attr = "blocksused";
 					break;
 				case 0xf1:
 					attr = "poolblocksavail";
 					break;
 				case 0xf2:
 					attr = "poolblocksused";
 					break;
 				default:
 					continue;
 				}
 				mtx_unlock(&softc->ctl_lock); // XXX
 				val = lun->backend->lun_attr(
 				    lun->be_lun->be_lun, attr);
 				mtx_lock(&softc->ctl_lock);
 				if (val == UINT64_MAX)
 					continue;
 				if ((page->descr[i].flags & SLBPPD_ARMING_MASK)
 				    == SLBPPD_ARMING_INC)
 					e = (val >= thres);
 				else
 					e = (val <= thres);
 				if (e)
 					break;
 			}
 			mtx_lock(&lun->lun_lock);
 			if (e) {
 				scsi_u64to8b((uint8_t *)&page->descr[i] -
 				    (uint8_t *)page, lun->ua_tpt_info);
 				if (lun->lasttpt == 0 ||
 				    time_uptime - lun->lasttpt >= CTL_LBP_UA_PERIOD) {
 					lun->lasttpt = time_uptime;
 					ctl_est_ua_all(lun, -1, CTL_UA_THIN_PROV_THRES);
 					set = 1;
 				} else
 					set = 0;
 			} else {
 				lun->lasttpt = 0;
 				ctl_clr_ua_all(lun, -1, CTL_UA_THIN_PROV_THRES);
 				set = -1;
 			}
 			mtx_unlock(&lun->lun_lock);
 			if (set != 0 &&
 			    lun->ctl_softc->ha_mode == CTL_HA_MODE_XFER) {
 				/* Send msg to other side. */
 				bzero(&msg.ua, sizeof(msg.ua));
 				msg.hdr.msg_type = CTL_MSG_UA;
 				msg.hdr.nexus.initid = -1;
 				msg.hdr.nexus.targ_port = -1;
 				msg.hdr.nexus.targ_lun = lun->lun;
 				msg.hdr.nexus.targ_mapped_lun = lun->lun;
 				msg.ua.ua_all = 1;
 				msg.ua.ua_set = (set > 0);
 				msg.ua.ua_type = CTL_UA_THIN_PROV_THRES;
 				memcpy(msg.ua.ua_info, lun->ua_tpt_info, 8);
 				mtx_unlock(&softc->ctl_lock); // XXX
 				ctl_ha_msg_send(CTL_HA_CHAN_CTL, &msg,
 				    sizeof(msg.ua), M_WAITOK);
 				mtx_lock(&softc->ctl_lock);
 			}
 		}
 		mtx_sleep(&softc->thresh_thread, &softc->ctl_lock,
 		    PDROP, "-", CTL_LBP_PERIOD * hz);
 	}
 	softc->thresh_thread = NULL;
 	kthread_exit();
 }
 
 static void
 ctl_enqueue_incoming(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	struct ctl_thread *thr;
 	u_int idx;
 
 	idx = (io->io_hdr.nexus.targ_port * 127 +
 	       io->io_hdr.nexus.initid) % worker_threads;
 	thr = &softc->threads[idx];
 	mtx_lock(&thr->queue_lock);
 	STAILQ_INSERT_TAIL(&thr->incoming_queue, &io->io_hdr, links);
 	mtx_unlock(&thr->queue_lock);
 	wakeup(thr);
 }
 
 static void
 ctl_enqueue_rtr(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	struct ctl_thread *thr;
 
 	thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads];
 	mtx_lock(&thr->queue_lock);
 	STAILQ_INSERT_TAIL(&thr->rtr_queue, &io->io_hdr, links);
 	mtx_unlock(&thr->queue_lock);
 	wakeup(thr);
 }
 
 static void
 ctl_enqueue_done(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	struct ctl_thread *thr;
 
 	thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads];
 	mtx_lock(&thr->queue_lock);
 	STAILQ_INSERT_TAIL(&thr->done_queue, &io->io_hdr, links);
 	mtx_unlock(&thr->queue_lock);
 	wakeup(thr);
 }
 
 static void
 ctl_enqueue_isc(union ctl_io *io)
 {
 	struct ctl_softc *softc = CTL_SOFTC(io);
 	struct ctl_thread *thr;
 
 	thr = &softc->threads[io->io_hdr.nexus.targ_mapped_lun % worker_threads];
 	mtx_lock(&thr->queue_lock);
 	STAILQ_INSERT_TAIL(&thr->isc_queue, &io->io_hdr, links);
 	mtx_unlock(&thr->queue_lock);
 	wakeup(thr);
 }
 
 /*
  *  vim: ts=8
  */
Index: projects/nfsv42/sys/cam/ctl/ctl.h
===================================================================
--- projects/nfsv42/sys/cam/ctl/ctl.h	(revision 350367)
+++ projects/nfsv42/sys/cam/ctl/ctl.h	(revision 350368)
@@ -1,207 +1,210 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003 Silicon Graphics International Corp.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
  * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl.h#5 $
  * $FreeBSD$
  */
 /*
  * Function definitions used both within CTL and potentially in various CTL
  * clients.
  *
  * Author: Ken Merry <ken@FreeBSD.org>
  */
 
 #ifndef	_CTL_H_
 #define	_CTL_H_
 
 #define	CTL_RETVAL_COMPLETE	0
 #define	CTL_RETVAL_QUEUED	1
 #define	CTL_RETVAL_ALLOCATED	2
 #define	CTL_RETVAL_ERROR	3
 
 typedef enum {
 	CTL_PORT_NONE		= 0x00,
 	CTL_PORT_FC		= 0x01,
 	CTL_PORT_SCSI		= 0x02,
 	CTL_PORT_IOCTL		= 0x04,
 	CTL_PORT_INTERNAL	= 0x08,
 	CTL_PORT_ISCSI		= 0x10,
 	CTL_PORT_SAS		= 0x20,
 	CTL_PORT_UMASS		= 0x40,
 	CTL_PORT_ALL		= 0xff,
 	CTL_PORT_ISC		= 0x100 // FC port for inter-shelf communication
 } ctl_port_type;
 
 struct ctl_port_entry {
 	ctl_port_type		port_type;
 	char			port_name[64];
 	int32_t			targ_port;
 	int			physical_port;
 	int			virtual_port;
 	u_int			flags;
 #define	CTL_PORT_WWNN_VALID	0x01
 #define	CTL_PORT_WWPN_VALID	0x02
 	uint64_t		wwnn;
 	uint64_t		wwpn;
 	int			online;
 };
 
 struct ctl_modepage_header {
 	uint8_t			page_code;
 	uint8_t			subpage;
 	uint16_t		len_used;
 	uint16_t		len_left;
 };
 
 union ctl_modepage_info {
 	struct ctl_modepage_header header;
 };
 
 /*
  * Serial number length, for VPD page 0x80.
  */
 #define	CTL_SN_LEN	16
 
 /*
  * Device ID length, for VPD page 0x83.
  */
 #define	CTL_DEVID_LEN	64
 #define	CTL_DEVID_MIN_LEN	16
 /*
  * WWPN length, for VPD page 0x83.
  */
 #define CTL_WWPN_LEN   8
 
 #define	CTL_DRIVER_NAME_LEN	32
 
 /*
  * Unit attention types. ASC/ASCQ values for these should be placed in
  * ctl_build_ua.  These are also listed in order of reporting priority.
  * i.e. a poweron UA is reported first, bus reset second, etc.
  */
 typedef enum {
 	CTL_UA_NONE		= 0x0000,
 	CTL_UA_POWERON		= 0x0001,
 	CTL_UA_BUS_RESET	= 0x0002,
 	CTL_UA_TARG_RESET	= 0x0004,
 	CTL_UA_I_T_NEXUS_LOSS	= 0x0008,
 	CTL_UA_LUN_RESET	= 0x0010,
 	CTL_UA_LUN_CHANGE	= 0x0020,
 	CTL_UA_MODE_CHANGE	= 0x0040,
 	CTL_UA_LOG_CHANGE	= 0x0080,
 	CTL_UA_INQ_CHANGE	= 0x0100,
 	CTL_UA_RES_PREEMPT	= 0x0400,
 	CTL_UA_RES_RELEASE	= 0x0800,
 	CTL_UA_REG_PREEMPT	= 0x1000,
 	CTL_UA_ASYM_ACC_CHANGE	= 0x2000,
 	CTL_UA_CAPACITY_CHANGE	= 0x4000,
 	CTL_UA_THIN_PROV_THRES	= 0x8000,
 	CTL_UA_MEDIUM_CHANGE	= 0x10000,
 	CTL_UA_IE		= 0x20000
 } ctl_ua_type;
 
 #ifdef	_KERNEL
 
 MALLOC_DECLARE(M_CTL);
 
 struct ctl_page_index;
 
 #ifdef SYSCTL_DECL	/* from sysctl.h */
 SYSCTL_DECL(_kern_cam_ctl);
 #endif
 
 struct ctl_lun;
 struct ctl_port;
 struct ctl_softc;
 
 /*
  * Put a string into an sbuf, escaping characters that are illegal or not
  * recommended in XML.  Note this doesn't escape everything, just > < and &.
  */
 int ctl_sbuf_printf_esc(struct sbuf *sb, char *str, int size);
 
 int ctl_ffz(uint32_t *mask, uint32_t first, uint32_t last);
 int ctl_set_mask(uint32_t *mask, uint32_t bit);
 int ctl_clear_mask(uint32_t *mask, uint32_t bit);
 int ctl_is_set(uint32_t *mask, uint32_t bit);
 int ctl_default_page_handler(struct ctl_scsiio *ctsio,
 			     struct ctl_page_index *page_index,
 			     uint8_t *page_ptr);
 int ctl_ie_page_handler(struct ctl_scsiio *ctsio,
 			struct ctl_page_index *page_index,
 			uint8_t *page_ptr);
+int ctl_temp_log_sense_handler(struct ctl_scsiio *ctsio,
+				   struct ctl_page_index *page_index,
+				   int pc);
 int ctl_lbp_log_sense_handler(struct ctl_scsiio *ctsio,
 				   struct ctl_page_index *page_index,
 				   int pc);
 int ctl_sap_log_sense_handler(struct ctl_scsiio *ctsio,
 				   struct ctl_page_index *page_index,
 				   int pc);
 int ctl_ie_log_sense_handler(struct ctl_scsiio *ctsio,
 				   struct ctl_page_index *page_index,
 				   int pc);
 int ctl_config_move_done(union ctl_io *io);
 void ctl_datamove(union ctl_io *io);
 void ctl_serseq_done(union ctl_io *io);
 void ctl_done(union ctl_io *io);
 void ctl_data_submit_done(union ctl_io *io);
 void ctl_config_read_done(union ctl_io *io);
 void ctl_config_write_done(union ctl_io *io);
 void ctl_portDB_changed(int portnum);
 int ctl_ioctl_io(struct cdev *dev, u_long cmd, caddr_t addr, int flag,
 		 struct thread *td);
 
 void ctl_est_ua(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua);
 void ctl_est_ua_port(struct ctl_lun *lun, int port, uint32_t except,
     ctl_ua_type ua);
 void ctl_est_ua_all(struct ctl_lun *lun, uint32_t except, ctl_ua_type ua);
 void ctl_clr_ua(struct ctl_lun *lun, uint32_t initidx, ctl_ua_type ua);
 void ctl_clr_ua_all(struct ctl_lun *lun, uint32_t except, ctl_ua_type ua);
 void ctl_clr_ua_allluns(struct ctl_softc *ctl_softc, uint32_t initidx,
     ctl_ua_type ua_type);
 
 uint32_t ctl_decode_lun(uint64_t encoded);
 uint64_t ctl_encode_lun(uint32_t decoded);
 
 void ctl_isc_announce_lun(struct ctl_lun *lun);
 void ctl_isc_announce_port(struct ctl_port *port);
 void ctl_isc_announce_iid(struct ctl_port *port, int iid);
 void ctl_isc_announce_mode(struct ctl_lun *lun, uint32_t initidx,
     uint8_t page, uint8_t subpage);
 
 int ctl_expand_number(const char *buf, uint64_t *num);
 
 #endif	/* _KERNEL */
 
 #endif	/* _CTL_H_ */
 
 /*
  * vim: ts=8
  */
Index: projects/nfsv42/sys/cam/ctl/ctl_private.h
===================================================================
--- projects/nfsv42/sys/cam/ctl/ctl_private.h	(revision 350367)
+++ projects/nfsv42/sys/cam/ctl/ctl_private.h	(revision 350368)
@@ -1,550 +1,553 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2003, 2004, 2005, 2008 Silicon Graphics International Corp.
  * Copyright (c) 2014-2017 Alexander Motin <mav@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification.
  * 2. Redistributions in binary form must reproduce at minimum a disclaimer
  *    substantially similar to the "NO WARRANTY" disclaimer below
  *    ("Disclaimer") and any redistribution must be conditioned upon
  *    including a substantially similar Disclaimer requirement for further
  *    binary redistribution.
  *
  * NO WARRANTY
  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR
  * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
  * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
  * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
  * POSSIBILITY OF SUCH DAMAGES.
  *
  * $Id: //depot/users/kenm/FreeBSD-test2/sys/cam/ctl/ctl_private.h#7 $
  * $FreeBSD$
  */
 /*
  * CAM Target Layer driver private data structures/definitions.
  *
  * Author: Ken Merry <ken@FreeBSD.org>
  */
 
 #ifndef	_CTL_PRIVATE_H_
 #define	_CTL_PRIVATE_H_
 
 #include <cam/scsi/scsi_all.h>
 #include <cam/scsi/scsi_cd.h>
 #include <cam/scsi/scsi_da.h>
 
 /*
  * SCSI vendor and product names.
  */
 #define	CTL_VENDOR		"FREEBSD "
 #define	CTL_DIRECT_PRODUCT	"CTLDISK         "
 #define	CTL_PROCESSOR_PRODUCT	"CTLPROCESSOR    "
 #define	CTL_CDROM_PRODUCT	"CTLCDROM        "
 #define	CTL_UNKNOWN_PRODUCT	"CTLDEVICE       "
 
 #define CTL_POOL_ENTRIES_OTHER_SC   200
 
 struct ctl_io_pool {
 	char			name[64];
 	uint32_t		id;
 	struct ctl_softc	*ctl_softc;
 	struct uma_zone		*zone;
 };
 
 typedef enum {
 	CTL_SER_BLOCK,
 	CTL_SER_BLOCKOPT,
 	CTL_SER_EXTENT,
 	CTL_SER_EXTENTOPT,
 	CTL_SER_EXTENTSEQ,
 	CTL_SER_PASS,
 	CTL_SER_SKIP
 } ctl_serialize_action;
 
 typedef enum {
 	CTL_ACTION_BLOCK,
 	CTL_ACTION_OVERLAP,
 	CTL_ACTION_OVERLAP_TAG,
 	CTL_ACTION_PASS,
 	CTL_ACTION_SKIP,
 	CTL_ACTION_ERROR
 } ctl_action;
 
 /*
  * WARNING:  Keep the bottom nibble here free, we OR in the data direction
  * flags for each command.
  *
  * Note:  "OK_ON_NO_LUN"   == we don't have to have a lun configured
  *        "OK_ON_BOTH"     == we have to have a lun configured
  *        "SA5"            == command has 5-bit service action at byte 1
  */
 typedef enum {
 	CTL_CMD_FLAG_NONE		= 0x0000,
 	CTL_CMD_FLAG_NO_SENSE		= 0x0010,
 	CTL_CMD_FLAG_ALLOW_ON_RESV	= 0x0020,
 	CTL_CMD_FLAG_ALLOW_ON_PR_RESV	= 0x0040,
 	CTL_CMD_FLAG_ALLOW_ON_PR_WRESV	= 0x0080,
 	CTL_CMD_FLAG_OK_ON_PROC		= 0x0100,
 	CTL_CMD_FLAG_OK_ON_DIRECT	= 0x0200,
 	CTL_CMD_FLAG_OK_ON_CDROM	= 0x0400,
 	CTL_CMD_FLAG_OK_ON_BOTH		= 0x0700,
 	CTL_CMD_FLAG_OK_ON_NO_LUN	= 0x0800,
 	CTL_CMD_FLAG_OK_ON_NO_MEDIA	= 0x1000,
 	CTL_CMD_FLAG_OK_ON_STANDBY	= 0x2000,
 	CTL_CMD_FLAG_OK_ON_UNAVAIL	= 0x4000,
 	CTL_CMD_FLAG_SA5		= 0x8000,
 	CTL_CMD_FLAG_RUN_HERE		= 0x10000
 } ctl_cmd_flags;
 
 typedef enum {
 	CTL_SERIDX_TUR	= 0,
 	CTL_SERIDX_READ,
 	CTL_SERIDX_WRITE,
 	CTL_SERIDX_UNMAP,
 	CTL_SERIDX_SYNC,
 	CTL_SERIDX_MD_SNS,
 	CTL_SERIDX_MD_SEL,
 	CTL_SERIDX_RQ_SNS,
 	CTL_SERIDX_INQ,
 	CTL_SERIDX_RD_CAP,
 	CTL_SERIDX_RES,
 	CTL_SERIDX_LOG_SNS,
 	CTL_SERIDX_FORMAT,
 	CTL_SERIDX_START,
 	/* TBD: others to be filled in as needed */
 	CTL_SERIDX_COUNT, /* LAST, not a normal code, provides # codes */
 	CTL_SERIDX_INVLD = CTL_SERIDX_COUNT
 } ctl_seridx;
 
 typedef int	ctl_opfunc(struct ctl_scsiio *ctsio);
 
 struct ctl_cmd_entry {
 	ctl_opfunc		*execute;
 	ctl_seridx		seridx;
 	ctl_cmd_flags		flags;
 	ctl_lun_error_pattern	pattern;
 	uint8_t			length;		/* CDB length */
 	uint8_t			usage[15];	/* Mask of allowed CDB bits
 						 * after the opcode byte. */
 };
 
 typedef enum {
 	CTL_LUN_NONE		= 0x000,
 	CTL_LUN_CONTROL		= 0x001,
 	CTL_LUN_RESERVED	= 0x002,
 	CTL_LUN_INVALID		= 0x004,
 	CTL_LUN_DISABLED	= 0x008,
 	CTL_LUN_MALLOCED	= 0x010,
 	CTL_LUN_STOPPED		= 0x020,
 	CTL_LUN_NO_MEDIA	= 0x040,
 	CTL_LUN_EJECTED		= 0x080,
 	CTL_LUN_PR_RESERVED	= 0x100,
 	CTL_LUN_PRIMARY_SC	= 0x200,
 	CTL_LUN_READONLY	= 0x800,
 	CTL_LUN_PEER_SC_PRIMARY	= 0x1000,
 	CTL_LUN_REMOVABLE	= 0x2000
 } ctl_lun_flags;
 
 typedef enum {
 	CTLBLOCK_FLAG_NONE	= 0x00,
 	CTLBLOCK_FLAG_INVALID	= 0x01
 } ctlblock_flags;
 
 union ctl_softcs {
 	struct ctl_softc	*ctl_softc;
 	struct ctlblock_softc	*ctlblock_softc;
 };
 
 /*
  * Mode page defaults.
  */
 #if 0
 /*
  * These values make Solaris trim off some of the capacity.
  */
 #define	CTL_DEFAULT_SECTORS_PER_TRACK	63
 #define	CTL_DEFAULT_HEADS		255
 /*
  * These values seem to work okay.
  */
 #define	CTL_DEFAULT_SECTORS_PER_TRACK	63
 #define	CTL_DEFAULT_HEADS		16
 /*
  * These values work reasonably well.
  */
 #define	CTL_DEFAULT_SECTORS_PER_TRACK	512
 #define	CTL_DEFAULT_HEADS		64
 #endif
 
 /*
  * Solaris is somewhat picky about how many heads and sectors per track you
  * have defined in mode pages 3 and 4.  These values seem to cause Solaris
  * to get the capacity more or less right when you run the format tool.
  * They still have problems when dealing with devices larger than 1TB,
  * but there isn't anything we can do about that.
  *
  * For smaller LUN sizes, this ends up causing the number of cylinders to
  * work out to 0.  Solaris actually recognizes that and comes up with its
  * own bogus geometry to fit the actual capacity of the drive.  They really
  * should just give up on geometry and stick to the read capacity
  * information alone for modern disk drives.
  *
  * One thing worth mentioning about Solaris' mkfs command is that it
  * doesn't like sectors per track values larger than 256.  512 seems to
  * work okay for format, but causes problems when you try to make a
  * filesystem.
  *
  * Another caveat about these values:  the product of these two values
  * really should be a power of 2.  This is because of the simplistic
  * shift-based calculation that we have to use on the i386 platform to
  * calculate the number of cylinders here.  (If you use a divide, you end
  * up calling __udivdi3(), which is a hardware FP call on the PC.  On the
  * XScale, it is done in software, so you can do that from inside the
  * kernel.)
  *
  * So for the current values (256 S/T, 128 H), we get 32768, which works
  * very nicely for calculating cylinders.
  *
  * If you want to change these values so that their product is no longer a
  * power of 2, re-visit the calculation in ctl_init_page_index().  You may
  * need to make it a bit more complicated to get the number of cylinders
  * right.
  */
 #define	CTL_DEFAULT_SECTORS_PER_TRACK	256
 #define	CTL_DEFAULT_HEADS		128
 
 #define	CTL_DEFAULT_ROTATION_RATE	SVPD_NON_ROTATING
 
 struct ctl_page_index;
 
 typedef int	ctl_modesen_handler(struct ctl_scsiio *ctsio,
 				    struct ctl_page_index *page_index,
 				    int pc);
 typedef int	ctl_modesel_handler(struct ctl_scsiio *ctsio,
 				    struct ctl_page_index *page_index,
 				    uint8_t *page_ptr);
 
 typedef enum {
 	CTL_PAGE_FLAG_NONE	 = 0x00,
 	CTL_PAGE_FLAG_DIRECT	 = 0x01,
 	CTL_PAGE_FLAG_PROC	 = 0x02,
 	CTL_PAGE_FLAG_CDROM	 = 0x04,
 	CTL_PAGE_FLAG_ALL	 = 0x07
 } ctl_page_flags;
 
 struct ctl_page_index {
 	uint8_t			page_code;
 	uint8_t			subpage;
 	uint16_t		page_len;
 	uint8_t			*page_data;
 	ctl_page_flags		page_flags;
 	ctl_modesen_handler	*sense_handler;
 	ctl_modesel_handler	*select_handler;
 };
 
 #define	CTL_PAGE_CURRENT	0x00
 #define	CTL_PAGE_CHANGEABLE	0x01
 #define	CTL_PAGE_DEFAULT	0x02
 #define	CTL_PAGE_SAVED		0x03
 
 #define CTL_NUM_LBP_PARAMS	4
 #define CTL_NUM_LBP_THRESH	4
 #define CTL_LBP_EXPONENT	11	/* 2048 sectors */
 #define CTL_LBP_PERIOD		10	/* 10 seconds */
 #define CTL_LBP_UA_PERIOD	300	/* 5 minutes */
 
 struct ctl_logical_block_provisioning_page {
 	struct scsi_logical_block_provisioning_page	main;
 	struct scsi_logical_block_provisioning_page_descr descr[CTL_NUM_LBP_THRESH];
 };
 
 static const struct ctl_page_index page_index_template[] = {
 	{SMS_RW_ERROR_RECOVERY_PAGE, 0, sizeof(struct scsi_da_rw_recovery_page), NULL,
 	 CTL_PAGE_FLAG_DIRECT | CTL_PAGE_FLAG_CDROM, NULL, ctl_default_page_handler},
 	{SMS_FORMAT_DEVICE_PAGE, 0, sizeof(struct scsi_format_page), NULL,
 	 CTL_PAGE_FLAG_DIRECT, NULL, NULL},
 	{SMS_RIGID_DISK_PAGE, 0, sizeof(struct scsi_rigid_disk_page), NULL,
 	 CTL_PAGE_FLAG_DIRECT, NULL, NULL},
 	{SMS_VERIFY_ERROR_RECOVERY_PAGE, 0, sizeof(struct scsi_da_verify_recovery_page), NULL,
 	 CTL_PAGE_FLAG_DIRECT | CTL_PAGE_FLAG_CDROM, NULL, ctl_default_page_handler},
 	{SMS_CACHING_PAGE, 0, sizeof(struct scsi_caching_page), NULL,
 	 CTL_PAGE_FLAG_DIRECT | CTL_PAGE_FLAG_CDROM,
 	 NULL, ctl_default_page_handler},
 	{SMS_CONTROL_MODE_PAGE, 0, sizeof(struct scsi_control_page), NULL,
 	 CTL_PAGE_FLAG_ALL, NULL, ctl_default_page_handler},
 	{SMS_CONTROL_MODE_PAGE | SMPH_SPF, 0x01,
 	 sizeof(struct scsi_control_ext_page), NULL,
 	 CTL_PAGE_FLAG_ALL, NULL, ctl_default_page_handler},
 	{SMS_INFO_EXCEPTIONS_PAGE, 0, sizeof(struct scsi_info_exceptions_page), NULL,
 	 CTL_PAGE_FLAG_ALL, NULL, ctl_ie_page_handler},
 	{SMS_INFO_EXCEPTIONS_PAGE | SMPH_SPF, 0x02,
 	 sizeof(struct ctl_logical_block_provisioning_page), NULL,
 	 CTL_PAGE_FLAG_DIRECT, NULL, ctl_default_page_handler},
 	{SMS_CDDVD_CAPS_PAGE, 0,
 	 sizeof(struct scsi_cddvd_capabilities_page), NULL,
 	 CTL_PAGE_FLAG_CDROM, NULL, NULL},
 };
 
 #define	CTL_NUM_MODE_PAGES sizeof(page_index_template)/   \
 			   sizeof(page_index_template[0])
 
 struct ctl_mode_pages {
 	struct scsi_da_rw_recovery_page	rw_er_page[4];
 	struct scsi_format_page		format_page[4];
 	struct scsi_rigid_disk_page	rigid_disk_page[4];
 	struct scsi_da_verify_recovery_page	verify_er_page[4];
 	struct scsi_caching_page	caching_page[4];
 	struct scsi_control_page	control_page[4];
 	struct scsi_control_ext_page	control_ext_page[4];
 	struct scsi_info_exceptions_page ie_page[4];
 	struct ctl_logical_block_provisioning_page lbp_page[4];
 	struct scsi_cddvd_capabilities_page cddvd_page[4];
 	struct ctl_page_index		index[CTL_NUM_MODE_PAGES];
 };
 
 #define	MODE_RWER	mode_pages.rw_er_page[CTL_PAGE_CURRENT]
 #define	MODE_FMT	mode_pages.format_page[CTL_PAGE_CURRENT]
 #define	MODE_RDISK	mode_pages.rigid_disk_page[CTL_PAGE_CURRENT]
 #define	MODE_VER	mode_pages.verify_er_page[CTL_PAGE_CURRENT]
 #define	MODE_CACHING	mode_pages.caching_page[CTL_PAGE_CURRENT]
 #define	MODE_CTRL	mode_pages.control_page[CTL_PAGE_CURRENT]
 #define	MODE_CTRLE	mode_pages.control_ext_page[CTL_PAGE_CURRENT]
 #define	MODE_IE		mode_pages.ie_page[CTL_PAGE_CURRENT]
 #define	MODE_LBP	mode_pages.lbp_page[CTL_PAGE_CURRENT]
 #define	MODE_CDDVD	mode_pages.cddvd_page[CTL_PAGE_CURRENT]
 
 static const struct ctl_page_index log_page_index_template[] = {
 	{SLS_SUPPORTED_PAGES_PAGE, 0, 0, NULL,
 	 CTL_PAGE_FLAG_ALL, NULL, NULL},
 	{SLS_SUPPORTED_PAGES_PAGE, SLS_SUPPORTED_SUBPAGES_SUBPAGE, 0, NULL,
 	 CTL_PAGE_FLAG_ALL, NULL, NULL},
+	{SLS_TEMPERATURE, 0, 0, NULL,
+	 CTL_PAGE_FLAG_DIRECT, ctl_temp_log_sense_handler, NULL},
 	{SLS_LOGICAL_BLOCK_PROVISIONING, 0, 0, NULL,
 	 CTL_PAGE_FLAG_DIRECT, ctl_lbp_log_sense_handler, NULL},
 	{SLS_STAT_AND_PERF, 0, 0, NULL,
 	 CTL_PAGE_FLAG_ALL, ctl_sap_log_sense_handler, NULL},
 	{SLS_IE_PAGE, 0, 0, NULL,
 	 CTL_PAGE_FLAG_ALL, ctl_ie_log_sense_handler, NULL},
 };
 
 #define	CTL_NUM_LOG_PAGES sizeof(log_page_index_template)/   \
 			  sizeof(log_page_index_template[0])
 
 struct ctl_log_pages {
 	uint8_t				pages_page[CTL_NUM_LOG_PAGES];
 	uint8_t				subpages_page[CTL_NUM_LOG_PAGES * 2];
 	uint8_t				lbp_page[12*CTL_NUM_LBP_PARAMS];
 	struct stat_page {
 		struct scsi_log_stat_and_perf sap;
 		struct scsi_log_idle_time it;
 		struct scsi_log_time_interval ti;
 	} stat_page;
+	struct scsi_log_temperature	temp_page[2];
 	struct scsi_log_informational_exceptions	ie_page;
 	struct ctl_page_index		index[CTL_NUM_LOG_PAGES];
 };
 
 struct ctl_lun_delay_info {
 	ctl_delay_type		datamove_type;
 	uint32_t		datamove_delay;
 	ctl_delay_type		done_type;
 	uint32_t		done_delay;
 };
 
 #define CTL_PR_ALL_REGISTRANTS  0xFFFFFFFF
 #define CTL_PR_NO_RESERVATION   0xFFFFFFF0
 
 struct ctl_devid {
 	int		len;
 	uint8_t		data[];
 };
 
 #define NUM_HA_SHELVES		2
 
 #define CTL_WRITE_BUFFER_SIZE	262144
 
 struct tpc_list;
 struct ctl_lun {
 	struct mtx			lun_lock;
 	uint64_t			lun;
 	ctl_lun_flags			flags;
 	STAILQ_HEAD(,ctl_error_desc)	error_list;
 	uint64_t			error_serial;
 	struct ctl_softc		*ctl_softc;
 	struct ctl_be_lun		*be_lun;
 	struct ctl_backend_driver	*backend;
 	struct ctl_lun_delay_info	delay_info;
 #ifdef CTL_TIME_IO
 	sbintime_t			idle_time;
 	sbintime_t			last_busy;
 #endif
 	TAILQ_HEAD(ctl_ooaq, ctl_io_hdr)  ooa_queue;
 	STAILQ_ENTRY(ctl_lun)		links;
 	struct scsi_sense_data		**pending_sense;
 	ctl_ua_type			**pending_ua;
 	uint8_t				ua_tpt_info[8];
 	time_t				lasttpt;
 	uint8_t				ie_asc;	/* Informational exceptions */
 	uint8_t				ie_ascq;
 	int				ie_reported;	/* Already reported */
 	uint32_t			ie_reportcnt;	/* REPORT COUNT */
 	struct callout			ie_callout;	/* INTERVAL TIMER */
 	struct ctl_mode_pages		mode_pages;
 	struct ctl_log_pages		log_pages;
 	struct ctl_io_stats		stats;
 	uint32_t			res_idx;
 	uint32_t			pr_generation;
 	uint64_t			**pr_keys;
 	int				pr_key_count;
 	uint32_t			pr_res_idx;
 	uint8_t				pr_res_type;
 	int				prevent_count;
 	uint32_t			*prevent;
 	uint8_t				*write_buffer;
 	struct ctl_devid		*lun_devid;
 	TAILQ_HEAD(tpc_lists, tpc_list) tpc_lists;
 };
 
 typedef enum {
 	CTL_FLAG_ACTIVE_SHELF	= 0x04
 } ctl_gen_flags;
 
 #define CTL_MAX_THREADS		16
 
 struct ctl_thread {
 	struct mtx_padalign queue_lock;
 	struct ctl_softc	*ctl_softc;
 	struct thread		*thread;
 	STAILQ_HEAD(, ctl_io_hdr) incoming_queue;
 	STAILQ_HEAD(, ctl_io_hdr) rtr_queue;
 	STAILQ_HEAD(, ctl_io_hdr) done_queue;
 	STAILQ_HEAD(, ctl_io_hdr) isc_queue;
 };
 
 struct tpc_token;
 struct ctl_softc {
 	struct mtx		ctl_lock;
 	struct cdev		*dev;
 	int			num_luns;
 	ctl_gen_flags		flags;
 	ctl_ha_mode		ha_mode;
 	int			ha_id;
 	int			is_single;
 	ctl_ha_link_state	ha_link;
 	int			port_min;
 	int			port_max;
 	int			port_cnt;
 	int			init_min;
 	int			init_max;
 	struct sysctl_ctx_list	sysctl_ctx;
 	struct sysctl_oid	*sysctl_tree;
 	void			*othersc_pool;
 	struct proc		*ctl_proc;
 	uint32_t		*ctl_lun_mask;
 	struct ctl_lun		**ctl_luns;
 	uint32_t		*ctl_port_mask;
 	STAILQ_HEAD(, ctl_lun)	lun_list;
 	STAILQ_HEAD(, ctl_be_lun)	pending_lun_queue;
 	uint32_t		num_frontends;
 	STAILQ_HEAD(, ctl_frontend)	fe_list;
 	uint32_t		num_ports;
 	STAILQ_HEAD(, ctl_port)	port_list;
 	struct ctl_port		**ctl_ports;
 	uint32_t		num_backends;
 	STAILQ_HEAD(, ctl_backend_driver)	be_list;
 	struct uma_zone		*io_zone;
 	uint32_t		cur_pool_id;
 	int			shutdown;
 	struct ctl_thread	threads[CTL_MAX_THREADS];
 	struct thread		*lun_thread;
 	struct thread		*thresh_thread;
 	TAILQ_HEAD(tpc_tokens, tpc_token)	tpc_tokens;
 	struct callout		tpc_timeout;
 	struct mtx		tpc_lock;
 };
 
 #ifdef _KERNEL
 
 extern const struct ctl_cmd_entry ctl_cmd_table[256];
 
 uint32_t ctl_get_initindex(struct ctl_nexus *nexus);
 int ctl_lun_map_init(struct ctl_port *port);
 int ctl_lun_map_deinit(struct ctl_port *port);
 int ctl_lun_map_set(struct ctl_port *port, uint32_t plun, uint32_t glun);
 int ctl_lun_map_unset(struct ctl_port *port, uint32_t plun);
 uint32_t ctl_lun_map_from_port(struct ctl_port *port, uint32_t plun);
 uint32_t ctl_lun_map_to_port(struct ctl_port *port, uint32_t glun);
 int ctl_pool_create(struct ctl_softc *ctl_softc, const char *pool_name,
 		    uint32_t total_ctl_io, void **npool);
 void ctl_pool_free(struct ctl_io_pool *pool);
 int ctl_scsi_release(struct ctl_scsiio *ctsio);
 int ctl_scsi_reserve(struct ctl_scsiio *ctsio);
 int ctl_start_stop(struct ctl_scsiio *ctsio);
 int ctl_prevent_allow(struct ctl_scsiio *ctsio);
 int ctl_sync_cache(struct ctl_scsiio *ctsio);
 int ctl_format(struct ctl_scsiio *ctsio);
 int ctl_read_buffer(struct ctl_scsiio *ctsio);
 int ctl_write_buffer(struct ctl_scsiio *ctsio);
 int ctl_write_same(struct ctl_scsiio *ctsio);
 int ctl_unmap(struct ctl_scsiio *ctsio);
 int ctl_mode_select(struct ctl_scsiio *ctsio);
 int ctl_mode_sense(struct ctl_scsiio *ctsio);
 int ctl_log_sense(struct ctl_scsiio *ctsio);
 int ctl_read_capacity(struct ctl_scsiio *ctsio);
 int ctl_read_capacity_16(struct ctl_scsiio *ctsio);
 int ctl_read_defect(struct ctl_scsiio *ctsio);
 int ctl_read_toc(struct ctl_scsiio *ctsio);
 int ctl_read_write(struct ctl_scsiio *ctsio);
 int ctl_cnw(struct ctl_scsiio *ctsio);
 int ctl_report_luns(struct ctl_scsiio *ctsio);
 int ctl_request_sense(struct ctl_scsiio *ctsio);
 int ctl_tur(struct ctl_scsiio *ctsio);
 int ctl_verify(struct ctl_scsiio *ctsio);
 int ctl_inquiry(struct ctl_scsiio *ctsio);
 int ctl_get_config(struct ctl_scsiio *ctsio);
 int ctl_get_event_status(struct ctl_scsiio *ctsio);
 int ctl_mechanism_status(struct ctl_scsiio *ctsio);
 int ctl_persistent_reserve_in(struct ctl_scsiio *ctsio);
 int ctl_persistent_reserve_out(struct ctl_scsiio *ctsio);
 int ctl_report_tagret_port_groups(struct ctl_scsiio *ctsio);
 int ctl_report_supported_opcodes(struct ctl_scsiio *ctsio);
 int ctl_report_supported_tmf(struct ctl_scsiio *ctsio);
 int ctl_report_timestamp(struct ctl_scsiio *ctsio);
 int ctl_get_lba_status(struct ctl_scsiio *ctsio);
 
 void ctl_tpc_init(struct ctl_softc *softc);
 void ctl_tpc_shutdown(struct ctl_softc *softc);
 void ctl_tpc_lun_init(struct ctl_lun *lun);
 void ctl_tpc_lun_clear(struct ctl_lun *lun, uint32_t initidx);
 void ctl_tpc_lun_shutdown(struct ctl_lun *lun);
 int ctl_inquiry_evpd_tpc(struct ctl_scsiio *ctsio, int alloc_len);
 int ctl_receive_copy_status_lid1(struct ctl_scsiio *ctsio);
 int ctl_receive_copy_failure_details(struct ctl_scsiio *ctsio);
 int ctl_receive_copy_status_lid4(struct ctl_scsiio *ctsio);
 int ctl_receive_copy_operating_parameters(struct ctl_scsiio *ctsio);
 int ctl_extended_copy_lid1(struct ctl_scsiio *ctsio);
 int ctl_extended_copy_lid4(struct ctl_scsiio *ctsio);
 int ctl_copy_operation_abort(struct ctl_scsiio *ctsio);
 int ctl_populate_token(struct ctl_scsiio *ctsio);
 int ctl_write_using_token(struct ctl_scsiio *ctsio);
 int ctl_receive_rod_token_information(struct ctl_scsiio *ctsio);
 int ctl_report_all_rod_tokens(struct ctl_scsiio *ctsio);
 
 #endif	/* _KERNEL */
 
 #endif	/* _CTL_PRIVATE_H_ */
 
 /*
  * vim: ts=8
  */
Index: projects/nfsv42/sys/cam/scsi/scsi_all.c
===================================================================
--- projects/nfsv42/sys/cam/scsi/scsi_all.c	(revision 350367)
+++ projects/nfsv42/sys/cam/scsi/scsi_all.c	(revision 350368)
@@ -1,9251 +1,9252 @@
 /*-
  * Implementation of Utility functions for all SCSI device types.
  *
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 1997, 1998, 1999 Justin T. Gibbs.
  * Copyright (c) 1997, 1998, 2003 Kenneth D. Merry.
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions, and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. The name of the author may not be used to endorse or promote products
  *    derived from this software without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE FOR
  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/stdint.h>
 
 #ifdef _KERNEL
 #include "opt_scsi.h"
 
 #include <sys/systm.h>
 #include <sys/libkern.h>
 #include <sys/kernel.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mutex.h>
 #include <sys/sysctl.h>
 #include <sys/ctype.h>
 #else
 #include <errno.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <ctype.h>
 #endif
 
 #include <cam/cam.h>
 #include <cam/cam_ccb.h>
 #include <cam/cam_queue.h>
 #include <cam/cam_xpt.h>
 #include <cam/scsi/scsi_all.h>
 #include <sys/ata.h>
 #include <sys/sbuf.h>
 
 #ifdef _KERNEL
 #include <cam/cam_periph.h>
 #include <cam/cam_xpt_sim.h>
 #include <cam/cam_xpt_periph.h>
 #include <cam/cam_xpt_internal.h>
 #else
 #include <camlib.h>
 #include <stddef.h>
 
 #ifndef FALSE
 #define FALSE   0
 #endif /* FALSE */
 #ifndef TRUE
 #define TRUE    1
 #endif /* TRUE */
 #define ERESTART        -1              /* restart syscall */
 #define EJUSTRETURN     -2              /* don't modify regs, just return */
 #endif /* !_KERNEL */
 
 /*
  * This is the default number of milliseconds we wait for devices to settle
  * after a SCSI bus reset.
  */
 #ifndef SCSI_DELAY
 #define SCSI_DELAY 2000
 #endif
 /*
  * All devices need _some_ sort of bus settle delay, so we'll set it to
  * a minimum value of 100ms. Note that this is pertinent only for SPI-
  * not transport like Fibre Channel or iSCSI where 'delay' is completely
  * meaningless.
  */
 #ifndef SCSI_MIN_DELAY
 #define SCSI_MIN_DELAY 100
 #endif
 /*
  * Make sure the user isn't using seconds instead of milliseconds.
  */
 #if (SCSI_DELAY < SCSI_MIN_DELAY && SCSI_DELAY != 0)
 #error "SCSI_DELAY is in milliseconds, not seconds!  Please use a larger value"
 #endif
 
 int scsi_delay;
 
 static int	ascentrycomp(const void *key, const void *member);
 static int	senseentrycomp(const void *key, const void *member);
 static void	fetchtableentries(int sense_key, int asc, int ascq,
 				  struct scsi_inquiry_data *,
 				  const struct sense_key_table_entry **,
 				  const struct asc_table_entry **);
 
 #ifdef _KERNEL
 static void	init_scsi_delay(void);
 static int	sysctl_scsi_delay(SYSCTL_HANDLER_ARGS);
 static int	set_scsi_delay(int delay);
 #endif
 
 #if !defined(SCSI_NO_OP_STRINGS)
 
 #define	D	(1 << T_DIRECT)
 #define	T	(1 << T_SEQUENTIAL)
 #define	L	(1 << T_PRINTER)
 #define	P	(1 << T_PROCESSOR)
 #define	W	(1 << T_WORM)
 #define	R	(1 << T_CDROM)
 #define	O	(1 << T_OPTICAL)
 #define	M	(1 << T_CHANGER)
 #define	A	(1 << T_STORARRAY)
 #define	E	(1 << T_ENCLOSURE)
 #define	B	(1 << T_RBC)
 #define	K	(1 << T_OCRW)
 #define	V	(1 << T_ADC)
 #define	F	(1 << T_OSD)
 #define	S	(1 << T_SCANNER)
 #define	C	(1 << T_COMM)
 
 #define ALL	(D | T | L | P | W | R | O | M | A | E | B | K | V | F | S | C)
 
 static struct op_table_entry plextor_cd_ops[] = {
 	{ 0xD8, R, "CD-DA READ" }
 };
 
 static struct scsi_op_quirk_entry scsi_op_quirk_table[] = {
 	{
 		/*
 		 * I believe that 0xD8 is the Plextor proprietary command
 		 * to read CD-DA data.  I'm not sure which Plextor CDROM
 		 * models support the command, though.  I know for sure
 		 * that the 4X, 8X, and 12X models do, and presumably the
 		 * 12-20X does.  I don't know about any earlier models,
 		 * though.  If anyone has any more complete information,
 		 * feel free to change this quirk entry.
 		 */
 		{T_CDROM, SIP_MEDIA_REMOVABLE, "PLEXTOR", "CD-ROM PX*", "*"},
 		nitems(plextor_cd_ops),
 		plextor_cd_ops
 	}
 };
 
 static struct op_table_entry scsi_op_codes[] = {
 	/*
 	 * From: http://www.t10.org/lists/op-num.txt
 	 * Modifications by Kenneth Merry (ken@FreeBSD.ORG)
 	 *              and Jung-uk Kim (jkim@FreeBSD.org)
 	 *
 	 * Note:  order is important in this table, scsi_op_desc() currently
 	 * depends on the opcodes in the table being in order to save
 	 * search time.
 	 * Note:  scanner and comm. devices are carried over from the previous
 	 * version because they were removed in the latest spec.
 	 */
 	/* File: OP-NUM.TXT
 	 *
 	 * SCSI Operation Codes
 	 * Numeric Sorted Listing
 	 * as of  5/26/15
 	 *
 	 *     D - DIRECT ACCESS DEVICE (SBC-2)                device column key
 	 *     .T - SEQUENTIAL ACCESS DEVICE (SSC-2)           -----------------
 	 *     . L - PRINTER DEVICE (SSC)                      M = Mandatory
 	 *     .  P - PROCESSOR DEVICE (SPC)                   O = Optional
 	 *     .  .W - WRITE ONCE READ MULTIPLE DEVICE (SBC-2) V = Vendor spec.
 	 *     .  . R - CD/DVE DEVICE (MMC-3)                  Z = Obsolete
 	 *     .  .  O - OPTICAL MEMORY DEVICE (SBC-2)
 	 *     .  .  .M - MEDIA CHANGER DEVICE (SMC-2)
 	 *     .  .  . A - STORAGE ARRAY DEVICE (SCC-2)
 	 *     .  .  . .E - ENCLOSURE SERVICES DEVICE (SES)
 	 *     .  .  .  .B - SIMPLIFIED DIRECT-ACCESS DEVICE (RBC)
 	 *     .  .  .  . K - OPTICAL CARD READER/WRITER DEVICE (OCRW)
 	 *     .  .  .  .  V - AUTOMATION/DRIVE INTERFACE (ADC)
 	 *     .  .  .  .  .F - OBJECT-BASED STORAGE (OSD)
 	 * OP  DTLPWROMAEBKVF  Description
 	 * --  --------------  ---------------------------------------------- */
 	/* 00  MMMMMMMMMMMMMM  TEST UNIT READY */
 	{ 0x00,	ALL, "TEST UNIT READY" },
 	/* 01   M              REWIND */
 	{ 0x01,	T, "REWIND" },
 	/* 01  Z V ZZZZ        REZERO UNIT */
 	{ 0x01,	D | W | R | O | M, "REZERO UNIT" },
 	/* 02  VVVVVV V */
 	/* 03  MMMMMMMMMMOMMM  REQUEST SENSE */
 	{ 0x03,	ALL, "REQUEST SENSE" },
 	/* 04  M    OO         FORMAT UNIT */
 	{ 0x04,	D | R | O, "FORMAT UNIT" },
 	/* 04   O              FORMAT MEDIUM */
 	{ 0x04,	T, "FORMAT MEDIUM" },
 	/* 04    O             FORMAT */
 	{ 0x04,	L, "FORMAT" },
 	/* 05  VMVVVV V        READ BLOCK LIMITS */
 	{ 0x05,	T, "READ BLOCK LIMITS" },
 	/* 06  VVVVVV V */
 	/* 07  OVV O OV        REASSIGN BLOCKS */
 	{ 0x07,	D | W | O, "REASSIGN BLOCKS" },
 	/* 07         O        INITIALIZE ELEMENT STATUS */
 	{ 0x07,	M, "INITIALIZE ELEMENT STATUS" },
 	/* 08  MOV O OV        READ(6) */
 	{ 0x08,	D | T | W | O, "READ(6)" },
 	/* 08     O            RECEIVE */
 	{ 0x08,	P, "RECEIVE" },
 	/* 08                  GET MESSAGE(6) */
 	{ 0x08, C, "GET MESSAGE(6)" },
 	/* 09  VVVVVV V */
 	/* 0A  OO  O OV        WRITE(6) */
 	{ 0x0A,	D | T | W | O, "WRITE(6)" },
 	/* 0A     M            SEND(6) */
 	{ 0x0A,	P, "SEND(6)" },
 	/* 0A                  SEND MESSAGE(6) */
 	{ 0x0A, C, "SEND MESSAGE(6)" },
 	/* 0A    M             PRINT */
 	{ 0x0A,	L, "PRINT" },
 	/* 0B  Z   ZOZV        SEEK(6) */
 	{ 0x0B,	D | W | R | O, "SEEK(6)" },
 	/* 0B   O              SET CAPACITY */
 	{ 0x0B,	T, "SET CAPACITY" },
 	/* 0B    O             SLEW AND PRINT */
 	{ 0x0B,	L, "SLEW AND PRINT" },
 	/* 0C  VVVVVV V */
 	/* 0D  VVVVVV V */
 	/* 0E  VVVVVV V */
 	/* 0F  VOVVVV V        READ REVERSE(6) */
 	{ 0x0F,	T, "READ REVERSE(6)" },
 	/* 10  VM VVV          WRITE FILEMARKS(6) */
 	{ 0x10,	T, "WRITE FILEMARKS(6)" },
 	/* 10    O             SYNCHRONIZE BUFFER */
 	{ 0x10,	L, "SYNCHRONIZE BUFFER" },
 	/* 11  VMVVVV          SPACE(6) */
 	{ 0x11,	T, "SPACE(6)" },
 	/* 12  MMMMMMMMMMMMMM  INQUIRY */
 	{ 0x12,	ALL, "INQUIRY" },
 	/* 13  V VVVV */
 	/* 13   O              VERIFY(6) */
 	{ 0x13,	T, "VERIFY(6)" },
 	/* 14  VOOVVV          RECOVER BUFFERED DATA */
 	{ 0x14,	T | L, "RECOVER BUFFERED DATA" },
 	/* 15  OMO O OOOO OO   MODE SELECT(6) */
 	{ 0x15,	ALL & ~(P | R | B | F), "MODE SELECT(6)" },
 	/* 16  ZZMZO OOOZ O    RESERVE(6) */
 	{ 0x16,	ALL & ~(R | B | V | F | C), "RESERVE(6)" },
 	/* 16         Z        RESERVE ELEMENT(6) */
 	{ 0x16,	M, "RESERVE ELEMENT(6)" },
 	/* 17  ZZMZO OOOZ O    RELEASE(6) */
 	{ 0x17,	ALL & ~(R | B | V | F | C), "RELEASE(6)" },
 	/* 17         Z        RELEASE ELEMENT(6) */
 	{ 0x17,	M, "RELEASE ELEMENT(6)" },
 	/* 18  ZZZZOZO    Z    COPY */
 	{ 0x18,	D | T | L | P | W | R | O | K | S, "COPY" },
 	/* 19  VMVVVV          ERASE(6) */
 	{ 0x19,	T, "ERASE(6)" },
 	/* 1A  OMO O OOOO OO   MODE SENSE(6) */
 	{ 0x1A,	ALL & ~(P | R | B | F), "MODE SENSE(6)" },
 	/* 1B  O   OOO O MO O  START STOP UNIT */
 	{ 0x1B,	D | W | R | O | A | B | K | F, "START STOP UNIT" },
 	/* 1B   O          M   LOAD UNLOAD */
 	{ 0x1B,	T | V, "LOAD UNLOAD" },
 	/* 1B                  SCAN */
 	{ 0x1B, S, "SCAN" },
 	/* 1B    O             STOP PRINT */
 	{ 0x1B,	L, "STOP PRINT" },
 	/* 1B         O        OPEN/CLOSE IMPORT/EXPORT ELEMENT */
 	{ 0x1B,	M, "OPEN/CLOSE IMPORT/EXPORT ELEMENT" },
 	/* 1C  OOOOO OOOM OOO  RECEIVE DIAGNOSTIC RESULTS */
 	{ 0x1C,	ALL & ~(R | B), "RECEIVE DIAGNOSTIC RESULTS" },
 	/* 1D  MMMMM MMOM MMM  SEND DIAGNOSTIC */
 	{ 0x1D,	ALL & ~(R | B), "SEND DIAGNOSTIC" },
 	/* 1E  OO  OOOO   O O  PREVENT ALLOW MEDIUM REMOVAL */
 	{ 0x1E,	D | T | W | R | O | M | K | F, "PREVENT ALLOW MEDIUM REMOVAL" },
 	/* 1F */
 	/* 20  V   VVV    V */
 	/* 21  V   VVV    V */
 	/* 22  V   VVV    V */
 	/* 23  V   V V    V */
 	/* 23       O          READ FORMAT CAPACITIES */
 	{ 0x23,	R, "READ FORMAT CAPACITIES" },
 	/* 24  V   VV          SET WINDOW */
 	{ 0x24, S, "SET WINDOW" },
 	/* 25  M   M M   M     READ CAPACITY(10) */
 	{ 0x25,	D | W | O | B, "READ CAPACITY(10)" },
 	/* 25       O          READ CAPACITY */
 	{ 0x25,	R, "READ CAPACITY" },
 	/* 25             M    READ CARD CAPACITY */
 	{ 0x25,	K, "READ CARD CAPACITY" },
 	/* 25                  GET WINDOW */
 	{ 0x25, S, "GET WINDOW" },
 	/* 26  V   VV */
 	/* 27  V   VV */
 	/* 28  M   MOM   MM    READ(10) */
 	{ 0x28,	D | W | R | O | B | K | S, "READ(10)" },
 	/* 28                  GET MESSAGE(10) */
 	{ 0x28, C, "GET MESSAGE(10)" },
 	/* 29  V   VVO         READ GENERATION */
 	{ 0x29,	O, "READ GENERATION" },
 	/* 2A  O   MOM   MO    WRITE(10) */
 	{ 0x2A,	D | W | R | O | B | K, "WRITE(10)" },
 	/* 2A                  SEND(10) */
 	{ 0x2A, S, "SEND(10)" },
 	/* 2A                  SEND MESSAGE(10) */
 	{ 0x2A, C, "SEND MESSAGE(10)" },
 	/* 2B  Z   OOO    O    SEEK(10) */
 	{ 0x2B,	D | W | R | O | K, "SEEK(10)" },
 	/* 2B   O              LOCATE(10) */
 	{ 0x2B,	T, "LOCATE(10)" },
 	/* 2B         O        POSITION TO ELEMENT */
 	{ 0x2B,	M, "POSITION TO ELEMENT" },
 	/* 2C  V    OO         ERASE(10) */
 	{ 0x2C,	R | O, "ERASE(10)" },
 	/* 2D        O         READ UPDATED BLOCK */
 	{ 0x2D,	O, "READ UPDATED BLOCK" },
 	/* 2D  V */
 	/* 2E  O   OOO   MO    WRITE AND VERIFY(10) */
 	{ 0x2E,	D | W | R | O | B | K, "WRITE AND VERIFY(10)" },
 	/* 2F  O   OOO         VERIFY(10) */
 	{ 0x2F,	D | W | R | O, "VERIFY(10)" },
 	/* 30  Z   ZZZ         SEARCH DATA HIGH(10) */
 	{ 0x30,	D | W | R | O, "SEARCH DATA HIGH(10)" },
 	/* 31  Z   ZZZ         SEARCH DATA EQUAL(10) */
 	{ 0x31,	D | W | R | O, "SEARCH DATA EQUAL(10)" },
 	/* 31                  OBJECT POSITION */
 	{ 0x31, S, "OBJECT POSITION" },
 	/* 32  Z   ZZZ         SEARCH DATA LOW(10) */
 	{ 0x32,	D | W | R | O, "SEARCH DATA LOW(10)" },
 	/* 33  Z   OZO         SET LIMITS(10) */
 	{ 0x33,	D | W | R | O, "SET LIMITS(10)" },
 	/* 34  O   O O    O    PRE-FETCH(10) */
 	{ 0x34,	D | W | O | K, "PRE-FETCH(10)" },
 	/* 34   M              READ POSITION */
 	{ 0x34,	T, "READ POSITION" },
 	/* 34                  GET DATA BUFFER STATUS */
 	{ 0x34, S, "GET DATA BUFFER STATUS" },
 	/* 35  O   OOO   MO    SYNCHRONIZE CACHE(10) */
 	{ 0x35,	D | W | R | O | B | K, "SYNCHRONIZE CACHE(10)" },
 	/* 36  Z   O O    O    LOCK UNLOCK CACHE(10) */
 	{ 0x36,	D | W | O | K, "LOCK UNLOCK CACHE(10)" },
 	/* 37  O     O         READ DEFECT DATA(10) */
 	{ 0x37,	D | O, "READ DEFECT DATA(10)" },
 	/* 37         O        INITIALIZE ELEMENT STATUS WITH RANGE */
 	{ 0x37,	M, "INITIALIZE ELEMENT STATUS WITH RANGE" },
 	/* 38      O O    O    MEDIUM SCAN */
 	{ 0x38,	W | O | K, "MEDIUM SCAN" },
 	/* 39  ZZZZOZO    Z    COMPARE */
 	{ 0x39,	D | T | L | P | W | R | O | K | S, "COMPARE" },
 	/* 3A  ZZZZOZO    Z    COPY AND VERIFY */
 	{ 0x3A,	D | T | L | P | W | R | O | K | S, "COPY AND VERIFY" },
 	/* 3B  OOOOOOOOOOMOOO  WRITE BUFFER */
 	{ 0x3B,	ALL, "WRITE BUFFER" },
 	/* 3C  OOOOOOOOOO OOO  READ BUFFER */
 	{ 0x3C,	ALL & ~(B), "READ BUFFER" },
 	/* 3D        O         UPDATE BLOCK */
 	{ 0x3D,	O, "UPDATE BLOCK" },
 	/* 3E  O   O O         READ LONG(10) */
 	{ 0x3E,	D | W | O, "READ LONG(10)" },
 	/* 3F  O   O O         WRITE LONG(10) */
 	{ 0x3F,	D | W | O, "WRITE LONG(10)" },
 	/* 40  ZZZZOZOZ        CHANGE DEFINITION */
 	{ 0x40,	D | T | L | P | W | R | O | M | S | C, "CHANGE DEFINITION" },
 	/* 41  O               WRITE SAME(10) */
 	{ 0x41,	D, "WRITE SAME(10)" },
-	/* 42       O          UNMAP */
+	/* 42  O               UNMAP */
 	{ 0x42,	D, "UNMAP" },
 	/* 42       O          READ SUB-CHANNEL */
 	{ 0x42,	R, "READ SUB-CHANNEL" },
 	/* 43       O          READ TOC/PMA/ATIP */
 	{ 0x43,	R, "READ TOC/PMA/ATIP" },
 	/* 44   M          M   REPORT DENSITY SUPPORT */
 	{ 0x44,	T | V, "REPORT DENSITY SUPPORT" },
 	/* 44                  READ HEADER */
 	/* 45       O          PLAY AUDIO(10) */
 	{ 0x45,	R, "PLAY AUDIO(10)" },
 	/* 46       M          GET CONFIGURATION */
 	{ 0x46,	R, "GET CONFIGURATION" },
 	/* 47       O          PLAY AUDIO MSF */
 	{ 0x47,	R, "PLAY AUDIO MSF" },
-	/* 48 */
+	/* 48  O               SANITIZE */
+	{ 0x48,	D, "SANITIZE" },
 	/* 49 */
 	/* 4A       M          GET EVENT STATUS NOTIFICATION */
 	{ 0x4A,	R, "GET EVENT STATUS NOTIFICATION" },
 	/* 4B       O          PAUSE/RESUME */
 	{ 0x4B,	R, "PAUSE/RESUME" },
 	/* 4C  OOOOO OOOO OOO  LOG SELECT */
 	{ 0x4C,	ALL & ~(R | B), "LOG SELECT" },
 	/* 4D  OOOOO OOOO OMO  LOG SENSE */
 	{ 0x4D,	ALL & ~(R | B), "LOG SENSE" },
 	/* 4E       O          STOP PLAY/SCAN */
 	{ 0x4E,	R, "STOP PLAY/SCAN" },
 	/* 4F */
 	/* 50  O               XDWRITE(10) */
 	{ 0x50,	D, "XDWRITE(10)" },
 	/* 51  O               XPWRITE(10) */
 	{ 0x51,	D, "XPWRITE(10)" },
 	/* 51       O          READ DISC INFORMATION */
 	{ 0x51,	R, "READ DISC INFORMATION" },
 	/* 52  O               XDREAD(10) */
 	{ 0x52,	D, "XDREAD(10)" },
 	/* 52       O          READ TRACK INFORMATION */
 	{ 0x52,	R, "READ TRACK INFORMATION" },
 	/* 53       O          RESERVE TRACK */
 	{ 0x53,	R, "RESERVE TRACK" },
 	/* 54       O          SEND OPC INFORMATION */
 	{ 0x54,	R, "SEND OPC INFORMATION" },
 	/* 55  OOO OMOOOOMOMO  MODE SELECT(10) */
 	{ 0x55,	ALL & ~(P), "MODE SELECT(10)" },
 	/* 56  ZZMZO OOOZ      RESERVE(10) */
 	{ 0x56,	ALL & ~(R | B | K | V | F | C), "RESERVE(10)" },
 	/* 56         Z        RESERVE ELEMENT(10) */
 	{ 0x56,	M, "RESERVE ELEMENT(10)" },
 	/* 57  ZZMZO OOOZ      RELEASE(10) */
 	{ 0x57,	ALL & ~(R | B | K | V | F | C), "RELEASE(10)" },
 	/* 57         Z        RELEASE ELEMENT(10) */
 	{ 0x57,	M, "RELEASE ELEMENT(10)" },
 	/* 58       O          REPAIR TRACK */
 	{ 0x58,	R, "REPAIR TRACK" },
 	/* 59 */
 	/* 5A  OOO OMOOOOMOMO  MODE SENSE(10) */
 	{ 0x5A,	ALL & ~(P), "MODE SENSE(10)" },
 	/* 5B       O          CLOSE TRACK/SESSION */
 	{ 0x5B,	R, "CLOSE TRACK/SESSION" },
 	/* 5C       O          READ BUFFER CAPACITY */
 	{ 0x5C,	R, "READ BUFFER CAPACITY" },
 	/* 5D       O          SEND CUE SHEET */
 	{ 0x5D,	R, "SEND CUE SHEET" },
 	/* 5E  OOOOO OOOO   M  PERSISTENT RESERVE IN */
 	{ 0x5E,	ALL & ~(R | B | K | V | C), "PERSISTENT RESERVE IN" },
 	/* 5F  OOOOO OOOO   M  PERSISTENT RESERVE OUT */
 	{ 0x5F,	ALL & ~(R | B | K | V | C), "PERSISTENT RESERVE OUT" },
 	/* 7E  OO   O OOOO O   extended CDB */
 	{ 0x7E,	D | T | R | M | A | E | B | V, "extended CDB" },
 	/* 7F  O            M  variable length CDB (more than 16 bytes) */
 	{ 0x7F,	D | F, "variable length CDB (more than 16 bytes)" },
 	/* 80  Z               XDWRITE EXTENDED(16) */
 	{ 0x80,	D, "XDWRITE EXTENDED(16)" },
 	/* 80   M              WRITE FILEMARKS(16) */
 	{ 0x80,	T, "WRITE FILEMARKS(16)" },
 	/* 81  Z               REBUILD(16) */
 	{ 0x81,	D, "REBUILD(16)" },
 	/* 81   O              READ REVERSE(16) */
 	{ 0x81,	T, "READ REVERSE(16)" },
 	/* 82  Z               REGENERATE(16) */
 	{ 0x82,	D, "REGENERATE(16)" },
 	/* 83  OOOOO O    OO   EXTENDED COPY */
 	{ 0x83,	D | T | L | P | W | O | K | V, "EXTENDED COPY" },
 	/* 84  OOOOO O    OO   RECEIVE COPY RESULTS */
 	{ 0x84,	D | T | L | P | W | O | K | V, "RECEIVE COPY RESULTS" },
 	/* 85  O    O    O     ATA COMMAND PASS THROUGH(16) */
 	{ 0x85,	D | R | B, "ATA COMMAND PASS THROUGH(16)" },
 	/* 86  OO OO OOOOOOO   ACCESS CONTROL IN */
 	{ 0x86,	ALL & ~(L | R | F), "ACCESS CONTROL IN" },
 	/* 87  OO OO OOOOOOO   ACCESS CONTROL OUT */
 	{ 0x87,	ALL & ~(L | R | F), "ACCESS CONTROL OUT" },
 	/* 88  MM  O O   O     READ(16) */
 	{ 0x88,	D | T | W | O | B, "READ(16)" },
 	/* 89  O               COMPARE AND WRITE*/
 	{ 0x89,	D, "COMPARE AND WRITE" },
 	/* 8A  OM  O O   O     WRITE(16) */
 	{ 0x8A,	D | T | W | O | B, "WRITE(16)" },
 	/* 8B  O               ORWRITE */
 	{ 0x8B,	D, "ORWRITE" },
 	/* 8C  OO  O OO  O M   READ ATTRIBUTE */
 	{ 0x8C,	D | T | W | O | M | B | V, "READ ATTRIBUTE" },
 	/* 8D  OO  O OO  O O   WRITE ATTRIBUTE */
 	{ 0x8D,	D | T | W | O | M | B | V, "WRITE ATTRIBUTE" },
 	/* 8E  O   O O   O     WRITE AND VERIFY(16) */
 	{ 0x8E,	D | W | O | B, "WRITE AND VERIFY(16)" },
 	/* 8F  OO  O O   O     VERIFY(16) */
 	{ 0x8F,	D | T | W | O | B, "VERIFY(16)" },
 	/* 90  O   O O   O     PRE-FETCH(16) */
 	{ 0x90,	D | W | O | B, "PRE-FETCH(16)" },
 	/* 91  O   O O   O     SYNCHRONIZE CACHE(16) */
 	{ 0x91,	D | W | O | B, "SYNCHRONIZE CACHE(16)" },
 	/* 91   O              SPACE(16) */
 	{ 0x91,	T, "SPACE(16)" },
 	/* 92  Z   O O         LOCK UNLOCK CACHE(16) */
 	{ 0x92,	D | W | O, "LOCK UNLOCK CACHE(16)" },
 	/* 92   O              LOCATE(16) */
 	{ 0x92,	T, "LOCATE(16)" },
 	/* 93  O               WRITE SAME(16) */
 	{ 0x93,	D, "WRITE SAME(16)" },
 	/* 93   M              ERASE(16) */
 	{ 0x93,	T, "ERASE(16)" },
 	/* 94  O               ZBC OUT */
 	{ 0x94,	ALL, "ZBC OUT" },
 	/* 95  O               ZBC IN */
 	{ 0x95,	ALL, "ZBC IN" },
 	/* 96 */
 	/* 97 */
 	/* 98 */
 	/* 99 */
 	/* 9A  O               WRITE STREAM(16) */
 	{ 0x9A,	D, "WRITE STREAM(16)" },
 	/* 9B  OOOOOOOOOO OOO  READ BUFFER(16) */
 	{ 0x9B,	ALL & ~(B) , "READ BUFFER(16)" },
 	/* 9C  O              WRITE ATOMIC(16) */
 	{ 0x9C, D, "WRITE ATOMIC(16)" },
 	/* 9D                  SERVICE ACTION BIDIRECTIONAL */
 	{ 0x9D, ALL, "SERVICE ACTION BIDIRECTIONAL" },
 	/* XXX KDM ALL for this?  op-num.txt defines it for none.. */
 	/* 9E                  SERVICE ACTION IN(16) */
 	{ 0x9E, ALL, "SERVICE ACTION IN(16)" },
 	/* 9F              M   SERVICE ACTION OUT(16) */
 	{ 0x9F,	ALL, "SERVICE ACTION OUT(16)" },
 	/* A0  MMOOO OMMM OMO  REPORT LUNS */
 	{ 0xA0,	ALL & ~(R | B), "REPORT LUNS" },
 	/* A1       O          BLANK */
 	{ 0xA1,	R, "BLANK" },
 	/* A1  O         O     ATA COMMAND PASS THROUGH(12) */
 	{ 0xA1,	D | B, "ATA COMMAND PASS THROUGH(12)" },
 	/* A2  OO   O      O   SECURITY PROTOCOL IN */
 	{ 0xA2,	D | T | R | V, "SECURITY PROTOCOL IN" },
 	/* A3  OOO O OOMOOOM   MAINTENANCE (IN) */
 	{ 0xA3,	ALL & ~(P | R | F), "MAINTENANCE (IN)" },
 	/* A3       O          SEND KEY */
 	{ 0xA3,	R, "SEND KEY" },
 	/* A4  OOO O OOOOOOO   MAINTENANCE (OUT) */
 	{ 0xA4,	ALL & ~(P | R | F), "MAINTENANCE (OUT)" },
 	/* A4       O          REPORT KEY */
 	{ 0xA4,	R, "REPORT KEY" },
 	/* A5   O  O OM        MOVE MEDIUM */
 	{ 0xA5,	T | W | O | M, "MOVE MEDIUM" },
 	/* A5       O          PLAY AUDIO(12) */
 	{ 0xA5,	R, "PLAY AUDIO(12)" },
 	/* A6         O        EXCHANGE MEDIUM */
 	{ 0xA6,	M, "EXCHANGE MEDIUM" },
 	/* A6       O          LOAD/UNLOAD C/DVD */
 	{ 0xA6,	R, "LOAD/UNLOAD C/DVD" },
 	/* A7  ZZ  O O         MOVE MEDIUM ATTACHED */
 	{ 0xA7,	D | T | W | O, "MOVE MEDIUM ATTACHED" },
 	/* A7       O          SET READ AHEAD */
 	{ 0xA7,	R, "SET READ AHEAD" },
 	/* A8  O   OOO         READ(12) */
 	{ 0xA8,	D | W | R | O, "READ(12)" },
 	/* A8                  GET MESSAGE(12) */
 	{ 0xA8, C, "GET MESSAGE(12)" },
 	/* A9              O   SERVICE ACTION OUT(12) */
 	{ 0xA9,	V, "SERVICE ACTION OUT(12)" },
 	/* AA  O   OOO         WRITE(12) */
 	{ 0xAA,	D | W | R | O, "WRITE(12)" },
 	/* AA                  SEND MESSAGE(12) */
 	{ 0xAA, C, "SEND MESSAGE(12)" },
 	/* AB       O      O   SERVICE ACTION IN(12) */
 	{ 0xAB,	R | V, "SERVICE ACTION IN(12)" },
 	/* AC        O         ERASE(12) */
 	{ 0xAC,	O, "ERASE(12)" },
 	/* AC       O          GET PERFORMANCE */
 	{ 0xAC,	R, "GET PERFORMANCE" },
 	/* AD       O          READ DVD STRUCTURE */
 	{ 0xAD,	R, "READ DVD STRUCTURE" },
 	/* AE  O   O O         WRITE AND VERIFY(12) */
 	{ 0xAE,	D | W | O, "WRITE AND VERIFY(12)" },
 	/* AF  O   OZO         VERIFY(12) */
 	{ 0xAF,	D | W | R | O, "VERIFY(12)" },
 	/* B0      ZZZ         SEARCH DATA HIGH(12) */
 	{ 0xB0,	W | R | O, "SEARCH DATA HIGH(12)" },
 	/* B1      ZZZ         SEARCH DATA EQUAL(12) */
 	{ 0xB1,	W | R | O, "SEARCH DATA EQUAL(12)" },
 	/* B2      ZZZ         SEARCH DATA LOW(12) */
 	{ 0xB2,	W | R | O, "SEARCH DATA LOW(12)" },
 	/* B3  Z   OZO         SET LIMITS(12) */
 	{ 0xB3,	D | W | R | O, "SET LIMITS(12)" },
 	/* B4  ZZ  OZO         READ ELEMENT STATUS ATTACHED */
 	{ 0xB4,	D | T | W | R | O, "READ ELEMENT STATUS ATTACHED" },
 	/* B5  OO   O      O   SECURITY PROTOCOL OUT */
 	{ 0xB5,	D | T | R | V, "SECURITY PROTOCOL OUT" },
 	/* B5         O        REQUEST VOLUME ELEMENT ADDRESS */
 	{ 0xB5,	M, "REQUEST VOLUME ELEMENT ADDRESS" },
 	/* B6         O        SEND VOLUME TAG */
 	{ 0xB6,	M, "SEND VOLUME TAG" },
 	/* B6       O          SET STREAMING */
 	{ 0xB6,	R, "SET STREAMING" },
 	/* B7  O     O         READ DEFECT DATA(12) */
 	{ 0xB7,	D | O, "READ DEFECT DATA(12)" },
 	/* B8   O  OZOM        READ ELEMENT STATUS */
 	{ 0xB8,	T | W | R | O | M, "READ ELEMENT STATUS" },
 	/* B9       O          READ CD MSF */
 	{ 0xB9,	R, "READ CD MSF" },
 	/* BA  O   O OOMO      REDUNDANCY GROUP (IN) */
 	{ 0xBA,	D | W | O | M | A | E, "REDUNDANCY GROUP (IN)" },
 	/* BA       O          SCAN */
 	{ 0xBA,	R, "SCAN" },
 	/* BB  O   O OOOO      REDUNDANCY GROUP (OUT) */
 	{ 0xBB,	D | W | O | M | A | E, "REDUNDANCY GROUP (OUT)" },
 	/* BB       O          SET CD SPEED */
 	{ 0xBB,	R, "SET CD SPEED" },
 	/* BC  O   O OOMO      SPARE (IN) */
 	{ 0xBC,	D | W | O | M | A | E, "SPARE (IN)" },
 	/* BD  O   O OOOO      SPARE (OUT) */
 	{ 0xBD,	D | W | O | M | A | E, "SPARE (OUT)" },
 	/* BD       O          MECHANISM STATUS */
 	{ 0xBD,	R, "MECHANISM STATUS" },
 	/* BE  O   O OOMO      VOLUME SET (IN) */
 	{ 0xBE,	D | W | O | M | A | E, "VOLUME SET (IN)" },
 	/* BE       O          READ CD */
 	{ 0xBE,	R, "READ CD" },
 	/* BF  O   O OOOO      VOLUME SET (OUT) */
 	{ 0xBF,	D | W | O | M | A | E, "VOLUME SET (OUT)" },
 	/* BF       O          SEND DVD STRUCTURE */
 	{ 0xBF,	R, "SEND DVD STRUCTURE" }
 };
 
 const char *
 scsi_op_desc(u_int16_t opcode, struct scsi_inquiry_data *inq_data)
 {
 	caddr_t match;
 	int i, j;
 	u_int32_t opmask;
 	u_int16_t pd_type;
 	int       num_ops[2];
 	struct op_table_entry *table[2];
 	int num_tables;
 
 	/*
 	 * If we've got inquiry data, use it to determine what type of
 	 * device we're dealing with here.  Otherwise, assume direct
 	 * access.
 	 */
 	if (inq_data == NULL) {
 		pd_type = T_DIRECT;
 		match = NULL;
 	} else {
 		pd_type = SID_TYPE(inq_data);
 
 		match = cam_quirkmatch((caddr_t)inq_data,
 				       (caddr_t)scsi_op_quirk_table,
 				       nitems(scsi_op_quirk_table),
 				       sizeof(*scsi_op_quirk_table),
 				       scsi_inquiry_match);
 	}
 
 	if (match != NULL) {
 		table[0] = ((struct scsi_op_quirk_entry *)match)->op_table;
 		num_ops[0] = ((struct scsi_op_quirk_entry *)match)->num_ops;
 		table[1] = scsi_op_codes;
 		num_ops[1] = nitems(scsi_op_codes);
 		num_tables = 2;
 	} else {
 		/*	
 		 * If this is true, we have a vendor specific opcode that
 		 * wasn't covered in the quirk table.
 		 */
 		if ((opcode > 0xBF) || ((opcode > 0x5F) && (opcode < 0x80)))
 			return("Vendor Specific Command");
 
 		table[0] = scsi_op_codes;
 		num_ops[0] = nitems(scsi_op_codes);
 		num_tables = 1;
 	}
 
 	/* RBC is 'Simplified' Direct Access Device */
 	if (pd_type == T_RBC)
 		pd_type = T_DIRECT;
 
 	/*
 	 * Host managed drives are direct access for the most part.
 	 */
 	if (pd_type == T_ZBC_HM)
 		pd_type = T_DIRECT;
 
 	/* Map NODEVICE to Direct Access Device to handle REPORT LUNS, etc. */
 	if (pd_type == T_NODEVICE)
 		pd_type = T_DIRECT;
 
 	opmask = 1 << pd_type;
 
 	for (j = 0; j < num_tables; j++) {
 		for (i = 0;i < num_ops[j] && table[j][i].opcode <= opcode; i++){
 			if ((table[j][i].opcode == opcode) 
 			 && ((table[j][i].opmask & opmask) != 0))
 				return(table[j][i].desc);
 		}
 	}
 	
 	/*
 	 * If we can't find a match for the command in the table, we just
 	 * assume it's a vendor specifc command.
 	 */
 	return("Vendor Specific Command");
 
 }
 
 #else /* SCSI_NO_OP_STRINGS */
 
 const char *
 scsi_op_desc(u_int16_t opcode, struct scsi_inquiry_data *inq_data)
 {
 	return("");
 }
 
 #endif
 
 
 #if !defined(SCSI_NO_SENSE_STRINGS)
 #define SST(asc, ascq, action, desc) \
 	asc, ascq, action, desc
 #else 
 const char empty_string[] = "";
 
 #define SST(asc, ascq, action, desc) \
 	asc, ascq, action, empty_string
 #endif 
 
 const struct sense_key_table_entry sense_key_table[] = 
 {
 	{ SSD_KEY_NO_SENSE, SS_NOP, "NO SENSE" },
 	{ SSD_KEY_RECOVERED_ERROR, SS_NOP|SSQ_PRINT_SENSE, "RECOVERED ERROR" },
 	{ SSD_KEY_NOT_READY, SS_RDEF, "NOT READY" },
 	{ SSD_KEY_MEDIUM_ERROR, SS_RDEF, "MEDIUM ERROR" },
 	{ SSD_KEY_HARDWARE_ERROR, SS_RDEF, "HARDWARE FAILURE" },
 	{ SSD_KEY_ILLEGAL_REQUEST, SS_FATAL|EINVAL, "ILLEGAL REQUEST" },
 	{ SSD_KEY_UNIT_ATTENTION, SS_FATAL|ENXIO, "UNIT ATTENTION" },
 	{ SSD_KEY_DATA_PROTECT, SS_FATAL|EACCES, "DATA PROTECT" },
 	{ SSD_KEY_BLANK_CHECK, SS_FATAL|ENOSPC, "BLANK CHECK" },
 	{ SSD_KEY_Vendor_Specific, SS_FATAL|EIO, "Vendor Specific" },
 	{ SSD_KEY_COPY_ABORTED, SS_FATAL|EIO, "COPY ABORTED" },
 	{ SSD_KEY_ABORTED_COMMAND, SS_RDEF, "ABORTED COMMAND" },
 	{ SSD_KEY_EQUAL, SS_NOP, "EQUAL" },
 	{ SSD_KEY_VOLUME_OVERFLOW, SS_FATAL|EIO, "VOLUME OVERFLOW" },
 	{ SSD_KEY_MISCOMPARE, SS_NOP, "MISCOMPARE" },
 	{ SSD_KEY_COMPLETED, SS_NOP, "COMPLETED" }
 };
 
 static struct asc_table_entry quantum_fireball_entries[] = {
 	{ SST(0x04, 0x0b, SS_START | SSQ_DECREMENT_COUNT | ENXIO, 
 	     "Logical unit not ready, initializing cmd. required") }
 };
 
 static struct asc_table_entry sony_mo_entries[] = {
 	{ SST(0x04, 0x00, SS_START | SSQ_DECREMENT_COUNT | ENXIO,
 	     "Logical unit not ready, cause not reportable") }
 };
 
 static struct asc_table_entry hgst_entries[] = {
 	{ SST(0x04, 0xF0, SS_RDEF,
 	    "Vendor Unique - Logical Unit Not Ready") },
 	{ SST(0x0A, 0x01, SS_RDEF,
 	    "Unrecovered Super Certification Log Write Error") },
 	{ SST(0x0A, 0x02, SS_RDEF,
 	    "Unrecovered Super Certification Log Read Error") },
 	{ SST(0x15, 0x03, SS_RDEF,
 	    "Unrecovered Sector Error") },
 	{ SST(0x3E, 0x04, SS_RDEF,
 	    "Unrecovered Self-Test Hard-Cache Test Fail") },
 	{ SST(0x3E, 0x05, SS_RDEF,
 	    "Unrecovered Self-Test OTF-Cache Fail") },
 	{ SST(0x40, 0x00, SS_RDEF,
 	    "Unrecovered SAT No Buffer Overflow Error") },
 	{ SST(0x40, 0x01, SS_RDEF,
 	    "Unrecovered SAT Buffer Overflow Error") },
 	{ SST(0x40, 0x02, SS_RDEF,
 	    "Unrecovered SAT No Buffer Overflow With ECS Fault") },
 	{ SST(0x40, 0x03, SS_RDEF,
 	    "Unrecovered SAT Buffer Overflow With ECS Fault") },
 	{ SST(0x40, 0x81, SS_RDEF,
 	    "DRAM Failure") },
 	{ SST(0x44, 0x0B, SS_RDEF,
 	    "Vendor Unique - Internal Target Failure") },
 	{ SST(0x44, 0xF2, SS_RDEF,
 	    "Vendor Unique - Internal Target Failure") },
 	{ SST(0x44, 0xF6, SS_RDEF,
 	    "Vendor Unique - Internal Target Failure") },
 	{ SST(0x44, 0xF9, SS_RDEF,
 	    "Vendor Unique - Internal Target Failure") },
 	{ SST(0x44, 0xFA, SS_RDEF,
 	    "Vendor Unique - Internal Target Failure") },
 	{ SST(0x5D, 0x22, SS_RDEF,
 	    "Extreme Over-Temperature Warning") },
 	{ SST(0x5D, 0x50, SS_RDEF,
 	    "Load/Unload cycle Count Warning") },
 	{ SST(0x81, 0x00, SS_RDEF,
 	    "Vendor Unique - Internal Logic Error") },
 	{ SST(0x85, 0x00, SS_RDEF,
 	    "Vendor Unique - Internal Key Seed Error") },
 };
 
 static struct asc_table_entry seagate_entries[] = {
 	{ SST(0x04, 0xF0, SS_RDEF,
 	    "Logical Unit Not Ready, super certify in Progress") },
 	{ SST(0x08, 0x86, SS_RDEF,
 	    "Write Fault Data Corruption") },
 	{ SST(0x09, 0x0D, SS_RDEF,
 	    "Tracking Failure") },
 	{ SST(0x09, 0x0E, SS_RDEF,
 	    "ETF Failure") },
 	{ SST(0x0B, 0x5D, SS_RDEF,
 	    "Pre-SMART Warning") },
 	{ SST(0x0B, 0x85, SS_RDEF,
 	    "5V Voltage Warning") },
 	{ SST(0x0B, 0x8C, SS_RDEF,
 	    "12V Voltage Warning") },
 	{ SST(0x0C, 0xFF, SS_RDEF,
 	    "Write Error - Too many error recovery revs") },
 	{ SST(0x11, 0xFF, SS_RDEF,
 	    "Unrecovered Read Error - Too many error recovery revs") },
 	{ SST(0x19, 0x0E, SS_RDEF,
 	    "Fewer than 1/2 defect list copies") },
 	{ SST(0x20, 0xF3, SS_RDEF,
 	    "Illegal CDB linked to skip mask cmd") },
 	{ SST(0x24, 0xF0, SS_RDEF,
 	    "Illegal byte in CDB, LBA not matching") },
 	{ SST(0x24, 0xF1, SS_RDEF,
 	    "Illegal byte in CDB, LEN not matching") },
 	{ SST(0x24, 0xF2, SS_RDEF,
 	    "Mask not matching transfer length") },
 	{ SST(0x24, 0xF3, SS_RDEF,
 	    "Drive formatted without plist") },
 	{ SST(0x26, 0x95, SS_RDEF,
 	    "Invalid Field Parameter - CAP File") },
 	{ SST(0x26, 0x96, SS_RDEF,
 	    "Invalid Field Parameter - RAP File") },
 	{ SST(0x26, 0x97, SS_RDEF,
 	    "Invalid Field Parameter - TMS Firmware Tag") },
 	{ SST(0x26, 0x98, SS_RDEF,
 	    "Invalid Field Parameter - Check Sum") },
 	{ SST(0x26, 0x99, SS_RDEF,
 	    "Invalid Field Parameter - Firmware Tag") },
 	{ SST(0x29, 0x08, SS_RDEF,
 	    "Write Log Dump data") },
 	{ SST(0x29, 0x09, SS_RDEF,
 	    "Write Log Dump data") },
 	{ SST(0x29, 0x0A, SS_RDEF,
 	    "Reserved disk space") },
 	{ SST(0x29, 0x0B, SS_RDEF,
 	    "SDBP") },
 	{ SST(0x29, 0x0C, SS_RDEF,
 	    "SDBP") },
 	{ SST(0x31, 0x91, SS_RDEF,
 	    "Format Corrupted World Wide Name (WWN) is Invalid") },
 	{ SST(0x32, 0x03, SS_RDEF,
 	    "Defect List - Length exceeds Command Allocated Length") },
 	{ SST(0x33, 0x00, SS_RDEF,
 	    "Flash not ready for access") },
 	{ SST(0x3F, 0x70, SS_RDEF,
 	    "Invalid RAP block") },
 	{ SST(0x3F, 0x71, SS_RDEF,
 	    "RAP/ETF mismatch") },
 	{ SST(0x3F, 0x90, SS_RDEF,
 	    "Invalid CAP block") },
 	{ SST(0x3F, 0x91, SS_RDEF,
 	    "World Wide Name (WWN) Mismatch") },
 	{ SST(0x40, 0x01, SS_RDEF,
 	    "DRAM Parity Error") },
 	{ SST(0x40, 0x02, SS_RDEF,
 	    "DRAM Parity Error") },
 	{ SST(0x42, 0x0A, SS_RDEF,
 	    "Loopback Test") },
 	{ SST(0x42, 0x0B, SS_RDEF,
 	    "Loopback Test") },
 	{ SST(0x44, 0xF2, SS_RDEF,
 	    "Compare error during data integrity check") },
 	{ SST(0x44, 0xF6, SS_RDEF,
 	    "Unrecoverable error during data integrity check") },
 	{ SST(0x47, 0x80, SS_RDEF,
 	    "Fibre Channel Sequence Error") },
 	{ SST(0x4E, 0x01, SS_RDEF,
 	    "Information Unit Too Short") },
 	{ SST(0x80, 0x00, SS_RDEF,
 	    "General Firmware Error / Command Timeout") },
 	{ SST(0x80, 0x01, SS_RDEF,
 	    "Command Timeout") },
 	{ SST(0x80, 0x02, SS_RDEF,
 	    "Command Timeout") },
 	{ SST(0x80, 0x80, SS_RDEF,
 	    "FC FIFO Error During Read Transfer") },
 	{ SST(0x80, 0x81, SS_RDEF,
 	    "FC FIFO Error During Write Transfer") },
 	{ SST(0x80, 0x82, SS_RDEF,
 	    "DISC FIFO Error During Read Transfer") },
 	{ SST(0x80, 0x83, SS_RDEF,
 	    "DISC FIFO Error During Write Transfer") },
 	{ SST(0x80, 0x84, SS_RDEF,
 	    "LBA Seeded LRC Error on Read") },
 	{ SST(0x80, 0x85, SS_RDEF,
 	    "LBA Seeded LRC Error on Write") },
 	{ SST(0x80, 0x86, SS_RDEF,
 	    "IOEDC Error on Read") },
 	{ SST(0x80, 0x87, SS_RDEF,
 	    "IOEDC Error on Write") },
 	{ SST(0x80, 0x88, SS_RDEF,
 	    "Host Parity Check Failed") },
 	{ SST(0x80, 0x89, SS_RDEF,
 	    "IOEDC error on read detected by formatter") },
 	{ SST(0x80, 0x8A, SS_RDEF,
 	    "Host Parity Errors / Host FIFO Initialization Failed") },
 	{ SST(0x80, 0x8B, SS_RDEF,
 	    "Host Parity Errors") },
 	{ SST(0x80, 0x8C, SS_RDEF,
 	    "Host Parity Errors") },
 	{ SST(0x80, 0x8D, SS_RDEF,
 	    "Host Parity Errors") },
 	{ SST(0x81, 0x00, SS_RDEF,
 	    "LA Check Failed") },
 	{ SST(0x82, 0x00, SS_RDEF,
 	    "Internal client detected insufficient buffer") },
 	{ SST(0x84, 0x00, SS_RDEF,
 	    "Scheduled Diagnostic And Repair") },
 };
 
 static struct scsi_sense_quirk_entry sense_quirk_table[] = {
 	{
 		/*
 		 * XXX The Quantum Fireball ST and SE like to return 0x04 0x0b
 		 * when they really should return 0x04 0x02.
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "QUANTUM", "FIREBALL S*", "*"},
 		/*num_sense_keys*/0,
 		nitems(quantum_fireball_entries),
 		/*sense key entries*/NULL,
 		quantum_fireball_entries
 	},
 	{
 		/*
 		 * This Sony MO drive likes to return 0x04, 0x00 when it
 		 * isn't spun up.
 		 */
 		{T_DIRECT, SIP_MEDIA_REMOVABLE, "SONY", "SMO-*", "*"},
 		/*num_sense_keys*/0,
 		nitems(sony_mo_entries),
 		/*sense key entries*/NULL,
 		sony_mo_entries
 	},
 	{
 		/*
 		 * HGST vendor-specific error codes
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "HGST", "*", "*"},
 		/*num_sense_keys*/0,
 		nitems(hgst_entries),
 		/*sense key entries*/NULL,
 		hgst_entries
 	},
 	{
 		/*
 		 * SEAGATE vendor-specific error codes
 		 */
 		{T_DIRECT, SIP_MEDIA_FIXED, "SEAGATE", "*", "*"},
 		/*num_sense_keys*/0,
 		nitems(seagate_entries),
 		/*sense key entries*/NULL,
 		seagate_entries
 	}
 };
 
 const u_int sense_quirk_table_size = nitems(sense_quirk_table);
 
 static struct asc_table_entry asc_table[] = {
 	/*
 	 * From: http://www.t10.org/lists/asc-num.txt
 	 * Modifications by Jung-uk Kim (jkim@FreeBSD.org)
 	 */
 	/*
 	 * File: ASC-NUM.TXT
 	 *
 	 * SCSI ASC/ASCQ Assignments
 	 * Numeric Sorted Listing
 	 * as of  8/12/15
 	 *
 	 * D - DIRECT ACCESS DEVICE (SBC-2)                   device column key
 	 * .T - SEQUENTIAL ACCESS DEVICE (SSC)               -------------------
 	 * . L - PRINTER DEVICE (SSC)                           blank = reserved
 	 * .  P - PROCESSOR DEVICE (SPC)                     not blank = allowed
 	 * .  .W - WRITE ONCE READ MULTIPLE DEVICE (SBC-2)
 	 * .  . R - CD DEVICE (MMC)
 	 * .  .  O - OPTICAL MEMORY DEVICE (SBC-2)
 	 * .  .  .M - MEDIA CHANGER DEVICE (SMC)
 	 * .  .  . A - STORAGE ARRAY DEVICE (SCC)
 	 * .  .  .  E - ENCLOSURE SERVICES DEVICE (SES)
 	 * .  .  .  .B - SIMPLIFIED DIRECT-ACCESS DEVICE (RBC)
 	 * .  .  .  . K - OPTICAL CARD READER/WRITER DEVICE (OCRW)
 	 * .  .  .  .  V - AUTOMATION/DRIVE INTERFACE (ADC)
 	 * .  .  .  .  .F - OBJECT-BASED STORAGE (OSD)
 	 * DTLPWROMAEBKVF
 	 * ASC      ASCQ  Action
 	 * Description
 	 */
 	/* DTLPWROMAEBKVF */
 	{ SST(0x00, 0x00, SS_NOP,
 	    "No additional sense information") },
 	/*  T             */
 	{ SST(0x00, 0x01, SS_RDEF,
 	    "Filemark detected") },
 	/*  T             */
 	{ SST(0x00, 0x02, SS_RDEF,
 	    "End-of-partition/medium detected") },
 	/*  T             */
 	{ SST(0x00, 0x03, SS_RDEF,
 	    "Setmark detected") },
 	/*  T             */
 	{ SST(0x00, 0x04, SS_RDEF,
 	    "Beginning-of-partition/medium detected") },
 	/*  TL            */
 	{ SST(0x00, 0x05, SS_RDEF,
 	    "End-of-data detected") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x00, 0x06, SS_RDEF,
 	    "I/O process terminated") },
 	/*  T             */
 	{ SST(0x00, 0x07, SS_RDEF,	/* XXX TBD */
 	    "Programmable early warning detected") },
 	/*      R         */
 	{ SST(0x00, 0x11, SS_FATAL | EBUSY,
 	    "Audio play operation in progress") },
 	/*      R         */
 	{ SST(0x00, 0x12, SS_NOP,
 	    "Audio play operation paused") },
 	/*      R         */
 	{ SST(0x00, 0x13, SS_NOP,
 	    "Audio play operation successfully completed") },
 	/*      R         */
 	{ SST(0x00, 0x14, SS_RDEF,
 	    "Audio play operation stopped due to error") },
 	/*      R         */
 	{ SST(0x00, 0x15, SS_NOP,
 	    "No current audio status to return") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x00, 0x16, SS_FATAL | EBUSY,
 	    "Operation in progress") },
 	/* DTL WROMAEBKVF */
 	{ SST(0x00, 0x17, SS_RDEF,
 	    "Cleaning requested") },
 	/*  T             */
 	{ SST(0x00, 0x18, SS_RDEF,	/* XXX TBD */
 	    "Erase operation in progress") },
 	/*  T             */
 	{ SST(0x00, 0x19, SS_RDEF,	/* XXX TBD */
 	    "Locate operation in progress") },
 	/*  T             */
 	{ SST(0x00, 0x1A, SS_RDEF,	/* XXX TBD */
 	    "Rewind operation in progress") },
 	/*  T             */
 	{ SST(0x00, 0x1B, SS_RDEF,	/* XXX TBD */
 	    "Set capacity operation in progress") },
 	/*  T             */
 	{ SST(0x00, 0x1C, SS_RDEF,	/* XXX TBD */
 	    "Verify operation in progress") },
 	/* DT        B    */
 	{ SST(0x00, 0x1D, SS_NOP,
 	    "ATA pass through information available") },
 	/* DT   R MAEBKV  */
 	{ SST(0x00, 0x1E, SS_RDEF,	/* XXX TBD */
 	    "Conflicting SA creation request") },
 	/* DT        B    */
 	{ SST(0x00, 0x1F, SS_RDEF,	/* XXX TBD */
 	    "Logical unit transitioning to another power condition") },
 	/* DT P      B    */
 	{ SST(0x00, 0x20, SS_NOP,
 	    "Extended copy information available") },
 	/* D              */
 	{ SST(0x00, 0x21, SS_RDEF,	/* XXX TBD */
 	    "Atomic command aborted due to ACA") },
 	/* D   W O   BK   */
 	{ SST(0x01, 0x00, SS_RDEF,
 	    "No index/sector signal") },
 	/* D   WRO   BK   */
 	{ SST(0x02, 0x00, SS_RDEF,
 	    "No seek complete") },
 	/* DTL W O   BK   */
 	{ SST(0x03, 0x00, SS_RDEF,
 	    "Peripheral device write fault") },
 	/*  T             */
 	{ SST(0x03, 0x01, SS_RDEF,
 	    "No write current") },
 	/*  T             */
 	{ SST(0x03, 0x02, SS_RDEF,
 	    "Excessive write errors") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x00, SS_RDEF,
 	    "Logical unit not ready, cause not reportable") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x01, SS_WAIT | EBUSY,
 	    "Logical unit is in process of becoming ready") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x02, SS_START | SSQ_DECREMENT_COUNT | ENXIO,
 	    "Logical unit not ready, initializing command required") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x03, SS_FATAL | ENXIO,
 	    "Logical unit not ready, manual intervention required") },
 	/* DTL  RO   B    */
 	{ SST(0x04, 0x04, SS_FATAL | EBUSY,
 	    "Logical unit not ready, format in progress") },
 	/* DT  W O A BK F */
 	{ SST(0x04, 0x05, SS_FATAL | EBUSY,
 	    "Logical unit not ready, rebuild in progress") },
 	/* DT  W O A BK   */
 	{ SST(0x04, 0x06, SS_FATAL | EBUSY,
 	    "Logical unit not ready, recalculation in progress") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x07, SS_FATAL | EBUSY,
 	    "Logical unit not ready, operation in progress") },
 	/*      R         */
 	{ SST(0x04, 0x08, SS_FATAL | EBUSY,
 	    "Logical unit not ready, long write in progress") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x09, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, self-test in progress") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x0A, SS_WAIT | ENXIO,
 	    "Logical unit not accessible, asymmetric access state transition")},
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x0B, SS_FATAL | ENXIO,
 	    "Logical unit not accessible, target port in standby state") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x0C, SS_FATAL | ENXIO,
 	    "Logical unit not accessible, target port in unavailable state") },
 	/*              F */
 	{ SST(0x04, 0x0D, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, structure check required") },
 	/* DTL WR MAEBKVF */
 	{ SST(0x04, 0x0E, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, security session in progress") },
 	/* DT  WROM  B    */
 	{ SST(0x04, 0x10, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, auxiliary memory not accessible") },
 	/* DT  WRO AEB VF */
 	{ SST(0x04, 0x11, SS_WAIT | EBUSY,
 	    "Logical unit not ready, notify (enable spinup) required") },
 	/*        M    V  */
 	{ SST(0x04, 0x12, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, offline") },
 	/* DT   R MAEBKV  */
 	{ SST(0x04, 0x13, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, SA creation in progress") },
 	/* D         B    */
 	{ SST(0x04, 0x14, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, space allocation in progress") },
 	/*        M       */
 	{ SST(0x04, 0x15, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, robotics disabled") },
 	/*        M       */
 	{ SST(0x04, 0x16, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, configuration required") },
 	/*        M       */
 	{ SST(0x04, 0x17, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, calibration required") },
 	/*        M       */
 	{ SST(0x04, 0x18, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, a door is open") },
 	/*        M       */
 	{ SST(0x04, 0x19, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, operating in sequential mode") },
 	/* DT        B    */
 	{ SST(0x04, 0x1A, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, START/STOP UNIT command in progress") },
 	/* D         B    */
-	{ SST(0x04, 0x1B, SS_RDEF,	/* XXX TBD */
+	{ SST(0x04, 0x1B, SS_WAIT | EBUSY,
 	    "Logical unit not ready, sanitize in progress") },
 	/* DT     MAEB    */
 	{ SST(0x04, 0x1C, SS_START | SSQ_DECREMENT_COUNT | ENXIO,
 	    "Logical unit not ready, additional power use not yet granted") },
 	/* D              */
 	{ SST(0x04, 0x1D, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, configuration in progress") },
 	/* D              */
 	{ SST(0x04, 0x1E, SS_FATAL | ENXIO,
 	    "Logical unit not ready, microcode activation required") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x1F, SS_FATAL | ENXIO,
 	    "Logical unit not ready, microcode download required") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x20, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, logical unit reset required") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x21, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, hard reset required") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x04, 0x22, SS_RDEF,	/* XXX TBD */
 	    "Logical unit not ready, power cycle required") },
 	/* DTL WROMAEBKVF */
 	{ SST(0x05, 0x00, SS_RDEF,
 	    "Logical unit does not respond to selection") },
 	/* D   WROM  BK   */
 	{ SST(0x06, 0x00, SS_RDEF,
 	    "No reference position found") },
 	/* DTL WROM  BK   */
 	{ SST(0x07, 0x00, SS_RDEF,
 	    "Multiple peripheral devices selected") },
 	/* DTL WROMAEBKVF */
 	{ SST(0x08, 0x00, SS_RDEF,
 	    "Logical unit communication failure") },
 	/* DTL WROMAEBKVF */
 	{ SST(0x08, 0x01, SS_RDEF,
 	    "Logical unit communication time-out") },
 	/* DTL WROMAEBKVF */
 	{ SST(0x08, 0x02, SS_RDEF,
 	    "Logical unit communication parity error") },
 	/* DT   ROM  BK   */
 	{ SST(0x08, 0x03, SS_RDEF,
 	    "Logical unit communication CRC error (Ultra-DMA/32)") },
 	/* DTLPWRO    K   */
 	{ SST(0x08, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Unreachable copy target") },
 	/* DT  WRO   B    */
 	{ SST(0x09, 0x00, SS_RDEF,
 	    "Track following error") },
 	/*     WRO    K   */
 	{ SST(0x09, 0x01, SS_RDEF,
 	    "Tracking servo failure") },
 	/*     WRO    K   */
 	{ SST(0x09, 0x02, SS_RDEF,
 	    "Focus servo failure") },
 	/*     WRO        */
 	{ SST(0x09, 0x03, SS_RDEF,
 	    "Spindle servo failure") },
 	/* DT  WRO   B    */
 	{ SST(0x09, 0x04, SS_RDEF,
 	    "Head select fault") },
 	/* DT   RO   B    */
 	{ SST(0x09, 0x05, SS_RDEF,
 	    "Vibration induced tracking error") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0A, 0x00, SS_FATAL | ENOSPC,
 	    "Error log overflow") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x00, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x01, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - specified temperature exceeded") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x02, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - enclosure degraded") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x03, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - background self-test failed") },
 	/* DTLPWRO AEBKVF */
 	{ SST(0x0B, 0x04, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - background pre-scan detected medium error") },
 	/* DTLPWRO AEBKVF */
 	{ SST(0x0B, 0x05, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - background medium scan detected medium error") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x06, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - non-volatile cache now volatile") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x07, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - degraded power to non-volatile cache") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x08, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - power loss expected") },
 	/* D              */
 	{ SST(0x0B, 0x09, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - device statistics notification available") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x0A, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - High critical temperature limit exceeded") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x0B, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - Low critical temperature limit exceeded") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x0C, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - High operating temperature limit exceeded") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x0D, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - Low operating temperature limit exceeded") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x0E, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - High citical humidity limit exceeded") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x0F, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - Low citical humidity limit exceeded") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x10, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - High operating humidity limit exceeded") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x0B, 0x11, SS_NOP | SSQ_PRINT_SENSE,
 	    "Warning - Low operating humidity limit exceeded") },
 	/*  T   R         */
 	{ SST(0x0C, 0x00, SS_RDEF,
 	    "Write error") },
 	/*            K   */
 	{ SST(0x0C, 0x01, SS_NOP | SSQ_PRINT_SENSE,
 	    "Write error - recovered with auto reallocation") },
 	/* D   W O   BK   */
 	{ SST(0x0C, 0x02, SS_RDEF,
 	    "Write error - auto reallocation failed") },
 	/* D   W O   BK   */
 	{ SST(0x0C, 0x03, SS_RDEF,
 	    "Write error - recommend reassignment") },
 	/* DT  W O   B    */
 	{ SST(0x0C, 0x04, SS_RDEF,
 	    "Compression check miscompare error") },
 	/* DT  W O   B    */
 	{ SST(0x0C, 0x05, SS_RDEF,
 	    "Data expansion occurred during compression") },
 	/* DT  W O   B    */
 	{ SST(0x0C, 0x06, SS_RDEF,
 	    "Block not compressible") },
 	/*      R         */
 	{ SST(0x0C, 0x07, SS_RDEF,
 	    "Write error - recovery needed") },
 	/*      R         */
 	{ SST(0x0C, 0x08, SS_RDEF,
 	    "Write error - recovery failed") },
 	/*      R         */
 	{ SST(0x0C, 0x09, SS_RDEF,
 	    "Write error - loss of streaming") },
 	/*      R         */
 	{ SST(0x0C, 0x0A, SS_RDEF,
 	    "Write error - padding blocks added") },
 	/* DT  WROM  B    */
 	{ SST(0x0C, 0x0B, SS_RDEF,	/* XXX TBD */
 	    "Auxiliary memory write error") },
 	/* DTLPWRO AEBKVF */
 	{ SST(0x0C, 0x0C, SS_RDEF,	/* XXX TBD */
 	    "Write error - unexpected unsolicited data") },
 	/* DTLPWRO AEBKVF */
 	{ SST(0x0C, 0x0D, SS_RDEF,	/* XXX TBD */
 	    "Write error - not enough unsolicited data") },
 	/* DT  W O   BK   */
 	{ SST(0x0C, 0x0E, SS_RDEF,	/* XXX TBD */
 	    "Multiple write errors") },
 	/*      R         */
 	{ SST(0x0C, 0x0F, SS_RDEF,	/* XXX TBD */
 	    "Defects in error window") },
 	/* D              */
 	{ SST(0x0C, 0x10, SS_RDEF,	/* XXX TBD */
 	    "Incomplete multiple atomic write operations") },
 	/* D              */
 	{ SST(0x0C, 0x11, SS_RDEF,	/* XXX TBD */
 	    "Write error - recovery scan needed") },
 	/* D              */
 	{ SST(0x0C, 0x12, SS_RDEF,	/* XXX TBD */
 	    "Write error - insufficient zone resources") },
 	/* DTLPWRO A  K   */
 	{ SST(0x0D, 0x00, SS_RDEF,	/* XXX TBD */
 	    "Error detected by third party temporary initiator") },
 	/* DTLPWRO A  K   */
 	{ SST(0x0D, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Third party device failure") },
 	/* DTLPWRO A  K   */
 	{ SST(0x0D, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Copy target device not reachable") },
 	/* DTLPWRO A  K   */
 	{ SST(0x0D, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Incorrect copy target device type") },
 	/* DTLPWRO A  K   */
 	{ SST(0x0D, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Copy target device data underrun") },
 	/* DTLPWRO A  K   */
 	{ SST(0x0D, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Copy target device data overrun") },
 	/* DT PWROMAEBK F */
 	{ SST(0x0E, 0x00, SS_RDEF,	/* XXX TBD */
 	    "Invalid information unit") },
 	/* DT PWROMAEBK F */
 	{ SST(0x0E, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Information unit too short") },
 	/* DT PWROMAEBK F */
 	{ SST(0x0E, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Information unit too long") },
 	/* DT P R MAEBK F */
 	{ SST(0x0E, 0x03, SS_FATAL | EINVAL,
 	    "Invalid field in command information unit") },
 	/* D   W O   BK   */
 	{ SST(0x10, 0x00, SS_RDEF,
 	    "ID CRC or ECC error") },
 	/* DT  W O        */
 	{ SST(0x10, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Logical block guard check failed") },
 	/* DT  W O        */
 	{ SST(0x10, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Logical block application tag check failed") },
 	/* DT  W O        */
 	{ SST(0x10, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Logical block reference tag check failed") },
 	/*  T             */
 	{ SST(0x10, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Logical block protection error on recovered buffer data") },
 	/*  T             */
 	{ SST(0x10, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Logical block protection method error") },
 	/* DT  WRO   BK   */
 	{ SST(0x11, 0x00, SS_FATAL|EIO,
 	    "Unrecovered read error") },
 	/* DT  WRO   BK   */
 	{ SST(0x11, 0x01, SS_FATAL|EIO,
 	    "Read retries exhausted") },
 	/* DT  WRO   BK   */
 	{ SST(0x11, 0x02, SS_FATAL|EIO,
 	    "Error too long to correct") },
 	/* DT  W O   BK   */
 	{ SST(0x11, 0x03, SS_FATAL|EIO,
 	    "Multiple read errors") },
 	/* D   W O   BK   */
 	{ SST(0x11, 0x04, SS_FATAL|EIO,
 	    "Unrecovered read error - auto reallocate failed") },
 	/*     WRO   B    */
 	{ SST(0x11, 0x05, SS_FATAL|EIO,
 	    "L-EC uncorrectable error") },
 	/*     WRO   B    */
 	{ SST(0x11, 0x06, SS_FATAL|EIO,
 	    "CIRC unrecovered error") },
 	/*     W O   B    */
 	{ SST(0x11, 0x07, SS_RDEF,
 	    "Data re-synchronization error") },
 	/*  T             */
 	{ SST(0x11, 0x08, SS_RDEF,
 	    "Incomplete block read") },
 	/*  T             */
 	{ SST(0x11, 0x09, SS_RDEF,
 	    "No gap found") },
 	/* DT    O   BK   */
 	{ SST(0x11, 0x0A, SS_RDEF,
 	    "Miscorrected error") },
 	/* D   W O   BK   */
 	{ SST(0x11, 0x0B, SS_FATAL|EIO,
 	    "Unrecovered read error - recommend reassignment") },
 	/* D   W O   BK   */
 	{ SST(0x11, 0x0C, SS_FATAL|EIO,
 	    "Unrecovered read error - recommend rewrite the data") },
 	/* DT  WRO   B    */
 	{ SST(0x11, 0x0D, SS_RDEF,
 	    "De-compression CRC error") },
 	/* DT  WRO   B    */
 	{ SST(0x11, 0x0E, SS_RDEF,
 	    "Cannot decompress using declared algorithm") },
 	/*      R         */
 	{ SST(0x11, 0x0F, SS_RDEF,
 	    "Error reading UPC/EAN number") },
 	/*      R         */
 	{ SST(0x11, 0x10, SS_RDEF,
 	    "Error reading ISRC number") },
 	/*      R         */
 	{ SST(0x11, 0x11, SS_RDEF,
 	    "Read error - loss of streaming") },
 	/* DT  WROM  B    */
 	{ SST(0x11, 0x12, SS_RDEF,	/* XXX TBD */
 	    "Auxiliary memory read error") },
 	/* DTLPWRO AEBKVF */
 	{ SST(0x11, 0x13, SS_RDEF,	/* XXX TBD */
 	    "Read error - failed retransmission request") },
 	/* D              */
 	{ SST(0x11, 0x14, SS_RDEF,	/* XXX TBD */
 	    "Read error - LBA marked bad by application client") },
 	/* D              */
-	{ SST(0x11, 0x15, SS_RDEF,	/* XXX TBD */
+	{ SST(0x11, 0x15, SS_FATAL | EIO,
 	    "Write after sanitize required") },
 	/* D   W O   BK   */
 	{ SST(0x12, 0x00, SS_RDEF,
 	    "Address mark not found for ID field") },
 	/* D   W O   BK   */
 	{ SST(0x13, 0x00, SS_RDEF,
 	    "Address mark not found for data field") },
 	/* DTL WRO   BK   */
 	{ SST(0x14, 0x00, SS_RDEF,
 	    "Recorded entity not found") },
 	/* DT  WRO   BK   */
 	{ SST(0x14, 0x01, SS_RDEF,
 	    "Record not found") },
 	/*  T             */
 	{ SST(0x14, 0x02, SS_RDEF,
 	    "Filemark or setmark not found") },
 	/*  T             */
 	{ SST(0x14, 0x03, SS_RDEF,
 	    "End-of-data not found") },
 	/*  T             */
 	{ SST(0x14, 0x04, SS_RDEF,
 	    "Block sequence error") },
 	/* DT  W O   BK   */
 	{ SST(0x14, 0x05, SS_RDEF,
 	    "Record not found - recommend reassignment") },
 	/* DT  W O   BK   */
 	{ SST(0x14, 0x06, SS_RDEF,
 	    "Record not found - data auto-reallocated") },
 	/*  T             */
 	{ SST(0x14, 0x07, SS_RDEF,	/* XXX TBD */
 	    "Locate operation failure") },
 	/* DTL WROM  BK   */
 	{ SST(0x15, 0x00, SS_RDEF,
 	    "Random positioning error") },
 	/* DTL WROM  BK   */
 	{ SST(0x15, 0x01, SS_RDEF,
 	    "Mechanical positioning error") },
 	/* DT  WRO   BK   */
 	{ SST(0x15, 0x02, SS_RDEF,
 	    "Positioning error detected by read of medium") },
 	/* D   W O   BK   */
 	{ SST(0x16, 0x00, SS_RDEF,
 	    "Data synchronization mark error") },
 	/* D   W O   BK   */
 	{ SST(0x16, 0x01, SS_RDEF,
 	    "Data sync error - data rewritten") },
 	/* D   W O   BK   */
 	{ SST(0x16, 0x02, SS_RDEF,
 	    "Data sync error - recommend rewrite") },
 	/* D   W O   BK   */
 	{ SST(0x16, 0x03, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data sync error - data auto-reallocated") },
 	/* D   W O   BK   */
 	{ SST(0x16, 0x04, SS_RDEF,
 	    "Data sync error - recommend reassignment") },
 	/* DT  WRO   BK   */
 	{ SST(0x17, 0x00, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data with no error correction applied") },
 	/* DT  WRO   BK   */
 	{ SST(0x17, 0x01, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data with retries") },
 	/* DT  WRO   BK   */
 	{ SST(0x17, 0x02, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data with positive head offset") },
 	/* DT  WRO   BK   */
 	{ SST(0x17, 0x03, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data with negative head offset") },
 	/*     WRO   B    */
 	{ SST(0x17, 0x04, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data with retries and/or CIRC applied") },
 	/* D   WRO   BK   */
 	{ SST(0x17, 0x05, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data using previous sector ID") },
 	/* D   W O   BK   */
 	{ SST(0x17, 0x06, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data without ECC - data auto-reallocated") },
 	/* D   WRO   BK   */
 	{ SST(0x17, 0x07, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data without ECC - recommend reassignment") },
 	/* D   WRO   BK   */
 	{ SST(0x17, 0x08, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data without ECC - recommend rewrite") },
 	/* D   WRO   BK   */
 	{ SST(0x17, 0x09, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data without ECC - data rewritten") },
 	/* DT  WRO   BK   */
 	{ SST(0x18, 0x00, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data with error correction applied") },
 	/* D   WRO   BK   */
 	{ SST(0x18, 0x01, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data with error corr. & retries applied") },
 	/* D   WRO   BK   */
 	{ SST(0x18, 0x02, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data - data auto-reallocated") },
 	/*      R         */
 	{ SST(0x18, 0x03, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data with CIRC") },
 	/*      R         */
 	{ SST(0x18, 0x04, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data with L-EC") },
 	/* D   WRO   BK   */
 	{ SST(0x18, 0x05, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data - recommend reassignment") },
 	/* D   WRO   BK   */
 	{ SST(0x18, 0x06, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data - recommend rewrite") },
 	/* D   W O   BK   */
 	{ SST(0x18, 0x07, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered data with ECC - data rewritten") },
 	/*      R         */
 	{ SST(0x18, 0x08, SS_RDEF,	/* XXX TBD */
 	    "Recovered data with linking") },
 	/* D     O    K   */
 	{ SST(0x19, 0x00, SS_RDEF,
 	    "Defect list error") },
 	/* D     O    K   */
 	{ SST(0x19, 0x01, SS_RDEF,
 	    "Defect list not available") },
 	/* D     O    K   */
 	{ SST(0x19, 0x02, SS_RDEF,
 	    "Defect list error in primary list") },
 	/* D     O    K   */
 	{ SST(0x19, 0x03, SS_RDEF,
 	    "Defect list error in grown list") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x1A, 0x00, SS_RDEF,
 	    "Parameter list length error") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x1B, 0x00, SS_RDEF,
 	    "Synchronous data transfer error") },
 	/* D     O   BK   */
 	{ SST(0x1C, 0x00, SS_RDEF,
 	    "Defect list not found") },
 	/* D     O   BK   */
 	{ SST(0x1C, 0x01, SS_RDEF,
 	    "Primary defect list not found") },
 	/* D     O   BK   */
 	{ SST(0x1C, 0x02, SS_RDEF,
 	    "Grown defect list not found") },
 	/* DT  WRO   BK   */
 	{ SST(0x1D, 0x00, SS_FATAL,
 	    "Miscompare during verify operation") },
 	/* D         B    */
 	{ SST(0x1D, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Miscomparable verify of unmapped LBA") },
 	/* D   W O   BK   */
 	{ SST(0x1E, 0x00, SS_NOP | SSQ_PRINT_SENSE,
 	    "Recovered ID with ECC correction") },
 	/* D     O    K   */
 	{ SST(0x1F, 0x00, SS_RDEF,
 	    "Partial defect list transfer") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x20, 0x00, SS_FATAL | EINVAL,
 	    "Invalid command operation code") },
 	/* DT PWROMAEBK   */
 	{ SST(0x20, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Access denied - initiator pending-enrolled") },
 	/* DT PWROMAEBK   */
 	{ SST(0x20, 0x02, SS_FATAL | EPERM,
 	    "Access denied - no access rights") },
 	/* DT PWROMAEBK   */
 	{ SST(0x20, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Access denied - invalid mgmt ID key") },
 	/*  T             */
 	{ SST(0x20, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Illegal command while in write capable state") },
 	/*  T             */
 	{ SST(0x20, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Obsolete") },
 	/*  T             */
 	{ SST(0x20, 0x06, SS_RDEF,	/* XXX TBD */
 	    "Illegal command while in explicit address mode") },
 	/*  T             */
 	{ SST(0x20, 0x07, SS_RDEF,	/* XXX TBD */
 	    "Illegal command while in implicit address mode") },
 	/* DT PWROMAEBK   */
 	{ SST(0x20, 0x08, SS_RDEF,	/* XXX TBD */
 	    "Access denied - enrollment conflict") },
 	/* DT PWROMAEBK   */
 	{ SST(0x20, 0x09, SS_RDEF,	/* XXX TBD */
 	    "Access denied - invalid LU identifier") },
 	/* DT PWROMAEBK   */
 	{ SST(0x20, 0x0A, SS_RDEF,	/* XXX TBD */
 	    "Access denied - invalid proxy token") },
 	/* DT PWROMAEBK   */
 	{ SST(0x20, 0x0B, SS_RDEF,	/* XXX TBD */
 	    "Access denied - ACL LUN conflict") },
 	/*  T             */
 	{ SST(0x20, 0x0C, SS_FATAL | EINVAL,
 	    "Illegal command when not in append-only mode") },
 	/* DT  WRO   BK   */
 	{ SST(0x21, 0x00, SS_FATAL | EINVAL,
 	    "Logical block address out of range") },
 	/* DT  WROM  BK   */
 	{ SST(0x21, 0x01, SS_FATAL | EINVAL,
 	    "Invalid element address") },
 	/*      R         */
 	{ SST(0x21, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Invalid address for write") },
 	/*      R         */
 	{ SST(0x21, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Invalid write crossing layer jump") },
 	/* D              */
 	{ SST(0x21, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Unaligned write command") },
 	/* D              */
 	{ SST(0x21, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Write boundary violation") },
 	/* D              */
 	{ SST(0x21, 0x06, SS_RDEF,	/* XXX TBD */
 	    "Attempt to read invalid data") },
 	/* D              */
 	{ SST(0x21, 0x07, SS_RDEF,	/* XXX TBD */
 	    "Read boundary violation") },
 	/* D              */
 	{ SST(0x22, 0x00, SS_FATAL | EINVAL,
 	    "Illegal function (use 20 00, 24 00, or 26 00)") },
 	/* DT P      B    */
 	{ SST(0x23, 0x00, SS_FATAL | EINVAL,
 	    "Invalid token operation, cause not reportable") },
 	/* DT P      B    */
 	{ SST(0x23, 0x01, SS_FATAL | EINVAL,
 	    "Invalid token operation, unsupported token type") },
 	/* DT P      B    */
 	{ SST(0x23, 0x02, SS_FATAL | EINVAL,
 	    "Invalid token operation, remote token usage not supported") },
 	/* DT P      B    */
 	{ SST(0x23, 0x03, SS_FATAL | EINVAL,
 	    "Invalid token operation, remote ROD token creation not supported") },
 	/* DT P      B    */
 	{ SST(0x23, 0x04, SS_FATAL | EINVAL,
 	    "Invalid token operation, token unknown") },
 	/* DT P      B    */
 	{ SST(0x23, 0x05, SS_FATAL | EINVAL,
 	    "Invalid token operation, token corrupt") },
 	/* DT P      B    */
 	{ SST(0x23, 0x06, SS_FATAL | EINVAL,
 	    "Invalid token operation, token revoked") },
 	/* DT P      B    */
 	{ SST(0x23, 0x07, SS_FATAL | EINVAL,
 	    "Invalid token operation, token expired") },
 	/* DT P      B    */
 	{ SST(0x23, 0x08, SS_FATAL | EINVAL,
 	    "Invalid token operation, token cancelled") },
 	/* DT P      B    */
 	{ SST(0x23, 0x09, SS_FATAL | EINVAL,
 	    "Invalid token operation, token deleted") },
 	/* DT P      B    */
 	{ SST(0x23, 0x0A, SS_FATAL | EINVAL,
 	    "Invalid token operation, invalid token length") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x24, 0x00, SS_FATAL | EINVAL,
 	    "Invalid field in CDB") },
 	/* DTLPWRO AEBKVF */
 	{ SST(0x24, 0x01, SS_RDEF,	/* XXX TBD */
 	    "CDB decryption error") },
 	/*  T             */
 	{ SST(0x24, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Obsolete") },
 	/*  T             */
 	{ SST(0x24, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Obsolete") },
 	/*              F */
 	{ SST(0x24, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Security audit value frozen") },
 	/*              F */
 	{ SST(0x24, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Security working key frozen") },
 	/*              F */
 	{ SST(0x24, 0x06, SS_RDEF,	/* XXX TBD */
 	    "NONCE not unique") },
 	/*              F */
 	{ SST(0x24, 0x07, SS_RDEF,	/* XXX TBD */
 	    "NONCE timestamp out of range") },
 	/* DT   R MAEBKV  */
 	{ SST(0x24, 0x08, SS_RDEF,	/* XXX TBD */
 	    "Invalid XCDB") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x25, 0x00, SS_FATAL | ENXIO | SSQ_LOST,
 	    "Logical unit not supported") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x26, 0x00, SS_FATAL | EINVAL,
 	    "Invalid field in parameter list") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x26, 0x01, SS_FATAL | EINVAL,
 	    "Parameter not supported") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x26, 0x02, SS_FATAL | EINVAL,
 	    "Parameter value invalid") },
 	/* DTLPWROMAE K   */
 	{ SST(0x26, 0x03, SS_FATAL | EINVAL,
 	    "Threshold parameters not supported") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x26, 0x04, SS_FATAL | EINVAL,
 	    "Invalid release of persistent reservation") },
 	/* DTLPWRO A BK   */
 	{ SST(0x26, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Data decryption error") },
 	/* DTLPWRO    K   */
 	{ SST(0x26, 0x06, SS_FATAL | EINVAL,
 	    "Too many target descriptors") },
 	/* DTLPWRO    K   */
 	{ SST(0x26, 0x07, SS_FATAL | EINVAL,
 	    "Unsupported target descriptor type code") },
 	/* DTLPWRO    K   */
 	{ SST(0x26, 0x08, SS_FATAL | EINVAL,
 	    "Too many segment descriptors") },
 	/* DTLPWRO    K   */
 	{ SST(0x26, 0x09, SS_FATAL | EINVAL,
 	    "Unsupported segment descriptor type code") },
 	/* DTLPWRO    K   */
 	{ SST(0x26, 0x0A, SS_FATAL | EINVAL,
 	    "Unexpected inexact segment") },
 	/* DTLPWRO    K   */
 	{ SST(0x26, 0x0B, SS_FATAL | EINVAL,
 	    "Inline data length exceeded") },
 	/* DTLPWRO    K   */
 	{ SST(0x26, 0x0C, SS_FATAL | EINVAL,
 	    "Invalid operation for copy source or destination") },
 	/* DTLPWRO    K   */
 	{ SST(0x26, 0x0D, SS_FATAL | EINVAL,
 	    "Copy segment granularity violation") },
 	/* DT PWROMAEBK   */
 	{ SST(0x26, 0x0E, SS_RDEF,	/* XXX TBD */
 	    "Invalid parameter while port is enabled") },
 	/*              F */
 	{ SST(0x26, 0x0F, SS_RDEF,	/* XXX TBD */
 	    "Invalid data-out buffer integrity check value") },
 	/*  T             */
 	{ SST(0x26, 0x10, SS_RDEF,	/* XXX TBD */
 	    "Data decryption key fail limit reached") },
 	/*  T             */
 	{ SST(0x26, 0x11, SS_RDEF,	/* XXX TBD */
 	    "Incomplete key-associated data set") },
 	/*  T             */
 	{ SST(0x26, 0x12, SS_RDEF,	/* XXX TBD */
 	    "Vendor specific key reference not found") },
 	/* D              */
 	{ SST(0x26, 0x13, SS_RDEF,	/* XXX TBD */
 	    "Application tag mode page is invalid") },
 	/* DT  WRO   BK   */
 	{ SST(0x27, 0x00, SS_FATAL | EACCES,
 	    "Write protected") },
 	/* DT  WRO   BK   */
 	{ SST(0x27, 0x01, SS_FATAL | EACCES,
 	    "Hardware write protected") },
 	/* DT  WRO   BK   */
 	{ SST(0x27, 0x02, SS_FATAL | EACCES,
 	    "Logical unit software write protected") },
 	/*  T   R         */
 	{ SST(0x27, 0x03, SS_FATAL | EACCES,
 	    "Associated write protect") },
 	/*  T   R         */
 	{ SST(0x27, 0x04, SS_FATAL | EACCES,
 	    "Persistent write protect") },
 	/*  T   R         */
 	{ SST(0x27, 0x05, SS_FATAL | EACCES,
 	    "Permanent write protect") },
 	/*      R       F */
 	{ SST(0x27, 0x06, SS_RDEF,	/* XXX TBD */
 	    "Conditional write protect") },
 	/* D         B    */
 	{ SST(0x27, 0x07, SS_FATAL | ENOSPC,
 	    "Space allocation failed write protect") },
 	/* D              */
 	{ SST(0x27, 0x08, SS_FATAL | EACCES,
 	    "Zone is read only") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x28, 0x00, SS_FATAL | ENXIO,
 	    "Not ready to ready change, medium may have changed") },
 	/* DT  WROM  B    */
 	{ SST(0x28, 0x01, SS_FATAL | ENXIO,
 	    "Import or export element accessed") },
 	/*      R         */
 	{ SST(0x28, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Format-layer may have changed") },
 	/*        M       */
 	{ SST(0x28, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Import/export element accessed, medium changed") },
 	/*
 	 * XXX JGibbs - All of these should use the same errno, but I don't
 	 * think ENXIO is the correct choice.  Should we borrow from
 	 * the networking errnos?  ECONNRESET anyone?
 	 */
 	/* DTLPWROMAEBKVF */
 	{ SST(0x29, 0x00, SS_FATAL | ENXIO,
 	    "Power on, reset, or bus device reset occurred") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x29, 0x01, SS_RDEF,
 	    "Power on occurred") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x29, 0x02, SS_RDEF,
 	    "SCSI bus reset occurred") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x29, 0x03, SS_RDEF,
 	    "Bus device reset function occurred") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x29, 0x04, SS_RDEF,
 	    "Device internal reset") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x29, 0x05, SS_RDEF,
 	    "Transceiver mode changed to single-ended") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x29, 0x06, SS_RDEF,
 	    "Transceiver mode changed to LVD") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x29, 0x07, SS_RDEF,	/* XXX TBD */
 	    "I_T nexus loss occurred") },
 	/* DTL WROMAEBKVF */
 	{ SST(0x2A, 0x00, SS_RDEF,
 	    "Parameters changed") },
 	/* DTL WROMAEBKVF */
 	{ SST(0x2A, 0x01, SS_RDEF,
 	    "Mode parameters changed") },
 	/* DTL WROMAE K   */
 	{ SST(0x2A, 0x02, SS_RDEF,
 	    "Log parameters changed") },
 	/* DTLPWROMAE K   */
 	{ SST(0x2A, 0x03, SS_RDEF,
 	    "Reservations preempted") },
 	/* DTLPWROMAE     */
 	{ SST(0x2A, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Reservations released") },
 	/* DTLPWROMAE     */
 	{ SST(0x2A, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Registrations preempted") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x2A, 0x06, SS_RDEF,	/* XXX TBD */
 	    "Asymmetric access state changed") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x2A, 0x07, SS_RDEF,	/* XXX TBD */
 	    "Implicit asymmetric access state transition failed") },
 	/* DT  WROMAEBKVF */
 	{ SST(0x2A, 0x08, SS_RDEF,	/* XXX TBD */
 	    "Priority changed") },
 	/* D              */
 	{ SST(0x2A, 0x09, SS_RDEF,	/* XXX TBD */
 	    "Capacity data has changed") },
 	/* DT             */
 	{ SST(0x2A, 0x0A, SS_RDEF,	/* XXX TBD */
 	    "Error history I_T nexus cleared") },
 	/* DT             */
 	{ SST(0x2A, 0x0B, SS_RDEF,	/* XXX TBD */
 	    "Error history snapshot released") },
 	/*              F */
 	{ SST(0x2A, 0x0C, SS_RDEF,	/* XXX TBD */
 	    "Error recovery attributes have changed") },
 	/*  T             */
 	{ SST(0x2A, 0x0D, SS_RDEF,	/* XXX TBD */
 	    "Data encryption capabilities changed") },
 	/* DT     M E  V  */
 	{ SST(0x2A, 0x10, SS_RDEF,	/* XXX TBD */
 	    "Timestamp changed") },
 	/*  T             */
 	{ SST(0x2A, 0x11, SS_RDEF,	/* XXX TBD */
 	    "Data encryption parameters changed by another I_T nexus") },
 	/*  T             */
 	{ SST(0x2A, 0x12, SS_RDEF,	/* XXX TBD */
 	    "Data encryption parameters changed by vendor specific event") },
 	/*  T             */
 	{ SST(0x2A, 0x13, SS_RDEF,	/* XXX TBD */
 	    "Data encryption key instance counter has changed") },
 	/* DT   R MAEBKV  */
 	{ SST(0x2A, 0x14, SS_RDEF,	/* XXX TBD */
 	    "SA creation capabilities data has changed") },
 	/*  T     M    V  */
 	{ SST(0x2A, 0x15, SS_RDEF,	/* XXX TBD */
 	    "Medium removal prevention preempted") },
 	/* DTLPWRO    K   */
 	{ SST(0x2B, 0x00, SS_RDEF,
 	    "Copy cannot execute since host cannot disconnect") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x2C, 0x00, SS_RDEF,
 	    "Command sequence error") },
 	/*                */
 	{ SST(0x2C, 0x01, SS_RDEF,
 	    "Too many windows specified") },
 	/*                */
 	{ SST(0x2C, 0x02, SS_RDEF,
 	    "Invalid combination of windows specified") },
 	/*      R         */
 	{ SST(0x2C, 0x03, SS_RDEF,
 	    "Current program area is not empty") },
 	/*      R         */
 	{ SST(0x2C, 0x04, SS_RDEF,
 	    "Current program area is empty") },
 	/*           B    */
 	{ SST(0x2C, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Illegal power condition request") },
 	/*      R         */
 	{ SST(0x2C, 0x06, SS_RDEF,	/* XXX TBD */
 	    "Persistent prevent conflict") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x2C, 0x07, SS_RDEF,	/* XXX TBD */
 	    "Previous busy status") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x2C, 0x08, SS_RDEF,	/* XXX TBD */
 	    "Previous task set full status") },
 	/* DTLPWROM EBKVF */
 	{ SST(0x2C, 0x09, SS_RDEF,	/* XXX TBD */
 	    "Previous reservation conflict status") },
 	/*              F */
 	{ SST(0x2C, 0x0A, SS_RDEF,	/* XXX TBD */
 	    "Partition or collection contains user objects") },
 	/*  T             */
 	{ SST(0x2C, 0x0B, SS_RDEF,	/* XXX TBD */
 	    "Not reserved") },
 	/* D              */
 	{ SST(0x2C, 0x0C, SS_RDEF,	/* XXX TBD */
 	    "ORWRITE generation does not match") },
 	/* D              */
 	{ SST(0x2C, 0x0D, SS_RDEF,	/* XXX TBD */
 	    "Reset write pointer not allowed") },
 	/* D              */
 	{ SST(0x2C, 0x0E, SS_RDEF,	/* XXX TBD */
 	    "Zone is offline") },
 	/* D              */
 	{ SST(0x2C, 0x0F, SS_RDEF,	/* XXX TBD */
 	    "Stream not open") },
 	/* D              */
 	{ SST(0x2C, 0x10, SS_RDEF,	/* XXX TBD */
 	    "Unwritten data in zone") },
 	/*  T             */
 	{ SST(0x2D, 0x00, SS_RDEF,
 	    "Overwrite error on update in place") },
 	/*      R         */
 	{ SST(0x2E, 0x00, SS_RDEF,	/* XXX TBD */
 	    "Insufficient time for operation") },
 	/* D              */
 	{ SST(0x2E, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Command timeout before processing") },
 	/* D              */
 	{ SST(0x2E, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Command timeout during processing") },
 	/* D              */
 	{ SST(0x2E, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Command timeout during processing due to error recovery") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x2F, 0x00, SS_RDEF,
 	    "Commands cleared by another initiator") },
 	/* D              */
 	{ SST(0x2F, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Commands cleared by power loss notification") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x2F, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Commands cleared by device server") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x2F, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Some commands cleared by queuing layer event") },
 	/* DT  WROM  BK   */
 	{ SST(0x30, 0x00, SS_RDEF,
 	    "Incompatible medium installed") },
 	/* DT  WRO   BK   */
 	{ SST(0x30, 0x01, SS_RDEF,
 	    "Cannot read medium - unknown format") },
 	/* DT  WRO   BK   */
 	{ SST(0x30, 0x02, SS_RDEF,
 	    "Cannot read medium - incompatible format") },
 	/* DT   R     K   */
 	{ SST(0x30, 0x03, SS_RDEF,
 	    "Cleaning cartridge installed") },
 	/* DT  WRO   BK   */
 	{ SST(0x30, 0x04, SS_RDEF,
 	    "Cannot write medium - unknown format") },
 	/* DT  WRO   BK   */
 	{ SST(0x30, 0x05, SS_RDEF,
 	    "Cannot write medium - incompatible format") },
 	/* DT  WRO   B    */
 	{ SST(0x30, 0x06, SS_RDEF,
 	    "Cannot format medium - incompatible medium") },
 	/* DTL WROMAEBKVF */
 	{ SST(0x30, 0x07, SS_RDEF,
 	    "Cleaning failure") },
 	/*      R         */
 	{ SST(0x30, 0x08, SS_RDEF,
 	    "Cannot write - application code mismatch") },
 	/*      R         */
 	{ SST(0x30, 0x09, SS_RDEF,
 	    "Current session not fixated for append") },
 	/* DT  WRO AEBK   */
 	{ SST(0x30, 0x0A, SS_RDEF,	/* XXX TBD */
 	    "Cleaning request rejected") },
 	/*  T             */
 	{ SST(0x30, 0x0C, SS_RDEF,	/* XXX TBD */
 	    "WORM medium - overwrite attempted") },
 	/*  T             */
 	{ SST(0x30, 0x0D, SS_RDEF,	/* XXX TBD */
 	    "WORM medium - integrity check") },
 	/*      R         */
 	{ SST(0x30, 0x10, SS_RDEF,	/* XXX TBD */
 	    "Medium not formatted") },
 	/*        M       */
 	{ SST(0x30, 0x11, SS_RDEF,	/* XXX TBD */
 	    "Incompatible volume type") },
 	/*        M       */
 	{ SST(0x30, 0x12, SS_RDEF,	/* XXX TBD */
 	    "Incompatible volume qualifier") },
 	/*        M       */
 	{ SST(0x30, 0x13, SS_RDEF,	/* XXX TBD */
 	    "Cleaning volume expired") },
 	/* DT  WRO   BK   */
 	{ SST(0x31, 0x00, SS_RDEF,
 	    "Medium format corrupted") },
 	/* D L  RO   B    */
 	{ SST(0x31, 0x01, SS_RDEF,
 	    "Format command failed") },
 	/*      R         */
 	{ SST(0x31, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Zoned formatting failed due to spare linking") },
 	/* D         B    */
-	{ SST(0x31, 0x03, SS_RDEF,	/* XXX TBD */
+	{ SST(0x31, 0x03, SS_FATAL | EIO,
 	    "SANITIZE command failed") },
 	/* D   W O   BK   */
 	{ SST(0x32, 0x00, SS_RDEF,
 	    "No defect spare location available") },
 	/* D   W O   BK   */
 	{ SST(0x32, 0x01, SS_RDEF,
 	    "Defect list update failure") },
 	/*  T             */
 	{ SST(0x33, 0x00, SS_RDEF,
 	    "Tape length error") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x34, 0x00, SS_RDEF,
 	    "Enclosure failure") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x35, 0x00, SS_RDEF,
 	    "Enclosure services failure") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x35, 0x01, SS_RDEF,
 	    "Unsupported enclosure function") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x35, 0x02, SS_RDEF,
 	    "Enclosure services unavailable") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x35, 0x03, SS_RDEF,
 	    "Enclosure services transfer failure") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x35, 0x04, SS_RDEF,
 	    "Enclosure services transfer refused") },
 	/* DTL WROMAEBKVF */
 	{ SST(0x35, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Enclosure services checksum error") },
 	/*   L            */
 	{ SST(0x36, 0x00, SS_RDEF,
 	    "Ribbon, ink, or toner failure") },
 	/* DTL WROMAEBKVF */
 	{ SST(0x37, 0x00, SS_RDEF,
 	    "Rounded parameter") },
 	/*           B    */
 	{ SST(0x38, 0x00, SS_RDEF,	/* XXX TBD */
 	    "Event status notification") },
 	/*           B    */
 	{ SST(0x38, 0x02, SS_RDEF,	/* XXX TBD */
 	    "ESN - power management class event") },
 	/*           B    */
 	{ SST(0x38, 0x04, SS_RDEF,	/* XXX TBD */
 	    "ESN - media class event") },
 	/*           B    */
 	{ SST(0x38, 0x06, SS_RDEF,	/* XXX TBD */
 	    "ESN - device busy class event") },
 	/* D              */
 	{ SST(0x38, 0x07, SS_RDEF,	/* XXX TBD */
 	    "Thin provisioning soft threshold reached") },
 	/* DTL WROMAE K   */
 	{ SST(0x39, 0x00, SS_RDEF,
 	    "Saving parameters not supported") },
 	/* DTL WROM  BK   */
 	{ SST(0x3A, 0x00, SS_FATAL | ENXIO,
 	    "Medium not present") },
 	/* DT  WROM  BK   */
 	{ SST(0x3A, 0x01, SS_FATAL | ENXIO,
 	    "Medium not present - tray closed") },
 	/* DT  WROM  BK   */
 	{ SST(0x3A, 0x02, SS_FATAL | ENXIO,
 	    "Medium not present - tray open") },
 	/* DT  WROM  B    */
 	{ SST(0x3A, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Medium not present - loadable") },
 	/* DT  WRO   B    */
 	{ SST(0x3A, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Medium not present - medium auxiliary memory accessible") },
 	/*  TL            */
 	{ SST(0x3B, 0x00, SS_RDEF,
 	    "Sequential positioning error") },
 	/*  T             */
 	{ SST(0x3B, 0x01, SS_RDEF,
 	    "Tape position error at beginning-of-medium") },
 	/*  T             */
 	{ SST(0x3B, 0x02, SS_RDEF,
 	    "Tape position error at end-of-medium") },
 	/*   L            */
 	{ SST(0x3B, 0x03, SS_RDEF,
 	    "Tape or electronic vertical forms unit not ready") },
 	/*   L            */
 	{ SST(0x3B, 0x04, SS_RDEF,
 	    "Slew failure") },
 	/*   L            */
 	{ SST(0x3B, 0x05, SS_RDEF,
 	    "Paper jam") },
 	/*   L            */
 	{ SST(0x3B, 0x06, SS_RDEF,
 	    "Failed to sense top-of-form") },
 	/*   L            */
 	{ SST(0x3B, 0x07, SS_RDEF,
 	    "Failed to sense bottom-of-form") },
 	/*  T             */
 	{ SST(0x3B, 0x08, SS_RDEF,
 	    "Reposition error") },
 	/*                */
 	{ SST(0x3B, 0x09, SS_RDEF,
 	    "Read past end of medium") },
 	/*                */
 	{ SST(0x3B, 0x0A, SS_RDEF,
 	    "Read past beginning of medium") },
 	/*                */
 	{ SST(0x3B, 0x0B, SS_RDEF,
 	    "Position past end of medium") },
 	/*  T             */
 	{ SST(0x3B, 0x0C, SS_RDEF,
 	    "Position past beginning of medium") },
 	/* DT  WROM  BK   */
 	{ SST(0x3B, 0x0D, SS_FATAL | ENOSPC,
 	    "Medium destination element full") },
 	/* DT  WROM  BK   */
 	{ SST(0x3B, 0x0E, SS_RDEF,
 	    "Medium source element empty") },
 	/*      R         */
 	{ SST(0x3B, 0x0F, SS_RDEF,
 	    "End of medium reached") },
 	/* DT  WROM  BK   */
 	{ SST(0x3B, 0x11, SS_RDEF,
 	    "Medium magazine not accessible") },
 	/* DT  WROM  BK   */
 	{ SST(0x3B, 0x12, SS_RDEF,
 	    "Medium magazine removed") },
 	/* DT  WROM  BK   */
 	{ SST(0x3B, 0x13, SS_RDEF,
 	    "Medium magazine inserted") },
 	/* DT  WROM  BK   */
 	{ SST(0x3B, 0x14, SS_RDEF,
 	    "Medium magazine locked") },
 	/* DT  WROM  BK   */
 	{ SST(0x3B, 0x15, SS_RDEF,
 	    "Medium magazine unlocked") },
 	/*      R         */
 	{ SST(0x3B, 0x16, SS_RDEF,	/* XXX TBD */
 	    "Mechanical positioning or changer error") },
 	/*              F */
 	{ SST(0x3B, 0x17, SS_RDEF,	/* XXX TBD */
 	    "Read past end of user object") },
 	/*        M       */
 	{ SST(0x3B, 0x18, SS_RDEF,	/* XXX TBD */
 	    "Element disabled") },
 	/*        M       */
 	{ SST(0x3B, 0x19, SS_RDEF,	/* XXX TBD */
 	    "Element enabled") },
 	/*        M       */
 	{ SST(0x3B, 0x1A, SS_RDEF,	/* XXX TBD */
 	    "Data transfer device removed") },
 	/*        M       */
 	{ SST(0x3B, 0x1B, SS_RDEF,	/* XXX TBD */
 	    "Data transfer device inserted") },
 	/*  T             */
 	{ SST(0x3B, 0x1C, SS_RDEF,	/* XXX TBD */
 	    "Too many logical objects on partition to support operation") },
 	/* DTLPWROMAE K   */
 	{ SST(0x3D, 0x00, SS_RDEF,
 	    "Invalid bits in IDENTIFY message") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x3E, 0x00, SS_RDEF,
 	    "Logical unit has not self-configured yet") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x3E, 0x01, SS_RDEF,
 	    "Logical unit failure") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x3E, 0x02, SS_RDEF,
 	    "Timeout on logical unit") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x3E, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Logical unit failed self-test") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x3E, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Logical unit unable to update self-test log") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x3F, 0x00, SS_RDEF,
 	    "Target operating conditions have changed") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x3F, 0x01, SS_RDEF,
 	    "Microcode has been changed") },
 	/* DTLPWROM  BK   */
 	{ SST(0x3F, 0x02, SS_RDEF,
 	    "Changed operating definition") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x3F, 0x03, SS_RDEF,
 	    "INQUIRY data has changed") },
 	/* DT  WROMAEBK   */
 	{ SST(0x3F, 0x04, SS_RDEF,
 	    "Component device attached") },
 	/* DT  WROMAEBK   */
 	{ SST(0x3F, 0x05, SS_RDEF,
 	    "Device identifier changed") },
 	/* DT  WROMAEB    */
 	{ SST(0x3F, 0x06, SS_RDEF,
 	    "Redundancy group created or modified") },
 	/* DT  WROMAEB    */
 	{ SST(0x3F, 0x07, SS_RDEF,
 	    "Redundancy group deleted") },
 	/* DT  WROMAEB    */
 	{ SST(0x3F, 0x08, SS_RDEF,
 	    "Spare created or modified") },
 	/* DT  WROMAEB    */
 	{ SST(0x3F, 0x09, SS_RDEF,
 	    "Spare deleted") },
 	/* DT  WROMAEBK   */
 	{ SST(0x3F, 0x0A, SS_RDEF,
 	    "Volume set created or modified") },
 	/* DT  WROMAEBK   */
 	{ SST(0x3F, 0x0B, SS_RDEF,
 	    "Volume set deleted") },
 	/* DT  WROMAEBK   */
 	{ SST(0x3F, 0x0C, SS_RDEF,
 	    "Volume set deassigned") },
 	/* DT  WROMAEBK   */
 	{ SST(0x3F, 0x0D, SS_RDEF,
 	    "Volume set reassigned") },
 	/* DTLPWROMAE     */
 	{ SST(0x3F, 0x0E, SS_RDEF | SSQ_RESCAN ,
 	    "Reported LUNs data has changed") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x3F, 0x0F, SS_RDEF,	/* XXX TBD */
 	    "Echo buffer overwritten") },
 	/* DT  WROM  B    */
 	{ SST(0x3F, 0x10, SS_RDEF,	/* XXX TBD */
 	    "Medium loadable") },
 	/* DT  WROM  B    */
 	{ SST(0x3F, 0x11, SS_RDEF,	/* XXX TBD */
 	    "Medium auxiliary memory accessible") },
 	/* DTLPWR MAEBK F */
 	{ SST(0x3F, 0x12, SS_RDEF,	/* XXX TBD */
 	    "iSCSI IP address added") },
 	/* DTLPWR MAEBK F */
 	{ SST(0x3F, 0x13, SS_RDEF,	/* XXX TBD */
 	    "iSCSI IP address removed") },
 	/* DTLPWR MAEBK F */
 	{ SST(0x3F, 0x14, SS_RDEF,	/* XXX TBD */
 	    "iSCSI IP address changed") },
 	/* DTLPWR MAEBK   */
 	{ SST(0x3F, 0x15, SS_RDEF,	/* XXX TBD */
 	    "Inspect referrals sense descriptors") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x3F, 0x16, SS_RDEF,	/* XXX TBD */
 	    "Microcode has been changed without reset") },
 	/* D              */
 	{ SST(0x3F, 0x17, SS_RDEF,	/* XXX TBD */
 	    "Zone transition to full") },
 	/* D              */
 	{ SST(0x40, 0x00, SS_RDEF,
 	    "RAM failure") },		/* deprecated - use 40 NN instead */
 	/* DTLPWROMAEBKVF */
 	{ SST(0x40, 0x80, SS_RDEF,
 	    "Diagnostic failure: ASCQ = Component ID") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x40, 0xFF, SS_RDEF | SSQ_RANGE,
 	    NULL) },			/* Range 0x80->0xFF */
 	/* D              */
 	{ SST(0x41, 0x00, SS_RDEF,
 	    "Data path failure") },	/* deprecated - use 40 NN instead */
 	/* D              */
 	{ SST(0x42, 0x00, SS_RDEF,
 	    "Power-on or self-test failure") },
 					/* deprecated - use 40 NN instead */
 	/* DTLPWROMAEBKVF */
 	{ SST(0x43, 0x00, SS_RDEF,
 	    "Message error") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x44, 0x00, SS_FATAL | EIO,
 	    "Internal target failure") },
 	/* DT P   MAEBKVF */
 	{ SST(0x44, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Persistent reservation information lost") },
 	/* DT        B    */
 	{ SST(0x44, 0x71, SS_RDEF,	/* XXX TBD */
 	    "ATA device failed set features") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x45, 0x00, SS_RDEF,
 	    "Select or reselect failure") },
 	/* DTLPWROM  BK   */
 	{ SST(0x46, 0x00, SS_RDEF,
 	    "Unsuccessful soft reset") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x47, 0x00, SS_RDEF,
 	    "SCSI parity error") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x47, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Data phase CRC error detected") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x47, 0x02, SS_RDEF,	/* XXX TBD */
 	    "SCSI parity error detected during ST data phase") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x47, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Information unit iuCRC error detected") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x47, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Asynchronous information protection error detected") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x47, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Protocol service CRC error") },
 	/* DT     MAEBKVF */
 	{ SST(0x47, 0x06, SS_RDEF,	/* XXX TBD */
 	    "PHY test function in progress") },
 	/* DT PWROMAEBK   */
 	{ SST(0x47, 0x7F, SS_RDEF,	/* XXX TBD */
 	    "Some commands cleared by iSCSI protocol event") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x48, 0x00, SS_RDEF,
 	    "Initiator detected error message received") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x49, 0x00, SS_RDEF,
 	    "Invalid message error") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x4A, 0x00, SS_RDEF,
 	    "Command phase error") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x4B, 0x00, SS_RDEF,
 	    "Data phase error") },
 	/* DT PWROMAEBK   */
 	{ SST(0x4B, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Invalid target port transfer tag received") },
 	/* DT PWROMAEBK   */
 	{ SST(0x4B, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Too much write data") },
 	/* DT PWROMAEBK   */
 	{ SST(0x4B, 0x03, SS_RDEF,	/* XXX TBD */
 	    "ACK/NAK timeout") },
 	/* DT PWROMAEBK   */
 	{ SST(0x4B, 0x04, SS_RDEF,	/* XXX TBD */
 	    "NAK received") },
 	/* DT PWROMAEBK   */
 	{ SST(0x4B, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Data offset error") },
 	/* DT PWROMAEBK   */
 	{ SST(0x4B, 0x06, SS_RDEF,	/* XXX TBD */
 	    "Initiator response timeout") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x07, SS_RDEF,	/* XXX TBD */
 	    "Connection lost") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x08, SS_RDEF,	/* XXX TBD */
 	    "Data-in buffer overflow - data buffer size") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x09, SS_RDEF,	/* XXX TBD */
 	    "Data-in buffer overflow - data buffer descriptor area") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x0A, SS_RDEF,	/* XXX TBD */
 	    "Data-in buffer error") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x0B, SS_RDEF,	/* XXX TBD */
 	    "Data-out buffer overflow - data buffer size") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x0C, SS_RDEF,	/* XXX TBD */
 	    "Data-out buffer overflow - data buffer descriptor area") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x0D, SS_RDEF,	/* XXX TBD */
 	    "Data-out buffer error") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x0E, SS_RDEF,	/* XXX TBD */
 	    "PCIe fabric error") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x0F, SS_RDEF,	/* XXX TBD */
 	    "PCIe completion timeout") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x10, SS_RDEF,	/* XXX TBD */
 	    "PCIe completer abort") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x11, SS_RDEF,	/* XXX TBD */
 	    "PCIe poisoned TLP received") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x12, SS_RDEF,	/* XXX TBD */
 	    "PCIe ECRC check failed") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x13, SS_RDEF,	/* XXX TBD */
 	    "PCIe unsupported request") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x14, SS_RDEF,	/* XXX TBD */
 	    "PCIe ACS violation") },
 	/* DT PWROMAEBK F */
 	{ SST(0x4B, 0x15, SS_RDEF,	/* XXX TBD */
 	    "PCIe TLP prefix blocket") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x4C, 0x00, SS_RDEF,
 	    "Logical unit failed self-configuration") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x4D, 0x00, SS_RDEF,
 	    "Tagged overlapped commands: ASCQ = Queue tag ID") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x4D, 0xFF, SS_RDEF | SSQ_RANGE,
 	    NULL) },			/* Range 0x00->0xFF */
 	/* DTLPWROMAEBKVF */
 	{ SST(0x4E, 0x00, SS_RDEF,
 	    "Overlapped commands attempted") },
 	/*  T             */
 	{ SST(0x50, 0x00, SS_RDEF,
 	    "Write append error") },
 	/*  T             */
 	{ SST(0x50, 0x01, SS_RDEF,
 	    "Write append position error") },
 	/*  T             */
 	{ SST(0x50, 0x02, SS_RDEF,
 	    "Position error related to timing") },
 	/*  T   RO        */
 	{ SST(0x51, 0x00, SS_RDEF,
 	    "Erase failure") },
 	/*      R         */
 	{ SST(0x51, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Erase failure - incomplete erase operation detected") },
 	/*  T             */
 	{ SST(0x52, 0x00, SS_RDEF,
 	    "Cartridge fault") },
 	/* DTL WROM  BK   */
 	{ SST(0x53, 0x00, SS_RDEF,
 	    "Media load or eject failed") },
 	/*  T             */
 	{ SST(0x53, 0x01, SS_RDEF,
 	    "Unload tape failure") },
 	/* DT  WROM  BK   */
 	{ SST(0x53, 0x02, SS_RDEF,
 	    "Medium removal prevented") },
 	/*        M       */
 	{ SST(0x53, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Medium removal prevented by data transfer element") },
 	/*  T             */
 	{ SST(0x53, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Medium thread or unthread failure") },
 	/*        M       */
 	{ SST(0x53, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Volume identifier invalid") },
 	/*  T             */
 	{ SST(0x53, 0x06, SS_RDEF,	/* XXX TBD */
 	    "Volume identifier missing") },
 	/*        M       */
 	{ SST(0x53, 0x07, SS_RDEF,	/* XXX TBD */
 	    "Duplicate volume identifier") },
 	/*        M       */
 	{ SST(0x53, 0x08, SS_RDEF,	/* XXX TBD */
 	    "Element status unknown") },
 	/*        M       */
 	{ SST(0x53, 0x09, SS_RDEF,	/* XXX TBD */
 	    "Data transfer device error - load failed") },
 	/*        M       */
 	{ SST(0x53, 0x0A, SS_RDEF,	/* XXX TBD */
 	    "Data transfer device error - unload failed") },
 	/*        M       */
 	{ SST(0x53, 0x0B, SS_RDEF,	/* XXX TBD */
 	    "Data transfer device error - unload missing") },
 	/*        M       */
 	{ SST(0x53, 0x0C, SS_RDEF,	/* XXX TBD */
 	    "Data transfer device error - eject failed") },
 	/*        M       */
 	{ SST(0x53, 0x0D, SS_RDEF,	/* XXX TBD */
 	    "Data transfer device error - library communication failed") },
 	/*    P           */
 	{ SST(0x54, 0x00, SS_RDEF,
 	    "SCSI to host system interface failure") },
 	/*    P           */
 	{ SST(0x55, 0x00, SS_RDEF,
 	    "System resource failure") },
 	/* D     O   BK   */
 	{ SST(0x55, 0x01, SS_FATAL | ENOSPC,
 	    "System buffer full") },
 	/* DTLPWROMAE K   */
 	{ SST(0x55, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Insufficient reservation resources") },
 	/* DTLPWROMAE K   */
 	{ SST(0x55, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Insufficient resources") },
 	/* DTLPWROMAE K   */
 	{ SST(0x55, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Insufficient registration resources") },
 	/* DT PWROMAEBK   */
 	{ SST(0x55, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Insufficient access control resources") },
 	/* DT  WROM  B    */
 	{ SST(0x55, 0x06, SS_RDEF,	/* XXX TBD */
 	    "Auxiliary memory out of space") },
 	/*              F */
 	{ SST(0x55, 0x07, SS_RDEF,	/* XXX TBD */
 	    "Quota error") },
 	/*  T             */
 	{ SST(0x55, 0x08, SS_RDEF,	/* XXX TBD */
 	    "Maximum number of supplemental decryption keys exceeded") },
 	/*        M       */
 	{ SST(0x55, 0x09, SS_RDEF,	/* XXX TBD */
 	    "Medium auxiliary memory not accessible") },
 	/*        M       */
 	{ SST(0x55, 0x0A, SS_RDEF,	/* XXX TBD */
 	    "Data currently unavailable") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x55, 0x0B, SS_RDEF,	/* XXX TBD */
 	    "Insufficient power for operation") },
 	/* DT P      B    */
 	{ SST(0x55, 0x0C, SS_RDEF,	/* XXX TBD */
 	    "Insufficient resources to create ROD") },
 	/* DT P      B    */
 	{ SST(0x55, 0x0D, SS_RDEF,	/* XXX TBD */
 	    "Insufficient resources to create ROD token") },
 	/* D              */
 	{ SST(0x55, 0x0E, SS_RDEF,	/* XXX TBD */
 	    "Insufficient zone resources") },
 	/* D              */
 	{ SST(0x55, 0x0F, SS_RDEF,	/* XXX TBD */
 	    "Insufficient zone resources to complete write") },
 	/* D              */
 	{ SST(0x55, 0x10, SS_RDEF,	/* XXX TBD */
 	    "Maximum number of streams open") },
 	/*      R         */
 	{ SST(0x57, 0x00, SS_RDEF,
 	    "Unable to recover table-of-contents") },
 	/*       O        */
 	{ SST(0x58, 0x00, SS_RDEF,
 	    "Generation does not exist") },
 	/*       O        */
 	{ SST(0x59, 0x00, SS_RDEF,
 	    "Updated block read") },
 	/* DTLPWRO   BK   */
 	{ SST(0x5A, 0x00, SS_RDEF,
 	    "Operator request or state change input") },
 	/* DT  WROM  BK   */
 	{ SST(0x5A, 0x01, SS_RDEF,
 	    "Operator medium removal request") },
 	/* DT  WRO A BK   */
 	{ SST(0x5A, 0x02, SS_RDEF,
 	    "Operator selected write protect") },
 	/* DT  WRO A BK   */
 	{ SST(0x5A, 0x03, SS_RDEF,
 	    "Operator selected write permit") },
 	/* DTLPWROM   K   */
 	{ SST(0x5B, 0x00, SS_RDEF,
 	    "Log exception") },
 	/* DTLPWROM   K   */
 	{ SST(0x5B, 0x01, SS_RDEF,
 	    "Threshold condition met") },
 	/* DTLPWROM   K   */
 	{ SST(0x5B, 0x02, SS_RDEF,
 	    "Log counter at maximum") },
 	/* DTLPWROM   K   */
 	{ SST(0x5B, 0x03, SS_RDEF,
 	    "Log list codes exhausted") },
 	/* D     O        */
 	{ SST(0x5C, 0x00, SS_RDEF,
 	    "RPL status change") },
 	/* D     O        */
 	{ SST(0x5C, 0x01, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindles synchronized") },
 	/* D     O        */
 	{ SST(0x5C, 0x02, SS_RDEF,
 	    "Spindles not synchronized") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x5D, 0x00, SS_NOP | SSQ_PRINT_SENSE,
 	    "Failure prediction threshold exceeded") },
 	/*      R    B    */
 	{ SST(0x5D, 0x01, SS_NOP | SSQ_PRINT_SENSE,
 	    "Media failure prediction threshold exceeded") },
 	/*      R         */
 	{ SST(0x5D, 0x02, SS_NOP | SSQ_PRINT_SENSE,
 	    "Logical unit failure prediction threshold exceeded") },
 	/*      R         */
 	{ SST(0x5D, 0x03, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spare area exhaustion prediction threshold exceeded") },
 	/* D         B    */
 	{ SST(0x5D, 0x10, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure general hard drive failure") },
 	/* D         B    */
 	{ SST(0x5D, 0x11, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure drive error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x12, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure data error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x13, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure seek error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x14, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure too many block reassigns") },
 	/* D         B    */
 	{ SST(0x5D, 0x15, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure access times too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x16, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure start unit times too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x17, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure channel parametrics") },
 	/* D         B    */
 	{ SST(0x5D, 0x18, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure controller detected") },
 	/* D         B    */
 	{ SST(0x5D, 0x19, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure throughput performance") },
 	/* D         B    */
 	{ SST(0x5D, 0x1A, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure seek time performance") },
 	/* D         B    */
 	{ SST(0x5D, 0x1B, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure spin-up retry count") },
 	/* D         B    */
 	{ SST(0x5D, 0x1C, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure drive calibration retry count") },
 	/* D         B    */
 	{ SST(0x5D, 0x1D, SS_NOP | SSQ_PRINT_SENSE,
 	    "Hardware impending failure power loss protection circuit") },
 	/* D         B    */
 	{ SST(0x5D, 0x20, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure general hard drive failure") },
 	/* D         B    */
 	{ SST(0x5D, 0x21, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure drive error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x22, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure data error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x23, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure seek error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x24, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure too many block reassigns") },
 	/* D         B    */
 	{ SST(0x5D, 0x25, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure access times too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x26, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure start unit times too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x27, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure channel parametrics") },
 	/* D         B    */
 	{ SST(0x5D, 0x28, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure controller detected") },
 	/* D         B    */
 	{ SST(0x5D, 0x29, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure throughput performance") },
 	/* D         B    */
 	{ SST(0x5D, 0x2A, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure seek time performance") },
 	/* D         B    */
 	{ SST(0x5D, 0x2B, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure spin-up retry count") },
 	/* D         B    */
 	{ SST(0x5D, 0x2C, SS_NOP | SSQ_PRINT_SENSE,
 	    "Controller impending failure drive calibration retry count") },
 	/* D         B    */
 	{ SST(0x5D, 0x30, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure general hard drive failure") },
 	/* D         B    */
 	{ SST(0x5D, 0x31, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure drive error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x32, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure data error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x33, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure seek error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x34, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure too many block reassigns") },
 	/* D         B    */
 	{ SST(0x5D, 0x35, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure access times too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x36, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure start unit times too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x37, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure channel parametrics") },
 	/* D         B    */
 	{ SST(0x5D, 0x38, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure controller detected") },
 	/* D         B    */
 	{ SST(0x5D, 0x39, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure throughput performance") },
 	/* D         B    */
 	{ SST(0x5D, 0x3A, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure seek time performance") },
 	/* D         B    */
 	{ SST(0x5D, 0x3B, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure spin-up retry count") },
 	/* D         B    */
 	{ SST(0x5D, 0x3C, SS_NOP | SSQ_PRINT_SENSE,
 	    "Data channel impending failure drive calibration retry count") },
 	/* D         B    */
 	{ SST(0x5D, 0x40, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure general hard drive failure") },
 	/* D         B    */
 	{ SST(0x5D, 0x41, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure drive error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x42, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure data error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x43, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure seek error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x44, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure too many block reassigns") },
 	/* D         B    */
 	{ SST(0x5D, 0x45, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure access times too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x46, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure start unit times too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x47, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure channel parametrics") },
 	/* D         B    */
 	{ SST(0x5D, 0x48, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure controller detected") },
 	/* D         B    */
 	{ SST(0x5D, 0x49, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure throughput performance") },
 	/* D         B    */
 	{ SST(0x5D, 0x4A, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure seek time performance") },
 	/* D         B    */
 	{ SST(0x5D, 0x4B, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure spin-up retry count") },
 	/* D         B    */
 	{ SST(0x5D, 0x4C, SS_NOP | SSQ_PRINT_SENSE,
 	    "Servo impending failure drive calibration retry count") },
 	/* D         B    */
 	{ SST(0x5D, 0x50, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure general hard drive failure") },
 	/* D         B    */
 	{ SST(0x5D, 0x51, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure drive error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x52, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure data error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x53, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure seek error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x54, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure too many block reassigns") },
 	/* D         B    */
 	{ SST(0x5D, 0x55, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure access times too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x56, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure start unit times too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x57, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure channel parametrics") },
 	/* D         B    */
 	{ SST(0x5D, 0x58, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure controller detected") },
 	/* D         B    */
 	{ SST(0x5D, 0x59, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure throughput performance") },
 	/* D         B    */
 	{ SST(0x5D, 0x5A, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure seek time performance") },
 	/* D         B    */
 	{ SST(0x5D, 0x5B, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure spin-up retry count") },
 	/* D         B    */
 	{ SST(0x5D, 0x5C, SS_NOP | SSQ_PRINT_SENSE,
 	    "Spindle impending failure drive calibration retry count") },
 	/* D         B    */
 	{ SST(0x5D, 0x60, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure general hard drive failure") },
 	/* D         B    */
 	{ SST(0x5D, 0x61, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure drive error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x62, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure data error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x63, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure seek error rate too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x64, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure too many block reassigns") },
 	/* D         B    */
 	{ SST(0x5D, 0x65, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure access times too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x66, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure start unit times too high") },
 	/* D         B    */
 	{ SST(0x5D, 0x67, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure channel parametrics") },
 	/* D         B    */
 	{ SST(0x5D, 0x68, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure controller detected") },
 	/* D         B    */
 	{ SST(0x5D, 0x69, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure throughput performance") },
 	/* D         B    */
 	{ SST(0x5D, 0x6A, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure seek time performance") },
 	/* D         B    */
 	{ SST(0x5D, 0x6B, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure spin-up retry count") },
 	/* D         B    */
 	{ SST(0x5D, 0x6C, SS_NOP | SSQ_PRINT_SENSE,
 	    "Firmware impending failure drive calibration retry count") },
 	/* D         B    */
 	{ SST(0x5D, 0x73, SS_NOP | SSQ_PRINT_SENSE,
 	    "Media impending failure endurance limit met") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x5D, 0xFF, SS_NOP | SSQ_PRINT_SENSE,
 	    "Failure prediction threshold exceeded (false)") },
 	/* DTLPWRO A  K   */
 	{ SST(0x5E, 0x00, SS_RDEF,
 	    "Low power condition on") },
 	/* DTLPWRO A  K   */
 	{ SST(0x5E, 0x01, SS_RDEF,
 	    "Idle condition activated by timer") },
 	/* DTLPWRO A  K   */
 	{ SST(0x5E, 0x02, SS_RDEF,
 	    "Standby condition activated by timer") },
 	/* DTLPWRO A  K   */
 	{ SST(0x5E, 0x03, SS_RDEF,
 	    "Idle condition activated by command") },
 	/* DTLPWRO A  K   */
 	{ SST(0x5E, 0x04, SS_RDEF,
 	    "Standby condition activated by command") },
 	/* DTLPWRO A  K   */
 	{ SST(0x5E, 0x05, SS_RDEF,
 	    "Idle-B condition activated by timer") },
 	/* DTLPWRO A  K   */
 	{ SST(0x5E, 0x06, SS_RDEF,
 	    "Idle-B condition activated by command") },
 	/* DTLPWRO A  K   */
 	{ SST(0x5E, 0x07, SS_RDEF,
 	    "Idle-C condition activated by timer") },
 	/* DTLPWRO A  K   */
 	{ SST(0x5E, 0x08, SS_RDEF,
 	    "Idle-C condition activated by command") },
 	/* DTLPWRO A  K   */
 	{ SST(0x5E, 0x09, SS_RDEF,
 	    "Standby-Y condition activated by timer") },
 	/* DTLPWRO A  K   */
 	{ SST(0x5E, 0x0A, SS_RDEF,
 	    "Standby-Y condition activated by command") },
 	/*           B    */
 	{ SST(0x5E, 0x41, SS_RDEF,	/* XXX TBD */
 	    "Power state change to active") },
 	/*           B    */
 	{ SST(0x5E, 0x42, SS_RDEF,	/* XXX TBD */
 	    "Power state change to idle") },
 	/*           B    */
 	{ SST(0x5E, 0x43, SS_RDEF,	/* XXX TBD */
 	    "Power state change to standby") },
 	/*           B    */
 	{ SST(0x5E, 0x45, SS_RDEF,	/* XXX TBD */
 	    "Power state change to sleep") },
 	/*           BK   */
 	{ SST(0x5E, 0x47, SS_RDEF,	/* XXX TBD */
 	    "Power state change to device control") },
 	/*                */
 	{ SST(0x60, 0x00, SS_RDEF,
 	    "Lamp failure") },
 	/*                */
 	{ SST(0x61, 0x00, SS_RDEF,
 	    "Video acquisition error") },
 	/*                */
 	{ SST(0x61, 0x01, SS_RDEF,
 	    "Unable to acquire video") },
 	/*                */
 	{ SST(0x61, 0x02, SS_RDEF,
 	    "Out of focus") },
 	/*                */
 	{ SST(0x62, 0x00, SS_RDEF,
 	    "Scan head positioning error") },
 	/*      R         */
 	{ SST(0x63, 0x00, SS_RDEF,
 	    "End of user area encountered on this track") },
 	/*      R         */
 	{ SST(0x63, 0x01, SS_FATAL | ENOSPC,
 	    "Packet does not fit in available space") },
 	/*      R         */
 	{ SST(0x64, 0x00, SS_FATAL | ENXIO,
 	    "Illegal mode for this track") },
 	/*      R         */
 	{ SST(0x64, 0x01, SS_RDEF,
 	    "Invalid packet size") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x65, 0x00, SS_RDEF,
 	    "Voltage fault") },
 	/*                */
 	{ SST(0x66, 0x00, SS_RDEF,
 	    "Automatic document feeder cover up") },
 	/*                */
 	{ SST(0x66, 0x01, SS_RDEF,
 	    "Automatic document feeder lift up") },
 	/*                */
 	{ SST(0x66, 0x02, SS_RDEF,
 	    "Document jam in automatic document feeder") },
 	/*                */
 	{ SST(0x66, 0x03, SS_RDEF,
 	    "Document miss feed automatic in document feeder") },
 	/*         A      */
 	{ SST(0x67, 0x00, SS_RDEF,
 	    "Configuration failure") },
 	/*         A      */
 	{ SST(0x67, 0x01, SS_RDEF,
 	    "Configuration of incapable logical units failed") },
 	/*         A      */
 	{ SST(0x67, 0x02, SS_RDEF,
 	    "Add logical unit failed") },
 	/*         A      */
 	{ SST(0x67, 0x03, SS_RDEF,
 	    "Modification of logical unit failed") },
 	/*         A      */
 	{ SST(0x67, 0x04, SS_RDEF,
 	    "Exchange of logical unit failed") },
 	/*         A      */
 	{ SST(0x67, 0x05, SS_RDEF,
 	    "Remove of logical unit failed") },
 	/*         A      */
 	{ SST(0x67, 0x06, SS_RDEF,
 	    "Attachment of logical unit failed") },
 	/*         A      */
 	{ SST(0x67, 0x07, SS_RDEF,
 	    "Creation of logical unit failed") },
 	/*         A      */
 	{ SST(0x67, 0x08, SS_RDEF,	/* XXX TBD */
 	    "Assign failure occurred") },
 	/*         A      */
 	{ SST(0x67, 0x09, SS_RDEF,	/* XXX TBD */
 	    "Multiply assigned logical unit") },
 	/* DTLPWROMAEBKVF */
 	{ SST(0x67, 0x0A, SS_RDEF,	/* XXX TBD */
 	    "Set target port groups command failed") },
 	/* DT        B    */
 	{ SST(0x67, 0x0B, SS_RDEF,	/* XXX TBD */
 	    "ATA device feature not enabled") },
 	/*         A      */
 	{ SST(0x68, 0x00, SS_RDEF,
 	    "Logical unit not configured") },
 	/* D              */
 	{ SST(0x68, 0x01, SS_RDEF,
 	    "Subsidiary logical unit not configured") },
 	/*         A      */
 	{ SST(0x69, 0x00, SS_RDEF,
 	    "Data loss on logical unit") },
 	/*         A      */
 	{ SST(0x69, 0x01, SS_RDEF,
 	    "Multiple logical unit failures") },
 	/*         A      */
 	{ SST(0x69, 0x02, SS_RDEF,
 	    "Parity/data mismatch") },
 	/*         A      */
 	{ SST(0x6A, 0x00, SS_RDEF,
 	    "Informational, refer to log") },
 	/*         A      */
 	{ SST(0x6B, 0x00, SS_RDEF,
 	    "State change has occurred") },
 	/*         A      */
 	{ SST(0x6B, 0x01, SS_RDEF,
 	    "Redundancy level got better") },
 	/*         A      */
 	{ SST(0x6B, 0x02, SS_RDEF,
 	    "Redundancy level got worse") },
 	/*         A      */
 	{ SST(0x6C, 0x00, SS_RDEF,
 	    "Rebuild failure occurred") },
 	/*         A      */
 	{ SST(0x6D, 0x00, SS_RDEF,
 	    "Recalculate failure occurred") },
 	/*         A      */
 	{ SST(0x6E, 0x00, SS_RDEF,
 	    "Command to logical unit failed") },
 	/*      R         */
 	{ SST(0x6F, 0x00, SS_RDEF,	/* XXX TBD */
 	    "Copy protection key exchange failure - authentication failure") },
 	/*      R         */
 	{ SST(0x6F, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Copy protection key exchange failure - key not present") },
 	/*      R         */
 	{ SST(0x6F, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Copy protection key exchange failure - key not established") },
 	/*      R         */
 	{ SST(0x6F, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Read of scrambled sector without authentication") },
 	/*      R         */
 	{ SST(0x6F, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Media region code is mismatched to logical unit region") },
 	/*      R         */
 	{ SST(0x6F, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Drive region must be permanent/region reset count error") },
 	/*      R         */
 	{ SST(0x6F, 0x06, SS_RDEF,	/* XXX TBD */
 	    "Insufficient block count for binding NONCE recording") },
 	/*      R         */
 	{ SST(0x6F, 0x07, SS_RDEF,	/* XXX TBD */
 	    "Conflict in binding NONCE recording") },
 	/*  T             */
 	{ SST(0x70, 0x00, SS_RDEF,
 	    "Decompression exception short: ASCQ = Algorithm ID") },
 	/*  T             */
 	{ SST(0x70, 0xFF, SS_RDEF | SSQ_RANGE,
 	    NULL) },			/* Range 0x00 -> 0xFF */
 	/*  T             */
 	{ SST(0x71, 0x00, SS_RDEF,
 	    "Decompression exception long: ASCQ = Algorithm ID") },
 	/*  T             */
 	{ SST(0x71, 0xFF, SS_RDEF | SSQ_RANGE,
 	    NULL) },			/* Range 0x00 -> 0xFF */
 	/*      R         */
 	{ SST(0x72, 0x00, SS_RDEF,
 	    "Session fixation error") },
 	/*      R         */
 	{ SST(0x72, 0x01, SS_RDEF,
 	    "Session fixation error writing lead-in") },
 	/*      R         */
 	{ SST(0x72, 0x02, SS_RDEF,
 	    "Session fixation error writing lead-out") },
 	/*      R         */
 	{ SST(0x72, 0x03, SS_RDEF,
 	    "Session fixation error - incomplete track in session") },
 	/*      R         */
 	{ SST(0x72, 0x04, SS_RDEF,
 	    "Empty or partially written reserved track") },
 	/*      R         */
 	{ SST(0x72, 0x05, SS_RDEF,	/* XXX TBD */
 	    "No more track reservations allowed") },
 	/*      R         */
 	{ SST(0x72, 0x06, SS_RDEF,	/* XXX TBD */
 	    "RMZ extension is not allowed") },
 	/*      R         */
 	{ SST(0x72, 0x07, SS_RDEF,	/* XXX TBD */
 	    "No more test zone extensions are allowed") },
 	/*      R         */
 	{ SST(0x73, 0x00, SS_RDEF,
 	    "CD control error") },
 	/*      R         */
 	{ SST(0x73, 0x01, SS_RDEF,
 	    "Power calibration area almost full") },
 	/*      R         */
 	{ SST(0x73, 0x02, SS_FATAL | ENOSPC,
 	    "Power calibration area is full") },
 	/*      R         */
 	{ SST(0x73, 0x03, SS_RDEF,
 	    "Power calibration area error") },
 	/*      R         */
 	{ SST(0x73, 0x04, SS_RDEF,
 	    "Program memory area update failure") },
 	/*      R         */
 	{ SST(0x73, 0x05, SS_RDEF,
 	    "Program memory area is full") },
 	/*      R         */
 	{ SST(0x73, 0x06, SS_RDEF,	/* XXX TBD */
 	    "RMA/PMA is almost full") },
 	/*      R         */
 	{ SST(0x73, 0x10, SS_RDEF,	/* XXX TBD */
 	    "Current power calibration area almost full") },
 	/*      R         */
 	{ SST(0x73, 0x11, SS_RDEF,	/* XXX TBD */
 	    "Current power calibration area is full") },
 	/*      R         */
 	{ SST(0x73, 0x17, SS_RDEF,	/* XXX TBD */
 	    "RDZ is full") },
 	/*  T             */
 	{ SST(0x74, 0x00, SS_RDEF,	/* XXX TBD */
 	    "Security error") },
 	/*  T             */
 	{ SST(0x74, 0x01, SS_RDEF,	/* XXX TBD */
 	    "Unable to decrypt data") },
 	/*  T             */
 	{ SST(0x74, 0x02, SS_RDEF,	/* XXX TBD */
 	    "Unencrypted data encountered while decrypting") },
 	/*  T             */
 	{ SST(0x74, 0x03, SS_RDEF,	/* XXX TBD */
 	    "Incorrect data encryption key") },
 	/*  T             */
 	{ SST(0x74, 0x04, SS_RDEF,	/* XXX TBD */
 	    "Cryptographic integrity validation failed") },
 	/*  T             */
 	{ SST(0x74, 0x05, SS_RDEF,	/* XXX TBD */
 	    "Error decrypting data") },
 	/*  T             */
 	{ SST(0x74, 0x06, SS_RDEF,	/* XXX TBD */
 	    "Unknown signature verification key") },
 	/*  T             */
 	{ SST(0x74, 0x07, SS_RDEF,	/* XXX TBD */
 	    "Encryption parameters not useable") },
 	/* DT   R M E  VF */
 	{ SST(0x74, 0x08, SS_RDEF,	/* XXX TBD */
 	    "Digital signature validation failure") },
 	/*  T             */
 	{ SST(0x74, 0x09, SS_RDEF,	/* XXX TBD */
 	    "Encryption mode mismatch on read") },
 	/*  T             */
 	{ SST(0x74, 0x0A, SS_RDEF,	/* XXX TBD */
 	    "Encrypted block not raw read enabled") },
 	/*  T             */
 	{ SST(0x74, 0x0B, SS_RDEF,	/* XXX TBD */
 	    "Incorrect encryption parameters") },
 	/* DT   R MAEBKV  */
 	{ SST(0x74, 0x0C, SS_RDEF,	/* XXX TBD */
 	    "Unable to decrypt parameter list") },
 	/*  T             */
 	{ SST(0x74, 0x0D, SS_RDEF,	/* XXX TBD */
 	    "Encryption algorithm disabled") },
 	/* DT   R MAEBKV  */
 	{ SST(0x74, 0x10, SS_RDEF,	/* XXX TBD */
 	    "SA creation parameter value invalid") },
 	/* DT   R MAEBKV  */
 	{ SST(0x74, 0x11, SS_RDEF,	/* XXX TBD */
 	    "SA creation parameter value rejected") },
 	/* DT   R MAEBKV  */
 	{ SST(0x74, 0x12, SS_RDEF,	/* XXX TBD */
 	    "Invalid SA usage") },
 	/*  T             */
 	{ SST(0x74, 0x21, SS_RDEF,	/* XXX TBD */
 	    "Data encryption configuration prevented") },
 	/* DT   R MAEBKV  */
 	{ SST(0x74, 0x30, SS_RDEF,	/* XXX TBD */
 	    "SA creation parameter not supported") },
 	/* DT   R MAEBKV  */
 	{ SST(0x74, 0x40, SS_RDEF,	/* XXX TBD */
 	    "Authentication failed") },
 	/*             V  */
 	{ SST(0x74, 0x61, SS_RDEF,	/* XXX TBD */
 	    "External data encryption key manager access error") },
 	/*             V  */
 	{ SST(0x74, 0x62, SS_RDEF,	/* XXX TBD */
 	    "External data encryption key manager error") },
 	/*             V  */
 	{ SST(0x74, 0x63, SS_RDEF,	/* XXX TBD */
 	    "External data encryption key not found") },
 	/*             V  */
 	{ SST(0x74, 0x64, SS_RDEF,	/* XXX TBD */
 	    "External data encryption request not authorized") },
 	/*  T             */
 	{ SST(0x74, 0x6E, SS_RDEF,	/* XXX TBD */
 	    "External data encryption control timeout") },
 	/*  T             */
 	{ SST(0x74, 0x6F, SS_RDEF,	/* XXX TBD */
 	    "External data encryption control error") },
 	/* DT   R M E  V  */
 	{ SST(0x74, 0x71, SS_FATAL | EACCES,
 	    "Logical unit access not authorized") },
 	/* D              */
 	{ SST(0x74, 0x79, SS_FATAL | EACCES,
 	    "Security conflict in translated device") }
 };
 
 const u_int asc_table_size = nitems(asc_table);
 
 struct asc_key
 {
 	int asc;
 	int ascq;
 };
 
 static int
 ascentrycomp(const void *key, const void *member)
 {
 	int asc;
 	int ascq;
 	const struct asc_table_entry *table_entry;
 
 	asc = ((const struct asc_key *)key)->asc;
 	ascq = ((const struct asc_key *)key)->ascq;
 	table_entry = (const struct asc_table_entry *)member;
 
 	if (asc >= table_entry->asc) {
 
 		if (asc > table_entry->asc)
 			return (1);
 
 		if (ascq <= table_entry->ascq) {
 			/* Check for ranges */
 			if (ascq == table_entry->ascq
 		 	 || ((table_entry->action & SSQ_RANGE) != 0
 		  	   && ascq >= (table_entry - 1)->ascq))
 				return (0);
 			return (-1);
 		}
 		return (1);
 	}
 	return (-1);
 }
 
 static int
 senseentrycomp(const void *key, const void *member)
 {
 	int sense_key;
 	const struct sense_key_table_entry *table_entry;
 
 	sense_key = *((const int *)key);
 	table_entry = (const struct sense_key_table_entry *)member;
 
 	if (sense_key >= table_entry->sense_key) {
 		if (sense_key == table_entry->sense_key)
 			return (0);
 		return (1);
 	}
 	return (-1);
 }
 
 static void
 fetchtableentries(int sense_key, int asc, int ascq,
 		  struct scsi_inquiry_data *inq_data,
 		  const struct sense_key_table_entry **sense_entry,
 		  const struct asc_table_entry **asc_entry)
 {
 	caddr_t match;
 	const struct asc_table_entry *asc_tables[2];
 	const struct sense_key_table_entry *sense_tables[2];
 	struct asc_key asc_ascq;
 	size_t asc_tables_size[2];
 	size_t sense_tables_size[2];
 	int num_asc_tables;
 	int num_sense_tables;
 	int i;
 
 	/* Default to failure */
 	*sense_entry = NULL;
 	*asc_entry = NULL;
 	match = NULL;
 	if (inq_data != NULL)
 		match = cam_quirkmatch((caddr_t)inq_data,
 				       (caddr_t)sense_quirk_table,
 				       sense_quirk_table_size,
 				       sizeof(*sense_quirk_table),
 				       scsi_inquiry_match);
 
 	if (match != NULL) {
 		struct scsi_sense_quirk_entry *quirk;
 
 		quirk = (struct scsi_sense_quirk_entry *)match;
 		asc_tables[0] = quirk->asc_info;
 		asc_tables_size[0] = quirk->num_ascs;
 		asc_tables[1] = asc_table;
 		asc_tables_size[1] = asc_table_size;
 		num_asc_tables = 2;
 		sense_tables[0] = quirk->sense_key_info;
 		sense_tables_size[0] = quirk->num_sense_keys;
 		sense_tables[1] = sense_key_table;
 		sense_tables_size[1] = nitems(sense_key_table);
 		num_sense_tables = 2;
 	} else {
 		asc_tables[0] = asc_table;
 		asc_tables_size[0] = asc_table_size;
 		num_asc_tables = 1;
 		sense_tables[0] = sense_key_table;
 		sense_tables_size[0] = nitems(sense_key_table);
 		num_sense_tables = 1;
 	}
 
 	asc_ascq.asc = asc;
 	asc_ascq.ascq = ascq;
 	for (i = 0; i < num_asc_tables; i++) {
 		void *found_entry;
 
 		found_entry = bsearch(&asc_ascq, asc_tables[i],
 				      asc_tables_size[i],
 				      sizeof(**asc_tables),
 				      ascentrycomp);
 
 		if (found_entry) {
 			*asc_entry = (struct asc_table_entry *)found_entry;
 			break;
 		}
 	}
 
 	for (i = 0; i < num_sense_tables; i++) {
 		void *found_entry;
 
 		found_entry = bsearch(&sense_key, sense_tables[i],
 				      sense_tables_size[i],
 				      sizeof(**sense_tables),
 				      senseentrycomp);
 
 		if (found_entry) {
 			*sense_entry =
 			    (struct sense_key_table_entry *)found_entry;
 			break;
 		}
 	}
 }
 
 void
 scsi_sense_desc(int sense_key, int asc, int ascq,
 		struct scsi_inquiry_data *inq_data,
 		const char **sense_key_desc, const char **asc_desc)
 {
 	const struct asc_table_entry *asc_entry;
 	const struct sense_key_table_entry *sense_entry;
 
 	fetchtableentries(sense_key, asc, ascq,
 			  inq_data,
 			  &sense_entry,
 			  &asc_entry);
 
 	if (sense_entry != NULL)
 		*sense_key_desc = sense_entry->desc;
 	else
 		*sense_key_desc = "Invalid Sense Key";
 
 	if (asc_entry != NULL)
 		*asc_desc = asc_entry->desc;
 	else if (asc >= 0x80 && asc <= 0xff)
 		*asc_desc = "Vendor Specific ASC";
 	else if (ascq >= 0x80 && ascq <= 0xff)
 		*asc_desc = "Vendor Specific ASCQ";
 	else
 		*asc_desc = "Reserved ASC/ASCQ pair";
 }
 
 /*
  * Given sense and device type information, return the appropriate action.
  * If we do not understand the specific error as identified by the ASC/ASCQ
  * pair, fall back on the more generic actions derived from the sense key.
  */
 scsi_sense_action
 scsi_error_action(struct ccb_scsiio *csio, struct scsi_inquiry_data *inq_data,
 		  u_int32_t sense_flags)
 {
 	const struct asc_table_entry *asc_entry;
 	const struct sense_key_table_entry *sense_entry;
 	int error_code, sense_key, asc, ascq;
 	scsi_sense_action action;
 
 	if (!scsi_extract_sense_ccb((union ccb *)csio,
 	    &error_code, &sense_key, &asc, &ascq)) {
 		action = SS_RETRY | SSQ_DECREMENT_COUNT | SSQ_PRINT_SENSE | EIO;
 	} else if ((error_code == SSD_DEFERRED_ERROR)
 	 || (error_code == SSD_DESC_DEFERRED_ERROR)) {
 		/*
 		 * XXX dufault@FreeBSD.org
 		 * This error doesn't relate to the command associated
 		 * with this request sense.  A deferred error is an error
 		 * for a command that has already returned GOOD status
 		 * (see SCSI2 8.2.14.2).
 		 *
 		 * By my reading of that section, it looks like the current
 		 * command has been cancelled, we should now clean things up
 		 * (hopefully recovering any lost data) and then retry the
 		 * current command.  There are two easy choices, both wrong:
 		 *
 		 * 1. Drop through (like we had been doing), thus treating
 		 *    this as if the error were for the current command and
 		 *    return and stop the current command.
 		 * 
 		 * 2. Issue a retry (like I made it do) thus hopefully
 		 *    recovering the current transfer, and ignoring the
 		 *    fact that we've dropped a command.
 		 *
 		 * These should probably be handled in a device specific
 		 * sense handler or punted back up to a user mode daemon
 		 */
 		action = SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE;
 	} else {
 		fetchtableentries(sense_key, asc, ascq,
 				  inq_data,
 				  &sense_entry,
 				  &asc_entry);
 
 		/*
 		 * Override the 'No additional Sense' entry (0,0)
 		 * with the error action of the sense key.
 		 */
 		if (asc_entry != NULL
 		 && (asc != 0 || ascq != 0))
 			action = asc_entry->action;
 		else if (sense_entry != NULL)
 			action = sense_entry->action;
 		else
 			action = SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE; 
 
 		if (sense_key == SSD_KEY_RECOVERED_ERROR) {
 			/*
 			 * The action succeeded but the device wants
 			 * the user to know that some recovery action
 			 * was required.
 			 */
 			action &= ~(SS_MASK|SSQ_MASK|SS_ERRMASK);
 			action |= SS_NOP|SSQ_PRINT_SENSE;
 		} else if (sense_key == SSD_KEY_ILLEGAL_REQUEST) {
 			if ((sense_flags & SF_QUIET_IR) != 0)
 				action &= ~SSQ_PRINT_SENSE;
 		} else if (sense_key == SSD_KEY_UNIT_ATTENTION) {
 			if ((sense_flags & SF_RETRY_UA) != 0
 			 && (action & SS_MASK) == SS_FAIL) {
 				action &= ~(SS_MASK|SSQ_MASK);
 				action |= SS_RETRY|SSQ_DECREMENT_COUNT|
 					  SSQ_PRINT_SENSE;
 			}
 			action |= SSQ_UA;
 		}
 	}
 	if ((action & SS_MASK) >= SS_START &&
 	    (sense_flags & SF_NO_RECOVERY)) {
 		action &= ~SS_MASK;
 		action |= SS_FAIL;
 	} else if ((action & SS_MASK) == SS_RETRY &&
 	    (sense_flags & SF_NO_RETRY)) {
 		action &= ~SS_MASK;
 		action |= SS_FAIL;
 	}
 	if ((sense_flags & SF_PRINT_ALWAYS) != 0)
 		action |= SSQ_PRINT_SENSE;
 	else if ((sense_flags & SF_NO_PRINT) != 0)
 		action &= ~SSQ_PRINT_SENSE;
 
 	return (action);
 }
 
 char *
 scsi_cdb_string(u_int8_t *cdb_ptr, char *cdb_string, size_t len)
 {
 	struct sbuf sb;
 	int error;
 
 	if (len == 0)
 		return ("");
 
 	sbuf_new(&sb, cdb_string, len, SBUF_FIXEDLEN);
 
 	scsi_cdb_sbuf(cdb_ptr, &sb);
 
 	/* ENOMEM just means that the fixed buffer is full, OK to ignore */
 	error = sbuf_finish(&sb);
 	if (error != 0 && error != ENOMEM)
 		return ("");
 
 	return(sbuf_data(&sb));
 }
 
 void
 scsi_cdb_sbuf(u_int8_t *cdb_ptr, struct sbuf *sb)
 {
 	u_int8_t cdb_len;
 	int i;
 
 	if (cdb_ptr == NULL)
 		return;
 
 	/*
 	 * This is taken from the SCSI-3 draft spec.
 	 * (T10/1157D revision 0.3)
 	 * The top 3 bits of an opcode are the group code.  The next 5 bits
 	 * are the command code.
 	 * Group 0:  six byte commands
 	 * Group 1:  ten byte commands
 	 * Group 2:  ten byte commands
 	 * Group 3:  reserved
 	 * Group 4:  sixteen byte commands
 	 * Group 5:  twelve byte commands
 	 * Group 6:  vendor specific
 	 * Group 7:  vendor specific
 	 */
 	switch((*cdb_ptr >> 5) & 0x7) {
 		case 0:
 			cdb_len = 6;
 			break;
 		case 1:
 		case 2:
 			cdb_len = 10;
 			break;
 		case 3:
 		case 6:
 		case 7:
 			/* in this case, just print out the opcode */
 			cdb_len = 1;
 			break;
 		case 4:
 			cdb_len = 16;
 			break;
 		case 5:
 			cdb_len = 12;
 			break;
 	}
 
 	for (i = 0; i < cdb_len; i++)
 		sbuf_printf(sb, "%02hhx ", cdb_ptr[i]);
 
 	return;
 }
 
 const char *
 scsi_status_string(struct ccb_scsiio *csio)
 {
 	switch(csio->scsi_status) {
 	case SCSI_STATUS_OK:
 		return("OK");
 	case SCSI_STATUS_CHECK_COND:
 		return("Check Condition");
 	case SCSI_STATUS_BUSY:
 		return("Busy");
 	case SCSI_STATUS_INTERMED:
 		return("Intermediate");
 	case SCSI_STATUS_INTERMED_COND_MET:
 		return("Intermediate-Condition Met");
 	case SCSI_STATUS_RESERV_CONFLICT:
 		return("Reservation Conflict");
 	case SCSI_STATUS_CMD_TERMINATED:
 		return("Command Terminated");
 	case SCSI_STATUS_QUEUE_FULL:
 		return("Queue Full");
 	case SCSI_STATUS_ACA_ACTIVE:
 		return("ACA Active");
 	case SCSI_STATUS_TASK_ABORTED:
 		return("Task Aborted");
 	default: {
 		static char unkstr[64];
 		snprintf(unkstr, sizeof(unkstr), "Unknown %#x",
 			 csio->scsi_status);
 		return(unkstr);
 	}
 	}
 }
 
 /*
  * scsi_command_string() returns 0 for success and -1 for failure.
  */
 #ifdef _KERNEL
 int
 scsi_command_string(struct ccb_scsiio *csio, struct sbuf *sb)
 #else /* !_KERNEL */
 int
 scsi_command_string(struct cam_device *device, struct ccb_scsiio *csio, 
 		    struct sbuf *sb)
 #endif /* _KERNEL/!_KERNEL */
 {
 	struct scsi_inquiry_data *inq_data;
 #ifdef _KERNEL
 	struct	  ccb_getdev *cgd;
 #endif /* _KERNEL */
 
 #ifdef _KERNEL
 	if ((cgd = (struct ccb_getdev*)xpt_alloc_ccb_nowait()) == NULL)
 		return(-1);
 	/*
 	 * Get the device information.
 	 */
 	xpt_setup_ccb(&cgd->ccb_h,
 		      csio->ccb_h.path,
 		      CAM_PRIORITY_NORMAL);
 	cgd->ccb_h.func_code = XPT_GDEV_TYPE;
 	xpt_action((union ccb *)cgd);
 
 	/*
 	 * If the device is unconfigured, just pretend that it is a hard
 	 * drive.  scsi_op_desc() needs this.
 	 */
 	if (cgd->ccb_h.status == CAM_DEV_NOT_THERE)
 		cgd->inq_data.device = T_DIRECT;
 
 	inq_data = &cgd->inq_data;
 
 #else /* !_KERNEL */
 
 	inq_data = &device->inq_data;
 
 #endif /* _KERNEL/!_KERNEL */
 
 	sbuf_printf(sb, "%s. CDB: ",
 		    scsi_op_desc(scsiio_cdb_ptr(csio)[0], inq_data));
 	scsi_cdb_sbuf(scsiio_cdb_ptr(csio), sb);
 
 #ifdef _KERNEL
 	xpt_free_ccb((union ccb *)cgd);
 #endif
 
 	return(0);
 }
 
 /*
  * Iterate over sense descriptors.  Each descriptor is passed into iter_func(). 
  * If iter_func() returns 0, list traversal continues.  If iter_func()
  * returns non-zero, list traversal is stopped.
  */
 void
 scsi_desc_iterate(struct scsi_sense_data_desc *sense, u_int sense_len,
 		  int (*iter_func)(struct scsi_sense_data_desc *sense,
 				   u_int, struct scsi_sense_desc_header *,
 				   void *), void *arg)
 {
 	int cur_pos;
 	int desc_len;
 
 	/*
 	 * First make sure the extra length field is present.
 	 */
 	if (SSD_DESC_IS_PRESENT(sense, sense_len, extra_len) == 0)
 		return;
 
 	/*
 	 * The length of data actually returned may be different than the
 	 * extra_len recorded in the structure.
 	 */
 	desc_len = sense_len -offsetof(struct scsi_sense_data_desc, sense_desc);
 
 	/*
 	 * Limit this further by the extra length reported, and the maximum
 	 * allowed extra length.
 	 */
 	desc_len = MIN(desc_len, MIN(sense->extra_len, SSD_EXTRA_MAX));
 
 	/*
 	 * Subtract the size of the header from the descriptor length.
 	 * This is to ensure that we have at least the header left, so we
 	 * don't have to check that inside the loop.  This can wind up
 	 * being a negative value.
 	 */
 	desc_len -= sizeof(struct scsi_sense_desc_header);
 
 	for (cur_pos = 0; cur_pos < desc_len;) {
 		struct scsi_sense_desc_header *header;
 
 		header = (struct scsi_sense_desc_header *)
 			&sense->sense_desc[cur_pos];
 
 		/*
 		 * Check to make sure we have the entire descriptor.  We
 		 * don't call iter_func() unless we do.
 		 *
 		 * Note that although cur_pos is at the beginning of the
 		 * descriptor, desc_len already has the header length
 		 * subtracted.  So the comparison of the length in the
 		 * header (which does not include the header itself) to
 		 * desc_len - cur_pos is correct.
 		 */
 		if (header->length > (desc_len - cur_pos)) 
 			break;
 
 		if (iter_func(sense, sense_len, header, arg) != 0)
 			break;
 
 		cur_pos += sizeof(*header) + header->length;
 	}
 }
 
 struct scsi_find_desc_info {
 	uint8_t desc_type;
 	struct scsi_sense_desc_header *header;
 };
 
 static int
 scsi_find_desc_func(struct scsi_sense_data_desc *sense, u_int sense_len,
 		    struct scsi_sense_desc_header *header, void *arg)
 {
 	struct scsi_find_desc_info *desc_info;
 
 	desc_info = (struct scsi_find_desc_info *)arg;
 
 	if (header->desc_type == desc_info->desc_type) {
 		desc_info->header = header;
 
 		/* We found the descriptor, tell the iterator to stop. */
 		return (1);
 	} else
 		return (0);
 }
 
 /*
  * Given a descriptor type, return a pointer to it if it is in the sense
  * data and not truncated.  Avoiding truncating sense data will simplify
  * things significantly for the caller.
  */
 uint8_t *
 scsi_find_desc(struct scsi_sense_data_desc *sense, u_int sense_len,
 	       uint8_t desc_type)
 {
 	struct scsi_find_desc_info desc_info;
 
 	desc_info.desc_type = desc_type;
 	desc_info.header = NULL;
 
 	scsi_desc_iterate(sense, sense_len, scsi_find_desc_func, &desc_info);
 
 	return ((uint8_t *)desc_info.header);
 }
 
 /*
  * Fill in SCSI descriptor sense data with the specified parameters.
  */
 static void
 scsi_set_sense_data_desc_va(struct scsi_sense_data *sense_data,
     u_int *sense_len, scsi_sense_data_type sense_format, int current_error,
     int sense_key, int asc, int ascq, va_list ap)
 {
 	struct scsi_sense_data_desc *sense;
 	scsi_sense_elem_type elem_type;
 	int space, len;
 	uint8_t *desc, *data;
 
 	memset(sense_data, 0, sizeof(*sense_data));
 	sense = (struct scsi_sense_data_desc *)sense_data;
 	if (current_error != 0)
 		sense->error_code = SSD_DESC_CURRENT_ERROR;
 	else
 		sense->error_code = SSD_DESC_DEFERRED_ERROR;
 	sense->sense_key = sense_key;
 	sense->add_sense_code = asc;
 	sense->add_sense_code_qual = ascq;
 	sense->flags = 0;
 
 	desc = &sense->sense_desc[0];
 	space = *sense_len - offsetof(struct scsi_sense_data_desc, sense_desc);
 	while ((elem_type = va_arg(ap, scsi_sense_elem_type)) !=
 	    SSD_ELEM_NONE) {
 		if (elem_type >= SSD_ELEM_MAX) {
 			printf("%s: invalid sense type %d\n", __func__,
 			       elem_type);
 			break;
 		}
 		len = va_arg(ap, int);
 		data = va_arg(ap, uint8_t *);
 
 		switch (elem_type) {
 		case SSD_ELEM_SKIP:
 			break;
 		case SSD_ELEM_DESC:
 			if (space < len) {
 				sense->flags |= SSDD_SDAT_OVFL;
 				break;
 			}
 			bcopy(data, desc, len);
 			desc += len;
 			space -= len;
 			break;
 		case SSD_ELEM_SKS: {
 			struct scsi_sense_sks *sks = (void *)desc;
 
 			if (len > sizeof(sks->sense_key_spec))
 				break;
 			if (space < sizeof(*sks)) {
 				sense->flags |= SSDD_SDAT_OVFL;
 				break;
 			}
 			sks->desc_type = SSD_DESC_SKS;
 			sks->length = sizeof(*sks) -
 			    (offsetof(struct scsi_sense_sks, length) + 1);
 			bcopy(data, &sks->sense_key_spec, len);
 			desc += sizeof(*sks);
 			space -= sizeof(*sks);
 			break;
 		}
 		case SSD_ELEM_COMMAND: {
 			struct scsi_sense_command *cmd = (void *)desc;
 
 			if (len > sizeof(cmd->command_info))
 				break;
 			if (space < sizeof(*cmd)) {
 				sense->flags |= SSDD_SDAT_OVFL;
 				break;
 			}
 			cmd->desc_type = SSD_DESC_COMMAND;
 			cmd->length = sizeof(*cmd) -
 			    (offsetof(struct scsi_sense_command, length) + 1);
 			bcopy(data, &cmd->command_info[
 			    sizeof(cmd->command_info) - len], len);
 			desc += sizeof(*cmd);
 			space -= sizeof(*cmd);
 			break;
 		}
 		case SSD_ELEM_INFO: {
 			struct scsi_sense_info *info = (void *)desc;
 
 			if (len > sizeof(info->info))
 				break;
 			if (space < sizeof(*info)) {
 				sense->flags |= SSDD_SDAT_OVFL;
 				break;
 			}
 			info->desc_type = SSD_DESC_INFO;
 			info->length = sizeof(*info) -
 			    (offsetof(struct scsi_sense_info, length) + 1);
 			info->byte2 = SSD_INFO_VALID;
 			bcopy(data, &info->info[sizeof(info->info) - len], len);
 			desc += sizeof(*info);
 			space -= sizeof(*info);
 			break;
 		}
 		case SSD_ELEM_FRU: {
 			struct scsi_sense_fru *fru = (void *)desc;
 
 			if (len > sizeof(fru->fru))
 				break;
 			if (space < sizeof(*fru)) {
 				sense->flags |= SSDD_SDAT_OVFL;
 				break;
 			}
 			fru->desc_type = SSD_DESC_FRU;
 			fru->length = sizeof(*fru) -
 			    (offsetof(struct scsi_sense_fru, length) + 1);
 			fru->fru = *data;
 			desc += sizeof(*fru);
 			space -= sizeof(*fru);
 			break;
 		}
 		case SSD_ELEM_STREAM: {
 			struct scsi_sense_stream *stream = (void *)desc;
 
 			if (len > sizeof(stream->byte3))
 				break;
 			if (space < sizeof(*stream)) {
 				sense->flags |= SSDD_SDAT_OVFL;
 				break;
 			}
 			stream->desc_type = SSD_DESC_STREAM;
 			stream->length = sizeof(*stream) -
 			    (offsetof(struct scsi_sense_stream, length) + 1);
 			stream->byte3 = *data;
 			desc += sizeof(*stream);
 			space -= sizeof(*stream);
 			break;
 		}
 		default:
 			/*
 			 * We shouldn't get here, but if we do, do nothing.
 			 * We've already consumed the arguments above.
 			 */
 			break;
 		}
 	}
 	sense->extra_len = desc - &sense->sense_desc[0];
 	*sense_len = offsetof(struct scsi_sense_data_desc, extra_len) + 1 +
 	    sense->extra_len;
 }
 
 /*
  * Fill in SCSI fixed sense data with the specified parameters.
  */
 static void
 scsi_set_sense_data_fixed_va(struct scsi_sense_data *sense_data,
     u_int *sense_len, scsi_sense_data_type sense_format, int current_error,
     int sense_key, int asc, int ascq, va_list ap)
 {
 	struct scsi_sense_data_fixed *sense;
 	scsi_sense_elem_type elem_type;
 	uint8_t *data;
 	int len;
 
 	memset(sense_data, 0, sizeof(*sense_data));
 	sense = (struct scsi_sense_data_fixed *)sense_data;
 	if (current_error != 0)
 		sense->error_code = SSD_CURRENT_ERROR;
 	else
 		sense->error_code = SSD_DEFERRED_ERROR;
 	sense->flags = sense_key & SSD_KEY;
 	sense->extra_len = 0;
 	if (*sense_len >= 13) {
 		sense->add_sense_code = asc;
 		sense->extra_len = MAX(sense->extra_len, 5);
 	} else
 		sense->flags |= SSD_SDAT_OVFL;
 	if (*sense_len >= 14) {
 		sense->add_sense_code_qual = ascq;
 		sense->extra_len = MAX(sense->extra_len, 6);
 	} else
 		sense->flags |= SSD_SDAT_OVFL;
 
 	while ((elem_type = va_arg(ap, scsi_sense_elem_type)) !=
 	    SSD_ELEM_NONE) {
 		if (elem_type >= SSD_ELEM_MAX) {
 			printf("%s: invalid sense type %d\n", __func__,
 			       elem_type);
 			break;
 		}
 		len = va_arg(ap, int);
 		data = va_arg(ap, uint8_t *);
 
 		switch (elem_type) {
 		case SSD_ELEM_SKIP:
 			break;
 		case SSD_ELEM_SKS:
 			if (len > sizeof(sense->sense_key_spec))
 				break;
 			if (*sense_len < 18) {
 				sense->flags |= SSD_SDAT_OVFL;
 				break;
 			}
 			bcopy(data, &sense->sense_key_spec[0], len);
 			sense->extra_len = MAX(sense->extra_len, 10);
 			break;
 		case SSD_ELEM_COMMAND:
 			if (*sense_len < 12) {
 				sense->flags |= SSD_SDAT_OVFL;
 				break;
 			}
 			if (len > sizeof(sense->cmd_spec_info)) {
 				data += len - sizeof(sense->cmd_spec_info);
 				len = sizeof(sense->cmd_spec_info);
 			}
 			bcopy(data, &sense->cmd_spec_info[
 			    sizeof(sense->cmd_spec_info) - len], len);
 			sense->extra_len = MAX(sense->extra_len, 4);
 			break;
 		case SSD_ELEM_INFO:
 			/* Set VALID bit only if no overflow. */
 			sense->error_code |= SSD_ERRCODE_VALID;
 			while (len > sizeof(sense->info)) {
 				if (data[0] != 0)
 					sense->error_code &= ~SSD_ERRCODE_VALID;
 				data ++;
 				len --;
 			}
 			bcopy(data, &sense->info[sizeof(sense->info) - len], len);
 			break;
 		case SSD_ELEM_FRU:
 			if (*sense_len < 15) {
 				sense->flags |= SSD_SDAT_OVFL;
 				break;
 			}
 			sense->fru = *data;
 			sense->extra_len = MAX(sense->extra_len, 7);
 			break;
 		case SSD_ELEM_STREAM:
 			sense->flags |= *data &
 			    (SSD_ILI | SSD_EOM | SSD_FILEMARK);
 			break;
 		default:
 
 			/*
 			 * We can't handle that in fixed format.  Skip it.
 			 */
 			break;
 		}
 	}
 	*sense_len = offsetof(struct scsi_sense_data_fixed, extra_len) + 1 +
 	    sense->extra_len;
 }
 
 /*
  * Fill in SCSI sense data with the specified parameters.  This routine can
  * fill in either fixed or descriptor type sense data.
  */
 void
 scsi_set_sense_data_va(struct scsi_sense_data *sense_data, u_int *sense_len,
 		      scsi_sense_data_type sense_format, int current_error,
 		      int sense_key, int asc, int ascq, va_list ap)
 {
 
 	if (*sense_len > SSD_FULL_SIZE)
 		*sense_len = SSD_FULL_SIZE;
 	if (sense_format == SSD_TYPE_DESC)
 		scsi_set_sense_data_desc_va(sense_data, sense_len,
 		    sense_format, current_error, sense_key, asc, ascq, ap);
 	else
 		scsi_set_sense_data_fixed_va(sense_data, sense_len,
 		    sense_format, current_error, sense_key, asc, ascq, ap);
 }
 
 void
 scsi_set_sense_data(struct scsi_sense_data *sense_data,
 		    scsi_sense_data_type sense_format, int current_error,
 		    int sense_key, int asc, int ascq, ...)
 {
 	va_list ap;
 	u_int	sense_len = SSD_FULL_SIZE;
 
 	va_start(ap, ascq);
 	scsi_set_sense_data_va(sense_data, &sense_len, sense_format,
 	    current_error, sense_key, asc, ascq, ap);
 	va_end(ap);
 }
 
 void
 scsi_set_sense_data_len(struct scsi_sense_data *sense_data, u_int *sense_len,
 		    scsi_sense_data_type sense_format, int current_error,
 		    int sense_key, int asc, int ascq, ...)
 {
 	va_list ap;
 
 	va_start(ap, ascq);
 	scsi_set_sense_data_va(sense_data, sense_len, sense_format,
 	    current_error, sense_key, asc, ascq, ap);
 	va_end(ap);
 }
 
 /*
  * Get sense information for three similar sense data types.
  */
 int
 scsi_get_sense_info(struct scsi_sense_data *sense_data, u_int sense_len,
 		    uint8_t info_type, uint64_t *info, int64_t *signed_info)
 {
 	scsi_sense_data_type sense_type;
 
 	if (sense_len == 0)
 		goto bailout;
 
 	sense_type = scsi_sense_type(sense_data);
 
 	switch (sense_type) {
 	case SSD_TYPE_DESC: {
 		struct scsi_sense_data_desc *sense;
 		uint8_t *desc;
 
 		sense = (struct scsi_sense_data_desc *)sense_data;
 
 		desc = scsi_find_desc(sense, sense_len, info_type);
 		if (desc == NULL)
 			goto bailout;
 
 		switch (info_type) {
 		case SSD_DESC_INFO: {
 			struct scsi_sense_info *info_desc;
 
 			info_desc = (struct scsi_sense_info *)desc;
 
 			if ((info_desc->byte2 & SSD_INFO_VALID) == 0)
 				goto bailout;
 
 			*info = scsi_8btou64(info_desc->info);
 			if (signed_info != NULL)
 				*signed_info = *info;
 			break;
 		}
 		case SSD_DESC_COMMAND: {
 			struct scsi_sense_command *cmd_desc;
 
 			cmd_desc = (struct scsi_sense_command *)desc;
 
 			*info = scsi_8btou64(cmd_desc->command_info);
 			if (signed_info != NULL)
 				*signed_info = *info;
 			break;
 		}
 		case SSD_DESC_FRU: {
 			struct scsi_sense_fru *fru_desc;
 
 			fru_desc = (struct scsi_sense_fru *)desc;
 
 			if (fru_desc->fru == 0)
 				goto bailout;
 
 			*info = fru_desc->fru;
 			if (signed_info != NULL)
 				*signed_info = (int8_t)fru_desc->fru;
 			break;
 		}
 		default:
 			goto bailout;
 			break;
 		}
 		break;
 	}
 	case SSD_TYPE_FIXED: {
 		struct scsi_sense_data_fixed *sense;
 
 		sense = (struct scsi_sense_data_fixed *)sense_data;
 
 		switch (info_type) {
 		case SSD_DESC_INFO: {
 			uint32_t info_val;
 
 			if ((sense->error_code & SSD_ERRCODE_VALID) == 0)
 				goto bailout;
 
 			if (SSD_FIXED_IS_PRESENT(sense, sense_len, info) == 0)
 				goto bailout;
 
 			info_val = scsi_4btoul(sense->info);
 
 			*info = info_val;
 			if (signed_info != NULL)
 				*signed_info = (int32_t)info_val;
 			break;
 		}
 		case SSD_DESC_COMMAND: {
 			uint32_t cmd_val;
 
 			if ((SSD_FIXED_IS_PRESENT(sense, sense_len,
 			     cmd_spec_info) == 0)
 			 || (SSD_FIXED_IS_FILLED(sense, cmd_spec_info) == 0)) 
 				goto bailout;
 
 			cmd_val = scsi_4btoul(sense->cmd_spec_info);
 			if (cmd_val == 0)
 				goto bailout;
 
 			*info = cmd_val;
 			if (signed_info != NULL)
 				*signed_info = (int32_t)cmd_val;
 			break;
 		}
 		case SSD_DESC_FRU:
 			if ((SSD_FIXED_IS_PRESENT(sense, sense_len, fru) == 0)
 			 || (SSD_FIXED_IS_FILLED(sense, fru) == 0))
 				goto bailout;
 
 			if (sense->fru == 0)
 				goto bailout;
 
 			*info = sense->fru;
 			if (signed_info != NULL)
 				*signed_info = (int8_t)sense->fru;
 			break;
 		default:
 			goto bailout;
 			break;
 		}
 		break;
 	}
 	default: 
 		goto bailout;
 		break;
 	}
 
 	return (0);
 bailout:
 	return (1);
 }
 
 int
 scsi_get_sks(struct scsi_sense_data *sense_data, u_int sense_len, uint8_t *sks)
 {
 	scsi_sense_data_type sense_type;
 
 	if (sense_len == 0)
 		goto bailout;
 
 	sense_type = scsi_sense_type(sense_data);
 
 	switch (sense_type) {
 	case SSD_TYPE_DESC: {
 		struct scsi_sense_data_desc *sense;
 		struct scsi_sense_sks *desc;
 
 		sense = (struct scsi_sense_data_desc *)sense_data;
 
 		desc = (struct scsi_sense_sks *)scsi_find_desc(sense, sense_len,
 							       SSD_DESC_SKS);
 		if (desc == NULL)
 			goto bailout;
 
 		if ((desc->sense_key_spec[0] & SSD_SKS_VALID) == 0)
 			goto bailout;
 
 		bcopy(desc->sense_key_spec, sks, sizeof(desc->sense_key_spec));
 		break;
 	}
 	case SSD_TYPE_FIXED: {
 		struct scsi_sense_data_fixed *sense;
 
 		sense = (struct scsi_sense_data_fixed *)sense_data;
 
 		if ((SSD_FIXED_IS_PRESENT(sense, sense_len, sense_key_spec)== 0)
 		 || (SSD_FIXED_IS_FILLED(sense, sense_key_spec) == 0))
 			goto bailout;
 
 		if ((sense->sense_key_spec[0] & SSD_SCS_VALID) == 0)
 			goto bailout;
 
 		bcopy(sense->sense_key_spec, sks,sizeof(sense->sense_key_spec));
 		break;
 	}
 	default:
 		goto bailout;
 		break;
 	}
 	return (0);
 bailout:
 	return (1);
 }
 
 /*
  * Provide a common interface for fixed and descriptor sense to detect
  * whether we have block-specific sense information.  It is clear by the
  * presence of the block descriptor in descriptor mode, but we have to
  * infer from the inquiry data and ILI bit in fixed mode.
  */
 int
 scsi_get_block_info(struct scsi_sense_data *sense_data, u_int sense_len,
 		    struct scsi_inquiry_data *inq_data, uint8_t *block_bits)
 {
 	scsi_sense_data_type sense_type;
 
 	if (inq_data != NULL) {
 		switch (SID_TYPE(inq_data)) {
 		case T_DIRECT:
 		case T_RBC:
 		case T_ZBC_HM:
 			break;
 		default:
 			goto bailout;
 			break;
 		}
 	}
 
 	sense_type = scsi_sense_type(sense_data);
 
 	switch (sense_type) {
 	case SSD_TYPE_DESC: {
 		struct scsi_sense_data_desc *sense;
 		struct scsi_sense_block *block;
 
 		sense = (struct scsi_sense_data_desc *)sense_data;
 
 		block = (struct scsi_sense_block *)scsi_find_desc(sense,
 		    sense_len, SSD_DESC_BLOCK);
 		if (block == NULL)
 			goto bailout;
 
 		*block_bits = block->byte3;
 		break;
 	}
 	case SSD_TYPE_FIXED: {
 		struct scsi_sense_data_fixed *sense;
 
 		sense = (struct scsi_sense_data_fixed *)sense_data;
 
 		if (SSD_FIXED_IS_PRESENT(sense, sense_len, flags) == 0)
 			goto bailout;
 
 		*block_bits = sense->flags & SSD_ILI;
 		break;
 	}
 	default:
 		goto bailout;
 		break;
 	}
 	return (0);
 bailout:
 	return (1);
 }
 
 int
 scsi_get_stream_info(struct scsi_sense_data *sense_data, u_int sense_len,
 		     struct scsi_inquiry_data *inq_data, uint8_t *stream_bits)
 {
 	scsi_sense_data_type sense_type;
 
 	if (inq_data != NULL) {
 		switch (SID_TYPE(inq_data)) {
 		case T_SEQUENTIAL:
 			break;
 		default:
 			goto bailout;
 			break;
 		}
 	}
 
 	sense_type = scsi_sense_type(sense_data);
 
 	switch (sense_type) {
 	case SSD_TYPE_DESC: {
 		struct scsi_sense_data_desc *sense;
 		struct scsi_sense_stream *stream;
 
 		sense = (struct scsi_sense_data_desc *)sense_data;
 
 		stream = (struct scsi_sense_stream *)scsi_find_desc(sense,
 		    sense_len, SSD_DESC_STREAM);
 		if (stream == NULL)
 			goto bailout;
 
 		*stream_bits = stream->byte3;
 		break;
 	}
 	case SSD_TYPE_FIXED: {
 		struct scsi_sense_data_fixed *sense;
 
 		sense = (struct scsi_sense_data_fixed *)sense_data;
 
 		if (SSD_FIXED_IS_PRESENT(sense, sense_len, flags) == 0)
 			goto bailout;
 
 		*stream_bits = sense->flags & (SSD_ILI|SSD_EOM|SSD_FILEMARK);
 		break;
 	}
 	default:
 		goto bailout;
 		break;
 	}
 	return (0);
 bailout:
 	return (1);
 }
 
 void
 scsi_info_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len,
 	       struct scsi_inquiry_data *inq_data, uint64_t info)
 {
 	sbuf_printf(sb, "Info: %#jx", info);
 }
 
 void
 scsi_command_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len,
 		  struct scsi_inquiry_data *inq_data, uint64_t csi)
 {
 	sbuf_printf(sb, "Command Specific Info: %#jx", csi);
 }
 
 
 void
 scsi_progress_sbuf(struct sbuf *sb, uint16_t progress)
 {
 	sbuf_printf(sb, "Progress: %d%% (%d/%d) complete",
 		    (progress * 100) / SSD_SKS_PROGRESS_DENOM,
 		    progress, SSD_SKS_PROGRESS_DENOM);
 }
 
 /*
  * Returns 1 for failure (i.e. SKS isn't valid) and 0 for success.
  */
 int
 scsi_sks_sbuf(struct sbuf *sb, int sense_key, uint8_t *sks)
 {
 
 	switch (sense_key) {
 	case SSD_KEY_ILLEGAL_REQUEST: {
 		struct scsi_sense_sks_field *field;
 		int bad_command;
 		char tmpstr[40];
 
 		/*Field Pointer*/
 		field = (struct scsi_sense_sks_field *)sks;
 
 		if (field->byte0 & SSD_SKS_FIELD_CMD)
 			bad_command = 1;
 		else
 			bad_command = 0;
 
 		tmpstr[0] = '\0';
 
 		/* Bit pointer is valid */
 		if (field->byte0 & SSD_SKS_BPV)
 			snprintf(tmpstr, sizeof(tmpstr), "bit %d ",
 				 field->byte0 & SSD_SKS_BIT_VALUE);
 
 		sbuf_printf(sb, "%s byte %d %sis invalid",
 			    bad_command ? "Command" : "Data",
 			    scsi_2btoul(field->field), tmpstr);
 		break;
 	}
 	case SSD_KEY_UNIT_ATTENTION: {
 		struct scsi_sense_sks_overflow *overflow;
 
 		overflow = (struct scsi_sense_sks_overflow *)sks;
 
 		/*UA Condition Queue Overflow*/
 		sbuf_printf(sb, "Unit Attention Condition Queue %s",
 			    (overflow->byte0 & SSD_SKS_OVERFLOW_SET) ?
 			    "Overflowed" : "Did Not Overflow??");
 		break;
 	}
 	case SSD_KEY_RECOVERED_ERROR:
 	case SSD_KEY_HARDWARE_ERROR:
 	case SSD_KEY_MEDIUM_ERROR: {
 		struct scsi_sense_sks_retry *retry;
 
 		/*Actual Retry Count*/
 		retry = (struct scsi_sense_sks_retry *)sks;
 
 		sbuf_printf(sb, "Actual Retry Count: %d",
 			    scsi_2btoul(retry->actual_retry_count));
 		break;
 	}
 	case SSD_KEY_NO_SENSE:
 	case SSD_KEY_NOT_READY: {
 		struct scsi_sense_sks_progress *progress;
 		int progress_val;
 
 		/*Progress Indication*/
 		progress = (struct scsi_sense_sks_progress *)sks;
 		progress_val = scsi_2btoul(progress->progress);
 
 		scsi_progress_sbuf(sb, progress_val);
 		break;
 	}
 	case SSD_KEY_COPY_ABORTED: {
 		struct scsi_sense_sks_segment *segment;
 		char tmpstr[40];
 
 		/*Segment Pointer*/
 		segment = (struct scsi_sense_sks_segment *)sks;
 
 		tmpstr[0] = '\0';
 
 		if (segment->byte0 & SSD_SKS_SEGMENT_BPV)
 			snprintf(tmpstr, sizeof(tmpstr), "bit %d ",
 				 segment->byte0 & SSD_SKS_SEGMENT_BITPTR);
 
 		sbuf_printf(sb, "%s byte %d %sis invalid", (segment->byte0 &
 			    SSD_SKS_SEGMENT_SD) ? "Segment" : "Data",
 			    scsi_2btoul(segment->field), tmpstr);
 		break;
 	}
 	default:
 		sbuf_printf(sb, "Sense Key Specific: %#x,%#x", sks[0],
 			    scsi_2btoul(&sks[1]));
 		break;
 	}
 
 	return (0);
 }
 
 void
 scsi_fru_sbuf(struct sbuf *sb, uint64_t fru)
 {
 	sbuf_printf(sb, "Field Replaceable Unit: %d", (int)fru);
 }
 
 void
 scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits)
 {
 	int need_comma;
 
 	need_comma = 0;
 	/*
 	 * XXX KDM this needs more descriptive decoding.
 	 */
 	sbuf_printf(sb, "Stream Command Sense Data: ");
 	if (stream_bits & SSD_DESC_STREAM_FM) {
 		sbuf_printf(sb, "Filemark");
 		need_comma = 1;
 	}
 
 	if (stream_bits & SSD_DESC_STREAM_EOM) {
 		sbuf_printf(sb, "%sEOM", (need_comma) ? "," : "");
 		need_comma = 1;
 	}
 
 	if (stream_bits & SSD_DESC_STREAM_ILI)
 		sbuf_printf(sb, "%sILI", (need_comma) ? "," : "");
 }
 
 void
 scsi_block_sbuf(struct sbuf *sb, uint8_t block_bits)
 {
 
 	sbuf_printf(sb, "Block Command Sense Data: ");
 	if (block_bits & SSD_DESC_BLOCK_ILI)
 		sbuf_printf(sb, "ILI");
 }
 
 void
 scsi_sense_info_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 		     u_int sense_len, uint8_t *cdb, int cdb_len,
 		     struct scsi_inquiry_data *inq_data,
 		     struct scsi_sense_desc_header *header)
 {
 	struct scsi_sense_info *info;
 
 	info = (struct scsi_sense_info *)header;
 
 	if ((info->byte2 & SSD_INFO_VALID) == 0)
 		return;
 
 	scsi_info_sbuf(sb, cdb, cdb_len, inq_data, scsi_8btou64(info->info));
 }
 
 void
 scsi_sense_command_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			u_int sense_len, uint8_t *cdb, int cdb_len,
 			struct scsi_inquiry_data *inq_data,
 			struct scsi_sense_desc_header *header)
 {
 	struct scsi_sense_command *command;
 
 	command = (struct scsi_sense_command *)header;
 
 	scsi_command_sbuf(sb, cdb, cdb_len, inq_data,
 			  scsi_8btou64(command->command_info));
 }
 
 void
 scsi_sense_sks_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 		    u_int sense_len, uint8_t *cdb, int cdb_len,
 		    struct scsi_inquiry_data *inq_data,
 		    struct scsi_sense_desc_header *header)
 {
 	struct scsi_sense_sks *sks;
 	int error_code, sense_key, asc, ascq;
 
 	sks = (struct scsi_sense_sks *)header;
 
 	if ((sks->sense_key_spec[0] & SSD_SKS_VALID) == 0)
 		return;
 
 	scsi_extract_sense_len(sense, sense_len, &error_code, &sense_key,
 			       &asc, &ascq, /*show_errors*/ 1);
 
 	scsi_sks_sbuf(sb, sense_key, sks->sense_key_spec);
 }
 
 void
 scsi_sense_fru_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 		    u_int sense_len, uint8_t *cdb, int cdb_len,
 		    struct scsi_inquiry_data *inq_data,
 		    struct scsi_sense_desc_header *header)
 {
 	struct scsi_sense_fru *fru;
 
 	fru = (struct scsi_sense_fru *)header;
 
 	if (fru->fru == 0)
 		return;
 
 	scsi_fru_sbuf(sb, (uint64_t)fru->fru);
 }
 
 void
 scsi_sense_stream_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 		       u_int sense_len, uint8_t *cdb, int cdb_len,
 		       struct scsi_inquiry_data *inq_data,
 		       struct scsi_sense_desc_header *header)
 {
 	struct scsi_sense_stream *stream;
 
 	stream = (struct scsi_sense_stream *)header;
 	scsi_stream_sbuf(sb, stream->byte3);
 }
 
 void
 scsi_sense_block_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 		      u_int sense_len, uint8_t *cdb, int cdb_len,
 		      struct scsi_inquiry_data *inq_data,
 		      struct scsi_sense_desc_header *header)
 {
 	struct scsi_sense_block *block;
 
 	block = (struct scsi_sense_block *)header;
 	scsi_block_sbuf(sb, block->byte3);
 }
 
 void
 scsi_sense_progress_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			 u_int sense_len, uint8_t *cdb, int cdb_len,
 			 struct scsi_inquiry_data *inq_data,
 			 struct scsi_sense_desc_header *header)
 {
 	struct scsi_sense_progress *progress;
 	const char *sense_key_desc;
 	const char *asc_desc;
 	int progress_val;
 
 	progress = (struct scsi_sense_progress *)header;
 
 	/*
 	 * Get descriptions for the sense key, ASC, and ASCQ in the
 	 * progress descriptor.  These could be different than the values
 	 * in the overall sense data.
 	 */
 	scsi_sense_desc(progress->sense_key, progress->add_sense_code,
 			progress->add_sense_code_qual, inq_data,
 			&sense_key_desc, &asc_desc);
 
 	progress_val = scsi_2btoul(progress->progress);
 
 	/*
 	 * The progress indicator is for the operation described by the
 	 * sense key, ASC, and ASCQ in the descriptor.
 	 */
 	sbuf_cat(sb, sense_key_desc);
 	sbuf_printf(sb, " asc:%x,%x (%s): ", progress->add_sense_code, 
 		    progress->add_sense_code_qual, asc_desc);
 	scsi_progress_sbuf(sb, progress_val);
 }
 
 void
 scsi_sense_ata_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			 u_int sense_len, uint8_t *cdb, int cdb_len,
 			 struct scsi_inquiry_data *inq_data,
 			 struct scsi_sense_desc_header *header)
 {
 	struct scsi_sense_ata_ret_desc *res;
 
 	res = (struct scsi_sense_ata_ret_desc *)header;
 
 	sbuf_printf(sb, "ATA status: %02x (%s%s%s%s%s%s%s%s), ",
 	    res->status,
 	    (res->status & 0x80) ? "BSY " : "",
 	    (res->status & 0x40) ? "DRDY " : "",
 	    (res->status & 0x20) ? "DF " : "",
 	    (res->status & 0x10) ? "SERV " : "",
 	    (res->status & 0x08) ? "DRQ " : "",
 	    (res->status & 0x04) ? "CORR " : "",
 	    (res->status & 0x02) ? "IDX " : "",
 	    (res->status & 0x01) ? "ERR" : "");
 	if (res->status & 1) {
 	    sbuf_printf(sb, "error: %02x (%s%s%s%s%s%s%s%s), ",
 		res->error,
 		(res->error & 0x80) ? "ICRC " : "",
 		(res->error & 0x40) ? "UNC " : "",
 		(res->error & 0x20) ? "MC " : "",
 		(res->error & 0x10) ? "IDNF " : "",
 		(res->error & 0x08) ? "MCR " : "",
 		(res->error & 0x04) ? "ABRT " : "",
 		(res->error & 0x02) ? "NM " : "",
 		(res->error & 0x01) ? "ILI" : "");
 	}
 
 	if (res->flags & SSD_DESC_ATA_FLAG_EXTEND) {
 		sbuf_printf(sb, "count: %02x%02x, ",
 		    res->count_15_8, res->count_7_0);
 		sbuf_printf(sb, "LBA: %02x%02x%02x%02x%02x%02x, ",
 		    res->lba_47_40, res->lba_39_32, res->lba_31_24,
 		    res->lba_23_16, res->lba_15_8, res->lba_7_0);
 	} else {
 		sbuf_printf(sb, "count: %02x, ", res->count_7_0);
 		sbuf_printf(sb, "LBA: %02x%02x%02x, ",
 		    res->lba_23_16, res->lba_15_8, res->lba_7_0);
 	}
 	sbuf_printf(sb, "device: %02x, ", res->device);
 }
 
 void
 scsi_sense_forwarded_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			 u_int sense_len, uint8_t *cdb, int cdb_len,
 			 struct scsi_inquiry_data *inq_data,
 			 struct scsi_sense_desc_header *header)
 {
 	struct scsi_sense_forwarded *forwarded;
 	const char *sense_key_desc;
 	const char *asc_desc;
 	int error_code, sense_key, asc, ascq;
 
 	forwarded = (struct scsi_sense_forwarded *)header;
 	scsi_extract_sense_len((struct scsi_sense_data *)forwarded->sense_data,
 	    forwarded->length - 2, &error_code, &sense_key, &asc, &ascq, 1);
 	scsi_sense_desc(sense_key, asc, ascq, NULL, &sense_key_desc, &asc_desc);
 
 	sbuf_printf(sb, "Forwarded sense: %s asc:%x,%x (%s): ",
 	    sense_key_desc, asc, ascq, asc_desc);
 }
 
 /*
  * Generic sense descriptor printing routine.  This is used when we have
  * not yet implemented a specific printing routine for this descriptor.
  */
 void
 scsi_sense_generic_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			u_int sense_len, uint8_t *cdb, int cdb_len,
 			struct scsi_inquiry_data *inq_data,
 			struct scsi_sense_desc_header *header)
 {
 	int i;
 	uint8_t *buf_ptr;
 
 	sbuf_printf(sb, "Descriptor %#x:", header->desc_type);
 
 	buf_ptr = (uint8_t *)&header[1];
 
 	for (i = 0; i < header->length; i++, buf_ptr++)
 		sbuf_printf(sb, " %02x", *buf_ptr);
 }
 
 /*
  * Keep this list in numeric order.  This speeds the array traversal.
  */
 struct scsi_sense_desc_printer {
 	uint8_t desc_type;
 	/*
 	 * The function arguments here are the superset of what is needed
 	 * to print out various different descriptors.  Command and
 	 * information descriptors need inquiry data and command type.
 	 * Sense key specific descriptors need the sense key.
 	 *
 	 * The sense, cdb, and inquiry data arguments may be NULL, but the
 	 * information printed may not be fully decoded as a result.
 	 */
 	void (*print_func)(struct sbuf *sb, struct scsi_sense_data *sense,
 			   u_int sense_len, uint8_t *cdb, int cdb_len,
 			   struct scsi_inquiry_data *inq_data,
 			   struct scsi_sense_desc_header *header);
 } scsi_sense_printers[] = {
 	{SSD_DESC_INFO, scsi_sense_info_sbuf},
 	{SSD_DESC_COMMAND, scsi_sense_command_sbuf},
 	{SSD_DESC_SKS, scsi_sense_sks_sbuf},
 	{SSD_DESC_FRU, scsi_sense_fru_sbuf},
 	{SSD_DESC_STREAM, scsi_sense_stream_sbuf},
 	{SSD_DESC_BLOCK, scsi_sense_block_sbuf},
 	{SSD_DESC_ATA, scsi_sense_ata_sbuf},
 	{SSD_DESC_PROGRESS, scsi_sense_progress_sbuf},
 	{SSD_DESC_FORWARDED, scsi_sense_forwarded_sbuf}
 };
 
 void
 scsi_sense_desc_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 		     u_int sense_len, uint8_t *cdb, int cdb_len,
 		     struct scsi_inquiry_data *inq_data,
 		     struct scsi_sense_desc_header *header)
 {
 	u_int i;
 
 	for (i = 0; i < nitems(scsi_sense_printers); i++) {
 		struct scsi_sense_desc_printer *printer;
 
 		printer = &scsi_sense_printers[i];
 
 		/*
 		 * The list is sorted, so quit if we've passed our
 		 * descriptor number.
 		 */
 		if (printer->desc_type > header->desc_type)
 			break;
 
 		if (printer->desc_type != header->desc_type)
 			continue;
 
 		printer->print_func(sb, sense, sense_len, cdb, cdb_len,
 				    inq_data, header);
 
 		return;
 	}
 
 	/*
 	 * No specific printing routine, so use the generic routine.
 	 */
 	scsi_sense_generic_sbuf(sb, sense, sense_len, cdb, cdb_len,
 				inq_data, header);
 }
 
 scsi_sense_data_type
 scsi_sense_type(struct scsi_sense_data *sense_data)
 {
 	switch (sense_data->error_code & SSD_ERRCODE) {
 	case SSD_DESC_CURRENT_ERROR:
 	case SSD_DESC_DEFERRED_ERROR:
 		return (SSD_TYPE_DESC);
 		break;
 	case SSD_CURRENT_ERROR:
 	case SSD_DEFERRED_ERROR:
 		return (SSD_TYPE_FIXED);
 		break;
 	default:
 		break;
 	}
 
 	return (SSD_TYPE_NONE);
 }
 
 struct scsi_print_sense_info {
 	struct sbuf *sb;
 	char *path_str;
 	uint8_t *cdb;
 	int cdb_len;
 	struct scsi_inquiry_data *inq_data;
 };
 
 static int
 scsi_print_desc_func(struct scsi_sense_data_desc *sense, u_int sense_len,
 		     struct scsi_sense_desc_header *header, void *arg)
 {
 	struct scsi_print_sense_info *print_info;
 
 	print_info = (struct scsi_print_sense_info *)arg;
 
 	switch (header->desc_type) {
 	case SSD_DESC_INFO:
 	case SSD_DESC_FRU:
 	case SSD_DESC_COMMAND:
 	case SSD_DESC_SKS:
 	case SSD_DESC_BLOCK:
 	case SSD_DESC_STREAM:
 		/*
 		 * We have already printed these descriptors, if they are
 		 * present.
 		 */
 		break;
 	default: {
 		sbuf_printf(print_info->sb, "%s", print_info->path_str);
 		scsi_sense_desc_sbuf(print_info->sb,
 				     (struct scsi_sense_data *)sense, sense_len,
 				     print_info->cdb, print_info->cdb_len,
 				     print_info->inq_data, header);
 		sbuf_printf(print_info->sb, "\n");
 		break;
 	}
 	}
 
 	/*
 	 * Tell the iterator that we want to see more descriptors if they
 	 * are present.
 	 */
 	return (0);
 }
 
 void
 scsi_sense_only_sbuf(struct scsi_sense_data *sense, u_int sense_len,
 		     struct sbuf *sb, char *path_str,
 		     struct scsi_inquiry_data *inq_data, uint8_t *cdb,
 		     int cdb_len)
 {
 	int error_code, sense_key, asc, ascq;
 
 	sbuf_cat(sb, path_str);
 
 	scsi_extract_sense_len(sense, sense_len, &error_code, &sense_key,
 			       &asc, &ascq, /*show_errors*/ 1);
 
 	sbuf_printf(sb, "SCSI sense: ");
 	switch (error_code) {
 	case SSD_DEFERRED_ERROR:
 	case SSD_DESC_DEFERRED_ERROR:
 		sbuf_printf(sb, "Deferred error: ");
 
 		/* FALLTHROUGH */
 	case SSD_CURRENT_ERROR:
 	case SSD_DESC_CURRENT_ERROR:
 	{
 		struct scsi_sense_data_desc *desc_sense;
 		struct scsi_print_sense_info print_info;
 		const char *sense_key_desc;
 		const char *asc_desc;
 		uint8_t sks[3];
 		uint64_t val;
 		uint8_t bits;
 
 		/*
 		 * Get descriptions for the sense key, ASC, and ASCQ.  If
 		 * these aren't present in the sense data (i.e. the sense
 		 * data isn't long enough), the -1 values that
 		 * scsi_extract_sense_len() returns will yield default
 		 * or error descriptions.
 		 */
 		scsi_sense_desc(sense_key, asc, ascq, inq_data,
 				&sense_key_desc, &asc_desc);
 
 		/*
 		 * We first print the sense key and ASC/ASCQ.
 		 */
 		sbuf_cat(sb, sense_key_desc);
 		sbuf_printf(sb, " asc:%x,%x (%s)\n", asc, ascq, asc_desc);
 
 		/*
 		 * Print any block or stream device-specific information.
 		 */
 		if (scsi_get_block_info(sense, sense_len, inq_data,
 		    &bits) == 0 && bits != 0) {
 			sbuf_cat(sb, path_str);
 			scsi_block_sbuf(sb, bits);
 			sbuf_printf(sb, "\n");
 		} else if (scsi_get_stream_info(sense, sense_len, inq_data,
 		    &bits) == 0 && bits != 0) {
 			sbuf_cat(sb, path_str);
 			scsi_stream_sbuf(sb, bits);
 			sbuf_printf(sb, "\n");
 		}
 
 		/*
 		 * Print the info field.
 		 */
 		if (scsi_get_sense_info(sense, sense_len, SSD_DESC_INFO,
 					&val, NULL) == 0) {
 			sbuf_cat(sb, path_str);
 			scsi_info_sbuf(sb, cdb, cdb_len, inq_data, val);
 			sbuf_printf(sb, "\n");
 		}
 
 		/* 
 		 * Print the FRU.
 		 */
 		if (scsi_get_sense_info(sense, sense_len, SSD_DESC_FRU,
 					&val, NULL) == 0) {
 			sbuf_cat(sb, path_str);
 			scsi_fru_sbuf(sb, val);
 			sbuf_printf(sb, "\n");
 		}
 
 		/*
 		 * Print any command-specific information.
 		 */
 		if (scsi_get_sense_info(sense, sense_len, SSD_DESC_COMMAND,
 					&val, NULL) == 0) {
 			sbuf_cat(sb, path_str);
 			scsi_command_sbuf(sb, cdb, cdb_len, inq_data, val);
 			sbuf_printf(sb, "\n");
 		}
 
 		/*
 		 * Print out any sense-key-specific information.
 		 */
 		if (scsi_get_sks(sense, sense_len, sks) == 0) {
 			sbuf_cat(sb, path_str);
 			scsi_sks_sbuf(sb, sense_key, sks);
 			sbuf_printf(sb, "\n");
 		}
 
 		/*
 		 * If this is fixed sense, we're done.  If we have
 		 * descriptor sense, we might have more information
 		 * available.
 		 */
 		if (scsi_sense_type(sense) != SSD_TYPE_DESC)
 			break;
 
 		desc_sense = (struct scsi_sense_data_desc *)sense;
 
 		print_info.sb = sb;
 		print_info.path_str = path_str;
 		print_info.cdb = cdb;
 		print_info.cdb_len = cdb_len;
 		print_info.inq_data = inq_data;
 
 		/*
 		 * Print any sense descriptors that we have not already printed.
 		 */
 		scsi_desc_iterate(desc_sense, sense_len, scsi_print_desc_func,
 				  &print_info);
 		break;
 
 	}
 	case -1:
 		/*
 		 * scsi_extract_sense_len() sets values to -1 if the
 		 * show_errors flag is set and they aren't present in the
 		 * sense data.  This means that sense_len is 0.
 		 */
 		sbuf_printf(sb, "No sense data present\n");
 		break;
 	default: {
 		sbuf_printf(sb, "Error code 0x%x", error_code);
 		if (sense->error_code & SSD_ERRCODE_VALID) {
 			struct scsi_sense_data_fixed *fixed_sense;
 
 			fixed_sense = (struct scsi_sense_data_fixed *)sense;
 
 			if (SSD_FIXED_IS_PRESENT(fixed_sense, sense_len, info)){
 				uint32_t info;
 
 				info = scsi_4btoul(fixed_sense->info);
 
 				sbuf_printf(sb, " at block no. %d (decimal)",
 					    info);
 			}
 		}
 		sbuf_printf(sb, "\n");
 		break;
 	}
 	}
 }
 
 /*
  * scsi_sense_sbuf() returns 0 for success and -1 for failure.
  */
 #ifdef _KERNEL
 int
 scsi_sense_sbuf(struct ccb_scsiio *csio, struct sbuf *sb,
 		scsi_sense_string_flags flags)
 #else /* !_KERNEL */
 int
 scsi_sense_sbuf(struct cam_device *device, struct ccb_scsiio *csio, 
 		struct sbuf *sb, scsi_sense_string_flags flags)
 #endif /* _KERNEL/!_KERNEL */
 {
 	struct	  scsi_sense_data *sense;
 	struct	  scsi_inquiry_data *inq_data;
 #ifdef _KERNEL
 	struct	  ccb_getdev *cgd;
 #endif /* _KERNEL */
 	char	  path_str[64];
 
 #ifndef _KERNEL
 	if (device == NULL)
 		return(-1);
 #endif /* !_KERNEL */
 	if ((csio == NULL) || (sb == NULL))
 		return(-1);
 
 	/*
 	 * If the CDB is a physical address, we can't deal with it..
 	 */
 	if ((csio->ccb_h.flags & CAM_CDB_PHYS) != 0)
 		flags &= ~SSS_FLAG_PRINT_COMMAND;
 
 #ifdef _KERNEL
 	xpt_path_string(csio->ccb_h.path, path_str, sizeof(path_str));
 #else /* !_KERNEL */
 	cam_path_string(device, path_str, sizeof(path_str));
 #endif /* _KERNEL/!_KERNEL */
 
 #ifdef _KERNEL
 	if ((cgd = (struct ccb_getdev*)xpt_alloc_ccb_nowait()) == NULL)
 		return(-1);
 	/*
 	 * Get the device information.
 	 */
 	xpt_setup_ccb(&cgd->ccb_h,
 		      csio->ccb_h.path,
 		      CAM_PRIORITY_NORMAL);
 	cgd->ccb_h.func_code = XPT_GDEV_TYPE;
 	xpt_action((union ccb *)cgd);
 
 	/*
 	 * If the device is unconfigured, just pretend that it is a hard
 	 * drive.  scsi_op_desc() needs this.
 	 */
 	if (cgd->ccb_h.status == CAM_DEV_NOT_THERE)
 		cgd->inq_data.device = T_DIRECT;
 
 	inq_data = &cgd->inq_data;
 
 #else /* !_KERNEL */
 
 	inq_data = &device->inq_data;
 
 #endif /* _KERNEL/!_KERNEL */
 
 	sense = NULL;
 
 	if (flags & SSS_FLAG_PRINT_COMMAND) {
 
 		sbuf_cat(sb, path_str);
 
 #ifdef _KERNEL
 		scsi_command_string(csio, sb);
 #else /* !_KERNEL */
 		scsi_command_string(device, csio, sb);
 #endif /* _KERNEL/!_KERNEL */
 		sbuf_printf(sb, "\n");
 	}
 
 	/*
 	 * If the sense data is a physical pointer, forget it.
 	 */
 	if (csio->ccb_h.flags & CAM_SENSE_PTR) {
 		if (csio->ccb_h.flags & CAM_SENSE_PHYS) {
 #ifdef _KERNEL
 			xpt_free_ccb((union ccb*)cgd);
 #endif /* _KERNEL/!_KERNEL */
 			return(-1);
 		} else {
 			/* 
 			 * bcopy the pointer to avoid unaligned access
 			 * errors on finicky architectures.  We don't
 			 * ensure that the sense data is pointer aligned.
 			 */
 			bcopy((struct scsi_sense_data **)&csio->sense_data,
 			    &sense, sizeof(struct scsi_sense_data *));
 		}
 	} else {
 		/*
 		 * If the physical sense flag is set, but the sense pointer
 		 * is not also set, we assume that the user is an idiot and
 		 * return.  (Well, okay, it could be that somehow, the
 		 * entire csio is physical, but we would have probably core
 		 * dumped on one of the bogus pointer deferences above
 		 * already.)
 		 */
 		if (csio->ccb_h.flags & CAM_SENSE_PHYS) {
 #ifdef _KERNEL
 			xpt_free_ccb((union ccb*)cgd);
 #endif /* _KERNEL/!_KERNEL */
 			return(-1);
 		} else
 			sense = &csio->sense_data;
 	}
 
 	scsi_sense_only_sbuf(sense, csio->sense_len - csio->sense_resid, sb,
 	    path_str, inq_data, scsiio_cdb_ptr(csio), csio->cdb_len);
 
 #ifdef _KERNEL
 	xpt_free_ccb((union ccb*)cgd);
 #endif /* _KERNEL/!_KERNEL */
 	return(0);
 }
 
 
 
 #ifdef _KERNEL
 char *
 scsi_sense_string(struct ccb_scsiio *csio, char *str, int str_len)
 #else /* !_KERNEL */
 char *
 scsi_sense_string(struct cam_device *device, struct ccb_scsiio *csio,
 		  char *str, int str_len)
 #endif /* _KERNEL/!_KERNEL */
 {
 	struct sbuf sb;
 
 	sbuf_new(&sb, str, str_len, 0);
 
 #ifdef _KERNEL
 	scsi_sense_sbuf(csio, &sb, SSS_FLAG_PRINT_COMMAND);
 #else /* !_KERNEL */
 	scsi_sense_sbuf(device, csio, &sb, SSS_FLAG_PRINT_COMMAND);
 #endif /* _KERNEL/!_KERNEL */
 
 	sbuf_finish(&sb);
 
 	return(sbuf_data(&sb));
 }
 
 #ifdef _KERNEL
 void 
 scsi_sense_print(struct ccb_scsiio *csio)
 {
 	struct sbuf sb;
 	char str[512];
 
 	sbuf_new(&sb, str, sizeof(str), 0);
 
 	scsi_sense_sbuf(csio, &sb, SSS_FLAG_PRINT_COMMAND);
 
 	sbuf_finish(&sb);
 
 	sbuf_putbuf(&sb);
 }
 
 #else /* !_KERNEL */
 void
 scsi_sense_print(struct cam_device *device, struct ccb_scsiio *csio, 
 		 FILE *ofile)
 {
 	struct sbuf sb;
 	char str[512];
 
 	if ((device == NULL) || (csio == NULL) || (ofile == NULL))
 		return;
 
 	sbuf_new(&sb, str, sizeof(str), 0);
 
 	scsi_sense_sbuf(device, csio, &sb, SSS_FLAG_PRINT_COMMAND);
 
 	sbuf_finish(&sb);
 
 	fprintf(ofile, "%s", sbuf_data(&sb));
 }
 
 #endif /* _KERNEL/!_KERNEL */
 
 /*
  * Extract basic sense information.  This is backward-compatible with the
  * previous implementation.  For new implementations,
  * scsi_extract_sense_len() is recommended.
  */
 void
 scsi_extract_sense(struct scsi_sense_data *sense_data, int *error_code,
 		   int *sense_key, int *asc, int *ascq)
 {
 	scsi_extract_sense_len(sense_data, sizeof(*sense_data), error_code,
 			       sense_key, asc, ascq, /*show_errors*/ 0);
 }
 
 /*
  * Extract basic sense information from SCSI I/O CCB structure.
  */
 int
 scsi_extract_sense_ccb(union ccb *ccb,
     int *error_code, int *sense_key, int *asc, int *ascq)
 {
 	struct scsi_sense_data *sense_data;
 
 	/* Make sure there are some sense data we can access. */
 	if (ccb->ccb_h.func_code != XPT_SCSI_IO ||
 	    (ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_SCSI_STATUS_ERROR ||
 	    (ccb->csio.scsi_status != SCSI_STATUS_CHECK_COND) ||
 	    (ccb->ccb_h.status & CAM_AUTOSNS_VALID) == 0 ||
 	    (ccb->ccb_h.flags & CAM_SENSE_PHYS))
 		return (0);
 
 	if (ccb->ccb_h.flags & CAM_SENSE_PTR)
 		bcopy((struct scsi_sense_data **)&ccb->csio.sense_data,
 		    &sense_data, sizeof(struct scsi_sense_data *));
 	else
 		sense_data = &ccb->csio.sense_data;
 	scsi_extract_sense_len(sense_data,
 	    ccb->csio.sense_len - ccb->csio.sense_resid,
 	    error_code, sense_key, asc, ascq, 1);
 	if (*error_code == -1)
 		return (0);
 	return (1);
 }
 
 /*
  * Extract basic sense information.  If show_errors is set, sense values
  * will be set to -1 if they are not present.
  */
 void
 scsi_extract_sense_len(struct scsi_sense_data *sense_data, u_int sense_len,
 		       int *error_code, int *sense_key, int *asc, int *ascq,
 		       int show_errors)
 {
 	/*
 	 * If we have no length, we have no sense.
 	 */
 	if (sense_len == 0) {
 		if (show_errors == 0) {
 			*error_code = 0;
 			*sense_key = 0;
 			*asc = 0;
 			*ascq = 0;
 		} else {
 			*error_code = -1;
 			*sense_key = -1;
 			*asc = -1;
 			*ascq = -1;
 		}
 		return;
 	}
 
 	*error_code = sense_data->error_code & SSD_ERRCODE;
 
 	switch (*error_code) {
 	case SSD_DESC_CURRENT_ERROR:
 	case SSD_DESC_DEFERRED_ERROR: {
 		struct scsi_sense_data_desc *sense;
 
 		sense = (struct scsi_sense_data_desc *)sense_data;
 
 		if (SSD_DESC_IS_PRESENT(sense, sense_len, sense_key))
 			*sense_key = sense->sense_key & SSD_KEY;
 		else
 			*sense_key = (show_errors) ? -1 : 0;
 
 		if (SSD_DESC_IS_PRESENT(sense, sense_len, add_sense_code))
 			*asc = sense->add_sense_code;
 		else
 			*asc = (show_errors) ? -1 : 0;
 
 		if (SSD_DESC_IS_PRESENT(sense, sense_len, add_sense_code_qual))
 			*ascq = sense->add_sense_code_qual;
 		else
 			*ascq = (show_errors) ? -1 : 0;
 		break;
 	}
 	case SSD_CURRENT_ERROR:
 	case SSD_DEFERRED_ERROR:
 	default: {
 		struct scsi_sense_data_fixed *sense;
 
 		sense = (struct scsi_sense_data_fixed *)sense_data;
 
 		if (SSD_FIXED_IS_PRESENT(sense, sense_len, flags))
 			*sense_key = sense->flags & SSD_KEY;
 		else
 			*sense_key = (show_errors) ? -1 : 0;
 
 		if ((SSD_FIXED_IS_PRESENT(sense, sense_len, add_sense_code))
 		 && (SSD_FIXED_IS_FILLED(sense, add_sense_code)))
 			*asc = sense->add_sense_code;
 		else
 			*asc = (show_errors) ? -1 : 0;
 
 		if ((SSD_FIXED_IS_PRESENT(sense, sense_len,add_sense_code_qual))
 		 && (SSD_FIXED_IS_FILLED(sense, add_sense_code_qual)))
 			*ascq = sense->add_sense_code_qual;
 		else
 			*ascq = (show_errors) ? -1 : 0;
 		break;
 	}
 	}
 }
 
 int
 scsi_get_sense_key(struct scsi_sense_data *sense_data, u_int sense_len,
 		   int show_errors)
 {
 	int error_code, sense_key, asc, ascq;
 
 	scsi_extract_sense_len(sense_data, sense_len, &error_code,
 			       &sense_key, &asc, &ascq, show_errors);
 
 	return (sense_key);
 }
 
 int
 scsi_get_asc(struct scsi_sense_data *sense_data, u_int sense_len,
 	     int show_errors)
 {
 	int error_code, sense_key, asc, ascq;
 
 	scsi_extract_sense_len(sense_data, sense_len, &error_code,
 			       &sense_key, &asc, &ascq, show_errors);
 
 	return (asc);
 }
 
 int
 scsi_get_ascq(struct scsi_sense_data *sense_data, u_int sense_len,
 	      int show_errors)
 {
 	int error_code, sense_key, asc, ascq;
 
 	scsi_extract_sense_len(sense_data, sense_len, &error_code,
 			       &sense_key, &asc, &ascq, show_errors);
 
 	return (ascq);
 }
 
 /*
  * This function currently requires at least 36 bytes, or
  * SHORT_INQUIRY_LENGTH, worth of data to function properly.  If this
  * function needs more or less data in the future, another length should be
  * defined in scsi_all.h to indicate the minimum amount of data necessary
  * for this routine to function properly.
  */
 void
 scsi_print_inquiry_sbuf(struct sbuf *sb, struct scsi_inquiry_data *inq_data)
 {
 	u_int8_t type;
 	char *dtype, *qtype;
 
 	type = SID_TYPE(inq_data);
 
 	/*
 	 * Figure out basic device type and qualifier.
 	 */
 	if (SID_QUAL_IS_VENDOR_UNIQUE(inq_data)) {
 		qtype = " (vendor-unique qualifier)";
 	} else {
 		switch (SID_QUAL(inq_data)) {
 		case SID_QUAL_LU_CONNECTED:
 			qtype = "";
 			break;
 
 		case SID_QUAL_LU_OFFLINE:
 			qtype = " (offline)";
 			break;
 
 		case SID_QUAL_RSVD:
 			qtype = " (reserved qualifier)";
 			break;
 		default:
 		case SID_QUAL_BAD_LU:
 			qtype = " (LUN not supported)";
 			break;
 		}
 	}
 
 	switch (type) {
 	case T_DIRECT:
 		dtype = "Direct Access";
 		break;
 	case T_SEQUENTIAL:
 		dtype = "Sequential Access";
 		break;
 	case T_PRINTER:
 		dtype = "Printer";
 		break;
 	case T_PROCESSOR:
 		dtype = "Processor";
 		break;
 	case T_WORM:
 		dtype = "WORM";
 		break;
 	case T_CDROM:
 		dtype = "CD-ROM";
 		break;
 	case T_SCANNER:
 		dtype = "Scanner";
 		break;
 	case T_OPTICAL:
 		dtype = "Optical";
 		break;
 	case T_CHANGER:
 		dtype = "Changer";
 		break;
 	case T_COMM:
 		dtype = "Communication";
 		break;
 	case T_STORARRAY:
 		dtype = "Storage Array";
 		break;
 	case T_ENCLOSURE:
 		dtype = "Enclosure Services";
 		break;
 	case T_RBC:
 		dtype = "Simplified Direct Access";
 		break;
 	case T_OCRW:
 		dtype = "Optical Card Read/Write";
 		break;
 	case T_OSD:
 		dtype = "Object-Based Storage";
 		break;
 	case T_ADC:
 		dtype = "Automation/Drive Interface";
 		break;
 	case T_ZBC_HM:
 		dtype = "Host Managed Zoned Block";
 		break;
 	case T_NODEVICE:
 		dtype = "Uninstalled";
 		break;
 	default:
 		dtype = "unknown";
 		break;
 	}
 
 	scsi_print_inquiry_short_sbuf(sb, inq_data);
 
 	sbuf_printf(sb, "%s %s ", SID_IS_REMOVABLE(inq_data) ? "Removable" : "Fixed", dtype);
 
 	if (SID_ANSI_REV(inq_data) == SCSI_REV_0)
 		sbuf_printf(sb, "SCSI ");
 	else if (SID_ANSI_REV(inq_data) <= SCSI_REV_SPC) {
 		sbuf_printf(sb, "SCSI-%d ", SID_ANSI_REV(inq_data));
 	} else {
 		sbuf_printf(sb, "SPC-%d SCSI ", SID_ANSI_REV(inq_data) - 2);
 	}
 	sbuf_printf(sb, "device%s\n", qtype);
 }
 
 void
 scsi_print_inquiry(struct scsi_inquiry_data *inq_data)
 {
 	struct sbuf	sb;
 	char		buffer[120];
 
 	sbuf_new(&sb, buffer, 120, SBUF_FIXEDLEN);
 	scsi_print_inquiry_sbuf(&sb, inq_data);
 	sbuf_finish(&sb);
 	sbuf_putbuf(&sb);
 }
 
 void
 scsi_print_inquiry_short_sbuf(struct sbuf *sb, struct scsi_inquiry_data *inq_data)
 {
 
 	sbuf_printf(sb, "<");
 	cam_strvis_sbuf(sb, inq_data->vendor, sizeof(inq_data->vendor), 0);
 	sbuf_printf(sb, " ");
 	cam_strvis_sbuf(sb, inq_data->product, sizeof(inq_data->product), 0);
 	sbuf_printf(sb, " ");
 	cam_strvis_sbuf(sb, inq_data->revision, sizeof(inq_data->revision), 0);
 	sbuf_printf(sb, "> ");
 }
 
 void
 scsi_print_inquiry_short(struct scsi_inquiry_data *inq_data)
 {
 	struct sbuf	sb;
 	char		buffer[84];
 
 	sbuf_new(&sb, buffer, 84, SBUF_FIXEDLEN);
 	scsi_print_inquiry_short_sbuf(&sb, inq_data);
 	sbuf_finish(&sb);
 	sbuf_putbuf(&sb);
 }
 
 /*
  * Table of syncrates that don't follow the "divisible by 4"
  * rule. This table will be expanded in future SCSI specs.
  */
 static struct {
 	u_int period_factor;
 	u_int period;	/* in 100ths of ns */
 } scsi_syncrates[] = {
 	{ 0x08, 625 },	/* FAST-160 */
 	{ 0x09, 1250 },	/* FAST-80 */
 	{ 0x0a, 2500 },	/* FAST-40 40MHz */
 	{ 0x0b, 3030 },	/* FAST-40 33MHz */
 	{ 0x0c, 5000 }	/* FAST-20 */
 };
 
 /*
  * Return the frequency in kHz corresponding to the given
  * sync period factor.
  */
 u_int
 scsi_calc_syncsrate(u_int period_factor)
 {
 	u_int i;
 	u_int num_syncrates;
 
 	/*
 	 * It's a bug if period is zero, but if it is anyway, don't
 	 * die with a divide fault- instead return something which
 	 * 'approximates' async
 	 */
 	if (period_factor == 0) {
 		return (3300);
 	}
 
 	num_syncrates = nitems(scsi_syncrates);
 	/* See if the period is in the "exception" table */
 	for (i = 0; i < num_syncrates; i++) {
 
 		if (period_factor == scsi_syncrates[i].period_factor) {
 			/* Period in kHz */
 			return (100000000 / scsi_syncrates[i].period);
 		}
 	}
 
 	/*
 	 * Wasn't in the table, so use the standard
 	 * 4 times conversion.
 	 */
 	return (10000000 / (period_factor * 4 * 10));
 }
 
 /*
  * Return the SCSI sync parameter that corresponds to
  * the passed in period in 10ths of ns.
  */
 u_int
 scsi_calc_syncparam(u_int period)
 {
 	u_int i;
 	u_int num_syncrates;
 
 	if (period == 0)
 		return (~0);	/* Async */
 
 	/* Adjust for exception table being in 100ths. */
 	period *= 10;
 	num_syncrates = nitems(scsi_syncrates);
 	/* See if the period is in the "exception" table */
 	for (i = 0; i < num_syncrates; i++) {
 
 		if (period <= scsi_syncrates[i].period) {
 			/* Period in 100ths of ns */
 			return (scsi_syncrates[i].period_factor);
 		}
 	}
 
 	/*
 	 * Wasn't in the table, so use the standard
 	 * 1/4 period in ns conversion.
 	 */
 	return (period/400);
 }
 
 int
 scsi_devid_is_naa_ieee_reg(uint8_t *bufp)
 {
 	struct scsi_vpd_id_descriptor *descr;
 	struct scsi_vpd_id_naa_basic *naa;
 	int n;
 
 	descr = (struct scsi_vpd_id_descriptor *)bufp;
 	naa = (struct scsi_vpd_id_naa_basic *)descr->identifier;
 	if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_NAA)
 		return 0;
 	if (descr->length < sizeof(struct scsi_vpd_id_naa_ieee_reg))
 		return 0;
 	n = naa->naa >> SVPD_ID_NAA_NAA_SHIFT;
 	if (n != SVPD_ID_NAA_LOCAL_REG && n != SVPD_ID_NAA_IEEE_REG)
 		return 0;
 	return 1;
 }
 
 int
 scsi_devid_is_sas_target(uint8_t *bufp)
 {
 	struct scsi_vpd_id_descriptor *descr;
 
 	descr = (struct scsi_vpd_id_descriptor *)bufp;
 	if (!scsi_devid_is_naa_ieee_reg(bufp))
 		return 0;
 	if ((descr->id_type & SVPD_ID_PIV) == 0) /* proto field reserved */
 		return 0;
 	if ((descr->proto_codeset >> SVPD_ID_PROTO_SHIFT) != SCSI_PROTO_SAS)
 		return 0;
 	return 1;
 }
 
 int
 scsi_devid_is_lun_eui64(uint8_t *bufp)
 {
 	struct scsi_vpd_id_descriptor *descr;
 
 	descr = (struct scsi_vpd_id_descriptor *)bufp;
 	if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN)
 		return 0;
 	if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_EUI64)
 		return 0;
 	return 1;
 }
 
 int
 scsi_devid_is_lun_naa(uint8_t *bufp)
 {
 	struct scsi_vpd_id_descriptor *descr;
 
 	descr = (struct scsi_vpd_id_descriptor *)bufp;
 	if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN)
 		return 0;
 	if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_NAA)
 		return 0;
 	return 1;
 }
 
 int
 scsi_devid_is_lun_t10(uint8_t *bufp)
 {
 	struct scsi_vpd_id_descriptor *descr;
 
 	descr = (struct scsi_vpd_id_descriptor *)bufp;
 	if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN)
 		return 0;
 	if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_T10)
 		return 0;
 	return 1;
 }
 
 int
 scsi_devid_is_lun_name(uint8_t *bufp)
 {
 	struct scsi_vpd_id_descriptor *descr;
 
 	descr = (struct scsi_vpd_id_descriptor *)bufp;
 	if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN)
 		return 0;
 	if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_SCSI_NAME)
 		return 0;
 	return 1;
 }
 
 int
 scsi_devid_is_lun_md5(uint8_t *bufp)
 {
 	struct scsi_vpd_id_descriptor *descr;
 
 	descr = (struct scsi_vpd_id_descriptor *)bufp;
 	if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN)
 		return 0;
 	if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_MD5_LUN_ID)
 		return 0;
 	return 1;
 }
 
 int
 scsi_devid_is_lun_uuid(uint8_t *bufp)
 {
 	struct scsi_vpd_id_descriptor *descr;
 
 	descr = (struct scsi_vpd_id_descriptor *)bufp;
 	if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_LUN)
 		return 0;
 	if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_UUID)
 		return 0;
 	return 1;
 }
 
 int
 scsi_devid_is_port_naa(uint8_t *bufp)
 {
 	struct scsi_vpd_id_descriptor *descr;
 
 	descr = (struct scsi_vpd_id_descriptor *)bufp;
 	if ((descr->id_type & SVPD_ID_ASSOC_MASK) != SVPD_ID_ASSOC_PORT)
 		return 0;
 	if ((descr->id_type & SVPD_ID_TYPE_MASK) != SVPD_ID_TYPE_NAA)
 		return 0;
 	return 1;
 }
 
 struct scsi_vpd_id_descriptor *
 scsi_get_devid_desc(struct scsi_vpd_id_descriptor *desc, uint32_t len,
     scsi_devid_checkfn_t ck_fn)
 {
 	uint8_t *desc_buf_end;
 
 	desc_buf_end = (uint8_t *)desc + len;
 
 	for (; desc->identifier <= desc_buf_end &&
 	    desc->identifier + desc->length <= desc_buf_end;
 	    desc = (struct scsi_vpd_id_descriptor *)(desc->identifier
 						    + desc->length)) {
 
 		if (ck_fn == NULL || ck_fn((uint8_t *)desc) != 0)
 			return (desc);
 	}
 	return (NULL);
 }
 
 struct scsi_vpd_id_descriptor *
 scsi_get_devid(struct scsi_vpd_device_id *id, uint32_t page_len,
     scsi_devid_checkfn_t ck_fn)
 {
 	uint32_t len;
 
 	if (page_len < sizeof(*id))
 		return (NULL);
 	len = MIN(scsi_2btoul(id->length), page_len - sizeof(*id));
 	return (scsi_get_devid_desc((struct scsi_vpd_id_descriptor *)
 	    id->desc_list, len, ck_fn));
 }
 
 int
 scsi_transportid_sbuf(struct sbuf *sb, struct scsi_transportid_header *hdr,
 		      uint32_t valid_len)
 {
 	switch (hdr->format_protocol & SCSI_TRN_PROTO_MASK) {
 	case SCSI_PROTO_FC: {
 		struct scsi_transportid_fcp *fcp;
 		uint64_t n_port_name;
 
 		fcp = (struct scsi_transportid_fcp *)hdr;
 
 		n_port_name = scsi_8btou64(fcp->n_port_name);
 
 		sbuf_printf(sb, "FCP address: 0x%.16jx",(uintmax_t)n_port_name);
 		break;
 	}
 	case SCSI_PROTO_SPI: {
 		struct scsi_transportid_spi *spi;
 
 		spi = (struct scsi_transportid_spi *)hdr;
 
 		sbuf_printf(sb, "SPI address: %u,%u",
 			    scsi_2btoul(spi->scsi_addr),
 			    scsi_2btoul(spi->rel_trgt_port_id));
 		break;
 	}
 	case SCSI_PROTO_SSA:
 		/*
 		 * XXX KDM there is no transport ID defined in SPC-4 for
 		 * SSA.
 		 */
 		break;
 	case SCSI_PROTO_1394: {
 		struct scsi_transportid_1394 *sbp;
 		uint64_t eui64;
 
 		sbp = (struct scsi_transportid_1394 *)hdr;
 
 		eui64 = scsi_8btou64(sbp->eui64);
 		sbuf_printf(sb, "SBP address: 0x%.16jx", (uintmax_t)eui64);
 		break;
 	}
 	case SCSI_PROTO_RDMA: {
 		struct scsi_transportid_rdma *rdma;
 		unsigned int i;
 
 		rdma = (struct scsi_transportid_rdma *)hdr;
 
 		sbuf_printf(sb, "RDMA address: 0x");
 		for (i = 0; i < sizeof(rdma->initiator_port_id); i++)
 			sbuf_printf(sb, "%02x", rdma->initiator_port_id[i]);
 		break;
 	}
 	case SCSI_PROTO_ISCSI: {
 		uint32_t add_len, i;
 		uint8_t *iscsi_name = NULL;
 		int nul_found = 0;
 
 		sbuf_printf(sb, "iSCSI address: ");
 		if ((hdr->format_protocol & SCSI_TRN_FORMAT_MASK) == 
 		    SCSI_TRN_ISCSI_FORMAT_DEVICE) {
 			struct scsi_transportid_iscsi_device *dev;
 
 			dev = (struct scsi_transportid_iscsi_device *)hdr;
 
 			/*
 			 * Verify how much additional data we really have.
 			 */
 			add_len = scsi_2btoul(dev->additional_length);
 			add_len = MIN(add_len, valid_len -
 				__offsetof(struct scsi_transportid_iscsi_device,
 					   iscsi_name));
 			iscsi_name = &dev->iscsi_name[0];
 
 		} else if ((hdr->format_protocol & SCSI_TRN_FORMAT_MASK) ==
 			    SCSI_TRN_ISCSI_FORMAT_PORT) {
 			struct scsi_transportid_iscsi_port *port;
 
 			port = (struct scsi_transportid_iscsi_port *)hdr;
 			
 			add_len = scsi_2btoul(port->additional_length);
 			add_len = MIN(add_len, valid_len -
 				__offsetof(struct scsi_transportid_iscsi_port,
 					   iscsi_name));
 			iscsi_name = &port->iscsi_name[0];
 		} else {
 			sbuf_printf(sb, "unknown format %x",
 				    (hdr->format_protocol &
 				     SCSI_TRN_FORMAT_MASK) >>
 				     SCSI_TRN_FORMAT_SHIFT);
 			break;
 		}
 		if (add_len == 0) {
 			sbuf_printf(sb, "not enough data");
 			break;
 		}
 		/*
 		 * This is supposed to be a NUL-terminated ASCII 
 		 * string, but you never know.  So we're going to
 		 * check.  We need to do this because there is no
 		 * sbuf equivalent of strncat().
 		 */
 		for (i = 0; i < add_len; i++) {
 			if (iscsi_name[i] == '\0') {
 				nul_found = 1;
 				break;
 			}
 		}
 		/*
 		 * If there is a NUL in the name, we can just use
 		 * sbuf_cat().  Otherwise we need to use sbuf_bcat().
 		 */
 		if (nul_found != 0)
 			sbuf_cat(sb, iscsi_name);
 		else
 			sbuf_bcat(sb, iscsi_name, add_len);
 		break;
 	}
 	case SCSI_PROTO_SAS: {
 		struct scsi_transportid_sas *sas;
 		uint64_t sas_addr;
 
 		sas = (struct scsi_transportid_sas *)hdr;
 
 		sas_addr = scsi_8btou64(sas->sas_address);
 		sbuf_printf(sb, "SAS address: 0x%.16jx", (uintmax_t)sas_addr);
 		break;
 	}
 	case SCSI_PROTO_ADITP:
 	case SCSI_PROTO_ATA:
 	case SCSI_PROTO_UAS:
 		/*
 		 * No Transport ID format for ADI, ATA or USB is defined in
 		 * SPC-4.
 		 */
 		sbuf_printf(sb, "No known Transport ID format for protocol "
 			    "%#x", hdr->format_protocol & SCSI_TRN_PROTO_MASK);
 		break;
 	case SCSI_PROTO_SOP: {
 		struct scsi_transportid_sop *sop;
 		struct scsi_sop_routing_id_norm *rid;
 
 		sop = (struct scsi_transportid_sop *)hdr;
 		rid = (struct scsi_sop_routing_id_norm *)sop->routing_id;
 
 		/*
 		 * Note that there is no alternate format specified in SPC-4
 		 * for the PCIe routing ID, so we don't really have a way
 		 * to know whether the second byte of the routing ID is
 		 * a device and function or just a function.  So we just
 		 * assume bus,device,function.
 		 */
 		sbuf_printf(sb, "SOP Routing ID: %u,%u,%u",
 			    rid->bus, rid->devfunc >> SCSI_TRN_SOP_DEV_SHIFT,
 			    rid->devfunc & SCSI_TRN_SOP_FUNC_NORM_MAX);
 		break;
 	}
 	case SCSI_PROTO_NONE:
 	default:
 		sbuf_printf(sb, "Unknown protocol %#x",
 			    hdr->format_protocol & SCSI_TRN_PROTO_MASK);
 		break;
 	}
 
 	return (0);
 }
 
 struct scsi_nv scsi_proto_map[] = {
 	{ "fcp", SCSI_PROTO_FC },
 	{ "spi", SCSI_PROTO_SPI },
 	{ "ssa", SCSI_PROTO_SSA },
 	{ "sbp", SCSI_PROTO_1394 },
 	{ "1394", SCSI_PROTO_1394 },
 	{ "srp", SCSI_PROTO_RDMA },
 	{ "rdma", SCSI_PROTO_RDMA },
 	{ "iscsi", SCSI_PROTO_ISCSI },
 	{ "iqn", SCSI_PROTO_ISCSI },
 	{ "sas", SCSI_PROTO_SAS },
 	{ "aditp", SCSI_PROTO_ADITP },
 	{ "ata", SCSI_PROTO_ATA },
 	{ "uas", SCSI_PROTO_UAS },
 	{ "usb", SCSI_PROTO_UAS },
 	{ "sop", SCSI_PROTO_SOP }
 };
 
 const char *
 scsi_nv_to_str(struct scsi_nv *table, int num_table_entries, uint64_t value)
 {
 	int i;
 
 	for (i = 0; i < num_table_entries; i++) {
 		if (table[i].value == value)
 			return (table[i].name);
 	}
 
 	return (NULL);
 }
 
 /*
  * Given a name/value table, find a value matching the given name.
  * Return values:
  *	SCSI_NV_FOUND - match found
  *	SCSI_NV_AMBIGUOUS - more than one match, none of them exact
  *	SCSI_NV_NOT_FOUND - no match found
  */
 scsi_nv_status
 scsi_get_nv(struct scsi_nv *table, int num_table_entries,
 	    char *name, int *table_entry, scsi_nv_flags flags)
 {
 	int i, num_matches = 0;
 
 	for (i = 0; i < num_table_entries; i++) {
 		size_t table_len, name_len;
 
 		table_len = strlen(table[i].name);
 		name_len = strlen(name);
 
 		if ((((flags & SCSI_NV_FLAG_IG_CASE) != 0)
 		  && (strncasecmp(table[i].name, name, name_len) == 0))
 		|| (((flags & SCSI_NV_FLAG_IG_CASE) == 0)
 		 && (strncmp(table[i].name, name, name_len) == 0))) {
 			*table_entry = i;
 
 			/*
 			 * Check for an exact match.  If we have the same
 			 * number of characters in the table as the argument,
 			 * and we already know they're the same, we have
 			 * an exact match.
 		 	 */
 			if (table_len == name_len)
 				return (SCSI_NV_FOUND);
 
 			/*
 			 * Otherwise, bump up the number of matches.  We'll
 			 * see later how many we have.
 			 */
 			num_matches++;
 		}
 	}
 
 	if (num_matches > 1)
 		return (SCSI_NV_AMBIGUOUS);
 	else if (num_matches == 1)
 		return (SCSI_NV_FOUND);
 	else
 		return (SCSI_NV_NOT_FOUND);
 }
 
 /*
  * Parse transport IDs for Fibre Channel, 1394 and SAS.  Since these are
  * all 64-bit numbers, the code is similar.
  */
 int
 scsi_parse_transportid_64bit(int proto_id, char *id_str,
 			     struct scsi_transportid_header **hdr,
 			     unsigned int *alloc_len,
 #ifdef _KERNEL
 			     struct malloc_type *type, int flags,
 #endif
 			     char *error_str, int error_str_len)
 {
 	uint64_t value;
 	char *endptr;
 	int retval;
 	size_t alloc_size;
 
 	retval = 0;
 
 	value = strtouq(id_str, &endptr, 0); 
 	if (*endptr != '\0') {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: error "
 				 "parsing ID %s, 64-bit number required",
 				 __func__, id_str);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	switch (proto_id) {
 	case SCSI_PROTO_FC:
 		alloc_size = sizeof(struct scsi_transportid_fcp);
 		break;
 	case SCSI_PROTO_1394:
 		alloc_size = sizeof(struct scsi_transportid_1394);
 		break;
 	case SCSI_PROTO_SAS:
 		alloc_size = sizeof(struct scsi_transportid_sas);
 		break;
 	default:
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: unsupported "
 				 "protocol %d", __func__, proto_id);
 		}
 		retval = 1;
 		goto bailout;
 		break; /* NOTREACHED */
 	}
 #ifdef _KERNEL
 	*hdr = malloc(alloc_size, type, flags);
 #else /* _KERNEL */
 	*hdr = malloc(alloc_size);
 #endif /*_KERNEL */
 	if (*hdr == NULL) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: unable to "
 				 "allocate %zu bytes", __func__, alloc_size);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	*alloc_len = alloc_size;
 
 	bzero(*hdr, alloc_size);
 
 	switch (proto_id) {
 	case SCSI_PROTO_FC: {
 		struct scsi_transportid_fcp *fcp;
 
 		fcp = (struct scsi_transportid_fcp *)(*hdr);
 		fcp->format_protocol = SCSI_PROTO_FC |
 				       SCSI_TRN_FCP_FORMAT_DEFAULT;
 		scsi_u64to8b(value, fcp->n_port_name);
 		break;
 	}
 	case SCSI_PROTO_1394: {
 		struct scsi_transportid_1394 *sbp;
 
 		sbp = (struct scsi_transportid_1394 *)(*hdr);
 		sbp->format_protocol = SCSI_PROTO_1394 |
 				       SCSI_TRN_1394_FORMAT_DEFAULT;
 		scsi_u64to8b(value, sbp->eui64);
 		break;
 	}
 	case SCSI_PROTO_SAS: {
 		struct scsi_transportid_sas *sas;
 
 		sas = (struct scsi_transportid_sas *)(*hdr);
 		sas->format_protocol = SCSI_PROTO_SAS |
 				       SCSI_TRN_SAS_FORMAT_DEFAULT;
 		scsi_u64to8b(value, sas->sas_address);
 		break;
 	}
 	default:
 		break;
 	}
 bailout:
 	return (retval);
 }
 
 /*
  * Parse a SPI (Parallel SCSI) address of the form: id,rel_tgt_port
  */
 int
 scsi_parse_transportid_spi(char *id_str, struct scsi_transportid_header **hdr,
 			   unsigned int *alloc_len,
 #ifdef _KERNEL
 			   struct malloc_type *type, int flags,
 #endif
 			   char *error_str, int error_str_len)
 {
 	unsigned long scsi_addr, target_port;
 	struct scsi_transportid_spi *spi;
 	char *tmpstr, *endptr;
 	int retval;
 
 	retval = 0;
 
 	tmpstr = strsep(&id_str, ",");
 	if (tmpstr == NULL) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len,
 				 "%s: no ID found", __func__);
 		}
 		retval = 1;
 		goto bailout;
 	}
 	scsi_addr = strtoul(tmpstr, &endptr, 0);
 	if (*endptr != '\0') {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: error "
 				 "parsing SCSI ID %s, number required",
 				 __func__, tmpstr);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	if (id_str == NULL) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: no relative "
 				 "target port found", __func__);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	target_port = strtoul(id_str, &endptr, 0);
 	if (*endptr != '\0') {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: error "
 				 "parsing relative target port %s, number "
 				 "required", __func__, id_str);
 		}
 		retval = 1;
 		goto bailout;
 	}
 #ifdef _KERNEL
 	spi = malloc(sizeof(*spi), type, flags);
 #else
 	spi = malloc(sizeof(*spi));
 #endif
 	if (spi == NULL) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: unable to "
 				 "allocate %zu bytes", __func__,
 				 sizeof(*spi));
 		}
 		retval = 1;
 		goto bailout;
 	}
 	*alloc_len = sizeof(*spi);
 	bzero(spi, sizeof(*spi));
 
 	spi->format_protocol = SCSI_PROTO_SPI | SCSI_TRN_SPI_FORMAT_DEFAULT;
 	scsi_ulto2b(scsi_addr, spi->scsi_addr);
 	scsi_ulto2b(target_port, spi->rel_trgt_port_id);
 
 	*hdr = (struct scsi_transportid_header *)spi;
 bailout:
 	return (retval);
 }
 
 /*
  * Parse an RDMA/SRP Initiator Port ID string.  This is 32 hexadecimal digits,
  * optionally prefixed by "0x" or "0X".
  */
 int
 scsi_parse_transportid_rdma(char *id_str, struct scsi_transportid_header **hdr,
 			    unsigned int *alloc_len,
 #ifdef _KERNEL
 			    struct malloc_type *type, int flags,
 #endif
 			    char *error_str, int error_str_len)
 {
 	struct scsi_transportid_rdma *rdma;
 	int retval;
 	size_t id_len, rdma_id_size;
 	uint8_t rdma_id[SCSI_TRN_RDMA_PORT_LEN];
 	char *tmpstr;
 	unsigned int i, j;
 
 	retval = 0;
 	id_len = strlen(id_str);
 	rdma_id_size = SCSI_TRN_RDMA_PORT_LEN;
 
 	/*
 	 * Check the size.  It needs to be either 32 or 34 characters long.
 	 */
 	if ((id_len != (rdma_id_size * 2))
 	 && (id_len != ((rdma_id_size * 2) + 2))) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: RDMA ID "
 				 "must be 32 hex digits (0x prefix "
 				 "optional), only %zu seen", __func__, id_len);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	tmpstr = id_str;
 	/*
 	 * If the user gave us 34 characters, the string needs to start
 	 * with '0x'.
 	 */
 	if (id_len == ((rdma_id_size * 2) + 2)) {
 	 	if ((tmpstr[0] == '0')
 		 && ((tmpstr[1] == 'x') || (tmpstr[1] == 'X'))) {
 			tmpstr += 2;
 		} else {
 			if (error_str != NULL) {
 				snprintf(error_str, error_str_len, "%s: RDMA "
 					 "ID prefix, if used, must be \"0x\", "
 					 "got %s", __func__, tmpstr);
 			}
 			retval = 1;
 			goto bailout;
 		}
 	}
 	bzero(rdma_id, sizeof(rdma_id));
 
 	/*
 	 * Convert ASCII hex into binary bytes.  There is no standard
 	 * 128-bit integer type, and so no strtou128t() routine to convert
 	 * from hex into a large integer.  In the end, we're not going to
 	 * an integer, but rather to a byte array, so that and the fact
 	 * that we require the user to give us 32 hex digits simplifies the
 	 * logic.
 	 */
 	for (i = 0; i < (rdma_id_size * 2); i++) {
 		int cur_shift;
 		unsigned char c;
 
 		/* Increment the byte array one for every 2 hex digits */
 		j = i >> 1;
 
 		/*
 		 * The first digit in every pair is the most significant
 		 * 4 bits.  The second is the least significant 4 bits.
 		 */
 		if ((i % 2) == 0)
 			cur_shift = 4;
 		else 
 			cur_shift = 0;
 
 		c = tmpstr[i];
 		/* Convert the ASCII hex character into a number */
 		if (isdigit(c))
 			c -= '0';
 		else if (isalpha(c))
 			c -= isupper(c) ? 'A' - 10 : 'a' - 10;
 		else {
 			if (error_str != NULL) {
 				snprintf(error_str, error_str_len, "%s: "
 					 "RDMA ID must be hex digits, got "
 					 "invalid character %c", __func__,
 					 tmpstr[i]);
 			}
 			retval = 1;
 			goto bailout;
 		}
 		/*
 		 * The converted number can't be less than 0; the type is
 		 * unsigned, and the subtraction logic will not give us 
 		 * a negative number.  So we only need to make sure that
 		 * the value is not greater than 0xf.  (i.e. make sure the
 		 * user didn't give us a value like "0x12jklmno").
 		 */
 		if (c > 0xf) {
 			if (error_str != NULL) {
 				snprintf(error_str, error_str_len, "%s: "
 					 "RDMA ID must be hex digits, got "
 					 "invalid character %c", __func__,
 					 tmpstr[i]);
 			}
 			retval = 1;
 			goto bailout;
 		}
 		
 		rdma_id[j] |= c << cur_shift;
 	}
 
 #ifdef _KERNEL
 	rdma = malloc(sizeof(*rdma), type, flags);
 #else
 	rdma = malloc(sizeof(*rdma));
 #endif
 	if (rdma == NULL) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: unable to "
 				 "allocate %zu bytes", __func__,
 				 sizeof(*rdma));
 		}
 		retval = 1;
 		goto bailout;
 	}
 	*alloc_len = sizeof(*rdma);
 	bzero(rdma, *alloc_len);
 
 	rdma->format_protocol = SCSI_PROTO_RDMA | SCSI_TRN_RDMA_FORMAT_DEFAULT;
 	bcopy(rdma_id, rdma->initiator_port_id, SCSI_TRN_RDMA_PORT_LEN);
 
 	*hdr = (struct scsi_transportid_header *)rdma;
 
 bailout:
 	return (retval);
 }
 
 /*
  * Parse an iSCSI name.  The format is either just the name:
  *
  *	iqn.2012-06.com.example:target0
  * or the name, separator and initiator session ID:
  *
  *	iqn.2012-06.com.example:target0,i,0x123
  *
  * The separator format is exact.
  */
 int
 scsi_parse_transportid_iscsi(char *id_str, struct scsi_transportid_header **hdr,
 			     unsigned int *alloc_len,
 #ifdef _KERNEL
 			     struct malloc_type *type, int flags,
 #endif
 			     char *error_str, int error_str_len)
 {
 	size_t id_len, sep_len, id_size, name_len;
 	int retval;
 	unsigned int i, sep_pos, sep_found;
 	const char *sep_template = ",i,0x";
 	const char *iqn_prefix = "iqn.";
 	struct scsi_transportid_iscsi_device *iscsi;
 
 	retval = 0;
 	sep_found = 0;
 
 	id_len = strlen(id_str);
 	sep_len = strlen(sep_template);
 
 	/*
 	 * The separator is defined as exactly ',i,0x'.  Any other commas,
 	 * or any other form, is an error.  So look for a comma, and once
 	 * we find that, the next few characters must match the separator
 	 * exactly.  Once we get through the separator, there should be at
 	 * least one character.
 	 */
 	for (i = 0, sep_pos = 0; i < id_len; i++) {
 		if (sep_pos == 0) {
 		 	if (id_str[i] == sep_template[sep_pos])
 				sep_pos++;
 
 			continue;
 		}
 		if (sep_pos < sep_len) {
 			if (id_str[i] == sep_template[sep_pos]) {
 				sep_pos++;
 				continue;
 			} 
 			if (error_str != NULL) {
 				snprintf(error_str, error_str_len, "%s: "
 					 "invalid separator in iSCSI name "
 					 "\"%s\"",
 					 __func__, id_str);
 			}
 			retval = 1;
 			goto bailout;
 		} else {
 			sep_found = 1;
 			break;
 		}
 	}
 
 	/*
 	 * Check to see whether we have a separator but no digits after it.
 	 */
 	if ((sep_pos != 0)
 	 && (sep_found == 0)) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: no digits "
 				 "found after separator in iSCSI name \"%s\"",
 				 __func__, id_str);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	/*
 	 * The incoming ID string has the "iqn." prefix stripped off.  We
 	 * need enough space for the base structure (the structures are the
 	 * same for the two iSCSI forms), the prefix, the ID string and a
 	 * terminating NUL.
 	 */
 	id_size = sizeof(*iscsi) + strlen(iqn_prefix) + id_len + 1;
 
 #ifdef _KERNEL
 	iscsi = malloc(id_size, type, flags);
 #else
 	iscsi = malloc(id_size);
 #endif
 	if (iscsi == NULL) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: unable to "
 				 "allocate %zu bytes", __func__, id_size);
 		}
 		retval = 1;
 		goto bailout;
 	}
 	*alloc_len = id_size;
 	bzero(iscsi, id_size);
 
 	iscsi->format_protocol = SCSI_PROTO_ISCSI;
 	if (sep_found == 0)
 		iscsi->format_protocol |= SCSI_TRN_ISCSI_FORMAT_DEVICE;
 	else
 		iscsi->format_protocol |= SCSI_TRN_ISCSI_FORMAT_PORT;
 	name_len = id_size - sizeof(*iscsi);
 	scsi_ulto2b(name_len, iscsi->additional_length);
 	snprintf(iscsi->iscsi_name, name_len, "%s%s", iqn_prefix, id_str);
 
 	*hdr = (struct scsi_transportid_header *)iscsi;
 
 bailout:
 	return (retval);
 }
 
 /*
  * Parse a SCSI over PCIe (SOP) identifier.  The Routing ID can either be
  * of the form 'bus,device,function' or 'bus,function'.
  */
 int
 scsi_parse_transportid_sop(char *id_str, struct scsi_transportid_header **hdr,
 			   unsigned int *alloc_len,
 #ifdef _KERNEL
 			   struct malloc_type *type, int flags,
 #endif
 			   char *error_str, int error_str_len)
 {
 	struct scsi_transportid_sop *sop;
 	unsigned long bus, device, function;
 	char *tmpstr, *endptr;
 	int retval, device_spec;
 
 	retval = 0;
 	device_spec = 0;
 	device = 0;
 
 	tmpstr = strsep(&id_str, ",");
 	if ((tmpstr == NULL)
 	 || (*tmpstr == '\0')) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: no ID found",
 				 __func__);
 		}
 		retval = 1;
 		goto bailout;
 	}
 	bus = strtoul(tmpstr, &endptr, 0);
 	if (*endptr != '\0') {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: error "
 				 "parsing PCIe bus %s, number required",
 				 __func__, tmpstr);
 		}
 		retval = 1;
 		goto bailout;
 	}
 	if ((id_str == NULL) 
 	 || (*id_str == '\0')) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: no PCIe "
 				 "device or function found", __func__);
 		}
 		retval = 1;
 		goto bailout;
 	}
 	tmpstr = strsep(&id_str, ",");
 	function = strtoul(tmpstr, &endptr, 0);
 	if (*endptr != '\0') {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: error "
 				 "parsing PCIe device/function %s, number "
 				 "required", __func__, tmpstr);
 		}
 		retval = 1;
 		goto bailout;
 	}
 	/*
 	 * Check to see whether the user specified a third value.  If so,
 	 * the second is the device.
 	 */
 	if (id_str != NULL) {
 		if (*id_str == '\0') {
 			if (error_str != NULL) {
 				snprintf(error_str, error_str_len, "%s: "
 					 "no PCIe function found", __func__);
 			}
 			retval = 1;
 			goto bailout;
 		}
 		device = function;
 		device_spec = 1;
 		function = strtoul(id_str, &endptr, 0);
 		if (*endptr != '\0') {
 			if (error_str != NULL) {
 				snprintf(error_str, error_str_len, "%s: "
 					 "error parsing PCIe function %s, "
 					 "number required", __func__, id_str);
 			}
 			retval = 1;
 			goto bailout;
 		}
 	}
 	if (bus > SCSI_TRN_SOP_BUS_MAX) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: bus value "
 				 "%lu greater than maximum %u", __func__,
 				 bus, SCSI_TRN_SOP_BUS_MAX);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	if ((device_spec != 0)
 	 && (device > SCSI_TRN_SOP_DEV_MASK)) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: device value "
 				 "%lu greater than maximum %u", __func__,
 				 device, SCSI_TRN_SOP_DEV_MAX);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	if (((device_spec != 0)
 	  && (function > SCSI_TRN_SOP_FUNC_NORM_MAX))
 	 || ((device_spec == 0)
 	  && (function > SCSI_TRN_SOP_FUNC_ALT_MAX))) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: function value "
 				 "%lu greater than maximum %u", __func__,
 				 function, (device_spec == 0) ?
 				 SCSI_TRN_SOP_FUNC_ALT_MAX : 
 				 SCSI_TRN_SOP_FUNC_NORM_MAX);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 #ifdef _KERNEL
 	sop = malloc(sizeof(*sop), type, flags);
 #else
 	sop = malloc(sizeof(*sop));
 #endif
 	if (sop == NULL) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: unable to "
 				 "allocate %zu bytes", __func__, sizeof(*sop));
 		}
 		retval = 1;
 		goto bailout;
 	}
 	*alloc_len = sizeof(*sop);
 	bzero(sop, sizeof(*sop));
 	sop->format_protocol = SCSI_PROTO_SOP | SCSI_TRN_SOP_FORMAT_DEFAULT;
 	if (device_spec != 0) {
 		struct scsi_sop_routing_id_norm rid;
 
 		rid.bus = bus;
 		rid.devfunc = (device << SCSI_TRN_SOP_DEV_SHIFT) | function;
 		bcopy(&rid, sop->routing_id, MIN(sizeof(rid),
 		      sizeof(sop->routing_id)));
 	} else {
 		struct scsi_sop_routing_id_alt rid;
 
 		rid.bus = bus;
 		rid.function = function;
 		bcopy(&rid, sop->routing_id, MIN(sizeof(rid),
 		      sizeof(sop->routing_id)));
 	}
 
 	*hdr = (struct scsi_transportid_header *)sop;
 bailout:
 	return (retval);
 }
 
 /*
  * transportid_str: NUL-terminated string with format: protcol,id
  *		    The ID is protocol specific.
  * hdr:		    Storage will be allocated for the transport ID.
  * alloc_len:	    The amount of memory allocated is returned here.
  * type:	    Malloc bucket (kernel only).
  * flags:	    Malloc flags (kernel only).
  * error_str:	    If non-NULL, it will contain error information (without
  * 		    a terminating newline) if an error is returned.
  * error_str_len:   Allocated length of the error string.
  *
  * Returns 0 for success, non-zero for failure.
  */
 int
 scsi_parse_transportid(char *transportid_str,
 		       struct scsi_transportid_header **hdr,
 		       unsigned int *alloc_len,
 #ifdef _KERNEL
 		       struct malloc_type *type, int flags,
 #endif
 		       char *error_str, int error_str_len)
 {
 	char *tmpstr;
 	scsi_nv_status status;
 	u_int num_proto_entries;
 	int retval, table_entry;
 
 	retval = 0;
 	table_entry = 0;
 
 	/*
 	 * We do allow a period as well as a comma to separate the protocol
 	 * from the ID string.  This is to accommodate iSCSI names, which
 	 * start with "iqn.".
 	 */
 	tmpstr = strsep(&transportid_str, ",.");
 	if (tmpstr == NULL) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len,
 				 "%s: transportid_str is NULL", __func__);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	num_proto_entries = nitems(scsi_proto_map);
 	status = scsi_get_nv(scsi_proto_map, num_proto_entries, tmpstr,
 			     &table_entry, SCSI_NV_FLAG_IG_CASE);
 	if (status != SCSI_NV_FOUND) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: %s protocol "
 				 "name %s", __func__,
 				 (status == SCSI_NV_AMBIGUOUS) ? "ambiguous" :
 				 "invalid", tmpstr);
 		}
 		retval = 1;
 		goto bailout;
 	}
 	switch (scsi_proto_map[table_entry].value) {
 	case SCSI_PROTO_FC:
 	case SCSI_PROTO_1394:
 	case SCSI_PROTO_SAS:
 		retval = scsi_parse_transportid_64bit(
 		    scsi_proto_map[table_entry].value, transportid_str, hdr,
 		    alloc_len,
 #ifdef _KERNEL
 		    type, flags,
 #endif
 		    error_str, error_str_len);
 		break;
 	case SCSI_PROTO_SPI:
 		retval = scsi_parse_transportid_spi(transportid_str, hdr,
 		    alloc_len,
 #ifdef _KERNEL
 		    type, flags,
 #endif
 		    error_str, error_str_len);
 		break;
 	case SCSI_PROTO_RDMA:
 		retval = scsi_parse_transportid_rdma(transportid_str, hdr,
 		    alloc_len,
 #ifdef _KERNEL
 		    type, flags,
 #endif
 		    error_str, error_str_len);
 		break;
 	case SCSI_PROTO_ISCSI:
 		retval = scsi_parse_transportid_iscsi(transportid_str, hdr,
 		    alloc_len,
 #ifdef _KERNEL
 		    type, flags,
 #endif
 		    error_str, error_str_len);
 		break;
 	case SCSI_PROTO_SOP:
 		retval = scsi_parse_transportid_sop(transportid_str, hdr,
 		    alloc_len,
 #ifdef _KERNEL
 		    type, flags,
 #endif
 		    error_str, error_str_len);
 		break;
 	case SCSI_PROTO_SSA:
 	case SCSI_PROTO_ADITP:
 	case SCSI_PROTO_ATA:
 	case SCSI_PROTO_UAS:
 	case SCSI_PROTO_NONE:
 	default:
 		/*
 		 * There is no format defined for a Transport ID for these
 		 * protocols.  So even if the user gives us something, we
 		 * have no way to turn it into a standard SCSI Transport ID.
 		 */
 		retval = 1;
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "%s: no Transport "
 				 "ID format exists for protocol %s",
 				 __func__, tmpstr);
 		}
 		goto bailout;
 		break;	/* NOTREACHED */
 	}
 bailout:
 	return (retval);
 }
 
 struct scsi_attrib_table_entry scsi_mam_attr_table[] = {
 	{ SMA_ATTR_REM_CAP_PARTITION, SCSI_ATTR_FLAG_NONE,
 	  "Remaining Capacity in Partition",
 	  /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf,/*parse_str*/ NULL },
 	{ SMA_ATTR_MAX_CAP_PARTITION, SCSI_ATTR_FLAG_NONE,
 	  "Maximum Capacity in Partition",
 	  /*suffix*/"MB", /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL },
 	{ SMA_ATTR_TAPEALERT_FLAGS, SCSI_ATTR_FLAG_HEX,
 	  "TapeAlert Flags",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL },
 	{ SMA_ATTR_LOAD_COUNT, SCSI_ATTR_FLAG_NONE,
 	  "Load Count",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL },
 	{ SMA_ATTR_MAM_SPACE_REMAINING, SCSI_ATTR_FLAG_NONE,
 	  "MAM Space Remaining",
 	  /*suffix*/"bytes", /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_DEV_ASSIGNING_ORG, SCSI_ATTR_FLAG_NONE,
 	  "Assigning Organization",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_FORMAT_DENSITY_CODE, SCSI_ATTR_FLAG_HEX,
 	  "Format Density Code",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL },
 	{ SMA_ATTR_INITIALIZATION_COUNT, SCSI_ATTR_FLAG_NONE,
 	  "Initialization Count",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf, /*parse_str*/ NULL },
 	{ SMA_ATTR_VOLUME_ID, SCSI_ATTR_FLAG_NONE,
 	  "Volume Identifier",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_VOLUME_CHANGE_REF, SCSI_ATTR_FLAG_HEX,
 	  "Volume Change Reference",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_DEV_SERIAL_LAST_LOAD, SCSI_ATTR_FLAG_NONE,
 	  "Device Vendor/Serial at Last Load",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_vendser_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_DEV_SERIAL_LAST_LOAD_1, SCSI_ATTR_FLAG_NONE,
 	  "Device Vendor/Serial at Last Load - 1",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_vendser_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_DEV_SERIAL_LAST_LOAD_2, SCSI_ATTR_FLAG_NONE,
 	  "Device Vendor/Serial at Last Load - 2",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_vendser_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_DEV_SERIAL_LAST_LOAD_3, SCSI_ATTR_FLAG_NONE,
 	  "Device Vendor/Serial at Last Load - 3",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_vendser_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_TOTAL_MB_WRITTEN_LT, SCSI_ATTR_FLAG_NONE,
 	  "Total MB Written in Medium Life",
 	  /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_TOTAL_MB_READ_LT, SCSI_ATTR_FLAG_NONE,
 	  "Total MB Read in Medium Life",
 	  /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_TOTAL_MB_WRITTEN_CUR, SCSI_ATTR_FLAG_NONE,
 	  "Total MB Written in Current/Last Load",
 	  /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_TOTAL_MB_READ_CUR, SCSI_ATTR_FLAG_NONE,
 	  "Total MB Read in Current/Last Load",
 	  /*suffix*/ "MB", /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_FIRST_ENC_BLOCK, SCSI_ATTR_FLAG_NONE,
 	  "Logical Position of First Encrypted Block",
 	  /*suffix*/ NULL, /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_NEXT_UNENC_BLOCK, SCSI_ATTR_FLAG_NONE,
 	  "Logical Position of First Unencrypted Block after First "
 	  "Encrypted Block",
 	  /*suffix*/ NULL, /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MEDIUM_USAGE_HIST, SCSI_ATTR_FLAG_NONE,
 	  "Medium Usage History",
 	  /*suffix*/ NULL, /*to_str*/ NULL,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_PART_USAGE_HIST, SCSI_ATTR_FLAG_NONE,
 	  "Partition Usage History",
 	  /*suffix*/ NULL, /*to_str*/ NULL,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MED_MANUF, SCSI_ATTR_FLAG_NONE,
 	  "Medium Manufacturer",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MED_SERIAL, SCSI_ATTR_FLAG_NONE,
 	  "Medium Serial Number",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MED_LENGTH, SCSI_ATTR_FLAG_NONE,
 	  "Medium Length",
 	  /*suffix*/"m", /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MED_WIDTH, SCSI_ATTR_FLAG_FP | SCSI_ATTR_FLAG_DIV_10 |
 	  SCSI_ATTR_FLAG_FP_1DIGIT,
 	  "Medium Width",
 	  /*suffix*/"mm", /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MED_ASSIGNING_ORG, SCSI_ATTR_FLAG_NONE,
 	  "Assigning Organization",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MED_DENSITY_CODE, SCSI_ATTR_FLAG_HEX,
 	  "Medium Density Code",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MED_MANUF_DATE, SCSI_ATTR_FLAG_NONE,
 	  "Medium Manufacture Date",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MAM_CAPACITY, SCSI_ATTR_FLAG_NONE,
 	  "MAM Capacity",
 	  /*suffix*/"bytes", /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MED_TYPE, SCSI_ATTR_FLAG_HEX,
 	  "Medium Type",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MED_TYPE_INFO, SCSI_ATTR_FLAG_HEX,
 	  "Medium Type Information",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MED_SERIAL_NUM, SCSI_ATTR_FLAG_NONE,
 	  "Medium Serial Number",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_APP_VENDOR, SCSI_ATTR_FLAG_NONE,
 	  "Application Vendor",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_APP_NAME, SCSI_ATTR_FLAG_NONE,
 	  "Application Name",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_APP_VERSION, SCSI_ATTR_FLAG_NONE,
 	  "Application Version",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_USER_MED_TEXT_LABEL, SCSI_ATTR_FLAG_NONE,
 	  "User Medium Text Label",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_text_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_LAST_WRITTEN_TIME, SCSI_ATTR_FLAG_NONE,
 	  "Date and Time Last Written",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_TEXT_LOCAL_ID, SCSI_ATTR_FLAG_HEX,
 	  "Text Localization Identifier",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_BARCODE, SCSI_ATTR_FLAG_NONE,
 	  "Barcode",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_HOST_OWNER_NAME, SCSI_ATTR_FLAG_NONE,
 	  "Owning Host Textual Name",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_text_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_MEDIA_POOL, SCSI_ATTR_FLAG_NONE,
 	  "Media Pool",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_text_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_PART_USER_LABEL, SCSI_ATTR_FLAG_NONE,
 	  "Partition User Text Label",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_LOAD_UNLOAD_AT_PART, SCSI_ATTR_FLAG_NONE,
 	  "Load/Unload at Partition",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_int_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_APP_FORMAT_VERSION, SCSI_ATTR_FLAG_NONE,
 	  "Application Format Version",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_ascii_sbuf,
 	  /*parse_str*/ NULL },
 	{ SMA_ATTR_VOL_COHERENCY_INFO, SCSI_ATTR_FLAG_NONE,
 	  "Volume Coherency Information",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_volcoh_sbuf,
 	  /*parse_str*/ NULL },
 	{ 0x0ff1, SCSI_ATTR_FLAG_NONE,
 	  "Spectra MLM Creation",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf,
 	  /*parse_str*/ NULL },
 	{ 0x0ff2, SCSI_ATTR_FLAG_NONE,
 	  "Spectra MLM C3",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf,
 	  /*parse_str*/ NULL },
 	{ 0x0ff3, SCSI_ATTR_FLAG_NONE,
 	  "Spectra MLM RW",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf,
 	  /*parse_str*/ NULL },
 	{ 0x0ff4, SCSI_ATTR_FLAG_NONE,
 	  "Spectra MLM SDC List",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf,
 	  /*parse_str*/ NULL },
 	{ 0x0ff7, SCSI_ATTR_FLAG_NONE,
 	  "Spectra MLM Post Scan",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf,
 	  /*parse_str*/ NULL },
 	{ 0x0ffe, SCSI_ATTR_FLAG_NONE,
 	  "Spectra MLM Checksum",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf,
 	  /*parse_str*/ NULL },
 	{ 0x17f1, SCSI_ATTR_FLAG_NONE,
 	  "Spectra MLM Creation",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf,
 	  /*parse_str*/ NULL },
 	{ 0x17f2, SCSI_ATTR_FLAG_NONE,
 	  "Spectra MLM C3",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf,
 	  /*parse_str*/ NULL },
 	{ 0x17f3, SCSI_ATTR_FLAG_NONE,
 	  "Spectra MLM RW",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf,
 	  /*parse_str*/ NULL },
 	{ 0x17f4, SCSI_ATTR_FLAG_NONE,
 	  "Spectra MLM SDC List",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf,
 	  /*parse_str*/ NULL },
 	{ 0x17f7, SCSI_ATTR_FLAG_NONE,
 	  "Spectra MLM Post Scan",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf,
 	  /*parse_str*/ NULL },
 	{ 0x17ff, SCSI_ATTR_FLAG_NONE,
 	  "Spectra MLM Checksum",
 	  /*suffix*/NULL, /*to_str*/ scsi_attrib_hexdump_sbuf,
 	  /*parse_str*/ NULL },
 };
 
 /*
  * Print out Volume Coherency Information (Attribute 0x080c).
  * This field has two variable length members, including one at the
  * beginning, so it isn't practical to have a fixed structure definition.
  * This is current as of SSC4r03 (see section 4.2.21.3), dated March 25,
  * 2013.
  */
 int
 scsi_attrib_volcoh_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr,
 			 uint32_t valid_len, uint32_t flags,
 			 uint32_t output_flags, char *error_str,
 			 int error_str_len)
 {
 	size_t avail_len;
 	uint32_t field_size;
 	uint64_t tmp_val;
 	uint8_t *cur_ptr;
 	int retval;
 	int vcr_len, as_len;
 
 	retval = 0;
 	tmp_val = 0;
 
 	field_size = scsi_2btoul(hdr->length);
 	avail_len = valid_len - sizeof(*hdr);
 	if (field_size > avail_len) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "Available "
 				 "length of attribute ID 0x%.4x %zu < field "
 				 "length %u", scsi_2btoul(hdr->id), avail_len,
 				 field_size);
 		}
 		retval = 1;
 		goto bailout;
 	} else if (field_size == 0) {
 		/*
 		 * It isn't clear from the spec whether a field length of
 		 * 0 is invalid here.  It probably is, but be lenient here
 		 * to avoid inconveniencing the user.
 		 */
 		goto bailout;
 	}
 	cur_ptr = hdr->attribute;
 	vcr_len = *cur_ptr;
 	cur_ptr++;
 
 	sbuf_printf(sb, "\n\tVolume Change Reference Value:");
 
 	switch (vcr_len) {
 	case 0:
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "Volume Change "
 				 "Reference value has length of 0");
 		}
 		retval = 1;
 		goto bailout;
 		break; /*NOTREACHED*/
 	case 1:
 		tmp_val = *cur_ptr;
 		break;
 	case 2:
 		tmp_val = scsi_2btoul(cur_ptr);
 		break;
 	case 3:
 		tmp_val = scsi_3btoul(cur_ptr);
 		break;
 	case 4:
 		tmp_val = scsi_4btoul(cur_ptr);
 		break;
 	case 8:
 		tmp_val = scsi_8btou64(cur_ptr);
 		break;
 	default:
 		sbuf_printf(sb, "\n");
 		sbuf_hexdump(sb, cur_ptr, vcr_len, NULL, 0);
 		break;
 	}
 	if (vcr_len <= 8)
 		sbuf_printf(sb, " 0x%jx\n", (uintmax_t)tmp_val);
 
 	cur_ptr += vcr_len;
 	tmp_val = scsi_8btou64(cur_ptr);
 	sbuf_printf(sb, "\tVolume Coherency Count: %ju\n", (uintmax_t)tmp_val);
 
 	cur_ptr += sizeof(tmp_val);
 	tmp_val = scsi_8btou64(cur_ptr);
 	sbuf_printf(sb, "\tVolume Coherency Set Identifier: 0x%jx\n",
 		    (uintmax_t)tmp_val);
 
 	/*
 	 * Figure out how long the Application Client Specific Information
 	 * is and produce a hexdump.
 	 */
 	cur_ptr += sizeof(tmp_val);
 	as_len = scsi_2btoul(cur_ptr);
 	cur_ptr += sizeof(uint16_t);
 	sbuf_printf(sb, "\tApplication Client Specific Information: ");
 	if (((as_len == SCSI_LTFS_VER0_LEN)
 	  || (as_len == SCSI_LTFS_VER1_LEN))
 	 && (strncmp(cur_ptr, SCSI_LTFS_STR_NAME, SCSI_LTFS_STR_LEN) == 0)) {
 		sbuf_printf(sb, "LTFS\n");
 		cur_ptr += SCSI_LTFS_STR_LEN + 1;
 		if (cur_ptr[SCSI_LTFS_UUID_LEN] != '\0')
 			cur_ptr[SCSI_LTFS_UUID_LEN] = '\0';
 		sbuf_printf(sb, "\tLTFS UUID: %s\n", cur_ptr);
 		cur_ptr += SCSI_LTFS_UUID_LEN + 1;
 		/* XXX KDM check the length */
 		sbuf_printf(sb, "\tLTFS Version: %d\n", *cur_ptr);
 	} else {
 		sbuf_printf(sb, "Unknown\n");
 		sbuf_hexdump(sb, cur_ptr, as_len, NULL, 0);
 	}
 
 bailout:
 	return (retval);
 }
 
 int
 scsi_attrib_vendser_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr,
 			 uint32_t valid_len, uint32_t flags, 
 			 uint32_t output_flags, char *error_str,
 			 int error_str_len)
 {
 	size_t avail_len;
 	uint32_t field_size;
 	struct scsi_attrib_vendser *vendser;
 	cam_strvis_flags strvis_flags;
 	int retval = 0;
 
 	field_size = scsi_2btoul(hdr->length);
 	avail_len = valid_len - sizeof(*hdr);
 	if (field_size > avail_len) {
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "Available "
 				 "length of attribute ID 0x%.4x %zu < field "
 				 "length %u", scsi_2btoul(hdr->id), avail_len,
 				 field_size);
 		}
 		retval = 1;
 		goto bailout;
 	} else if (field_size == 0) {
 		/*
 		 * A field size of 0 doesn't make sense here.  The device
 		 * can at least give you the vendor ID, even if it can't
 		 * give you the serial number.
 		 */
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "The length of "
 				 "attribute ID 0x%.4x is 0",
 				 scsi_2btoul(hdr->id));
 		}
 		retval = 1;
 		goto bailout;
 	}
 	vendser = (struct scsi_attrib_vendser *)hdr->attribute;
 
 	switch (output_flags & SCSI_ATTR_OUTPUT_NONASCII_MASK) {
 	case SCSI_ATTR_OUTPUT_NONASCII_TRIM:
 		strvis_flags = CAM_STRVIS_FLAG_NONASCII_TRIM;
 		break;
 	case SCSI_ATTR_OUTPUT_NONASCII_RAW:
 		strvis_flags = CAM_STRVIS_FLAG_NONASCII_RAW;
 		break;
 	case SCSI_ATTR_OUTPUT_NONASCII_ESC:
 	default:
 		strvis_flags = CAM_STRVIS_FLAG_NONASCII_ESC;
 		break;;
 	}
 	cam_strvis_sbuf(sb, vendser->vendor, sizeof(vendser->vendor),
 	    strvis_flags);
 	sbuf_putc(sb, ' ');
 	cam_strvis_sbuf(sb, vendser->serial_num, sizeof(vendser->serial_num),
 	    strvis_flags);
 bailout:
 	return (retval);
 }
 
 int
 scsi_attrib_hexdump_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr,
 			 uint32_t valid_len, uint32_t flags,
 			 uint32_t output_flags, char *error_str,
 			 int error_str_len)
 {
 	uint32_t field_size;
 	ssize_t avail_len;
 	uint32_t print_len;
 	uint8_t *num_ptr;
 	int retval = 0;
 
 	field_size = scsi_2btoul(hdr->length);
 	avail_len = valid_len - sizeof(*hdr);
 	print_len = MIN(avail_len, field_size);
 	num_ptr = hdr->attribute;
 
 	if (print_len > 0) {
 		sbuf_printf(sb, "\n");
 		sbuf_hexdump(sb, num_ptr, print_len, NULL, 0);
 	}
 
 	return (retval);
 }
 
 int
 scsi_attrib_int_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr,
 		     uint32_t valid_len, uint32_t flags,
 		     uint32_t output_flags, char *error_str,
 		     int error_str_len)
 {
 	uint64_t print_number;
 	size_t avail_len;
 	uint32_t number_size;
 	int retval = 0;
 
 	number_size = scsi_2btoul(hdr->length);
 
 	avail_len = valid_len - sizeof(*hdr);
 	if (avail_len < number_size) { 
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "Available "
 				 "length of attribute ID 0x%.4x %zu < field "
 				 "length %u", scsi_2btoul(hdr->id), avail_len,
 				 number_size);
 		}
 		retval = 1;
 		goto bailout;
 	}
 
 	switch (number_size) {
 	case 0:
 		/*
 		 * We don't treat this as an error, since there may be
 		 * scenarios where a device reports a field but then gives
 		 * a length of 0.  See the note in scsi_attrib_ascii_sbuf().
 		 */
 		goto bailout;
 		break; /*NOTREACHED*/
 	case 1:
 		print_number = hdr->attribute[0];
 		break;
 	case 2:
 		print_number = scsi_2btoul(hdr->attribute);
 		break;
 	case 3:
 		print_number = scsi_3btoul(hdr->attribute);
 		break;
 	case 4:
 		print_number = scsi_4btoul(hdr->attribute);
 		break;
 	case 8:
 		print_number = scsi_8btou64(hdr->attribute);
 		break;
 	default:
 		/*
 		 * If we wind up here, the number is too big to print
 		 * normally, so just do a hexdump.
 		 */
 		retval = scsi_attrib_hexdump_sbuf(sb, hdr, valid_len,
 						  flags, output_flags,
 						  error_str, error_str_len);
 		goto bailout;
 		break;
 	}
 
 	if (flags & SCSI_ATTR_FLAG_FP) {
 #ifndef _KERNEL
 		long double num_float;
 
 		num_float = (long double)print_number;
 
 		if (flags & SCSI_ATTR_FLAG_DIV_10)
 			num_float /= 10;
 
 		sbuf_printf(sb, "%.*Lf", (flags & SCSI_ATTR_FLAG_FP_1DIGIT) ?
 			    1 : 0, num_float);
 #else /* _KERNEL */
 		sbuf_printf(sb, "%ju", (flags & SCSI_ATTR_FLAG_DIV_10) ?
 			    (print_number / 10) : print_number);
 #endif /* _KERNEL */
 	} else if (flags & SCSI_ATTR_FLAG_HEX) {
 		sbuf_printf(sb, "0x%jx", (uintmax_t)print_number);
 	} else
 		sbuf_printf(sb, "%ju", (uintmax_t)print_number);
 
 bailout:
 	return (retval);
 }
 
 int
 scsi_attrib_ascii_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr,
 		       uint32_t valid_len, uint32_t flags,
 		       uint32_t output_flags, char *error_str,
 		       int error_str_len)
 {
 	size_t avail_len;
 	uint32_t field_size, print_size;
 	int retval = 0;
 
 	avail_len = valid_len - sizeof(*hdr);
 	field_size = scsi_2btoul(hdr->length);
 	print_size = MIN(avail_len, field_size);
 
 	if (print_size > 0) {
 		cam_strvis_flags strvis_flags;
 
 		switch (output_flags & SCSI_ATTR_OUTPUT_NONASCII_MASK) {
 		case SCSI_ATTR_OUTPUT_NONASCII_TRIM:
 			strvis_flags = CAM_STRVIS_FLAG_NONASCII_TRIM;
 			break;
 		case SCSI_ATTR_OUTPUT_NONASCII_RAW:
 			strvis_flags = CAM_STRVIS_FLAG_NONASCII_RAW;
 			break;
 		case SCSI_ATTR_OUTPUT_NONASCII_ESC:
 		default:
 			strvis_flags = CAM_STRVIS_FLAG_NONASCII_ESC;
 			break;
 		}
 		cam_strvis_sbuf(sb, hdr->attribute, print_size, strvis_flags);
 	} else if (avail_len < field_size) {
 		/*
 		 * We only report an error if the user didn't allocate
 		 * enough space to hold the full value of this field.  If
 		 * the field length is 0, that is allowed by the spec.
 		 * e.g. in SPC-4r37, section 7.4.2.2.5, VOLUME IDENTIFIER
 		 * "This attribute indicates the current volume identifier
 		 * (see SMC-3) of the medium. If the device server supports
 		 * this attribute but does not have access to the volume
 		 * identifier, the device server shall report this attribute
 		 * with an attribute length value of zero."
 		 */
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "Available "
 				 "length of attribute ID 0x%.4x %zu < field "
 				 "length %u", scsi_2btoul(hdr->id), avail_len,
 				 field_size);
 		}
 		retval = 1;
 	}
 
 	return (retval);
 }
 
 int
 scsi_attrib_text_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr,
 		      uint32_t valid_len, uint32_t flags, 
 		      uint32_t output_flags, char *error_str,
 		      int error_str_len)
 {
 	size_t avail_len;
 	uint32_t field_size, print_size;
 	int retval = 0;
 	int esc_text = 1;
 
 	avail_len = valid_len - sizeof(*hdr);
 	field_size = scsi_2btoul(hdr->length);
 	print_size = MIN(avail_len, field_size);
 
 	if ((output_flags & SCSI_ATTR_OUTPUT_TEXT_MASK) ==
 	     SCSI_ATTR_OUTPUT_TEXT_RAW)
 		esc_text = 0;
 
 	if (print_size > 0) {
 		uint32_t i;
 
 		for (i = 0; i < print_size; i++) {
 			if (hdr->attribute[i] == '\0')
 				continue;
 			else if (((unsigned char)hdr->attribute[i] < 0x80)
 			      || (esc_text == 0))
 				sbuf_putc(sb, hdr->attribute[i]);
 			else
 				sbuf_printf(sb, "%%%02x",
 				    (unsigned char)hdr->attribute[i]);
 		}
 	} else if (avail_len < field_size) {
 		/*
 		 * We only report an error if the user didn't allocate
 		 * enough space to hold the full value of this field.
 		 */
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "Available "
 				 "length of attribute ID 0x%.4x %zu < field "
 				 "length %u", scsi_2btoul(hdr->id), avail_len,
 				 field_size);
 		}
 		retval = 1;
 	}
 
 	return (retval);
 }
 
 struct scsi_attrib_table_entry *
 scsi_find_attrib_entry(struct scsi_attrib_table_entry *table,
 		       size_t num_table_entries, uint32_t id)
 {
 	uint32_t i;
 
 	for (i = 0; i < num_table_entries; i++) {
 		if (table[i].id == id)
 			return (&table[i]);
 	}
 
 	return (NULL);
 }
 
 struct scsi_attrib_table_entry *
 scsi_get_attrib_entry(uint32_t id)
 {
 	return (scsi_find_attrib_entry(scsi_mam_attr_table,
 	    nitems(scsi_mam_attr_table), id));
 }
 
 int
 scsi_attrib_value_sbuf(struct sbuf *sb, uint32_t valid_len,
    struct scsi_mam_attribute_header *hdr, uint32_t output_flags,
    char *error_str, size_t error_str_len)
 {
 	int retval;
 
 	switch (hdr->byte2 & SMA_FORMAT_MASK) {
 	case SMA_FORMAT_ASCII:
 		retval = scsi_attrib_ascii_sbuf(sb, hdr, valid_len,
 		    SCSI_ATTR_FLAG_NONE, output_flags, error_str,error_str_len);
 		break;
 	case SMA_FORMAT_BINARY:
 		if (scsi_2btoul(hdr->length) <= 8)
 			retval = scsi_attrib_int_sbuf(sb, hdr, valid_len,
 			    SCSI_ATTR_FLAG_NONE, output_flags, error_str,
 			    error_str_len);
 		else
 			retval = scsi_attrib_hexdump_sbuf(sb, hdr, valid_len,
 			    SCSI_ATTR_FLAG_NONE, output_flags, error_str,
 			    error_str_len);
 		break;
 	case SMA_FORMAT_TEXT:
 		retval = scsi_attrib_text_sbuf(sb, hdr, valid_len,
 		    SCSI_ATTR_FLAG_NONE, output_flags, error_str,
 		    error_str_len);
 		break;
 	default:
 		if (error_str != NULL) {
 			snprintf(error_str, error_str_len, "Unknown attribute "
 			    "format 0x%x", hdr->byte2 & SMA_FORMAT_MASK);
 		}
 		retval = 1;
 		goto bailout;
 		break; /*NOTREACHED*/
 	}
 
 	sbuf_trim(sb);
 
 bailout:
 
 	return (retval);
 }
 
 void
 scsi_attrib_prefix_sbuf(struct sbuf *sb, uint32_t output_flags,
 			struct scsi_mam_attribute_header *hdr,
 			uint32_t valid_len, const char *desc)
 {
 	int need_space = 0;
 	uint32_t len;
 	uint32_t id;
 
 	/*
 	 * We can't do anything if we don't have enough valid data for the
 	 * header.
 	 */
 	if (valid_len < sizeof(*hdr))
 		return;
 
 	id = scsi_2btoul(hdr->id);
 	/*
 	 * Note that we print out the value of the attribute listed in the
 	 * header, regardless of whether we actually got that many bytes
 	 * back from the device through the controller.  A truncated result
 	 * could be the result of a failure to ask for enough data; the
 	 * header indicates how many bytes are allocated for this attribute
 	 * in the MAM.
 	 */
 	len = scsi_2btoul(hdr->length);
 
 	if ((output_flags & SCSI_ATTR_OUTPUT_FIELD_MASK) ==
 	    SCSI_ATTR_OUTPUT_FIELD_NONE)
 		return;
 
 	if ((output_flags & SCSI_ATTR_OUTPUT_FIELD_DESC)
 	 && (desc != NULL)) {
 		sbuf_printf(sb, "%s", desc);
 		need_space = 1;
 	}
 
 	if (output_flags & SCSI_ATTR_OUTPUT_FIELD_NUM) {
 		sbuf_printf(sb, "%s(0x%.4x)", (need_space) ? " " : "", id);
 		need_space = 0;
 	}
 
 	if (output_flags & SCSI_ATTR_OUTPUT_FIELD_SIZE) {
 		sbuf_printf(sb, "%s[%d]", (need_space) ? " " : "", len);
 		need_space = 0;
 	}
 	if (output_flags & SCSI_ATTR_OUTPUT_FIELD_RW) {
 		sbuf_printf(sb, "%s(%s)", (need_space) ? " " : "",
 			    (hdr->byte2 & SMA_READ_ONLY) ? "RO" : "RW");
 	}
 	sbuf_printf(sb, ": ");
 }
 
 int
 scsi_attrib_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr,
 		 uint32_t valid_len, struct scsi_attrib_table_entry *user_table,
 		 size_t num_user_entries, int prefer_user_table,
 		 uint32_t output_flags, char *error_str, int error_str_len)
 {
 	int retval;
 	struct scsi_attrib_table_entry *table1 = NULL, *table2 = NULL;
 	struct scsi_attrib_table_entry *entry = NULL;
 	size_t table1_size = 0, table2_size = 0;
 	uint32_t id;
 
 	retval = 0;
 
 	if (valid_len < sizeof(*hdr)) {
 		retval = 1;
 		goto bailout;
 	}
 
 	id = scsi_2btoul(hdr->id);
 
 	if (user_table != NULL) {
 		if (prefer_user_table != 0) {
 			table1 = user_table;
 			table1_size = num_user_entries;
 			table2 = scsi_mam_attr_table;
 			table2_size = nitems(scsi_mam_attr_table);
 		} else {
 			table1 = scsi_mam_attr_table;
 			table1_size = nitems(scsi_mam_attr_table);
 			table2 = user_table;
 			table2_size = num_user_entries;
 		}
 	} else {
 		table1 = scsi_mam_attr_table;
 		table1_size = nitems(scsi_mam_attr_table);
 	}
 
 	entry = scsi_find_attrib_entry(table1, table1_size, id);
 	if (entry != NULL) {
 		scsi_attrib_prefix_sbuf(sb, output_flags, hdr, valid_len,
 					entry->desc);
 		if (entry->to_str == NULL)
 			goto print_default;
 		retval = entry->to_str(sb, hdr, valid_len, entry->flags,
 				       output_flags, error_str, error_str_len);
 		goto bailout;
 	}
 	if (table2 != NULL) {
 		entry = scsi_find_attrib_entry(table2, table2_size, id);
 		if (entry != NULL) {
 			if (entry->to_str == NULL)
 				goto print_default;
 
 			scsi_attrib_prefix_sbuf(sb, output_flags, hdr,
 						valid_len, entry->desc);
 			retval = entry->to_str(sb, hdr, valid_len, entry->flags,
 					       output_flags, error_str,
 					       error_str_len);
 			goto bailout;
 		}
 	}
 
 	scsi_attrib_prefix_sbuf(sb, output_flags, hdr, valid_len, NULL);
 
 print_default:
 	retval = scsi_attrib_value_sbuf(sb, valid_len, hdr, output_flags,
 	    error_str, error_str_len);
 bailout:
 	if (retval == 0) {
 	 	if ((entry != NULL)
 		 && (entry->suffix != NULL))
 			sbuf_printf(sb, " %s", entry->suffix);
 
 		sbuf_trim(sb);
 		sbuf_printf(sb, "\n");
 	}
 
 	return (retval);
 }
 
 void
 scsi_test_unit_ready(struct ccb_scsiio *csio, u_int32_t retries,
 		     void (*cbfcnp)(struct cam_periph *, union ccb *),
 		     u_int8_t tag_action, u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_test_unit_ready *scsi_cmd;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      CAM_DIR_NONE,
 		      tag_action,
 		      /*data_ptr*/NULL,
 		      /*dxfer_len*/0,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 
 	scsi_cmd = (struct scsi_test_unit_ready *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = TEST_UNIT_READY;
 }
 
 void
 scsi_request_sense(struct ccb_scsiio *csio, u_int32_t retries,
 		   void (*cbfcnp)(struct cam_periph *, union ccb *),
 		   void *data_ptr, u_int8_t dxfer_len, u_int8_t tag_action,
 		   u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_request_sense *scsi_cmd;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      CAM_DIR_IN,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 
 	scsi_cmd = (struct scsi_request_sense *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = REQUEST_SENSE;
 	scsi_cmd->length = dxfer_len;
 }
 
 void
 scsi_inquiry(struct ccb_scsiio *csio, u_int32_t retries,
 	     void (*cbfcnp)(struct cam_periph *, union ccb *),
 	     u_int8_t tag_action, u_int8_t *inq_buf, u_int32_t inq_len,
 	     int evpd, u_int8_t page_code, u_int8_t sense_len,
 	     u_int32_t timeout)
 {
 	struct scsi_inquiry *scsi_cmd;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      /*data_ptr*/inq_buf,
 		      /*dxfer_len*/inq_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 
 	scsi_cmd = (struct scsi_inquiry *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = INQUIRY;
 	if (evpd) {
 		scsi_cmd->byte2 |= SI_EVPD;
 		scsi_cmd->page_code = page_code;		
 	}
 	scsi_ulto2b(inq_len, scsi_cmd->length);
 }
 
 void
 scsi_mode_sense(struct ccb_scsiio *csio, uint32_t retries,
     void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action,
     int dbd, uint8_t pc, uint8_t page, uint8_t *param_buf, uint32_t param_len,
     uint8_t sense_len, uint32_t timeout)
 {
 
 	scsi_mode_sense_subpage(csio, retries, cbfcnp, tag_action, dbd,
 	    pc, page, 0, param_buf, param_len, 0, sense_len, timeout);
 }
 
 void
 scsi_mode_sense_len(struct ccb_scsiio *csio, uint32_t retries,
     void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action,
     int dbd, uint8_t pc, uint8_t page, uint8_t *param_buf, uint32_t param_len,
     int minimum_cmd_size, uint8_t sense_len, uint32_t timeout)
 {
 
 	scsi_mode_sense_subpage(csio, retries, cbfcnp, tag_action, dbd,
 	    pc, page, 0, param_buf, param_len, minimum_cmd_size,
 	    sense_len, timeout);
 }
 
 void
 scsi_mode_sense_subpage(struct ccb_scsiio *csio, uint32_t retries,
     void (*cbfcnp)(struct cam_periph *, union ccb *), uint8_t tag_action,
     int dbd, uint8_t pc, uint8_t page, uint8_t subpage, uint8_t *param_buf,
     uint32_t param_len, int minimum_cmd_size, uint8_t sense_len,
     uint32_t timeout)
 {
 	u_int8_t cdb_len;
 
 	/*
 	 * Use the smallest possible command to perform the operation.
 	 */
 	if ((param_len < 256)
 	 && (minimum_cmd_size < 10)) {
 		/*
 		 * We can fit in a 6 byte cdb.
 		 */
 		struct scsi_mode_sense_6 *scsi_cmd;
 
 		scsi_cmd = (struct scsi_mode_sense_6 *)&csio->cdb_io.cdb_bytes;
 		bzero(scsi_cmd, sizeof(*scsi_cmd));
 		scsi_cmd->opcode = MODE_SENSE_6;
 		if (dbd != 0)
 			scsi_cmd->byte2 |= SMS_DBD;
 		scsi_cmd->page = pc | page;
 		scsi_cmd->subpage = subpage;
 		scsi_cmd->length = param_len;
 		cdb_len = sizeof(*scsi_cmd);
 	} else {
 		/*
 		 * Need a 10 byte cdb.
 		 */
 		struct scsi_mode_sense_10 *scsi_cmd;
 
 		scsi_cmd = (struct scsi_mode_sense_10 *)&csio->cdb_io.cdb_bytes;
 		bzero(scsi_cmd, sizeof(*scsi_cmd));
 		scsi_cmd->opcode = MODE_SENSE_10;
 		if (dbd != 0)
 			scsi_cmd->byte2 |= SMS_DBD;
 		scsi_cmd->page = pc | page;
 		scsi_cmd->subpage = subpage;
 		scsi_ulto2b(param_len, scsi_cmd->length);
 		cdb_len = sizeof(*scsi_cmd);
 	}
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      CAM_DIR_IN,
 		      tag_action,
 		      param_buf,
 		      param_len,
 		      sense_len,
 		      cdb_len,
 		      timeout);
 }
 
 void
 scsi_mode_select(struct ccb_scsiio *csio, u_int32_t retries,
 		 void (*cbfcnp)(struct cam_periph *, union ccb *),
 		 u_int8_t tag_action, int scsi_page_fmt, int save_pages,
 		 u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len,
 		 u_int32_t timeout)
 {
 	scsi_mode_select_len(csio, retries, cbfcnp, tag_action,
 			     scsi_page_fmt, save_pages, param_buf,
 			     param_len, 0, sense_len, timeout);
 }
 
 void
 scsi_mode_select_len(struct ccb_scsiio *csio, u_int32_t retries,
 		     void (*cbfcnp)(struct cam_periph *, union ccb *),
 		     u_int8_t tag_action, int scsi_page_fmt, int save_pages,
 		     u_int8_t *param_buf, u_int32_t param_len,
 		     int minimum_cmd_size, u_int8_t sense_len,
 		     u_int32_t timeout)
 {
 	u_int8_t cdb_len;
 
 	/*
 	 * Use the smallest possible command to perform the operation.
 	 */
 	if ((param_len < 256)
 	 && (minimum_cmd_size < 10)) {
 		/*
 		 * We can fit in a 6 byte cdb.
 		 */
 		struct scsi_mode_select_6 *scsi_cmd;
 
 		scsi_cmd = (struct scsi_mode_select_6 *)&csio->cdb_io.cdb_bytes;
 		bzero(scsi_cmd, sizeof(*scsi_cmd));
 		scsi_cmd->opcode = MODE_SELECT_6;
 		if (scsi_page_fmt != 0)
 			scsi_cmd->byte2 |= SMS_PF;
 		if (save_pages != 0)
 			scsi_cmd->byte2 |= SMS_SP;
 		scsi_cmd->length = param_len;
 		cdb_len = sizeof(*scsi_cmd);
 	} else {
 		/*
 		 * Need a 10 byte cdb.
 		 */
 		struct scsi_mode_select_10 *scsi_cmd;
 
 		scsi_cmd =
 		    (struct scsi_mode_select_10 *)&csio->cdb_io.cdb_bytes;
 		bzero(scsi_cmd, sizeof(*scsi_cmd));
 		scsi_cmd->opcode = MODE_SELECT_10;
 		if (scsi_page_fmt != 0)
 			scsi_cmd->byte2 |= SMS_PF;
 		if (save_pages != 0)
 			scsi_cmd->byte2 |= SMS_SP;
 		scsi_ulto2b(param_len, scsi_cmd->length);
 		cdb_len = sizeof(*scsi_cmd);
 	}
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      CAM_DIR_OUT,
 		      tag_action,
 		      param_buf,
 		      param_len,
 		      sense_len,
 		      cdb_len,
 		      timeout);
 }
 
 void
 scsi_log_sense(struct ccb_scsiio *csio, u_int32_t retries,
 	       void (*cbfcnp)(struct cam_periph *, union ccb *),
 	       u_int8_t tag_action, u_int8_t page_code, u_int8_t page,
 	       int save_pages, int ppc, u_int32_t paramptr,
 	       u_int8_t *param_buf, u_int32_t param_len, u_int8_t sense_len,
 	       u_int32_t timeout)
 {
 	struct scsi_log_sense *scsi_cmd;
 	u_int8_t cdb_len;
 
 	scsi_cmd = (struct scsi_log_sense *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = LOG_SENSE;
 	scsi_cmd->page = page_code | page;
 	if (save_pages != 0)
 		scsi_cmd->byte2 |= SLS_SP;
 	if (ppc != 0)
 		scsi_cmd->byte2 |= SLS_PPC;
 	scsi_ulto2b(paramptr, scsi_cmd->paramptr);
 	scsi_ulto2b(param_len, scsi_cmd->length);
 	cdb_len = sizeof(*scsi_cmd);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      /*data_ptr*/param_buf,
 		      /*dxfer_len*/param_len,
 		      sense_len,
 		      cdb_len,
 		      timeout);
 }
 
 void
 scsi_log_select(struct ccb_scsiio *csio, u_int32_t retries,
 		void (*cbfcnp)(struct cam_periph *, union ccb *),
 		u_int8_t tag_action, u_int8_t page_code, int save_pages,
 		int pc_reset, u_int8_t *param_buf, u_int32_t param_len,
 		u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_log_select *scsi_cmd;
 	u_int8_t cdb_len;
 
 	scsi_cmd = (struct scsi_log_select *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = LOG_SELECT;
 	scsi_cmd->page = page_code & SLS_PAGE_CODE;
 	if (save_pages != 0)
 		scsi_cmd->byte2 |= SLS_SP;
 	if (pc_reset != 0)
 		scsi_cmd->byte2 |= SLS_PCR;
 	scsi_ulto2b(param_len, scsi_cmd->length);
 	cdb_len = sizeof(*scsi_cmd);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_OUT,
 		      tag_action,
 		      /*data_ptr*/param_buf,
 		      /*dxfer_len*/param_len,
 		      sense_len,
 		      cdb_len,
 		      timeout);
 }
 
 /*
  * Prevent or allow the user to remove the media
  */
 void
 scsi_prevent(struct ccb_scsiio *csio, u_int32_t retries,
 	     void (*cbfcnp)(struct cam_periph *, union ccb *),
 	     u_int8_t tag_action, u_int8_t action,
 	     u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_prevent *scsi_cmd;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_NONE,
 		      tag_action,
 		      /*data_ptr*/NULL,
 		      /*dxfer_len*/0,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 
 	scsi_cmd = (struct scsi_prevent *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = PREVENT_ALLOW;
 	scsi_cmd->how = action;
 }
 
 /* XXX allow specification of address and PMI bit and LBA */
 void
 scsi_read_capacity(struct ccb_scsiio *csio, u_int32_t retries,
 		   void (*cbfcnp)(struct cam_periph *, union ccb *),
 		   u_int8_t tag_action,
 		   struct scsi_read_capacity_data *rcap_buf,
 		   u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_read_capacity *scsi_cmd;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      /*data_ptr*/(u_int8_t *)rcap_buf,
 		      /*dxfer_len*/sizeof(*rcap_buf),
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 
 	scsi_cmd = (struct scsi_read_capacity *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = READ_CAPACITY;
 }
 
 void
 scsi_read_capacity_16(struct ccb_scsiio *csio, uint32_t retries,
 		      void (*cbfcnp)(struct cam_periph *, union ccb *),
 		      uint8_t tag_action, uint64_t lba, int reladr, int pmi,
 		      uint8_t *rcap_buf, int rcap_buf_len, uint8_t sense_len,
 		      uint32_t timeout)
 {
 	struct scsi_read_capacity_16 *scsi_cmd;
 
 	
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      /*data_ptr*/(u_int8_t *)rcap_buf,
 		      /*dxfer_len*/rcap_buf_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 	scsi_cmd = (struct scsi_read_capacity_16 *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = SERVICE_ACTION_IN;
 	scsi_cmd->service_action = SRC16_SERVICE_ACTION;
 	scsi_u64to8b(lba, scsi_cmd->addr);
 	scsi_ulto4b(rcap_buf_len, scsi_cmd->alloc_len);
 	if (pmi)
 		reladr |= SRC16_PMI;
 	if (reladr)
 		reladr |= SRC16_RELADR;
 }
 
 void
 scsi_report_luns(struct ccb_scsiio *csio, u_int32_t retries,
 		 void (*cbfcnp)(struct cam_periph *, union ccb *),
 		 u_int8_t tag_action, u_int8_t select_report,
 		 struct scsi_report_luns_data *rpl_buf, u_int32_t alloc_len,
 		 u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_report_luns *scsi_cmd;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      /*data_ptr*/(u_int8_t *)rpl_buf,
 		      /*dxfer_len*/alloc_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 	scsi_cmd = (struct scsi_report_luns *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = REPORT_LUNS;
 	scsi_cmd->select_report = select_report;
 	scsi_ulto4b(alloc_len, scsi_cmd->length);
 }
 
 void
 scsi_report_target_group(struct ccb_scsiio *csio, u_int32_t retries,
 		 void (*cbfcnp)(struct cam_periph *, union ccb *),
 		 u_int8_t tag_action, u_int8_t pdf,
 		 void *buf, u_int32_t alloc_len,
 		 u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_target_group *scsi_cmd;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      /*data_ptr*/(u_int8_t *)buf,
 		      /*dxfer_len*/alloc_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 	scsi_cmd = (struct scsi_target_group *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = MAINTENANCE_IN;
 	scsi_cmd->service_action = REPORT_TARGET_PORT_GROUPS | pdf;
 	scsi_ulto4b(alloc_len, scsi_cmd->length);
 }
 
 void
 scsi_report_timestamp(struct ccb_scsiio *csio, u_int32_t retries,
 		 void (*cbfcnp)(struct cam_periph *, union ccb *),
 		 u_int8_t tag_action, u_int8_t pdf,
 		 void *buf, u_int32_t alloc_len,
 		 u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_timestamp *scsi_cmd;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      /*data_ptr*/(u_int8_t *)buf,
 		      /*dxfer_len*/alloc_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 	scsi_cmd = (struct scsi_timestamp *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = MAINTENANCE_IN;
 	scsi_cmd->service_action = REPORT_TIMESTAMP | pdf;
 	scsi_ulto4b(alloc_len, scsi_cmd->length);
 }
 
 void
 scsi_set_target_group(struct ccb_scsiio *csio, u_int32_t retries,
 		 void (*cbfcnp)(struct cam_periph *, union ccb *),
 		 u_int8_t tag_action, void *buf, u_int32_t alloc_len,
 		 u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_target_group *scsi_cmd;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_OUT,
 		      tag_action,
 		      /*data_ptr*/(u_int8_t *)buf,
 		      /*dxfer_len*/alloc_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 	scsi_cmd = (struct scsi_target_group *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = MAINTENANCE_OUT;
 	scsi_cmd->service_action = SET_TARGET_PORT_GROUPS;
 	scsi_ulto4b(alloc_len, scsi_cmd->length);
 }
 
 void
 scsi_create_timestamp(uint8_t *timestamp_6b_buf,
 		      uint64_t timestamp)
 {
 	uint8_t buf[8];
 	scsi_u64to8b(timestamp, buf);
 	/*
 	 * Using memcopy starting at buf[2] because the set timestamp parameters
 	 * only has six bytes for the timestamp to fit into, and we don't have a
 	 * scsi_u64to6b function.
 	 */
 	memcpy(timestamp_6b_buf, &buf[2], 6);
 }
 
 void
 scsi_set_timestamp(struct ccb_scsiio *csio, u_int32_t retries,
 		   void (*cbfcnp)(struct cam_periph *, union ccb *),
 		   u_int8_t tag_action, void *buf, u_int32_t alloc_len,
 		   u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_timestamp *scsi_cmd;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_OUT,
 		      tag_action,
 		      /*data_ptr*/(u_int8_t *) buf,
 		      /*dxfer_len*/alloc_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 	scsi_cmd = (struct scsi_timestamp *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = MAINTENANCE_OUT;
 	scsi_cmd->service_action = SET_TIMESTAMP;
 	scsi_ulto4b(alloc_len, scsi_cmd->length);
 }
 
 /*
  * Syncronize the media to the contents of the cache for
  * the given lba/count pair.  Specifying 0/0 means sync
  * the whole cache.
  */
 void
 scsi_synchronize_cache(struct ccb_scsiio *csio, u_int32_t retries,
 		       void (*cbfcnp)(struct cam_periph *, union ccb *),
 		       u_int8_t tag_action, u_int32_t begin_lba,
 		       u_int16_t lb_count, u_int8_t sense_len,
 		       u_int32_t timeout)
 {
 	struct scsi_sync_cache *scsi_cmd;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_NONE,
 		      tag_action,
 		      /*data_ptr*/NULL,
 		      /*dxfer_len*/0,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 
 	scsi_cmd = (struct scsi_sync_cache *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = SYNCHRONIZE_CACHE;
 	scsi_ulto4b(begin_lba, scsi_cmd->begin_lba);
 	scsi_ulto2b(lb_count, scsi_cmd->lb_count);
 }
 
 void
 scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries,
 		void (*cbfcnp)(struct cam_periph *, union ccb *),
 		u_int8_t tag_action, int readop, u_int8_t byte2,
 		int minimum_cmd_size, u_int64_t lba, u_int32_t block_count,
 		u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len,
 		u_int32_t timeout)
 {
 	int read;
 	u_int8_t cdb_len;
 
 	read = (readop & SCSI_RW_DIRMASK) == SCSI_RW_READ;
 
 	/*
 	 * Use the smallest possible command to perform the operation
 	 * as some legacy hardware does not support the 10 byte commands.
 	 * If any of the bits in byte2 is set, we have to go with a larger
 	 * command.
 	 */
 	if ((minimum_cmd_size < 10)
 	 && ((lba & 0x1fffff) == lba)
 	 && ((block_count & 0xff) == block_count)
 	 && (byte2 == 0)) {
 		/*
 		 * We can fit in a 6 byte cdb.
 		 */
 		struct scsi_rw_6 *scsi_cmd;
 
 		scsi_cmd = (struct scsi_rw_6 *)&csio->cdb_io.cdb_bytes;
 		scsi_cmd->opcode = read ? READ_6 : WRITE_6;
 		scsi_ulto3b(lba, scsi_cmd->addr);
 		scsi_cmd->length = block_count & 0xff;
 		scsi_cmd->control = 0;
 		cdb_len = sizeof(*scsi_cmd);
 
 		CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE,
 			  ("6byte: %x%x%x:%d:%d\n", scsi_cmd->addr[0],
 			   scsi_cmd->addr[1], scsi_cmd->addr[2],
 			   scsi_cmd->length, dxfer_len));
 	} else if ((minimum_cmd_size < 12)
 		&& ((block_count & 0xffff) == block_count)
 		&& ((lba & 0xffffffff) == lba)) {
 		/*
 		 * Need a 10 byte cdb.
 		 */
 		struct scsi_rw_10 *scsi_cmd;
 
 		scsi_cmd = (struct scsi_rw_10 *)&csio->cdb_io.cdb_bytes;
 		scsi_cmd->opcode = read ? READ_10 : WRITE_10;
 		scsi_cmd->byte2 = byte2;
 		scsi_ulto4b(lba, scsi_cmd->addr);
 		scsi_cmd->reserved = 0;
 		scsi_ulto2b(block_count, scsi_cmd->length);
 		scsi_cmd->control = 0;
 		cdb_len = sizeof(*scsi_cmd);
 
 		CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE,
 			  ("10byte: %x%x%x%x:%x%x: %d\n", scsi_cmd->addr[0],
 			   scsi_cmd->addr[1], scsi_cmd->addr[2],
 			   scsi_cmd->addr[3], scsi_cmd->length[0],
 			   scsi_cmd->length[1], dxfer_len));
 	} else if ((minimum_cmd_size < 16)
 		&& ((block_count & 0xffffffff) == block_count)
 		&& ((lba & 0xffffffff) == lba)) {
 		/* 
 		 * The block count is too big for a 10 byte CDB, use a 12
 		 * byte CDB.
 		 */
 		struct scsi_rw_12 *scsi_cmd;
 
 		scsi_cmd = (struct scsi_rw_12 *)&csio->cdb_io.cdb_bytes;
 		scsi_cmd->opcode = read ? READ_12 : WRITE_12;
 		scsi_cmd->byte2 = byte2;
 		scsi_ulto4b(lba, scsi_cmd->addr);
 		scsi_cmd->reserved = 0;
 		scsi_ulto4b(block_count, scsi_cmd->length);
 		scsi_cmd->control = 0;
 		cdb_len = sizeof(*scsi_cmd);
 
 		CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE,
 			  ("12byte: %x%x%x%x:%x%x%x%x: %d\n", scsi_cmd->addr[0],
 			   scsi_cmd->addr[1], scsi_cmd->addr[2],
 			   scsi_cmd->addr[3], scsi_cmd->length[0],
 			   scsi_cmd->length[1], scsi_cmd->length[2],
 			   scsi_cmd->length[3], dxfer_len));
 	} else {
 		/*
 		 * 16 byte CDB.  We'll only get here if the LBA is larger
 		 * than 2^32, or if the user asks for a 16 byte command.
 		 */
 		struct scsi_rw_16 *scsi_cmd;
 
 		scsi_cmd = (struct scsi_rw_16 *)&csio->cdb_io.cdb_bytes;
 		scsi_cmd->opcode = read ? READ_16 : WRITE_16;
 		scsi_cmd->byte2 = byte2;
 		scsi_u64to8b(lba, scsi_cmd->addr);
 		scsi_cmd->reserved = 0;
 		scsi_ulto4b(block_count, scsi_cmd->length);
 		scsi_cmd->control = 0;
 		cdb_len = sizeof(*scsi_cmd);
 	}
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      (read ? CAM_DIR_IN : CAM_DIR_OUT) |
 		      ((readop & SCSI_RW_BIO) != 0 ? CAM_DATA_BIO : 0),
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      cdb_len,
 		      timeout);
 }
 
 void
 scsi_write_same(struct ccb_scsiio *csio, u_int32_t retries,
 		void (*cbfcnp)(struct cam_periph *, union ccb *),
 		u_int8_t tag_action, u_int8_t byte2,
 		int minimum_cmd_size, u_int64_t lba, u_int32_t block_count,
 		u_int8_t *data_ptr, u_int32_t dxfer_len, u_int8_t sense_len,
 		u_int32_t timeout)
 {
 	u_int8_t cdb_len;
 	if ((minimum_cmd_size < 16) &&
 	    ((block_count & 0xffff) == block_count) &&
 	    ((lba & 0xffffffff) == lba)) {
 		/*
 		 * Need a 10 byte cdb.
 		 */
 		struct scsi_write_same_10 *scsi_cmd;
 
 		scsi_cmd = (struct scsi_write_same_10 *)&csio->cdb_io.cdb_bytes;
 		scsi_cmd->opcode = WRITE_SAME_10;
 		scsi_cmd->byte2 = byte2;
 		scsi_ulto4b(lba, scsi_cmd->addr);
 		scsi_cmd->group = 0;
 		scsi_ulto2b(block_count, scsi_cmd->length);
 		scsi_cmd->control = 0;
 		cdb_len = sizeof(*scsi_cmd);
 
 		CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE,
 			  ("10byte: %x%x%x%x:%x%x: %d\n", scsi_cmd->addr[0],
 			   scsi_cmd->addr[1], scsi_cmd->addr[2],
 			   scsi_cmd->addr[3], scsi_cmd->length[0],
 			   scsi_cmd->length[1], dxfer_len));
 	} else {
 		/*
 		 * 16 byte CDB.  We'll only get here if the LBA is larger
 		 * than 2^32, or if the user asks for a 16 byte command.
 		 */
 		struct scsi_write_same_16 *scsi_cmd;
 
 		scsi_cmd = (struct scsi_write_same_16 *)&csio->cdb_io.cdb_bytes;
 		scsi_cmd->opcode = WRITE_SAME_16;
 		scsi_cmd->byte2 = byte2;
 		scsi_u64to8b(lba, scsi_cmd->addr);
 		scsi_ulto4b(block_count, scsi_cmd->length);
 		scsi_cmd->group = 0;
 		scsi_cmd->control = 0;
 		cdb_len = sizeof(*scsi_cmd);
 
 		CAM_DEBUG(csio->ccb_h.path, CAM_DEBUG_SUBTRACE,
 			  ("16byte: %x%x%x%x%x%x%x%x:%x%x%x%x: %d\n",
 			   scsi_cmd->addr[0], scsi_cmd->addr[1],
 			   scsi_cmd->addr[2], scsi_cmd->addr[3],
 			   scsi_cmd->addr[4], scsi_cmd->addr[5],
 			   scsi_cmd->addr[6], scsi_cmd->addr[7],
 			   scsi_cmd->length[0], scsi_cmd->length[1],
 			   scsi_cmd->length[2], scsi_cmd->length[3],
 			   dxfer_len));
 	}
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_OUT,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      cdb_len,
 		      timeout);
 }
 
 void
 scsi_ata_identify(struct ccb_scsiio *csio, u_int32_t retries,
 		  void (*cbfcnp)(struct cam_periph *, union ccb *),
 		  u_int8_t tag_action, u_int8_t *data_ptr,
 		  u_int16_t dxfer_len, u_int8_t sense_len,
 		  u_int32_t timeout)
 {
 	scsi_ata_pass(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      /*protocol*/AP_PROTO_PIO_IN,
 		      /*ata_flags*/AP_FLAG_TDIR_FROM_DEV |
 				   AP_FLAG_BYT_BLOK_BYTES |
 				   AP_FLAG_TLEN_SECT_CNT,
 		      /*features*/0,
 		      /*sector_count*/dxfer_len,
 		      /*lba*/0,
 		      /*command*/ATA_ATA_IDENTIFY,
 		      /*device*/ 0,
 		      /*icc*/ 0,
 		      /*auxiliary*/ 0,
 		      /*control*/0,
 		      data_ptr,
 		      dxfer_len,
 		      /*cdb_storage*/ NULL,
 		      /*cdb_storage_len*/ 0,
 		      /*minimum_cmd_size*/ 0,
 		      sense_len,
 		      timeout);
 }
 
 void
 scsi_ata_trim(struct ccb_scsiio *csio, u_int32_t retries,
 	      void (*cbfcnp)(struct cam_periph *, union ccb *),
 	      u_int8_t tag_action, u_int16_t block_count,
 	      u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len,
 	      u_int32_t timeout)
 {
 	scsi_ata_pass_16(csio,
 			 retries,
 			 cbfcnp,
 			 /*flags*/CAM_DIR_OUT,
 			 tag_action,
 			 /*protocol*/AP_EXTEND|AP_PROTO_DMA,
 			 /*ata_flags*/AP_FLAG_TLEN_SECT_CNT|AP_FLAG_BYT_BLOK_BLOCKS,
 			 /*features*/ATA_DSM_TRIM,
 			 /*sector_count*/block_count,
 			 /*lba*/0,
 			 /*command*/ATA_DATA_SET_MANAGEMENT,
 			 /*control*/0,
 			 data_ptr,
 			 dxfer_len,
 			 sense_len,
 			 timeout);
 }
 
 int
 scsi_ata_read_log(struct ccb_scsiio *csio, uint32_t retries,
 		  void (*cbfcnp)(struct cam_periph *, union ccb *),
 		  uint8_t tag_action, uint32_t log_address,
 		  uint32_t page_number, uint16_t block_count,
 		  uint8_t protocol, uint8_t *data_ptr, uint32_t dxfer_len,
 		  uint8_t sense_len, uint32_t timeout)
 {
 	uint8_t command, protocol_out;
 	uint16_t count_out;
 	uint64_t lba;
 	int retval;
 
 	retval = 0;
 
 	switch (protocol) {
 	case AP_PROTO_DMA:
 		count_out = block_count;
 		command = ATA_READ_LOG_DMA_EXT;
 		protocol_out = AP_PROTO_DMA;
 		break;
 	case AP_PROTO_PIO_IN:
 	default:
 		count_out = block_count;
 		command = ATA_READ_LOG_EXT;
 		protocol_out = AP_PROTO_PIO_IN;
 		break;
 	}
 
 	lba = (((uint64_t)page_number & 0xff00) << 32) |
 	      ((page_number & 0x00ff) << 8) |
 	      (log_address & 0xff);
 
 	protocol_out |= AP_EXTEND;
 
 	retval = scsi_ata_pass(csio,
 			       retries,
 			       cbfcnp,
 			       /*flags*/CAM_DIR_IN,
 			       tag_action,
 			       /*protocol*/ protocol_out,
 			       /*ata_flags*/AP_FLAG_TLEN_SECT_CNT |
 					    AP_FLAG_BYT_BLOK_BLOCKS |
 					    AP_FLAG_TDIR_FROM_DEV,
 			       /*feature*/ 0,
 			       /*sector_count*/ count_out,
 			       /*lba*/ lba,
 			       /*command*/ command,
 			       /*device*/ 0,
 			       /*icc*/ 0,
 			       /*auxiliary*/ 0,
 			       /*control*/0,
 			       data_ptr,
 			       dxfer_len,
 			       /*cdb_storage*/ NULL,
 			       /*cdb_storage_len*/ 0,
 			       /*minimum_cmd_size*/ 0,
 			       sense_len,
 			       timeout);
 
 	return (retval);
 }
 
 int scsi_ata_setfeatures(struct ccb_scsiio *csio, uint32_t retries,
 			 void (*cbfcnp)(struct cam_periph *, union ccb *),
 			 uint8_t tag_action, uint8_t feature,
 			 uint64_t lba, uint32_t count,
 			 uint8_t sense_len, uint32_t timeout)
 {
 	return (scsi_ata_pass(csio,
 		retries,
 		cbfcnp,
 		/*flags*/CAM_DIR_NONE,
 		tag_action,
 		/*protocol*/AP_PROTO_PIO_IN,
 		/*ata_flags*/AP_FLAG_TDIR_FROM_DEV |
 			     AP_FLAG_BYT_BLOK_BYTES |
 			     AP_FLAG_TLEN_SECT_CNT,
 		/*features*/feature,
 		/*sector_count*/count,
 		/*lba*/lba,
 		/*command*/ATA_SETFEATURES,
 		/*device*/ 0,
 		/*icc*/ 0,
 		/*auxiliary*/0,
 		/*control*/0,
 		/*data_ptr*/NULL,
 		/*dxfer_len*/0,
 		/*cdb_storage*/NULL,
 		/*cdb_storage_len*/0,
 		/*minimum_cmd_size*/0,
 		sense_len,
 		timeout));
 }
 
 /*
  * Note! This is an unusual CDB building function because it can return
  * an error in the event that the command in question requires a variable
  * length CDB, but the caller has not given storage space for one or has not
  * given enough storage space.  If there is enough space available in the
  * standard SCSI CCB CDB bytes, we'll prefer that over passed in storage.
  */
 int
 scsi_ata_pass(struct ccb_scsiio *csio, uint32_t retries,
 	      void (*cbfcnp)(struct cam_periph *, union ccb *),
 	      uint32_t flags, uint8_t tag_action,
 	      uint8_t protocol, uint8_t ata_flags, uint16_t features,
 	      uint16_t sector_count, uint64_t lba, uint8_t command,
 	      uint8_t device, uint8_t icc, uint32_t auxiliary,
 	      uint8_t control, u_int8_t *data_ptr, uint32_t dxfer_len,
 	      uint8_t *cdb_storage, size_t cdb_storage_len,
 	      int minimum_cmd_size, u_int8_t sense_len, u_int32_t timeout)
 {
 	uint32_t cam_flags;
 	uint8_t *cdb_ptr;
 	int cmd_size;
 	int retval;
 	uint8_t cdb_len;
 
 	retval = 0;
 	cam_flags = flags;
 
 	/*
 	 * Round the user's request to the nearest command size that is at
 	 * least as big as what he requested.
 	 */
 	if (minimum_cmd_size <= 12)
 		cmd_size = 12;
 	else if (minimum_cmd_size > 16)
 		cmd_size = 32;
 	else
 		cmd_size = 16;
 
 	/*
 	 * If we have parameters that require a 48-bit ATA command, we have to
 	 * use the 16 byte ATA PASS-THROUGH command at least.
 	 */
 	if (((lba > ATA_MAX_28BIT_LBA) 
 	  || (sector_count > 255)
 	  || (features > 255)
 	  || (protocol & AP_EXTEND))
 	 && ((cmd_size < 16)
 	  || ((protocol & AP_EXTEND) == 0))) {
 		if (cmd_size < 16)
 			cmd_size = 16;
 		protocol |= AP_EXTEND;
 	}
 
 	/*
 	 * The icc and auxiliary ATA registers are only supported in the
 	 * 32-byte version of the ATA PASS-THROUGH command.
 	 */
 	if ((icc != 0)
 	 || (auxiliary != 0)) {
 		cmd_size = 32;
 		protocol |= AP_EXTEND;
 	}
 
 
 	if ((cmd_size > sizeof(csio->cdb_io.cdb_bytes))
 	 && ((cdb_storage == NULL)
 	  || (cdb_storage_len < cmd_size))) {
 		retval = 1;
 		goto bailout;
 	}
 
 	/*
 	 * At this point we know we have enough space to store the command
 	 * in one place or another.  We prefer the built-in array, but used
 	 * the passed in storage if necessary.
 	 */
 	if (cmd_size <= sizeof(csio->cdb_io.cdb_bytes))
 		cdb_ptr = csio->cdb_io.cdb_bytes;
 	else {
 		cdb_ptr = cdb_storage;
 		cam_flags |= CAM_CDB_POINTER;
 	}
 
 	if (cmd_size <= 12) {
 		struct ata_pass_12 *cdb;
 
 		cdb = (struct ata_pass_12 *)cdb_ptr;
 		cdb_len = sizeof(*cdb);
 		bzero(cdb, cdb_len);
 
 		cdb->opcode = ATA_PASS_12;
 		cdb->protocol = protocol;
 		cdb->flags = ata_flags;
 		cdb->features = features;
 		cdb->sector_count = sector_count;
 		cdb->lba_low = lba & 0xff;
 		cdb->lba_mid = (lba >> 8) & 0xff;
 		cdb->lba_high = (lba >> 16) & 0xff;
 		cdb->device = ((lba >> 24) & 0xf) | ATA_DEV_LBA;
 		cdb->command = command;
 		cdb->control = control;
 	} else if (cmd_size <= 16) {
 		struct ata_pass_16 *cdb;
 
 		cdb = (struct ata_pass_16 *)cdb_ptr;
 		cdb_len = sizeof(*cdb);
 		bzero(cdb, cdb_len);
 
 		cdb->opcode = ATA_PASS_16;
 		cdb->protocol = protocol;
 		cdb->flags = ata_flags;
 		cdb->features = features & 0xff;
 		cdb->sector_count = sector_count & 0xff;
 		cdb->lba_low = lba & 0xff;
 		cdb->lba_mid = (lba >> 8) & 0xff;
 		cdb->lba_high = (lba >> 16) & 0xff;
 		/*
 		 * If AP_EXTEND is set, we're sending a 48-bit command.
 		 * Otherwise it's a 28-bit command.
 		 */
 		if (protocol & AP_EXTEND) {
 			cdb->lba_low_ext = (lba >> 24) & 0xff;
 			cdb->lba_mid_ext = (lba >> 32) & 0xff;
 			cdb->lba_high_ext = (lba >> 40) & 0xff;
 			cdb->features_ext = (features >> 8) & 0xff;
 			cdb->sector_count_ext = (sector_count >> 8) & 0xff;
 			cdb->device = device | ATA_DEV_LBA;
 		} else {
 			cdb->lba_low_ext = (lba >> 24) & 0xf;
 			cdb->device = ((lba >> 24) & 0xf) | ATA_DEV_LBA;
 		}
 		cdb->command = command;
 		cdb->control = control;
 	} else {
 		struct ata_pass_32 *cdb;
 		uint8_t tmp_lba[8];
 
 		cdb = (struct ata_pass_32 *)cdb_ptr;
 		cdb_len = sizeof(*cdb);
 		bzero(cdb, cdb_len);
 		cdb->opcode = VARIABLE_LEN_CDB;
 		cdb->control = control;
 		cdb->length = sizeof(*cdb) - __offsetof(struct ata_pass_32,
 							service_action);
 		scsi_ulto2b(ATA_PASS_32_SA, cdb->service_action);
 		cdb->protocol = protocol;
 		cdb->flags = ata_flags;
 
 		if ((protocol & AP_EXTEND) == 0) {
 			lba &= 0x0fffffff;
 			cdb->device = ((lba >> 24) & 0xf) | ATA_DEV_LBA;
 			features &= 0xff;
 			sector_count &= 0xff;
 		} else {
 			cdb->device = device | ATA_DEV_LBA;
 		}
 		scsi_u64to8b(lba, tmp_lba);
 		bcopy(&tmp_lba[2], cdb->lba, sizeof(cdb->lba));
 		scsi_ulto2b(features, cdb->features);
 		scsi_ulto2b(sector_count, cdb->count);
 		cdb->command = command;
 		cdb->icc = icc;
 		scsi_ulto4b(auxiliary, cdb->auxiliary);
 	}
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      cam_flags,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      cmd_size,
 		      timeout);
 bailout:
 	return (retval);
 }
 
 void
 scsi_ata_pass_16(struct ccb_scsiio *csio, u_int32_t retries,
 		 void (*cbfcnp)(struct cam_periph *, union ccb *),
 		 u_int32_t flags, u_int8_t tag_action,
 		 u_int8_t protocol, u_int8_t ata_flags, u_int16_t features,
 		 u_int16_t sector_count, uint64_t lba, u_int8_t command,
 		 u_int8_t control, u_int8_t *data_ptr, u_int16_t dxfer_len,
 		 u_int8_t sense_len, u_int32_t timeout)
 {
 	struct ata_pass_16 *ata_cmd;
 
 	ata_cmd = (struct ata_pass_16 *)&csio->cdb_io.cdb_bytes;
 	ata_cmd->opcode = ATA_PASS_16;
 	ata_cmd->protocol = protocol;
 	ata_cmd->flags = ata_flags;
 	ata_cmd->features_ext = features >> 8;
 	ata_cmd->features = features;
 	ata_cmd->sector_count_ext = sector_count >> 8;
 	ata_cmd->sector_count = sector_count;
 	ata_cmd->lba_low = lba;
 	ata_cmd->lba_mid = lba >> 8;
 	ata_cmd->lba_high = lba >> 16;
 	ata_cmd->device = ATA_DEV_LBA;
 	if (protocol & AP_EXTEND) {
 		ata_cmd->lba_low_ext = lba >> 24;
 		ata_cmd->lba_mid_ext = lba >> 32;
 		ata_cmd->lba_high_ext = lba >> 40;
 	} else
 		ata_cmd->device |= (lba >> 24) & 0x0f;
 	ata_cmd->command = command;
 	ata_cmd->control = control;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      flags,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*ata_cmd),
 		      timeout);
 }
 
 void
 scsi_unmap(struct ccb_scsiio *csio, u_int32_t retries,
 	   void (*cbfcnp)(struct cam_periph *, union ccb *),
 	   u_int8_t tag_action, u_int8_t byte2,
 	   u_int8_t *data_ptr, u_int16_t dxfer_len, u_int8_t sense_len,
 	   u_int32_t timeout)
 {
 	struct scsi_unmap *scsi_cmd;
 
 	scsi_cmd = (struct scsi_unmap *)&csio->cdb_io.cdb_bytes;
 	scsi_cmd->opcode = UNMAP;
 	scsi_cmd->byte2 = byte2;
 	scsi_ulto4b(0, scsi_cmd->reserved);
 	scsi_cmd->group = 0;
 	scsi_ulto2b(dxfer_len, scsi_cmd->length);
 	scsi_cmd->control = 0;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_OUT,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_receive_diagnostic_results(struct ccb_scsiio *csio, u_int32_t retries,
 				void (*cbfcnp)(struct cam_periph *, union ccb*),
 				uint8_t tag_action, int pcv, uint8_t page_code,
 				uint8_t *data_ptr, uint16_t allocation_length,
 				uint8_t sense_len, uint32_t timeout)
 {
 	struct scsi_receive_diag *scsi_cmd;
 
 	scsi_cmd = (struct scsi_receive_diag *)&csio->cdb_io.cdb_bytes;
 	memset(scsi_cmd, 0, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = RECEIVE_DIAGNOSTIC;
 	if (pcv) {
 		scsi_cmd->byte2 |= SRD_PCV;
 		scsi_cmd->page_code = page_code;
 	}
 	scsi_ulto2b(allocation_length, scsi_cmd->length);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      data_ptr,
 		      allocation_length,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_send_diagnostic(struct ccb_scsiio *csio, u_int32_t retries,
 		     void (*cbfcnp)(struct cam_periph *, union ccb *),
 		     uint8_t tag_action, int unit_offline, int device_offline,
 		     int self_test, int page_format, int self_test_code,
 		     uint8_t *data_ptr, uint16_t param_list_length,
 		     uint8_t sense_len, uint32_t timeout)
 {
 	struct scsi_send_diag *scsi_cmd;
 
 	scsi_cmd = (struct scsi_send_diag *)&csio->cdb_io.cdb_bytes;
 	memset(scsi_cmd, 0, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = SEND_DIAGNOSTIC;
 
 	/*
 	 * The default self-test mode control and specific test
 	 * control are mutually exclusive.
 	 */
 	if (self_test)
 		self_test_code = SSD_SELF_TEST_CODE_NONE;
 
 	scsi_cmd->byte2 = ((self_test_code << SSD_SELF_TEST_CODE_SHIFT)
 			 & SSD_SELF_TEST_CODE_MASK)
 			| (unit_offline   ? SSD_UNITOFFL : 0)
 			| (device_offline ? SSD_DEVOFFL  : 0)
 			| (self_test      ? SSD_SELFTEST : 0)
 			| (page_format    ? SSD_PF       : 0);
 	scsi_ulto2b(param_list_length, scsi_cmd->length);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/param_list_length ? CAM_DIR_OUT : CAM_DIR_NONE,
 		      tag_action,
 		      data_ptr,
 		      param_list_length,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_read_buffer(struct ccb_scsiio *csio, u_int32_t retries,
 			void (*cbfcnp)(struct cam_periph *, union ccb*),
 			uint8_t tag_action, int mode,
 			uint8_t buffer_id, u_int32_t offset,
 			uint8_t *data_ptr, uint32_t allocation_length,
 			uint8_t sense_len, uint32_t timeout)
 {
 	struct scsi_read_buffer *scsi_cmd;
 
 	scsi_cmd = (struct scsi_read_buffer *)&csio->cdb_io.cdb_bytes;
 	memset(scsi_cmd, 0, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = READ_BUFFER;
 	scsi_cmd->byte2 = mode;
 	scsi_cmd->buffer_id = buffer_id;
 	scsi_ulto3b(offset, scsi_cmd->offset);
 	scsi_ulto3b(allocation_length, scsi_cmd->length);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      data_ptr,
 		      allocation_length,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_write_buffer(struct ccb_scsiio *csio, u_int32_t retries,
 			void (*cbfcnp)(struct cam_periph *, union ccb *),
 			uint8_t tag_action, int mode,
 			uint8_t buffer_id, u_int32_t offset,
 			uint8_t *data_ptr, uint32_t param_list_length,
 			uint8_t sense_len, uint32_t timeout)
 {
 	struct scsi_write_buffer *scsi_cmd;
 
 	scsi_cmd = (struct scsi_write_buffer *)&csio->cdb_io.cdb_bytes;
 	memset(scsi_cmd, 0, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = WRITE_BUFFER;
 	scsi_cmd->byte2 = mode;
 	scsi_cmd->buffer_id = buffer_id;
 	scsi_ulto3b(offset, scsi_cmd->offset);
 	scsi_ulto3b(param_list_length, scsi_cmd->length);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/param_list_length ? CAM_DIR_OUT : CAM_DIR_NONE,
 		      tag_action,
 		      data_ptr,
 		      param_list_length,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void 
 scsi_start_stop(struct ccb_scsiio *csio, u_int32_t retries,
 		void (*cbfcnp)(struct cam_periph *, union ccb *),
 		u_int8_t tag_action, int start, int load_eject,
 		int immediate, u_int8_t sense_len, u_int32_t timeout)
 {
 	struct scsi_start_stop_unit *scsi_cmd;
 	int extra_flags = 0;
 
 	scsi_cmd = (struct scsi_start_stop_unit *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 	scsi_cmd->opcode = START_STOP_UNIT;
 	if (start != 0) {
 		scsi_cmd->how |= SSS_START;
 		/* it takes a lot of power to start a drive */
 		extra_flags |= CAM_HIGH_POWER;
 	}
 	if (load_eject != 0)
 		scsi_cmd->how |= SSS_LOEJ;
 	if (immediate != 0)
 		scsi_cmd->byte2 |= SSS_IMMED;
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_NONE | extra_flags,
 		      tag_action,
 		      /*data_ptr*/NULL,
 		      /*dxfer_len*/0,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_read_attribute(struct ccb_scsiio *csio, u_int32_t retries, 
 		    void (*cbfcnp)(struct cam_periph *, union ccb *),
 		    u_int8_t tag_action, u_int8_t service_action,
 		    uint32_t element, u_int8_t elem_type, int logical_volume,
 		    int partition, u_int32_t first_attribute, int cache,
 		    u_int8_t *data_ptr, u_int32_t length, int sense_len,
 		    u_int32_t timeout)
 {
 	struct scsi_read_attribute *scsi_cmd;
 
 	scsi_cmd = (struct scsi_read_attribute *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = READ_ATTRIBUTE;
 	scsi_cmd->service_action = service_action;
 	scsi_ulto2b(element, scsi_cmd->element);
 	scsi_cmd->elem_type = elem_type;
 	scsi_cmd->logical_volume = logical_volume;
 	scsi_cmd->partition = partition;
 	scsi_ulto2b(first_attribute, scsi_cmd->first_attribute);
 	scsi_ulto4b(length, scsi_cmd->length);
 	if (cache != 0)
 		scsi_cmd->cache |= SRA_CACHE;
 	
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      /*data_ptr*/data_ptr,
 		      /*dxfer_len*/length,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_write_attribute(struct ccb_scsiio *csio, u_int32_t retries, 
 		    void (*cbfcnp)(struct cam_periph *, union ccb *),
 		    u_int8_t tag_action, uint32_t element, int logical_volume,
 		    int partition, int wtc, u_int8_t *data_ptr,
 		    u_int32_t length, int sense_len, u_int32_t timeout)
 {
 	struct scsi_write_attribute *scsi_cmd;
 
 	scsi_cmd = (struct scsi_write_attribute *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = WRITE_ATTRIBUTE;
 	if (wtc != 0)
 		scsi_cmd->byte2 = SWA_WTC;
 	scsi_ulto3b(element, scsi_cmd->element);
 	scsi_cmd->logical_volume = logical_volume;
 	scsi_cmd->partition = partition;
 	scsi_ulto4b(length, scsi_cmd->length);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_OUT,
 		      tag_action,
 		      /*data_ptr*/data_ptr,
 		      /*dxfer_len*/length,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_persistent_reserve_in(struct ccb_scsiio *csio, uint32_t retries, 
 			   void (*cbfcnp)(struct cam_periph *, union ccb *),
 			   uint8_t tag_action, int service_action,
 			   uint8_t *data_ptr, uint32_t dxfer_len, int sense_len,
 			   int timeout)
 {
 	struct scsi_per_res_in *scsi_cmd;
 
 	scsi_cmd = (struct scsi_per_res_in *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = PERSISTENT_RES_IN;
 	scsi_cmd->action = service_action;
 	scsi_ulto2b(dxfer_len, scsi_cmd->length);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_persistent_reserve_out(struct ccb_scsiio *csio, uint32_t retries, 
 			    void (*cbfcnp)(struct cam_periph *, union ccb *),
 			    uint8_t tag_action, int service_action,
 			    int scope, int res_type, uint8_t *data_ptr,
 			    uint32_t dxfer_len, int sense_len, int timeout)
 {
 	struct scsi_per_res_out *scsi_cmd;
 
 	scsi_cmd = (struct scsi_per_res_out *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = PERSISTENT_RES_OUT;
 	scsi_cmd->action = service_action;
 	scsi_cmd->scope_type = scope | res_type;
 	scsi_ulto4b(dxfer_len, scsi_cmd->length);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_OUT,
 		      tag_action,
 		      /*data_ptr*/data_ptr,
 		      /*dxfer_len*/dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_security_protocol_in(struct ccb_scsiio *csio, uint32_t retries, 
 			  void (*cbfcnp)(struct cam_periph *, union ccb *),
 			  uint8_t tag_action, uint32_t security_protocol,
 			  uint32_t security_protocol_specific, int byte4,
 			  uint8_t *data_ptr, uint32_t dxfer_len, int sense_len,
 			  int timeout)
 {
 	struct scsi_security_protocol_in *scsi_cmd;
 
 	scsi_cmd = (struct scsi_security_protocol_in *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = SECURITY_PROTOCOL_IN;
 
 	scsi_cmd->security_protocol = security_protocol;
 	scsi_ulto2b(security_protocol_specific,
 		    scsi_cmd->security_protocol_specific); 
 	scsi_cmd->byte4 = byte4;
 	scsi_ulto4b(dxfer_len, scsi_cmd->length);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_security_protocol_out(struct ccb_scsiio *csio, uint32_t retries, 
 			   void (*cbfcnp)(struct cam_periph *, union ccb *),
 			   uint8_t tag_action, uint32_t security_protocol,
 			   uint32_t security_protocol_specific, int byte4,
 			   uint8_t *data_ptr, uint32_t dxfer_len, int sense_len,
 			   int timeout)
 {
 	struct scsi_security_protocol_out *scsi_cmd;
 
 	scsi_cmd = (struct scsi_security_protocol_out *)&csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = SECURITY_PROTOCOL_OUT;
 
 	scsi_cmd->security_protocol = security_protocol;
 	scsi_ulto2b(security_protocol_specific,
 		    scsi_cmd->security_protocol_specific); 
 	scsi_cmd->byte4 = byte4;
 	scsi_ulto4b(dxfer_len, scsi_cmd->length);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_OUT,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 void
 scsi_report_supported_opcodes(struct ccb_scsiio *csio, uint32_t retries, 
 			      void (*cbfcnp)(struct cam_periph *, union ccb *),
 			      uint8_t tag_action, int options, int req_opcode,
 			      int req_service_action, uint8_t *data_ptr,
 			      uint32_t dxfer_len, int sense_len, int timeout)
 {
 	struct scsi_report_supported_opcodes *scsi_cmd;
 
 	scsi_cmd = (struct scsi_report_supported_opcodes *)
 	    &csio->cdb_io.cdb_bytes;
 	bzero(scsi_cmd, sizeof(*scsi_cmd));
 
 	scsi_cmd->opcode = MAINTENANCE_IN;
 	scsi_cmd->service_action = REPORT_SUPPORTED_OPERATION_CODES;
 	scsi_cmd->options = options;
 	scsi_cmd->requested_opcode = req_opcode;
 	scsi_ulto2b(req_service_action, scsi_cmd->requested_service_action);
 	scsi_ulto4b(dxfer_len, scsi_cmd->length);
 
 	cam_fill_csio(csio,
 		      retries,
 		      cbfcnp,
 		      /*flags*/CAM_DIR_IN,
 		      tag_action,
 		      data_ptr,
 		      dxfer_len,
 		      sense_len,
 		      sizeof(*scsi_cmd),
 		      timeout);
 }
 
 /*      
  * Try make as good a match as possible with
  * available sub drivers
  */
 int
 scsi_inquiry_match(caddr_t inqbuffer, caddr_t table_entry)
 {
 	struct scsi_inquiry_pattern *entry;
 	struct scsi_inquiry_data *inq;
  
 	entry = (struct scsi_inquiry_pattern *)table_entry;
 	inq = (struct scsi_inquiry_data *)inqbuffer;
 
 	if (((SID_TYPE(inq) == entry->type)
 	  || (entry->type == T_ANY))
 	 && (SID_IS_REMOVABLE(inq) ? entry->media_type & SIP_MEDIA_REMOVABLE
 				   : entry->media_type & SIP_MEDIA_FIXED)
 	 && (cam_strmatch(inq->vendor, entry->vendor, sizeof(inq->vendor)) == 0)
 	 && (cam_strmatch(inq->product, entry->product,
 			  sizeof(inq->product)) == 0)
 	 && (cam_strmatch(inq->revision, entry->revision,
 			  sizeof(inq->revision)) == 0)) {
 		return (0);
 	}
         return (-1);
 }
 
 /*      
  * Try make as good a match as possible with
  * available sub drivers
  */
 int
 scsi_static_inquiry_match(caddr_t inqbuffer, caddr_t table_entry)
 {
 	struct scsi_static_inquiry_pattern *entry;
 	struct scsi_inquiry_data *inq;
  
 	entry = (struct scsi_static_inquiry_pattern *)table_entry;
 	inq = (struct scsi_inquiry_data *)inqbuffer;
 
 	if (((SID_TYPE(inq) == entry->type)
 	  || (entry->type == T_ANY))
 	 && (SID_IS_REMOVABLE(inq) ? entry->media_type & SIP_MEDIA_REMOVABLE
 				   : entry->media_type & SIP_MEDIA_FIXED)
 	 && (cam_strmatch(inq->vendor, entry->vendor, sizeof(inq->vendor)) == 0)
 	 && (cam_strmatch(inq->product, entry->product,
 			  sizeof(inq->product)) == 0)
 	 && (cam_strmatch(inq->revision, entry->revision,
 			  sizeof(inq->revision)) == 0)) {
 		return (0);
 	}
         return (-1);
 }
 
 /**
  * Compare two buffers of vpd device descriptors for a match.
  *
  * \param lhs      Pointer to first buffer of descriptors to compare.
  * \param lhs_len  The length of the first buffer.
  * \param rhs	   Pointer to second buffer of descriptors to compare.
  * \param rhs_len  The length of the second buffer.
  *
  * \return  0 on a match, -1 otherwise.
  *
  * Treat rhs and lhs as arrays of vpd device id descriptors.  Walk lhs matching
  * against each element in rhs until all data are exhausted or we have found
  * a match.
  */
 int
 scsi_devid_match(uint8_t *lhs, size_t lhs_len, uint8_t *rhs, size_t rhs_len)
 {
 	struct scsi_vpd_id_descriptor *lhs_id;
 	struct scsi_vpd_id_descriptor *lhs_last;
 	struct scsi_vpd_id_descriptor *rhs_last;
 	uint8_t *lhs_end;
 	uint8_t *rhs_end;
 
 	lhs_end = lhs + lhs_len;
 	rhs_end = rhs + rhs_len;
 
 	/*
 	 * rhs_last and lhs_last are the last posible position of a valid
 	 * descriptor assuming it had a zero length identifier.  We use
 	 * these variables to insure we can safely dereference the length
 	 * field in our loop termination tests.
 	 */
 	lhs_last = (struct scsi_vpd_id_descriptor *)
 	    (lhs_end - __offsetof(struct scsi_vpd_id_descriptor, identifier));
 	rhs_last = (struct scsi_vpd_id_descriptor *)
 	    (rhs_end - __offsetof(struct scsi_vpd_id_descriptor, identifier));
 
 	lhs_id = (struct scsi_vpd_id_descriptor *)lhs;
 	while (lhs_id <= lhs_last
 	    && (lhs_id->identifier + lhs_id->length) <= lhs_end) {
 		struct scsi_vpd_id_descriptor *rhs_id;
 
 		rhs_id = (struct scsi_vpd_id_descriptor *)rhs;
 		while (rhs_id <= rhs_last
 		    && (rhs_id->identifier + rhs_id->length) <= rhs_end) {
 
 			if ((rhs_id->id_type &
 			     (SVPD_ID_ASSOC_MASK | SVPD_ID_TYPE_MASK)) ==
 			    (lhs_id->id_type &
 			     (SVPD_ID_ASSOC_MASK | SVPD_ID_TYPE_MASK))
 			 && rhs_id->length == lhs_id->length
 			 && memcmp(rhs_id->identifier, lhs_id->identifier,
 				   rhs_id->length) == 0)
 				return (0);
 
 			rhs_id = (struct scsi_vpd_id_descriptor *)
 			   (rhs_id->identifier + rhs_id->length);
 		}
 		lhs_id = (struct scsi_vpd_id_descriptor *)
 		   (lhs_id->identifier + lhs_id->length);
 	}
 	return (-1);
 }
 
 #ifdef _KERNEL
 int
 scsi_vpd_supported_page(struct cam_periph *periph, uint8_t page_id)
 {
 	struct cam_ed *device;
 	struct scsi_vpd_supported_pages *vpds;
 	int i, num_pages;
 
 	device = periph->path->device;
 	vpds = (struct scsi_vpd_supported_pages *)device->supported_vpds;
 
 	if (vpds != NULL) {
 		num_pages = device->supported_vpds_len -
 		    SVPD_SUPPORTED_PAGES_HDR_LEN;
 		for (i = 0; i < num_pages; i++) {
 			if (vpds->page_list[i] == page_id)
 				return (1);
 		}
 	}
 
 	return (0);
 }
 
 static void
 init_scsi_delay(void)
 {
 	int delay;
 
 	delay = SCSI_DELAY;
 	TUNABLE_INT_FETCH("kern.cam.scsi_delay", &delay);
 
 	if (set_scsi_delay(delay) != 0) {
 		printf("cam: invalid value for tunable kern.cam.scsi_delay\n");
 		set_scsi_delay(SCSI_DELAY);
 	}
 }
 SYSINIT(scsi_delay, SI_SUB_TUNABLES, SI_ORDER_ANY, init_scsi_delay, NULL);
 
 static int
 sysctl_scsi_delay(SYSCTL_HANDLER_ARGS)
 {
 	int error, delay;
 
 	delay = scsi_delay;
 	error = sysctl_handle_int(oidp, &delay, 0, req);
 	if (error != 0 || req->newptr == NULL)
 		return (error);
 	return (set_scsi_delay(delay));
 }
 SYSCTL_PROC(_kern_cam, OID_AUTO, scsi_delay, CTLTYPE_INT|CTLFLAG_RW,
     0, 0, sysctl_scsi_delay, "I",
     "Delay to allow devices to settle after a SCSI bus reset (ms)");
 
 static int
 set_scsi_delay(int delay)
 {
 	/*
          * If someone sets this to 0, we assume that they want the
          * minimum allowable bus settle delay.
 	 */
 	if (delay == 0) {
 		printf("cam: using minimum scsi_delay (%dms)\n",
 		    SCSI_MIN_DELAY);
 		delay = SCSI_MIN_DELAY;
 	}
 	if (delay < SCSI_MIN_DELAY)
 		return (EINVAL);
 	scsi_delay = delay;
 	return (0);
 }
 #endif /* _KERNEL */
Index: projects/nfsv42/sys/cam/scsi/scsi_all.h
===================================================================
--- projects/nfsv42/sys/cam/scsi/scsi_all.h	(revision 350367)
+++ projects/nfsv42/sys/cam/scsi/scsi_all.h	(revision 350368)
@@ -1,4423 +1,4465 @@
 /*-
  * Largely written by Julian Elischer (julian@tfs.com)
  * for TRW Financial Systems.
  *
  * TRW Financial Systems, in accordance with their agreement with Carnegie
  * Mellon University, makes this software available to CMU to distribute
  * or use in any manner that they see fit as long as this message is kept with
  * the software. For this reason TFS also grants any other persons or
  * organisations permission to use or modify this software.
  *
  * TFS supplies this software to be publicly redistributed
  * on the understanding that TFS is not responsible for the correct
  * functioning of this software in any circumstances.
  *
  * Ported to run under 386BSD by Julian Elischer (julian@tfs.com) Sept 1992
  *
  * $FreeBSD$
  */
 
 /*
  * SCSI general  interface description
  */
 
 #ifndef	_SCSI_SCSI_ALL_H
 #define	_SCSI_SCSI_ALL_H 1
 
 #include <sys/cdefs.h>
 #ifdef _KERNEL
 #include <machine/stdarg.h>
 #else
 #include <stdarg.h>
 #endif
 
 #ifdef _KERNEL
 /*
  * This is the number of seconds we wait for devices to settle after a SCSI
  * bus reset.
  */
 extern int scsi_delay;
 #endif /* _KERNEL */
 
 /*
  * SCSI command format
  */
 
 /*
  * Define dome bits that are in ALL (or a lot of) scsi commands
  */
 #define	SCSI_CTL_LINK		0x01
 #define	SCSI_CTL_FLAG		0x02
 #define	SCSI_CTL_VENDOR		0xC0
 #define	SCSI_CMD_LUN		0xA0	/* these two should not be needed */
 #define	SCSI_CMD_LUN_SHIFT	5	/* LUN in the cmd is no longer SCSI */
 
 #define	SCSI_MAX_CDBLEN		16	/* 
 					 * 16 byte commands are in the 
 					 * SCSI-3 spec 
 					 */
 #if defined(CAM_MAX_CDBLEN) && (CAM_MAX_CDBLEN < SCSI_MAX_CDBLEN)
 #error "CAM_MAX_CDBLEN cannot be less than SCSI_MAX_CDBLEN"
 #endif
 
 /* 6byte CDBs special case 0 length to be 256 */
 #define	SCSI_CDB6_LEN(len)	((len) == 0 ? 256 : len)
 
 /*
  * This type defines actions to be taken when a particular sense code is
  * received.  Right now, these flags are only defined to take up 16 bits,
  * but can be expanded in the future if necessary.
  */
 typedef enum {
 	SS_NOP      = 0x000000,	/* Do nothing */
 	SS_RETRY    = 0x010000,	/* Retry the command */
 	SS_FAIL     = 0x020000,	/* Bail out */
 	SS_START    = 0x030000,	/* Send a Start Unit command to the device,
 				 * then retry the original command.
 				 */
 	SS_TUR      = 0x040000,	/* Send a Test Unit Ready command to the
 				 * device, then retry the original command.
 				 */
 	SS_MASK     = 0xff0000
 } scsi_sense_action;
 
 typedef enum {
 	SSQ_NONE		= 0x0000,
 	SSQ_DECREMENT_COUNT	= 0x0100,  /* Decrement the retry count */
 	SSQ_MANY		= 0x0200,  /* send lots of recovery commands */
 	SSQ_RANGE		= 0x0400,  /*
 					    * This table entry represents the
 					    * end of a range of ASCQs that
 					    * have identical error actions
 					    * and text.
 					    */
 	SSQ_PRINT_SENSE		= 0x0800,
 	SSQ_UA			= 0x1000,  /* Broadcast UA. */
 	SSQ_RESCAN		= 0x2000,  /* Rescan target for LUNs. */
 	SSQ_LOST		= 0x4000,  /* Destroy the LUNs. */
 	SSQ_MASK		= 0xff00
 } scsi_sense_action_qualifier;
 
 /* Mask for error status values */
 #define	SS_ERRMASK	0xff
 
 /* The default, retyable, error action */
 #define	SS_RDEF		SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE|EIO
 
 /* The retyable, error action, with table specified error code */
 #define	SS_RET		SS_RETRY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE
 
 /* Wait for transient error status to change */
 #define	SS_WAIT		SS_TUR|SSQ_MANY|SSQ_DECREMENT_COUNT|SSQ_PRINT_SENSE
 
 /* Fatal error action, with table specified error code */
 #define	SS_FATAL	SS_FAIL|SSQ_PRINT_SENSE
 
 struct scsi_generic
 {
 	u_int8_t opcode;
 	u_int8_t bytes[11];
 };
 
 struct scsi_request_sense
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SRS_DESC	0x01
 	u_int8_t unused[2];
 	u_int8_t length;
 	u_int8_t control;
 };
 
 struct scsi_test_unit_ready
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 	u_int8_t unused[3];
 	u_int8_t control;
 };
 
 struct scsi_receive_diag {
 	uint8_t opcode;
 	uint8_t byte2;
 #define SRD_PCV		0x01
 	uint8_t page_code;
 	uint8_t length[2]; 
 	uint8_t control;
 };
 
 struct scsi_send_diag {
 	uint8_t opcode;
 	uint8_t byte2;
 #define SSD_UNITOFFL				0x01
 #define SSD_DEVOFFL				0x02
 #define SSD_SELFTEST				0x04
 #define SSD_PF					0x10
 #define SSD_SELF_TEST_CODE_MASK			0xE0
 #define SSD_SELF_TEST_CODE_SHIFT		5
 #define		SSD_SELF_TEST_CODE_NONE		0x00
 #define		SSD_SELF_TEST_CODE_BG_SHORT	0x01
 #define		SSD_SELF_TEST_CODE_BG_EXTENDED	0x02
 #define		SSD_SELF_TEST_CODE_BG_ABORT	0x04
 #define		SSD_SELF_TEST_CODE_FG_SHORT	0x05
 #define		SSD_SELF_TEST_CODE_FG_EXTENDED	0x06
 	uint8_t	reserved;
 	uint8_t	length[2];
 	uint8_t control;
 };
 
 struct scsi_sense
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 	u_int8_t unused[2];
 	u_int8_t length;
 	u_int8_t control;
 };
 
 struct scsi_inquiry
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SI_EVPD 	0x01
 #define	SI_CMDDT	0x02
 	u_int8_t page_code;
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 struct scsi_mode_sense_6
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SMS_DBD				0x08
 	u_int8_t page;
 #define	SMS_PAGE_CODE 			0x3F
 #define	SMS_VENDOR_SPECIFIC_PAGE	0x00
 #define	SMS_DISCONNECT_RECONNECT_PAGE	0x02
 #define	SMS_FORMAT_DEVICE_PAGE		0x03
 #define	SMS_GEOMETRY_PAGE		0x04
 #define	SMS_CACHE_PAGE			0x08
 #define	SMS_PERIPHERAL_DEVICE_PAGE	0x09
 #define	SMS_CONTROL_MODE_PAGE		0x0A
 #define	SMS_PROTO_SPECIFIC_PAGE		0x19
 #define	SMS_INFO_EXCEPTIONS_PAGE	0x1C
 #define	SMS_ALL_PAGES_PAGE		0x3F
 #define	SMS_PAGE_CTRL_MASK		0xC0
 #define	SMS_PAGE_CTRL_CURRENT 		0x00
 #define	SMS_PAGE_CTRL_CHANGEABLE 	0x40
 #define	SMS_PAGE_CTRL_DEFAULT 		0x80
 #define	SMS_PAGE_CTRL_SAVED 		0xC0
 	u_int8_t subpage;
 #define	SMS_SUBPAGE_PAGE_0		0x00
 #define	SMS_SUBPAGE_ALL			0xff
 	u_int8_t length;
 	u_int8_t control;
 };
 
 struct scsi_mode_sense_10
 {
 	u_int8_t opcode;
 	u_int8_t byte2;		/* same bits as small version */
 #define	SMS10_LLBAA			0x10
 	u_int8_t page; 		/* same bits as small version */
 	u_int8_t subpage;
 	u_int8_t unused[3];
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 struct scsi_mode_select_6
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SMS_SP	0x01
 #define	SMS_RTD	0x02
 #define	SMS_PF	0x10
 	u_int8_t unused[2];
 	u_int8_t length;
 	u_int8_t control;
 };
 
 struct scsi_mode_select_10
 {
 	u_int8_t opcode;
 	u_int8_t byte2;		/* same bits as small version */
 	u_int8_t unused[5];
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 /*
  * When sending a mode select to a tape drive, the medium type must be 0.
  */
 struct scsi_mode_hdr_6
 {
 	u_int8_t datalen;
 	u_int8_t medium_type;
 	u_int8_t dev_specific;
 	u_int8_t block_descr_len;
 };
 
 struct scsi_mode_hdr_10
 {
 	u_int8_t datalen[2];
 	u_int8_t medium_type;
 	u_int8_t dev_specific;
-	u_int8_t reserved[2];
+	u_int8_t flags;
+#define	SMH_LONGLBA	0x01
+	u_int8_t reserved;
 	u_int8_t block_descr_len[2];
 };
 
 struct scsi_mode_block_descr
 {
 	u_int8_t density_code;
 	u_int8_t num_blocks[3];
 	u_int8_t reserved;
 	u_int8_t block_len[3];
 };
 
+struct scsi_mode_block_descr_dshort
+{
+	u_int8_t num_blocks[4];
+	u_int8_t reserved;
+	u_int8_t block_len[3];
+};
+
+struct scsi_mode_block_descr_dlong
+{
+	u_int8_t num_blocks[8];
+	u_int8_t reserved[4];
+	u_int8_t block_len[4];
+};
+
 struct scsi_per_res_in
 {
 	u_int8_t opcode;
 	u_int8_t action;
 #define	SPRI_RK	0x00
 #define	SPRI_RR	0x01
 #define	SPRI_RC	0x02
 #define	SPRI_RS	0x03
 	u_int8_t reserved[5];
 	u_int8_t length[2];
 #define	SPRI_MAX_LEN		0xffff
 	u_int8_t control;
 };
 
 struct scsi_per_res_in_header
 {
 	u_int8_t generation[4];
 	u_int8_t length[4];
 };
 
 struct scsi_per_res_key
 {
 	u_int8_t key[8];
 };
 
 struct scsi_per_res_in_keys
 {
 	struct scsi_per_res_in_header header;
 	struct scsi_per_res_key keys[0];
 };
 
 struct scsi_per_res_cap
 {
 	uint8_t length[2];
 	uint8_t flags1;
 #define	SPRI_RLR_C		0x80
 #define	SPRI_CRH		0x10
 #define	SPRI_SIP_C		0x08
 #define	SPRI_ATP_C		0x04
 #define	SPRI_PTPL_C		0x01
 	uint8_t flags2;
 #define	SPRI_TMV		0x80
 #define	SPRI_ALLOW_CMD_MASK	0x70
 #define	SPRI_ALLOW_CMD_SHIFT	4
 #define	SPRI_ALLOW_NA		0x00
 #define	SPRI_ALLOW_1		0x10
 #define	SPRI_ALLOW_2		0x20
 #define	SPRI_ALLOW_3		0x30
 #define	SPRI_ALLOW_4		0x40
 #define	SPRI_ALLOW_5		0x50
 #define	SPRI_PTPL_A		0x01
 	uint8_t type_mask[2];
 #define	SPRI_TM_WR_EX_AR	0x8000
 #define	SPRI_TM_EX_AC_RO	0x4000
 #define	SPRI_TM_WR_EX_RO	0x2000
 #define	SPRI_TM_EX_AC		0x0800
 #define	SPRI_TM_WR_EX		0x0200
 #define	SPRI_TM_EX_AC_AR	0x0001
 	uint8_t reserved[2];
 };
 
 struct scsi_per_res_in_rsrv_data
 {
 	uint8_t reservation[8];
 	uint8_t scope_addr[4];
 	uint8_t reserved;
 	uint8_t scopetype;
 #define	SPRT_WE    0x01
 #define	SPRT_EA    0x03
 #define	SPRT_WERO  0x05
 #define	SPRT_EARO  0x06
 #define	SPRT_WEAR  0x07
 #define	SPRT_EAAR  0x08
 	uint8_t extent_length[2];
 };
 
 struct scsi_per_res_in_rsrv
 {
 	struct scsi_per_res_in_header header;
 	struct scsi_per_res_in_rsrv_data data;
 };
 
 struct scsi_per_res_in_full_desc
 {
 	struct scsi_per_res_key res_key;
 	uint8_t reserved1[4];
 	uint8_t flags;
 #define	SPRI_FULL_ALL_TG_PT	0x02
 #define	SPRI_FULL_R_HOLDER	0x01
 	uint8_t scopetype;
 	uint8_t reserved2[4];
 	uint8_t rel_trgt_port_id[2];
 	uint8_t additional_length[4];
 	uint8_t transport_id[];
 };
 
 struct scsi_per_res_in_full
 {
 	struct scsi_per_res_in_header header;
 	struct scsi_per_res_in_full_desc desc[];
 };
 
 struct scsi_per_res_out
 {
 	u_int8_t opcode;
 	u_int8_t action;
 #define	SPRO_REGISTER		0x00
 #define	SPRO_RESERVE		0x01
 #define	SPRO_RELEASE		0x02
 #define	SPRO_CLEAR		0x03
 #define	SPRO_PREEMPT		0x04
 #define	SPRO_PRE_ABO		0x05
 #define	SPRO_REG_IGNO		0x06
 #define	SPRO_REG_MOVE		0x07
 #define	SPRO_REPL_LOST_RES	0x08
 #define	SPRO_ACTION_MASK	0x1f
 	u_int8_t scope_type;
 #define	SPR_SCOPE_MASK		0xf0
 #define	SPR_SCOPE_SHIFT		4
 #define	SPR_LU_SCOPE		0x00
 #define	SPR_EXTENT_SCOPE	0x10
 #define	SPR_ELEMENT_SCOPE	0x20
 #define	SPR_TYPE_MASK		0x0f
 #define	SPR_TYPE_RD_SHARED	0x00
 #define	SPR_TYPE_WR_EX		0x01
 #define	SPR_TYPE_RD_EX		0x02
 #define	SPR_TYPE_EX_AC		0x03
 #define	SPR_TYPE_SHARED		0x04
 #define	SPR_TYPE_WR_EX_RO	0x05
 #define	SPR_TYPE_EX_AC_RO	0x06
 #define	SPR_TYPE_WR_EX_AR	0x07
 #define	SPR_TYPE_EX_AC_AR	0x08
 	u_int8_t reserved[2];
 	u_int8_t length[4];
 	u_int8_t control;
 };
 
 struct scsi_per_res_out_parms
 {
 	struct scsi_per_res_key res_key;
 	u_int8_t serv_act_res_key[8];
 	u_int8_t scope_spec_address[4];
 	u_int8_t flags;
 #define	SPR_SPEC_I_PT		0x08
 #define	SPR_ALL_TG_PT		0x04
 #define	SPR_APTPL		0x01
 	u_int8_t reserved1;
 	u_int8_t extent_length[2];
 	u_int8_t transport_id_list[];
 };
 
 struct scsi_per_res_out_trans_ids {
 	u_int8_t additional_length[4];
 	u_int8_t transport_ids[];
 };
 
 /*
  * Used with REGISTER AND MOVE serivce action of the PERSISTENT RESERVE OUT
  * command.
  */
 struct scsi_per_res_reg_move
 {
 	struct scsi_per_res_key res_key;
 	u_int8_t serv_act_res_key[8];
 	u_int8_t reserved;
 	u_int8_t flags;
 #define	SPR_REG_MOVE_UNREG	0x02
 #define	SPR_REG_MOVE_APTPL	0x01
 	u_int8_t rel_trgt_port_id[2];
 	u_int8_t transport_id_length[4];
 	u_int8_t transport_id[];
 };
 
 struct scsi_transportid_header
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_FORMAT_MASK		0xc0
 #define	SCSI_TRN_FORMAT_SHIFT		6
 #define	SCSI_TRN_PROTO_MASK		0x0f
 };
 
 struct scsi_transportid_fcp
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_FCP_FORMAT_DEFAULT	0x00
 	uint8_t reserved1[7];
 	uint8_t n_port_name[8];
 	uint8_t reserved2[8];
 };
 
 struct scsi_transportid_spi
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_SPI_FORMAT_DEFAULT	0x00
 	uint8_t reserved1;
 	uint8_t scsi_addr[2];
 	uint8_t obsolete[2];
 	uint8_t rel_trgt_port_id[2];
 	uint8_t reserved2[16];
 };
 
 struct scsi_transportid_1394
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_1394_FORMAT_DEFAULT	0x00
 	uint8_t reserved1[7];
 	uint8_t eui64[8];
 	uint8_t reserved2[8];
 };
 
 struct scsi_transportid_rdma
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_RDMA_FORMAT_DEFAULT	0x00
 	uint8_t reserved[7];
 #define	SCSI_TRN_RDMA_PORT_LEN		16
 	uint8_t initiator_port_id[SCSI_TRN_RDMA_PORT_LEN];
 };
 
 struct scsi_transportid_iscsi_device
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_ISCSI_FORMAT_DEVICE	0x00
 	uint8_t reserved;
 	uint8_t additional_length[2];
 	uint8_t iscsi_name[];
 };
 
 struct scsi_transportid_iscsi_port
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_ISCSI_FORMAT_PORT	0x40
 	uint8_t reserved;
 	uint8_t additional_length[2];
 	uint8_t iscsi_name[];
 	/*
 	 * Followed by a separator and iSCSI initiator session ID
 	 */
 };
 
 struct scsi_transportid_sas
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_SAS_FORMAT_DEFAULT	0x00
 	uint8_t reserved1[3];
 	uint8_t sas_address[8];
 	uint8_t reserved2[12];
 };
 
 struct scsi_sop_routing_id_norm {
 	uint8_t bus;
 	uint8_t devfunc;
 #define	SCSI_TRN_SOP_BUS_MAX		0xff
 #define	SCSI_TRN_SOP_DEV_MAX		0x1f
 #define	SCSI_TRN_SOP_DEV_MASK		0xf8
 #define	SCSI_TRN_SOP_DEV_SHIFT		3
 #define	SCSI_TRN_SOP_FUNC_NORM_MASK	0x07
 #define	SCSI_TRN_SOP_FUNC_NORM_MAX	0x07
 };
 
 struct scsi_sop_routing_id_alt {
 	uint8_t bus;
 	uint8_t function;
 #define	SCSI_TRN_SOP_FUNC_ALT_MAX	0xff
 };
 
 struct scsi_transportid_sop
 {
 	uint8_t format_protocol;
 #define	SCSI_TRN_SOP_FORMAT_DEFAULT	0x00
 	uint8_t reserved1;
 	uint8_t routing_id[2];
 	uint8_t reserved2[20];
 };
 
 struct scsi_log_sense
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SLS_SP				0x01
 #define	SLS_PPC				0x02
 	u_int8_t page;
 #define	SLS_PAGE_CODE 			0x3F
 #define	SLS_SUPPORTED_PAGES_PAGE	0x00
 #define	SLS_OVERRUN_PAGE		0x01
 #define	SLS_ERROR_WRITE_PAGE		0x02
 #define	SLS_ERROR_READ_PAGE		0x03
 #define	SLS_ERROR_READREVERSE_PAGE	0x04
 #define	SLS_ERROR_VERIFY_PAGE		0x05
 #define	SLS_ERROR_NONMEDIUM_PAGE	0x06
 #define	SLS_ERROR_LASTN_PAGE		0x07
 #define	SLS_LOGICAL_BLOCK_PROVISIONING	0x0c
+#define	SLS_TEMPERATURE			0x0d
 #define	SLS_SELF_TEST_PAGE		0x10
 #define	SLS_SOLID_STATE_MEDIA		0x11
 #define	SLS_STAT_AND_PERF		0x19
 #define	SLS_IE_PAGE			0x2f
 #define	SLS_PAGE_CTRL_MASK		0xC0
 #define	SLS_PAGE_CTRL_THRESHOLD		0x00
 #define	SLS_PAGE_CTRL_CUMULATIVE	0x40
 #define	SLS_PAGE_CTRL_THRESH_DEFAULT	0x80
 #define	SLS_PAGE_CTRL_CUMUL_DEFAULT	0xC0
 	u_int8_t subpage;
 #define	SLS_SUPPORTED_SUBPAGES_SUBPAGE	0xff
 	u_int8_t reserved;
 	u_int8_t paramptr[2];
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 struct scsi_log_select
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 /*	SLS_SP				0x01 */
 #define	SLS_PCR				0x02
 	u_int8_t page;
 /*	SLS_PAGE_CTRL_MASK		0xC0 */
 /*	SLS_PAGE_CTRL_THRESHOLD		0x00 */
 /*	SLS_PAGE_CTRL_CUMULATIVE	0x40 */
 /*	SLS_PAGE_CTRL_THRESH_DEFAULT	0x80 */
 /*	SLS_PAGE_CTRL_CUMUL_DEFAULT	0xC0 */
 	u_int8_t reserved[4];
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 struct scsi_log_header
 {
 	u_int8_t page;
 #define	SL_PAGE_CODE			0x3F
 #define	SL_SPF				0x40
 #define	SL_DS				0x80
 	u_int8_t subpage;
 	u_int8_t datalen[2];
 };
 
 struct scsi_log_param_header {
 	u_int8_t param_code[2];
 	u_int8_t param_control;
 #define	SLP_LP				0x01
 #define	SLP_LBIN			0x02
 #define	SLP_TMC_MASK			0x0C
 #define	SLP_TMC_ALWAYS			0x00
 #define	SLP_TMC_EQUAL			0x04
 #define	SLP_TMC_NOTEQUAL		0x08
 #define	SLP_TMC_GREATER			0x0C
 #define	SLP_ETC				0x10
 #define	SLP_TSD				0x20
 #define	SLP_DS				0x40
 #define	SLP_DU				0x80
 	u_int8_t param_len;
 };
 
 struct scsi_log_media_pct_used {
 	struct scsi_log_param_header hdr;
 #define	SLP_SS_MEDIA_PCT_USED		0x0001
 	uint8_t reserved[3];
 	uint8_t pct_used;
 };
 
 struct scsi_log_stat_and_perf {
 	struct scsi_log_param_header hdr;
 #define	SLP_SAP				0x0001
 	uint8_t	read_num[8];
 	uint8_t	write_num[8];
 	uint8_t	recvieved_lba[8];
 	uint8_t	transmitted_lba[8];
 	uint8_t	read_int[8];
 	uint8_t	write_int[8];
 	uint8_t	weighted_num[8];
 	uint8_t	weighted_int[8];
 };
 
 struct scsi_log_idle_time {
 	struct scsi_log_param_header hdr;
 #define	SLP_IT				0x0002
 	uint8_t	idle_int[8];
 };
 
 struct scsi_log_time_interval {
 	struct scsi_log_param_header hdr;
 #define	SLP_TI				0x0003
 	uint8_t	exponent[4];
 	uint8_t	integer[4];
 };
 
 struct scsi_log_fua_stat_and_perf {
 	struct scsi_log_param_header hdr;
 #define	SLP_FUA_SAP			0x0004
 	uint8_t	fua_read_num[8];
 	uint8_t	fua_write_num[8];
 	uint8_t	fuanv_read_num[8];
 	uint8_t	fuanv_write_num[8];
 	uint8_t	fua_read_int[8];
 	uint8_t	fua_write_int[8];
 	uint8_t	fuanv_read_int[8];
 	uint8_t	fuanv_write_int[8];
 };
 
 struct scsi_log_informational_exceptions {
 	struct scsi_log_param_header hdr;
 #define	SLP_IE_GEN			0x0000
 	uint8_t	ie_asc;
 	uint8_t	ie_ascq;
 	uint8_t	temperature;
 };
 
+struct scsi_log_temperature {
+	struct scsi_log_param_header hdr;
+#define	SLP_TEMPERATURE			0x0000
+#define	SLP_REFTEMPERATURE		0x0001
+	uint8_t	reserved;
+	uint8_t	temperature;
+};
+
 struct scsi_control_page {
 	u_int8_t page_code;
 	u_int8_t page_length;
 	u_int8_t rlec;
 #define	SCP_RLEC			0x01	/*Report Log Exception Cond*/
 #define	SCP_GLTSD			0x02	/*Global Logging target
 						  save disable */
 #define	SCP_DSENSE			0x04	/*Descriptor Sense */
 #define	SCP_DPICZ			0x08	/*Disable Prot. Info Check
 						  if Prot. Field is Zero */
 #define	SCP_TMF_ONLY			0x10	/*TM Functions Only*/
 #define	SCP_TST_MASK			0xE0	/*Task Set Type Mask*/
 #define	SCP_TST_ONE			0x00	/*One Task Set*/
 #define	SCP_TST_SEPARATE		0x20	/*Separate Task Sets*/
 	u_int8_t queue_flags;
 #define	SCP_QUEUE_ALG_MASK		0xF0
 #define	SCP_QUEUE_ALG_RESTRICTED	0x00
 #define	SCP_QUEUE_ALG_UNRESTRICTED	0x10
 #define	SCP_NUAR			0x08	/*No UA on release*/
 #define	SCP_QUEUE_ERR			0x02	/*Queued I/O aborted for CACs*/
 #define	SCP_QUEUE_DQUE			0x01	/*Queued I/O disabled*/
 	u_int8_t eca_and_aen;
 #define	SCP_EECA			0x80	/*Enable Extended CA*/
 #define	SCP_RAC				0x40	/*Report a check*/
 #define	SCP_SWP				0x08	/*Software Write Protect*/
 #define	SCP_RAENP			0x04	/*Ready AEN Permission*/
 #define	SCP_UAAENP			0x02	/*UA AEN Permission*/
 #define	SCP_EAENP			0x01	/*Error AEN Permission*/
 	u_int8_t flags4;
 #define	SCP_ATO				0x80	/*Application tag owner*/
 #define	SCP_TAS				0x40	/*Task aborted status*/
 #define	SCP_ATMPE			0x20	/*Application tag mode page*/
 #define	SCP_RWWP			0x10	/*Reject write without prot*/
 	u_int8_t aen_holdoff_period[2];
 	u_int8_t busy_timeout_period[2];
 	u_int8_t extended_selftest_completion_time[2];
 };
 
 struct scsi_control_ext_page {
 	uint8_t page_code;
 #define SCEP_PAGE_CODE			0x0a
 	uint8_t subpage_code;
 #define SCEP_SUBPAGE_CODE		0x01
 	uint8_t page_length[2];
 	uint8_t flags;
 #define	SCEP_TCMOS			0x04	/* Timestamp Changeable by */
 #define	SCEP_SCSIP			0x02	/* SCSI Precedence (clock) */
 #define	SCEP_IALUAE			0x01	/* Implicit ALUA Enabled */
 	uint8_t prio;
 	uint8_t max_sense;
 	uint8_t reserve[25];
 };
 
 struct scsi_cache_page {
 	u_int8_t page_code;
 #define	SCHP_PAGE_SAVABLE		0x80	/* Page is savable */
 	u_int8_t page_length;
 	u_int8_t cache_flags;
 #define	SCHP_FLAGS_WCE			0x04	/* Write Cache Enable */
 #define	SCHP_FLAGS_MF			0x02	/* Multiplication factor */
 #define	SCHP_FLAGS_RCD			0x01	/* Read Cache Disable */
 	u_int8_t rw_cache_policy;
 	u_int8_t dis_prefetch[2];
 	u_int8_t min_prefetch[2];
 	u_int8_t max_prefetch[2];
 	u_int8_t max_prefetch_ceil[2];
 };
 
 /*
  * XXX KDM
  * Updated version of the cache page, as of SBC.  Update this to SBC-3 and
  * rationalize the two.
  */
 struct scsi_caching_page {
 	uint8_t page_code;
 #define	SMS_CACHING_PAGE		0x08
 	uint8_t page_length;
 	uint8_t flags1;
 #define	SCP_IC		0x80
 #define	SCP_ABPF	0x40
 #define	SCP_CAP		0x20
 #define	SCP_DISC	0x10
 #define	SCP_SIZE	0x08
 #define	SCP_WCE		0x04
 #define	SCP_MF		0x02
 #define	SCP_RCD		0x01
 	uint8_t ret_priority;
 	uint8_t disable_pf_transfer_len[2];
 	uint8_t min_prefetch[2];
 	uint8_t max_prefetch[2];
 	uint8_t max_pf_ceiling[2];
 	uint8_t flags2;
 #define	SCP_FSW		0x80
 #define	SCP_LBCSS	0x40
 #define	SCP_DRA		0x20
 #define	SCP_VS1		0x10
 #define	SCP_VS2		0x08
 	uint8_t cache_segments;
 	uint8_t cache_seg_size[2];
 	uint8_t reserved;
 	uint8_t non_cache_seg_size[3];
 };
 
 struct scsi_info_exceptions_page {
 	u_int8_t page_code;
 #define	SIEP_PAGE_SAVABLE		0x80	/* Page is savable */
 	u_int8_t page_length;
 	u_int8_t info_flags;
 #define	SIEP_FLAGS_PERF			0x80
 #define	SIEP_FLAGS_EBF			0x20
 #define	SIEP_FLAGS_EWASC		0x10
 #define	SIEP_FLAGS_DEXCPT		0x08
 #define	SIEP_FLAGS_TEST			0x04
 #define	SIEP_FLAGS_EBACKERR		0x02
 #define	SIEP_FLAGS_LOGERR		0x01
 	u_int8_t mrie;
 #define	SIEP_MRIE_NO		0x00
 #define	SIEP_MRIE_UA		0x02
 #define	SIEP_MRIE_REC_COND	0x03
 #define	SIEP_MRIE_REC_UNCOND	0x04
 #define	SIEP_MRIE_NO_SENSE	0x05
 #define	SIEP_MRIE_ON_REQ	0x06
 	u_int8_t interval_timer[4];
 	u_int8_t report_count[4];
 };
 
 struct scsi_logical_block_provisioning_page_descr {
 	uint8_t flags;
 #define	SLBPPD_ENABLED		0x80
 #define	SLBPPD_TYPE_MASK	0x38
 #define	SLBPPD_ARMING_MASK	0x07
 #define	SLBPPD_ARMING_DEC	0x02
 #define	SLBPPD_ARMING_INC	0x01
 	uint8_t resource;
 	uint8_t reserved[2];
 	uint8_t count[4];
 };
 
 struct scsi_logical_block_provisioning_page {
 	uint8_t page_code;
 	uint8_t subpage_code;
 	uint8_t page_length[2];
 	uint8_t flags;
 #define	SLBPP_SITUA		0x01
 	uint8_t reserved[11];
 	struct scsi_logical_block_provisioning_page_descr descr[0];
 };
 
 /*
  * SCSI protocol identifier values, current as of SPC4r36l.
  */
 #define	SCSI_PROTO_FC		0x00	/* Fibre Channel */
 #define	SCSI_PROTO_SPI		0x01	/* Parallel SCSI */
 #define	SCSI_PROTO_SSA		0x02	/* Serial Storage Arch. */
 #define	SCSI_PROTO_1394		0x03	/* IEEE 1394 (Firewire) */
 #define	SCSI_PROTO_RDMA		0x04	/* SCSI RDMA Protocol */
 #define	SCSI_PROTO_ISCSI	0x05	/* Internet SCSI */
 #define	SCSI_PROTO_iSCSI	0x05	/* Internet SCSI */
 #define	SCSI_PROTO_SAS		0x06	/* SAS Serial SCSI Protocol */
 #define	SCSI_PROTO_ADT		0x07	/* Automation/Drive Int. Trans. Prot.*/
 #define	SCSI_PROTO_ADITP	0x07	/* Automation/Drive Int. Trans. Prot.*/
 #define	SCSI_PROTO_ATA		0x08	/* AT Attachment Interface */
 #define	SCSI_PROTO_UAS		0x09	/* USB Atached SCSI */
 #define	SCSI_PROTO_SOP		0x0a	/* SCSI over PCI Express */
 #define	SCSI_PROTO_NONE		0x0f	/* No specific protocol */
 
 struct scsi_proto_specific_page {
 	u_int8_t page_code;
 #define	SPSP_PAGE_SAVABLE		0x80	/* Page is savable */
 	u_int8_t page_length;
 	u_int8_t protocol;
 #define	SPSP_PROTO_FC			SCSI_PROTO_FC
 #define	SPSP_PROTO_SPI			SCSI_PROTO_SPI
 #define	SPSP_PROTO_SSA			SCSI_PROTO_SSA
 #define	SPSP_PROTO_1394			SCSI_PROTO_1394
 #define	SPSP_PROTO_RDMA			SCSI_PROTO_RDMA
 #define	SPSP_PROTO_ISCSI		SCSI_PROTO_ISCSI
 #define	SPSP_PROTO_SAS			SCSI_PROTO_SAS
 #define	SPSP_PROTO_ADT			SCSI_PROTO_ADITP
 #define	SPSP_PROTO_ATA			SCSI_PROTO_ATA
 #define	SPSP_PROTO_UAS			SCSI_PROTO_UAS
 #define	SPSP_PROTO_SOP			SCSI_PROTO_SOP
 #define	SPSP_PROTO_NONE			SCSI_PROTO_NONE
 };
 
 struct scsi_reserve
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SR_EXTENT	0x01
 #define	SR_ID_MASK	0x0e
 #define	SR_3RDPTY	0x10
 #define	SR_LUN_MASK	0xe0
 	u_int8_t resv_id;
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 struct scsi_reserve_10 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 #define	SR10_3RDPTY	0x10
 #define	SR10_LONGID	0x02
 #define	SR10_EXTENT	0x01
 	uint8_t resv_id;
 	uint8_t thirdparty_id;
 	uint8_t reserved[3];
 	uint8_t length[2];
 	uint8_t control;
 };
 
 
 struct scsi_release
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 	u_int8_t resv_id;
 	u_int8_t unused[1];
 	u_int8_t length;
 	u_int8_t control;
 };
 
 struct scsi_release_10 {
 	uint8_t opcode;
 	uint8_t byte2;
 	uint8_t resv_id;
 	uint8_t thirdparty_id;
 	uint8_t reserved[3];
 	uint8_t length[2];
 	uint8_t control;
 };
 
 struct scsi_prevent
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 	u_int8_t unused[2];
 	u_int8_t how;
 	u_int8_t control;
 };
 #define	PR_PREVENT 0x01
 #define	PR_ALLOW   0x00
 
 struct scsi_sync_cache
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SSC_IMMED	0x02
 #define	SSC_RELADR	0x01
 	u_int8_t begin_lba[4];
 	u_int8_t reserved;
 	u_int8_t lb_count[2];
 	u_int8_t control;	
 };
 
 struct scsi_sync_cache_16
 {
 	uint8_t opcode;
 	uint8_t byte2;
 	uint8_t begin_lba[8];
 	uint8_t lb_count[4];
 	uint8_t reserved;
 	uint8_t control;
 };
 
 struct scsi_format {
 	uint8_t opcode;
 	uint8_t byte2;
 #define	SF_LONGLIST		0x20
 #define	SF_FMTDATA		0x10
 #define	SF_CMPLIST		0x08
 #define	SF_FORMAT_MASK		0x07
 #define	SF_FORMAT_BLOCK		0x00
 #define	SF_FORMAT_LONG_BLOCK	0x03
 #define	SF_FORMAT_BFI		0x04
 #define	SF_FORMAT_PHYS		0x05
 	uint8_t vendor;
 	uint8_t interleave[2];
 	uint8_t control;
 };
 
 struct scsi_format_header_short {
 	uint8_t reserved;
 #define	SF_DATA_FOV	0x80
 #define	SF_DATA_DPRY	0x40
 #define	SF_DATA_DCRT	0x20
 #define	SF_DATA_STPF	0x10
 #define	SF_DATA_IP	0x08
 #define	SF_DATA_DSP	0x04
 #define	SF_DATA_IMMED	0x02
 #define	SF_DATA_VS	0x01
 	uint8_t byte2;
 	uint8_t defect_list_len[2];
 };
 
 struct scsi_format_header_long {
 	uint8_t reserved;
 	uint8_t byte2;
 	uint8_t reserved2[2];
 	uint8_t defect_list_len[4];
 };
 
 struct scsi_changedef
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 	u_int8_t unused1;
 	u_int8_t how;
 	u_int8_t unused[4];
 	u_int8_t datalen;
 	u_int8_t control;
 };
 
 struct scsi_read_buffer
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	RWB_MODE		0x1F
 #define	RWB_MODE_HDR_DATA	0x00
 #define	RWB_MODE_VENDOR		0x01
 #define	RWB_MODE_DATA		0x02
 #define	RWB_MODE_DESCR		0x03
 #define	RWB_MODE_DOWNLOAD	0x04
 #define	RWB_MODE_DOWNLOAD_SAVE	0x05
 #define	RWB_MODE_ECHO		0x0A
 #define	RWB_MODE_ECHO_DESCR	0x0B
 #define	RWB_MODE_ERROR_HISTORY	0x1C
         u_int8_t buffer_id;
         u_int8_t offset[3];
         u_int8_t length[3];
         u_int8_t control;
 };
 
 struct scsi_read_buffer_16
 {
 	uint8_t opcode;
 	uint8_t byte2;
 	uint8_t offset[8];
 	uint8_t length[4];
 	uint8_t buffer_id;
 	uint8_t control;
 };
 
 struct scsi_write_buffer
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 	u_int8_t buffer_id;
 	u_int8_t offset[3];
 	u_int8_t length[3];
 	u_int8_t control;
 };
 
 struct scsi_read_attribute
 {
 	u_int8_t opcode;
 	u_int8_t service_action;
 #define	SRA_SA_ATTR_VALUES		0x00
 #define	SRA_SA_ATTR_LIST		0x01
 #define	SRA_SA_LOG_VOL_LIST		0x02
 #define	SRA_SA_PART_LIST		0x03
 #define	SRA_SA_RESTRICTED		0x04
 #define	SRA_SA_SUPPORTED_ATTRS		0x05
 #define	SRA_SA_MASK			0x1f
 	u_int8_t element[2];
 	u_int8_t elem_type;
 	u_int8_t logical_volume;
 	u_int8_t reserved1;
 	u_int8_t partition;
 	u_int8_t first_attribute[2];
 	u_int8_t length[4];
 	u_int8_t cache;
 #define	SRA_CACHE			0x01
 	u_int8_t control;
 };
 
 struct scsi_write_attribute
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SWA_WTC				0x01
 	u_int8_t element[3];
 	u_int8_t logical_volume;
 	u_int8_t reserved1;
 	u_int8_t partition;
 	u_int8_t reserved2[2];
 	u_int8_t length[4];
 	u_int8_t reserved3;
 	u_int8_t control;
 };
 
 
 struct scsi_read_attribute_values
 {
 	u_int8_t length[4];
 	u_int8_t attribute_0[0];
 };
 
 struct scsi_mam_attribute_header
 {
 	u_int8_t id[2];
 	/*
 	 * Attributes obtained from SPC-4r36g (section 7.4.2.2) and
 	 * SSC-4r03 (section 4.2.21). 
 	 */
 #define	SMA_ATTR_ID_DEVICE_MIN		0x0000
 
 #define	SMA_ATTR_REM_CAP_PARTITION	0x0000
 #define	SMA_ATTR_MAX_CAP_PARTITION	0x0001
 #define	SMA_ATTR_TAPEALERT_FLAGS	0x0002
 #define	SMA_ATTR_LOAD_COUNT		0x0003
 #define	SMA_ATTR_MAM_SPACE_REMAINING	0x0004
 
 #define	SMA_ATTR_DEV_ASSIGNING_ORG	0x0005
 #define	SMA_ATTR_FORMAT_DENSITY_CODE	0x0006
 #define	SMA_ATTR_INITIALIZATION_COUNT	0x0007
 #define	SMA_ATTR_VOLUME_ID		0x0008
 #define	SMA_ATTR_VOLUME_CHANGE_REF	0x0009
 
 #define	SMA_ATTR_DEV_SERIAL_LAST_LOAD	0x020a
 #define	SMA_ATTR_DEV_SERIAL_LAST_LOAD_1	0x020b
 #define	SMA_ATTR_DEV_SERIAL_LAST_LOAD_2	0x020c
 #define	SMA_ATTR_DEV_SERIAL_LAST_LOAD_3	0x020d
 
 #define	SMA_ATTR_TOTAL_MB_WRITTEN_LT	0x0220
 #define	SMA_ATTR_TOTAL_MB_READ_LT	0x0221
 #define	SMA_ATTR_TOTAL_MB_WRITTEN_CUR	0x0222
 #define	SMA_ATTR_TOTAL_MB_READ_CUR	0x0223
 #define	SMA_ATTR_FIRST_ENC_BLOCK	0x0224
 #define	SMA_ATTR_NEXT_UNENC_BLOCK	0x0225
 
 #define	SMA_ATTR_MEDIUM_USAGE_HIST	0x0340
 #define	SMA_ATTR_PART_USAGE_HIST	0x0341
 
 #define	SMA_ATTR_ID_DEVICE_MAX		0x03ff
 
 #define	SMA_ATTR_ID_MEDIUM_MIN		0x0400
 
 #define	SMA_ATTR_MED_MANUF		0x0400
 #define	SMA_ATTR_MED_SERIAL		0x0401
 
 #define	SMA_ATTR_MED_LENGTH		0x0402
 #define	SMA_ATTR_MED_WIDTH		0x0403
 #define	SMA_ATTR_MED_ASSIGNING_ORG	0x0404
 #define	SMA_ATTR_MED_DENSITY_CODE	0x0405
 
 #define	SMA_ATTR_MED_MANUF_DATE		0x0406
 #define	SMA_ATTR_MAM_CAPACITY		0x0407
 #define	SMA_ATTR_MED_TYPE		0x0408
 #define	SMA_ATTR_MED_TYPE_INFO		0x0409
 #define	SMA_ATTR_MED_SERIAL_NUM		0x040a
 
 #define	SMA_ATTR_ID_MEDIUM_MAX		0x07ff
 
 #define	SMA_ATTR_ID_HOST_MIN		0x0800
 
 #define	SMA_ATTR_APP_VENDOR		0x0800
 #define	SMA_ATTR_APP_NAME		0x0801
 #define	SMA_ATTR_APP_VERSION		0x0802
 #define	SMA_ATTR_USER_MED_TEXT_LABEL	0x0803
 #define	SMA_ATTR_LAST_WRITTEN_TIME	0x0804
 #define	SMA_ATTR_TEXT_LOCAL_ID		0x0805
 #define	SMA_ATTR_BARCODE		0x0806
 #define	SMA_ATTR_HOST_OWNER_NAME	0x0807
 #define	SMA_ATTR_MEDIA_POOL		0x0808
 #define	SMA_ATTR_PART_USER_LABEL	0x0809
 #define	SMA_ATTR_LOAD_UNLOAD_AT_PART	0x080a
 #define	SMA_ATTR_APP_FORMAT_VERSION	0x080b
 #define	SMA_ATTR_VOL_COHERENCY_INFO	0x080c
 
 #define	SMA_ATTR_ID_HOST_MAX		0x0bff
 
 #define	SMA_ATTR_VENDOR_DEVICE_MIN	0x0c00
 #define	SMA_ATTR_VENDOR_DEVICE_MAX	0x0fff
 #define	SMA_ATTR_VENDOR_MEDIUM_MIN	0x1000
 #define	SMA_ATTR_VENDOR_MEDIUM_MAX	0x13ff
 #define	SMA_ATTR_VENDOR_HOST_MIN	0x1400
 #define	SMA_ATTR_VENDOR_HOST_MAX	0x17ff
 	u_int8_t byte2;
 #define	SMA_FORMAT_BINARY	0x00
 #define	SMA_FORMAT_ASCII	0x01
 #define	SMA_FORMAT_TEXT		0x02
 #define	SMA_FORMAT_MASK		0x03
 #define	SMA_READ_ONLY		0x80
 	u_int8_t length[2];
 	u_int8_t attribute[0];
 };
 
 struct scsi_attrib_list_header {
 	u_int8_t length[4];
 	u_int8_t first_attr_0[0];
 };
 
 struct scsi_attrib_lv_list {
 	u_int8_t length[2];
 	u_int8_t first_lv_number;
 	u_int8_t num_logical_volumes;
 };
 
 struct scsi_attrib_vendser {
 	uint8_t vendor[8];
 	uint8_t serial_num[32];
 };
 
 /*
  * These values are used to decode the Volume Coherency Information
  * Attribute (0x080c) for LTFS-format coherency information.
  * Although the Application Client Specific lengths are different for
  * Version 0 and Version 1, the data is in fact the same.  The length
  * difference was due to a code bug.
  */
 #define	SCSI_LTFS_VER0_LEN	42
 #define	SCSI_LTFS_VER1_LEN	43
 #define	SCSI_LTFS_UUID_LEN	36
 #define	SCSI_LTFS_STR_NAME	"LTFS"
 #define	SCSI_LTFS_STR_LEN	4
 
 typedef enum {
 	SCSI_ATTR_FLAG_NONE		= 0x00,
 	SCSI_ATTR_FLAG_HEX		= 0x01,
 	SCSI_ATTR_FLAG_FP		= 0x02,
 	SCSI_ATTR_FLAG_DIV_10		= 0x04,
 	SCSI_ATTR_FLAG_FP_1DIGIT	= 0x08
 } scsi_attrib_flags;
 
 typedef enum {
 	SCSI_ATTR_OUTPUT_NONE		= 0x00,
 	SCSI_ATTR_OUTPUT_TEXT_MASK	= 0x03,
 	SCSI_ATTR_OUTPUT_TEXT_RAW	= 0x00,
 	SCSI_ATTR_OUTPUT_TEXT_ESC	= 0x01,
 	SCSI_ATTR_OUTPUT_TEXT_RSV1	= 0x02,
 	SCSI_ATTR_OUTPUT_TEXT_RSV2	= 0x03,
 	SCSI_ATTR_OUTPUT_NONASCII_MASK	= 0x0c,
 	SCSI_ATTR_OUTPUT_NONASCII_TRIM	= 0x00,
 	SCSI_ATTR_OUTPUT_NONASCII_ESC	= 0x04,
 	SCSI_ATTR_OUTPUT_NONASCII_RAW	= 0x08,
 	SCSI_ATTR_OUTPUT_NONASCII_RSV1	= 0x0c,
 	SCSI_ATTR_OUTPUT_FIELD_MASK	= 0xf0,
 	SCSI_ATTR_OUTPUT_FIELD_ALL	= 0xf0,
 	SCSI_ATTR_OUTPUT_FIELD_NONE	= 0x00,
 	SCSI_ATTR_OUTPUT_FIELD_DESC	= 0x10,
 	SCSI_ATTR_OUTPUT_FIELD_NUM	= 0x20,
 	SCSI_ATTR_OUTPUT_FIELD_SIZE	= 0x40,
 	SCSI_ATTR_OUTPUT_FIELD_RW	= 0x80
 } scsi_attrib_output_flags;
 
 struct sbuf;
 
 struct scsi_attrib_table_entry
 {
 	u_int32_t id;
 	u_int32_t flags;
 	const char *desc;
 	const char *suffix;
 	int (*to_str)(struct sbuf *sb, struct scsi_mam_attribute_header *hdr,
 		      uint32_t valid_len, uint32_t flags,
 		      uint32_t output_flags, char *error_str,
 		      int error_str_len);
 	int (*parse_str)(char *str, struct scsi_mam_attribute_header *hdr,
 			 uint32_t alloc_len, uint32_t flags, char *error_str,
 			 int error_str_len);
 };
 
 struct scsi_rw_6
 {
 	u_int8_t opcode;
 	u_int8_t addr[3];
 /* only 5 bits are valid in the MSB address byte */
 #define	SRW_TOPADDR	0x1F
 	u_int8_t length;
 	u_int8_t control;
 };
 
 struct scsi_rw_10
 {
 	u_int8_t opcode;
 #define	SRW10_RELADDR	0x01
 /* EBP defined for WRITE(10) only */
 #define	SRW10_EBP	0x04
 #define	SRW10_FUA	0x08
 #define	SRW10_DPO	0x10
 	u_int8_t byte2;
 	u_int8_t addr[4];
 	u_int8_t reserved;
 	u_int8_t length[2];
 	u_int8_t control;
 };
 
 struct scsi_rw_12
 {
 	u_int8_t opcode;
 #define	SRW12_RELADDR	0x01
 #define	SRW12_FUA	0x08
 #define	SRW12_DPO	0x10
 	u_int8_t byte2;
 	u_int8_t addr[4];
 	u_int8_t length[4];
 	u_int8_t reserved;
 	u_int8_t control;
 };
 
 struct scsi_rw_16
 {
 	u_int8_t opcode;
 #define	SRW16_RELADDR	0x01
 #define	SRW16_FUA	0x08
 #define	SRW16_DPO	0x10
 	u_int8_t byte2;
 	u_int8_t addr[8];
 	u_int8_t length[4];
 	u_int8_t reserved;
 	u_int8_t control;
 };
 
 struct scsi_write_atomic_16
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 	uint8_t	addr[8];
 	uint8_t	boundary[2];
 	uint8_t	length[2];
 	uint8_t	group;
 	uint8_t	control;
 };
 
 struct scsi_write_same_10
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 #define	SWS_LBDATA	0x02
 #define	SWS_PBDATA	0x04
 #define	SWS_UNMAP	0x08
 #define	SWS_ANCHOR	0x10
 	uint8_t	addr[4];
 	uint8_t	group;
 	uint8_t	length[2];
 	uint8_t	control;
 };
 
 struct scsi_write_same_16
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 #define	SWS_NDOB	0x01
 	uint8_t	addr[8];
 	uint8_t	length[4];
 	uint8_t	group;
 	uint8_t	control;
 };
 
 struct scsi_unmap
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 #define	SU_ANCHOR	0x01
 	uint8_t	reserved[4];
 	uint8_t	group;
 	uint8_t	length[2];
 	uint8_t	control;
 };
 
 struct scsi_unmap_header
 {
 	uint8_t	length[2];
 	uint8_t	desc_length[2];
 	uint8_t	reserved[4];
 };
 
 struct scsi_unmap_desc
 {
 	uint8_t	lba[8];
 	uint8_t	length[4];
 	uint8_t	reserved[4];
 };
 
 struct scsi_write_verify_10
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 #define	SWV_BYTCHK		0x02
 #define	SWV_DPO			0x10
 #define	SWV_WRPROECT_MASK	0xe0
 	uint8_t	addr[4];
 	uint8_t	group;
 	uint8_t length[2];
 	uint8_t	control;
 };
 
 struct scsi_write_verify_12
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 	uint8_t	addr[4];
 	uint8_t	length[4];
 	uint8_t	group;
 	uint8_t	control;
 };
 
 struct scsi_write_verify_16
 {
 	uint8_t	opcode;
 	uint8_t	byte2;
 	uint8_t	addr[8];
 	uint8_t	length[4];
 	uint8_t	group;
 	uint8_t	control;
 };
 
 
 struct scsi_start_stop_unit
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SSS_IMMED		0x01
 	u_int8_t reserved[2];
 	u_int8_t how;
 #define	SSS_START		0x01
 #define	SSS_LOEJ		0x02
 #define	SSS_PC_MASK		0xf0
 #define	SSS_PC_START_VALID	0x00
 #define	SSS_PC_ACTIVE		0x10
 #define	SSS_PC_IDLE		0x20
 #define	SSS_PC_STANDBY		0x30
 #define	SSS_PC_LU_CONTROL	0x70
 #define	SSS_PC_FORCE_IDLE_0	0xa0
 #define	SSS_PC_FORCE_STANDBY_0	0xb0
 	u_int8_t control;
 };
 
 struct ata_pass_12 {
 	u_int8_t opcode;
 	u_int8_t protocol;
 #define	AP_PROTO_HARD_RESET	(0x00 << 1)
 #define	AP_PROTO_SRST		(0x01 << 1)
 #define	AP_PROTO_NON_DATA	(0x03 << 1)
 #define	AP_PROTO_PIO_IN		(0x04 << 1)
 #define	AP_PROTO_PIO_OUT	(0x05 << 1)
 #define	AP_PROTO_DMA		(0x06 << 1)
 #define	AP_PROTO_DMA_QUEUED	(0x07 << 1)
 #define	AP_PROTO_DEVICE_DIAG	(0x08 << 1)
 #define	AP_PROTO_DEVICE_RESET	(0x09 << 1)
 #define	AP_PROTO_UDMA_IN	(0x0a << 1)
 #define	AP_PROTO_UDMA_OUT	(0x0b << 1)
 #define	AP_PROTO_FPDMA		(0x0c << 1)
 #define	AP_PROTO_RESP_INFO	(0x0f << 1)
 #define AP_PROTO_MASK		0x1e
 #define	AP_MULTI	0xe0
 	u_int8_t flags;
 #define	AP_T_LEN	0x03
 #define	AP_BB		0x04
 #define	AP_T_DIR	0x08
 #define	AP_CK_COND	0x20
 #define	AP_OFFLINE	0x60
 	u_int8_t features;
 	u_int8_t sector_count;
 	u_int8_t lba_low;
 	u_int8_t lba_mid;
 	u_int8_t lba_high;
 	u_int8_t device;
 	u_int8_t command;
 	u_int8_t reserved;
 	u_int8_t control;
 };
 
 struct scsi_maintenance_in
 {
         uint8_t  opcode;
         uint8_t  byte2;
 #define SERVICE_ACTION_MASK  0x1f
 #define SA_RPRT_TRGT_GRP     0x0a
         uint8_t  reserved[4];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_report_supported_opcodes
 {
         uint8_t  opcode;
         uint8_t  service_action;
         uint8_t  options;
 #define RSO_RCTD		0x80
 #define RSO_OPTIONS_MASK	0x07
 #define RSO_OPTIONS_ALL		0x00
 #define RSO_OPTIONS_OC		0x01
 #define RSO_OPTIONS_OC_SA	0x02
 #define RSO_OPTIONS_OC_ASA	0x03
         uint8_t  requested_opcode;
         uint8_t  requested_service_action[2];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_report_supported_opcodes_timeout
 {
 	uint8_t  length[2];
 	uint8_t  reserved;
 	uint8_t  cmd_specific;
 	uint8_t  nominal_time[4];
 	uint8_t  recommended_time[4];
 };
 
 struct scsi_report_supported_opcodes_descr
 {
 	uint8_t  opcode;
 	uint8_t  reserved;
 	uint8_t  service_action[2];
 	uint8_t  reserved2;
 	uint8_t  flags;
 #define RSO_SERVACTV		0x01
 #define RSO_CTDP		0x02
 #define RSO_CDLP_MASK		0x0c
 #define RSO_CDLP_NO		0x00
 #define RSO_CDLP_A		0x04
 #define RSO_CDLP_B		0x08
 	uint8_t  cdb_length[2];
 	struct scsi_report_supported_opcodes_timeout timeout[0];
 };
 
 struct scsi_report_supported_opcodes_all
 {
 	uint8_t  length[4];
 	struct scsi_report_supported_opcodes_descr descr[0];
 };
 
 struct scsi_report_supported_opcodes_one
 {
 	uint8_t  reserved;
 	uint8_t  support;
 #define RSO_ONE_CTDP		0x80
 #define RSO_ONE_CDLP_MASK	0x18
 #define RSO_ONE_CDLP_NO		0x00
 #define RSO_ONE_CDLP_A		0x08
 #define RSO_ONE_CDLP_B		0x10
 #define RSO_ONE_SUP_MASK	0x07
 #define RSO_ONE_SUP_UNAVAIL	0x00
 #define RSO_ONE_SUP_NOT_SUP	0x01
 #define RSO_ONE_SUP_AVAIL	0x03
 #define RSO_ONE_SUP_VENDOR	0x05
 	uint8_t  cdb_length[2];
 	uint8_t  cdb_usage[];
 };
 
 struct scsi_report_supported_tmf
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 	uint8_t  options;
 #define RST_REPD		0x80
 	uint8_t  reserved[3];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_report_supported_tmf_data
 {
 	uint8_t  byte1;
 #define RST_WAKES		0x01
 #define RST_TRS			0x02
 #define RST_QTS			0x04
 #define RST_LURS		0x08
 #define RST_CTSS		0x10
 #define RST_CACAS		0x20
 #define RST_ATSS		0x40
 #define RST_ATS			0x80
 	uint8_t  byte2;
 #define RST_ITNRS		0x01
 #define RST_QTSS		0x02
 #define RST_QAES		0x04
 	uint8_t  reserved;
 	uint8_t  length;
 };
 
 struct scsi_report_supported_tmf_ext_data
 {
 	uint8_t  byte1;
 	uint8_t  byte2;
 	uint8_t  reserved;
 	uint8_t  length;
 	uint8_t  byte5;
 #define RST_TMFTMOV		0x01
 	uint8_t  reserved2;
 	uint8_t  byte7;
 #define RST_WAKETS		0x01
 #define RST_TRTS		0x02
 #define RST_QTTS		0x04
 #define RST_LURTS		0x08
 #define RST_CTSTS		0x10
 #define RST_CACATS		0x20
 #define RST_ATSTS		0x40
 #define RST_ATTS		0x80
 	uint8_t  byte8;
 #define RST_ITNRTS		0x01
 #define RST_QTSTS		0x02
 #define RST_QAETS		0x04
 	uint8_t  long_timeout[4];
 	uint8_t  short_timeout[4];
 };
 
 struct scsi_report_timestamp
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 	uint8_t  reserved[4];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_report_timestamp_data
 {
 	uint8_t  length[2];
 	uint8_t  origin;
 #define RTS_ORIG_MASK		0x00
 #define RTS_ORIG_ZERO		0x00
 #define RTS_ORIG_SET		0x02
 #define RTS_ORIG_OUTSIDE	0x03
 	uint8_t  reserved;
 	uint8_t  timestamp[6];
 	uint8_t  reserve2[2];
 };
 
 struct scsi_receive_copy_status_lid1
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define RCS_RCS_LID1		0x00
 	uint8_t  list_identifier;
 	uint8_t  reserved[7];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_receive_copy_status_lid1_data
 {
 	uint8_t  available_data[4];
 	uint8_t  copy_command_status;
 #define RCS_CCS_INPROG		0x00
 #define RCS_CCS_COMPLETED	0x01
 #define RCS_CCS_ERROR		0x02
 	uint8_t  segments_processed[2];
 	uint8_t  transfer_count_units;
 #define RCS_TC_BYTES		0x00
 #define RCS_TC_KBYTES		0x01
 #define RCS_TC_MBYTES		0x02
 #define RCS_TC_GBYTES		0x03
 #define RCS_TC_TBYTES		0x04
 #define RCS_TC_PBYTES		0x05
 #define RCS_TC_EBYTES		0x06
 #define RCS_TC_LBAS		0xf1
 	uint8_t  transfer_count[4];
 };
 
 struct scsi_receive_copy_failure_details
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define RCS_RCFD		0x04
 	uint8_t  list_identifier;
 	uint8_t  reserved[7];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_receive_copy_failure_details_data
 {
 	uint8_t  available_data[4];
 	uint8_t  reserved[52];
 	uint8_t  copy_command_status;
 	uint8_t  reserved2;
 	uint8_t  sense_data_length[2];
 	uint8_t  sense_data[];
 };
 
 struct scsi_receive_copy_status_lid4
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define RCS_RCS_LID4		0x05
 	uint8_t  list_identifier[4];
 	uint8_t  reserved[4];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_receive_copy_status_lid4_data
 {
 	uint8_t  available_data[4];
 	uint8_t  response_to_service_action;
 	uint8_t  copy_command_status;
 #define RCS_CCS_COMPLETED_PROD	0x03
 #define RCS_CCS_COMPLETED_RESID	0x04
 #define RCS_CCS_INPROG_FGBG	0x10
 #define RCS_CCS_INPROG_FG	0x11
 #define RCS_CCS_INPROG_BG	0x12
 #define RCS_CCS_ABORTED		0x60
 	uint8_t  operation_counter[2];
 	uint8_t  estimated_status_update_delay[4];
 	uint8_t  extended_copy_completion_status;
 	uint8_t  length_of_the_sense_data_field;
 	uint8_t  sense_data_length;
 	uint8_t  transfer_count_units;
 	uint8_t  transfer_count[8];
 	uint8_t  segments_processed[2];
 	uint8_t  reserved[6];
 	uint8_t  sense_data[];
 };
 
 struct scsi_receive_copy_operating_parameters
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define RCS_RCOP		0x03
 	uint8_t  reserved[8];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_receive_copy_operating_parameters_data
 {
 	uint8_t  length[4];
 	uint8_t  snlid;
 #define RCOP_SNLID		0x01
 	uint8_t  reserved[3];
 	uint8_t  maximum_cscd_descriptor_count[2];
 	uint8_t  maximum_segment_descriptor_count[2];
 	uint8_t  maximum_descriptor_list_length[4];
 	uint8_t  maximum_segment_length[4];
 	uint8_t  maximum_inline_data_length[4];
 	uint8_t  held_data_limit[4];
 	uint8_t  maximum_stream_device_transfer_size[4];
 	uint8_t  reserved2[2];
 	uint8_t  total_concurrent_copies[2];
 	uint8_t  maximum_concurrent_copies;
 	uint8_t  data_segment_granularity;
 	uint8_t  inline_data_granularity;
 	uint8_t  held_data_granularity;
 	uint8_t  reserved3[3];
 	uint8_t  implemented_descriptor_list_length;
 	uint8_t  list_of_implemented_descriptor_type_codes[0];
 };
 
 struct scsi_extended_copy
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define EC_EC_LID1		0x00
 #define EC_EC_LID4		0x01
 	uint8_t  reserved[8];
 	uint8_t  length[4];
 	uint8_t  reserved1;
 	uint8_t  control;
 };
 
 struct scsi_ec_cscd_dtsp
 {
 	uint8_t  flags;
 #define EC_CSCD_FIXED		0x01
 #define EC_CSCD_PAD		0x04
 	uint8_t  block_length[3];
 };
 
 struct scsi_ec_cscd
 {
 	uint8_t  type_code;
 #define EC_CSCD_EXT		0xff
 	uint8_t  luidt_pdt;
 #define EC_NUL			0x20
 #define EC_LUIDT_MASK		0xc0
 #define EC_LUIDT_LUN		0x00
 #define EC_LUIDT_PROXY_TOKEN	0x40
 	uint8_t  relative_initiator_port[2];
 	uint8_t  cscd_params[24];
 	struct scsi_ec_cscd_dtsp dtsp;
 };
 
 struct scsi_ec_cscd_id
 {
 	uint8_t  type_code;
 #define EC_CSCD_ID		0xe4
 	uint8_t  luidt_pdt;
 	uint8_t  relative_initiator_port[2];
 	uint8_t  codeset;
 	uint8_t  id_type;
 	uint8_t  reserved;
 	uint8_t  length;
 	uint8_t  designator[20];
 	struct scsi_ec_cscd_dtsp dtsp;
 };
 
 struct scsi_ec_segment
 {
 	uint8_t  type_code;
 	uint8_t  flags;
 #define EC_SEG_DC		0x02
 #define EC_SEG_CAT		0x01
 	uint8_t  descr_length[2];
 	uint8_t  params[];
 };
 
 struct scsi_ec_segment_b2b
 {
 	uint8_t  type_code;
 #define EC_SEG_B2B		0x02
 	uint8_t  flags;
 	uint8_t  descr_length[2];
 	uint8_t  src_cscd[2];
 	uint8_t  dst_cscd[2];
 	uint8_t  reserved[2];
 	uint8_t  number_of_blocks[2];
 	uint8_t  src_lba[8];
 	uint8_t  dst_lba[8];
 };
 
 struct scsi_ec_segment_verify
 {
 	uint8_t  type_code;
 #define EC_SEG_VERIFY		0x07
 	uint8_t  reserved;
 	uint8_t  descr_length[2];
 	uint8_t  src_cscd[2];
 	uint8_t  reserved2[2];
 	uint8_t  tur;
 	uint8_t  reserved3[3];
 };
 
 struct scsi_ec_segment_register_key
 {
 	uint8_t  type_code;
 #define EC_SEG_REGISTER_KEY	0x14
 	uint8_t  reserved;
 	uint8_t  descr_length[2];
 	uint8_t  reserved2[2];
 	uint8_t  dst_cscd[2];
 	uint8_t  res_key[8];
 	uint8_t  sa_res_key[8];
 	uint8_t  reserved3[4];
 };
 
 struct scsi_extended_copy_lid1_data
 {
 	uint8_t  list_identifier;
 	uint8_t  flags;
 #define EC_PRIORITY		0x07
 #define EC_LIST_ID_USAGE_MASK	0x18
 #define EC_LIST_ID_USAGE_FULL	0x08
 #define EC_LIST_ID_USAGE_NOHOLD	0x10
 #define EC_LIST_ID_USAGE_NONE	0x18
 #define EC_STR			0x20
 	uint8_t  cscd_list_length[2];
 	uint8_t  reserved[4];
 	uint8_t  segment_list_length[4];
 	uint8_t  inline_data_length[4];
 	uint8_t  data[];
 };
 
 struct scsi_extended_copy_lid4_data
 {
 	uint8_t  list_format;
 #define EC_LIST_FORMAT		0x01
 	uint8_t  flags;
 	uint8_t  header_cscd_list_length[2];
 	uint8_t  reserved[11];
 	uint8_t  flags2;
 #define EC_IMMED		0x01
 #define EC_G_SENSE		0x02
 	uint8_t  header_cscd_type_code;
 	uint8_t  reserved2[3];
 	uint8_t  list_identifier[4];
 	uint8_t  reserved3[18];
 	uint8_t  cscd_list_length[2];
 	uint8_t  segment_list_length[2];
 	uint8_t  inline_data_length[2];
 	uint8_t  data[];
 };
 
 struct scsi_copy_operation_abort
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define EC_COA			0x1c
 	uint8_t  list_identifier[4];
 	uint8_t  reserved[9];
 	uint8_t  control;
 };
 
 struct scsi_populate_token
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define EC_PT			0x10
 	uint8_t  reserved[4];
 	uint8_t  list_identifier[4];
 	uint8_t  length[4];
 	uint8_t  group_number;
 	uint8_t  control;
 };
 
 struct scsi_range_desc
 {
 	uint8_t	lba[8];
 	uint8_t	length[4];
 	uint8_t	reserved[4];
 };
 
 struct scsi_populate_token_data
 {
 	uint8_t  length[2];
 	uint8_t  flags;
 #define EC_PT_IMMED			0x01
 #define EC_PT_RTV			0x02
 	uint8_t  reserved;
 	uint8_t  inactivity_timeout[4];
 	uint8_t  rod_type[4];
 	uint8_t  reserved2[2];
 	uint8_t  range_descriptor_length[2];
 	struct scsi_range_desc desc[];
 };
 
 struct scsi_write_using_token
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define EC_WUT			0x11
 	uint8_t  reserved[4];
 	uint8_t  list_identifier[4];
 	uint8_t  length[4];
 	uint8_t  group_number;
 	uint8_t  control;
 };
 
 struct scsi_write_using_token_data
 {
 	uint8_t  length[2];
 	uint8_t  flags;
 #define EC_WUT_IMMED			0x01
 #define EC_WUT_DEL_TKN			0x02
 	uint8_t  reserved[5];
 	uint8_t  offset_into_rod[8];
 	uint8_t  rod_token[512];
 	uint8_t  reserved2[6];
 	uint8_t  range_descriptor_length[2];
 	struct scsi_range_desc desc[];
 };
 
 struct scsi_receive_rod_token_information
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define RCS_RRTI		0x07
 	uint8_t  list_identifier[4];
 	uint8_t  reserved[4];
 	uint8_t  length[4];
 	uint8_t  reserved2;
 	uint8_t  control;
 };
 
 struct scsi_token
 {
 	uint8_t  type[4];
 #define ROD_TYPE_INTERNAL	0x00000000
 #define ROD_TYPE_AUR		0x00010000
 #define ROD_TYPE_PIT_DEF	0x00800000
 #define ROD_TYPE_PIT_VULN	0x00800001
 #define ROD_TYPE_PIT_PERS	0x00800002
 #define ROD_TYPE_PIT_ANY	0x0080FFFF
 #define ROD_TYPE_BLOCK_ZERO	0xFFFF0001
 	uint8_t  reserved[2];
 	uint8_t  length[2];
 	uint8_t  body[0];
 };
 
 struct scsi_report_all_rod_tokens
 {
 	uint8_t  opcode;
 	uint8_t  service_action;
 #define RCS_RART		0x08
 	uint8_t  reserved[8];
 	uint8_t  length[4];
 	uint8_t  reserved2;
 	uint8_t  control;
 };
 
 struct scsi_report_all_rod_tokens_data
 {
 	uint8_t  available_data[4];
 	uint8_t  reserved[4];
 	uint8_t  rod_management_token_list[];
 };
 
 struct ata_pass_16 {
 	u_int8_t opcode;
 	u_int8_t protocol;
 #define	AP_EXTEND	0x01
 	u_int8_t flags;
 #define	AP_FLAG_TLEN_NO_DATA	(0 << 0)
 #define	AP_FLAG_TLEN_FEAT	(1 << 0)
 #define	AP_FLAG_TLEN_SECT_CNT	(2 << 0)
 #define	AP_FLAG_TLEN_STPSIU	(3 << 0)
 #define	AP_FLAG_BYT_BLOK_BYTES	(0 << 2)  
 #define	AP_FLAG_BYT_BLOK_BLOCKS	(1 << 2)  
 #define	AP_FLAG_TDIR_TO_DEV	(0 << 3)  
 #define	AP_FLAG_TDIR_FROM_DEV	(1 << 3)  
 #define	AP_FLAG_CHK_COND	(1 << 5)  
 	u_int8_t features_ext;
 	u_int8_t features;
 	u_int8_t sector_count_ext;
 	u_int8_t sector_count;
 	u_int8_t lba_low_ext;
 	u_int8_t lba_low;
 	u_int8_t lba_mid_ext;
 	u_int8_t lba_mid;
 	u_int8_t lba_high_ext;
 	u_int8_t lba_high;
 	u_int8_t device;
 	u_int8_t command;
 	u_int8_t control;
 };
 
 struct ata_pass_32 {
 	uint8_t opcode;
 	uint8_t control;
 	uint8_t reserved1[5];
 	uint8_t length;
 	uint8_t service_action[2];
 #define	ATA_PASS_32_SA		0x1ff0
 	uint8_t protocol;
 	uint8_t flags;
 	uint8_t reserved2[2];
 	uint8_t lba[6];
 	uint8_t features[2];
 	uint8_t count[2];
 	uint8_t device;
 	uint8_t command;
 	uint8_t reserved3;
 	uint8_t icc;
 	uint8_t auxiliary[4];
 };
 
 
 #define	SC_SCSI_1 0x01
 #define	SC_SCSI_2 0x03
 
 /*
  * Opcodes
  */
 
 #define	TEST_UNIT_READY		0x00
 #define	REQUEST_SENSE		0x03
 #define	READ_6			0x08
 #define	WRITE_6			0x0A
 #define	INQUIRY			0x12
 #define	MODE_SELECT_6		0x15
 #define	MODE_SENSE_6		0x1A
 #define	START_STOP_UNIT		0x1B
 #define	START_STOP		0x1B
 #define	RESERVE      		0x16
 #define	RELEASE      		0x17
 #define	RECEIVE_DIAGNOSTIC	0x1C
 #define	SEND_DIAGNOSTIC		0x1D
 #define	PREVENT_ALLOW		0x1E
 #define	READ_CAPACITY		0x25
 #define	READ_10			0x28
 #define	WRITE_10		0x2A
 #define	POSITION_TO_ELEMENT	0x2B
 #define	WRITE_VERIFY_10		0x2E
 #define	VERIFY_10		0x2F
 #define	SYNCHRONIZE_CACHE	0x35
 #define	READ_DEFECT_DATA_10	0x37
 #define	WRITE_BUFFER            0x3B
 #define	READ_BUFFER             0x3C
 #define	CHANGE_DEFINITION	0x40
 #define	WRITE_SAME_10		0x41
 #define	UNMAP			0x42
 #define	LOG_SELECT		0x4C
 #define	LOG_SENSE		0x4D
 #define	MODE_SELECT_10		0x55
 #define	RESERVE_10		0x56
 #define	RELEASE_10		0x57
 #define	MODE_SENSE_10		0x5A
 #define	PERSISTENT_RES_IN	0x5E
 #define	PERSISTENT_RES_OUT	0x5F
 #define	EXTENDED_CDB		0x7E
 #define	VARIABLE_LEN_CDB	0x7F
 #define	EXTENDED_COPY		0x83
 #define	RECEIVE_COPY_STATUS	0x84
 #define	ATA_PASS_16		0x85
 #define	READ_16			0x88
 #define	COMPARE_AND_WRITE	0x89
 #define	WRITE_16		0x8A
 #define	READ_ATTRIBUTE		0x8C
 #define	WRITE_ATTRIBUTE		0x8D
 #define	WRITE_VERIFY_16		0x8E
 #define	VERIFY_16		0x8F
 #define	SYNCHRONIZE_CACHE_16	0x91
 #define	WRITE_SAME_16		0x93
 #define	READ_BUFFER_16		0x9B
 #define	WRITE_ATOMIC_16		0x9C
 #define	SERVICE_ACTION_IN	0x9E
 #define	REPORT_LUNS		0xA0
 #define	ATA_PASS_12		0xA1
 #define	SECURITY_PROTOCOL_IN	0xA2
 #define	MAINTENANCE_IN		0xA3
 #define	MAINTENANCE_OUT		0xA4
 #define	MOVE_MEDIUM     	0xA5
 #define	READ_12			0xA8
 #define	WRITE_12		0xAA
 #define	WRITE_VERIFY_12		0xAE
 #define	VERIFY_12		0xAF
 #define	SECURITY_PROTOCOL_OUT	0xB5
 #define	READ_ELEMENT_STATUS	0xB8
 #define	READ_CD			0xBE
 
 /* Maintenance In Service Action Codes */
 #define	REPORT_IDENTIFYING_INFRMATION		0x05
 #define	REPORT_TARGET_PORT_GROUPS		0x0A
 #define	REPORT_ALIASES				0x0B
 #define	REPORT_SUPPORTED_OPERATION_CODES	0x0C
 #define	REPORT_SUPPORTED_TASK_MANAGEMENT_FUNCTIONS	0x0D
 #define	REPORT_PRIORITY				0x0E
 #define	REPORT_TIMESTAMP			0x0F
 #define	MANAGEMENT_PROTOCOL_IN			0x10
 /* Maintenance Out Service Action Codes */
 #define	SET_IDENTIFY_INFORMATION		0x06
 #define	SET_TARGET_PORT_GROUPS			0x0A
 #define	CHANGE_ALIASES				0x0B
 #define	SET_PRIORITY				0x0E
 #define	SET_TIMESTAMP				0x0F
 #define	MANGAEMENT_PROTOCOL_OUT			0x10
 
 /*
  * Device Types
  */
 #define	T_DIRECT	0x00
 #define	T_SEQUENTIAL	0x01
 #define	T_PRINTER	0x02
 #define	T_PROCESSOR	0x03
 #define	T_WORM		0x04
 #define	T_CDROM		0x05
 #define	T_SCANNER	0x06
 #define	T_OPTICAL 	0x07
 #define	T_CHANGER	0x08
 #define	T_COMM		0x09
 #define	T_ASC0		0x0a
 #define	T_ASC1		0x0b
 #define	T_STORARRAY	0x0c
 #define	T_ENCLOSURE	0x0d
 #define	T_RBC		0x0e
 #define	T_OCRW		0x0f
 #define	T_OSD		0x11
 #define	T_ADC		0x12
 #define	T_ZBC_HM	0x14
 #define	T_NODEVICE	0x1f
 #define	T_ANY		0xff	/* Used in Quirk table matches */
 
 #define	T_REMOV		1
 #define	T_FIXED		0
 
 /*
  * This length is the initial inquiry length used by the probe code, as    
  * well as the length necessary for scsi_print_inquiry() to function 
  * correctly.  If either use requires a different length in the future, 
  * the two values should be de-coupled.
  */
 #define	SHORT_INQUIRY_LENGTH	36
 
 struct scsi_inquiry_data
 {
 	u_int8_t device;
 #define	SID_TYPE(inq_data) ((inq_data)->device & 0x1f)
 #define	SID_QUAL(inq_data) (((inq_data)->device & 0xE0) >> 5)
 #define	SID_QUAL_LU_CONNECTED	0x00	/*
 					 * The specified peripheral device
 					 * type is currently connected to
 					 * logical unit.  If the target cannot
 					 * determine whether or not a physical
 					 * device is currently connected, it
 					 * shall also use this peripheral
 					 * qualifier when returning the INQUIRY
 					 * data.  This peripheral qualifier
 					 * does not mean that the device is
 					 * ready for access by the initiator.
 					 */
 #define	SID_QUAL_LU_OFFLINE	0x01	/*
 					 * The target is capable of supporting
 					 * the specified peripheral device type
 					 * on this logical unit; however, the
 					 * physical device is not currently
 					 * connected to this logical unit.
 					 */
 #define	SID_QUAL_RSVD		0x02
 #define	SID_QUAL_BAD_LU		0x03	/*
 					 * The target is not capable of
 					 * supporting a physical device on
 					 * this logical unit. For this
 					 * peripheral qualifier the peripheral
 					 * device type shall be set to 1Fh to
 					 * provide compatibility with previous
 					 * versions of SCSI. All other
 					 * peripheral device type values are
 					 * reserved for this peripheral
 					 * qualifier.
 					 */
 #define	SID_QUAL_IS_VENDOR_UNIQUE(inq_data) ((SID_QUAL(inq_data) & 0x04) != 0)
 	u_int8_t dev_qual2;
 #define	SID_QUAL2	0x7F
 #define	SID_LU_CONG	0x40
 #define	SID_RMB		0x80
 #define	SID_IS_REMOVABLE(inq_data) (((inq_data)->dev_qual2 & SID_RMB) != 0)
 	u_int8_t version;
 #define	SID_ANSI_REV(inq_data) ((inq_data)->version & 0x07)
 #define		SCSI_REV_0		0
 #define		SCSI_REV_CCS		1
 #define		SCSI_REV_2		2
 #define		SCSI_REV_SPC		3
 #define		SCSI_REV_SPC2		4
 #define		SCSI_REV_SPC3		5
 #define		SCSI_REV_SPC4		6
 #define		SCSI_REV_SPC5		7
 
 #define	SID_ECMA	0x38
 #define	SID_ISO		0xC0
 	u_int8_t response_format;
 #define	SID_AENC	0x80
 #define	SID_TrmIOP	0x40
 #define	SID_NormACA	0x20
 #define	SID_HiSup	0x10
 	u_int8_t additional_length;
 #define	SID_ADDITIONAL_LENGTH(iqd)					\
 	((iqd)->additional_length +					\
 	__offsetof(struct scsi_inquiry_data, additional_length) + 1)
 	u_int8_t spc3_flags;
 #define	SPC3_SID_PROTECT	0x01
 #define	SPC3_SID_3PC		0x08
 #define	SPC3_SID_TPGS_MASK	0x30
 #define	SPC3_SID_TPGS_IMPLICIT	0x10
 #define	SPC3_SID_TPGS_EXPLICIT	0x20
 #define	SPC3_SID_ACC		0x40
 #define	SPC3_SID_SCCS		0x80
 	u_int8_t spc2_flags;
 #define	SPC2_SID_ADDR16		0x01
 #define	SPC2_SID_MChngr 	0x08
 #define	SPC2_SID_MultiP 	0x10
 #define	SPC2_SID_EncServ	0x40
 #define	SPC2_SID_BQueue		0x80
 
 #define	INQ_DATA_TQ_ENABLED(iqd)				\
     ((SID_ANSI_REV(iqd) < SCSI_REV_SPC2)? ((iqd)->flags & SID_CmdQue) :	\
     (((iqd)->flags & SID_CmdQue) && !((iqd)->spc2_flags & SPC2_SID_BQueue)) || \
     (!((iqd)->flags & SID_CmdQue) && ((iqd)->spc2_flags & SPC2_SID_BQueue)))
 
 	u_int8_t flags;
 #define	SID_SftRe	0x01
 #define	SID_CmdQue	0x02
 #define	SID_Linked	0x08
 #define	SID_Sync	0x10
 #define	SID_WBus16	0x20
 #define	SID_WBus32	0x40
 #define	SID_RelAdr	0x80
 #define	SID_VENDOR_SIZE   8
 	char	 vendor[SID_VENDOR_SIZE];
 #define	SID_PRODUCT_SIZE  16
 	char	 product[SID_PRODUCT_SIZE];
 #define	SID_REVISION_SIZE 4
 	char	 revision[SID_REVISION_SIZE];
 	/*
 	 * The following fields were taken from SCSI Primary Commands - 2
 	 * (SPC-2) Revision 14, Dated 11 November 1999
 	 */
 #define	SID_VENDOR_SPECIFIC_0_SIZE	20
 	u_int8_t vendor_specific0[SID_VENDOR_SPECIFIC_0_SIZE];
 	/*
 	 * An extension of SCSI Parallel Specific Values
 	 */
 #define	SID_SPI_IUS		0x01
 #define	SID_SPI_QAS		0x02
 #define	SID_SPI_CLOCK_ST	0x00
 #define	SID_SPI_CLOCK_DT	0x04
 #define	SID_SPI_CLOCK_DT_ST	0x0C
 #define	SID_SPI_MASK		0x0F
 	u_int8_t spi3data;
 	u_int8_t reserved2;
 	/*
 	 * Version Descriptors, stored 2 byte values.
 	 */
 	u_int8_t version1[2];
 	u_int8_t version2[2];
 	u_int8_t version3[2];
 	u_int8_t version4[2];
 	u_int8_t version5[2];
 	u_int8_t version6[2];
 	u_int8_t version7[2];
 	u_int8_t version8[2];
 
 	u_int8_t reserved3[22];
 
 #define	SID_VENDOR_SPECIFIC_1_SIZE	160
 	u_int8_t vendor_specific1[SID_VENDOR_SPECIFIC_1_SIZE];
 };
 
 /*
  * This structure is more suited to initiator operation, because the
  * maximum number of supported pages is already allocated.
  */
 struct scsi_vpd_supported_page_list
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define	SVPD_SUPPORTED_PAGE_LIST	0x00
 #define	SVPD_SUPPORTED_PAGES_HDR_LEN	4
 	u_int8_t reserved;
 	u_int8_t length;	/* number of VPD entries */
 #define	SVPD_SUPPORTED_PAGES_SIZE	251
 	u_int8_t list[SVPD_SUPPORTED_PAGES_SIZE];
 };
 
 /*
  * This structure is more suited to target operation, because the
  * number of supported pages is left to the user to allocate.
  */
 struct scsi_vpd_supported_pages
 {
 	u_int8_t device;
 	u_int8_t page_code;
 	u_int8_t reserved;
 #define	SVPD_SUPPORTED_PAGES	0x00
 	u_int8_t length;
 	u_int8_t page_list[0];
 };
 
 
 struct scsi_vpd_unit_serial_number
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define	SVPD_UNIT_SERIAL_NUMBER	0x80
 	u_int8_t reserved;
 	u_int8_t length; /* serial number length */
 #define	SVPD_SERIAL_NUM_SIZE 251
 	u_int8_t serial_num[SVPD_SERIAL_NUM_SIZE];
 };
 
 struct scsi_vpd_device_id
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define	SVPD_DEVICE_ID			0x83
 #define	SVPD_DEVICE_ID_MAX_SIZE		252
 #define	SVPD_DEVICE_ID_HDR_LEN \
     __offsetof(struct scsi_vpd_device_id, desc_list)
 	u_int8_t length[2];
 	u_int8_t desc_list[];
 };
 
 struct scsi_vpd_id_descriptor
 {
 	u_int8_t	proto_codeset;
 	/*
 	 * See the SCSI_PROTO definitions above for the protocols.
 	 */
 #define	SVPD_ID_PROTO_SHIFT	4
 #define	SVPD_ID_CODESET_BINARY	0x01
 #define	SVPD_ID_CODESET_ASCII	0x02
 #define	SVPD_ID_CODESET_UTF8	0x03
 #define	SVPD_ID_CODESET_MASK	0x0f
 	u_int8_t	id_type;
 #define	SVPD_ID_PIV		0x80
 #define	SVPD_ID_ASSOC_LUN	0x00
 #define	SVPD_ID_ASSOC_PORT	0x10
 #define	SVPD_ID_ASSOC_TARGET	0x20
 #define	SVPD_ID_ASSOC_MASK	0x30
 #define	SVPD_ID_TYPE_VENDOR	0x00
 #define	SVPD_ID_TYPE_T10	0x01
 #define	SVPD_ID_TYPE_EUI64	0x02
 #define	SVPD_ID_TYPE_NAA	0x03
 #define	SVPD_ID_TYPE_RELTARG	0x04
 #define	SVPD_ID_TYPE_TPORTGRP	0x05
 #define	SVPD_ID_TYPE_LUNGRP	0x06
 #define	SVPD_ID_TYPE_MD5_LUN_ID	0x07
 #define	SVPD_ID_TYPE_SCSI_NAME	0x08
 #define	SVPD_ID_TYPE_PROTO	0x09
 #define	SVPD_ID_TYPE_UUID	0x0a
 #define	SVPD_ID_TYPE_MASK	0x0f
 	u_int8_t	reserved;
 	u_int8_t	length;
 #define	SVPD_DEVICE_ID_DESC_HDR_LEN \
     __offsetof(struct scsi_vpd_id_descriptor, identifier) 
 	u_int8_t	identifier[];
 };
 
 struct scsi_vpd_id_t10
 {
 	u_int8_t	vendor[8];
 	u_int8_t	vendor_spec_id[0];
 };
 
 struct scsi_vpd_id_eui64
 {
 	u_int8_t	ieee_company_id[3];
 	u_int8_t	extension_id[5];
 };
 
 struct scsi_vpd_id_naa_basic
 {
 	uint8_t naa;
 	/* big endian, packed:
 	uint8_t	naa : 4;
 	uint8_t naa_desig : 4;
 	*/
 #define	SVPD_ID_NAA_NAA_SHIFT		4
 #define	SVPD_ID_NAA_IEEE_EXT		0x02
 #define	SVPD_ID_NAA_LOCAL_REG		0x03
 #define	SVPD_ID_NAA_IEEE_REG		0x05
 #define	SVPD_ID_NAA_IEEE_REG_EXT	0x06
 	uint8_t	naa_data[];
 };
 
 struct scsi_vpd_id_naa_ieee_extended_id
 {
 	uint8_t naa;
 	uint8_t vendor_specific_id_a;
 	uint8_t ieee_company_id[3];
 	uint8_t vendor_specific_id_b[4];
 };
 
 struct scsi_vpd_id_naa_local_reg
 {
 	uint8_t naa;
 	uint8_t local_value[7];
 };
 
 struct scsi_vpd_id_naa_ieee_reg
 {
 	uint8_t naa;
 	uint8_t reg_value[7];
 	/* big endian, packed:
 	uint8_t naa_basic : 4;
 	uint8_t ieee_company_id_0 : 4;
 	uint8_t ieee_company_id_1[2];
 	uint8_t ieee_company_id_2 : 4;
 	uint8_t vendor_specific_id_0 : 4;
 	uint8_t vendor_specific_id_1[4];
 	*/
 };
 
 struct scsi_vpd_id_naa_ieee_reg_extended
 {
 	uint8_t naa;
 	uint8_t reg_value[15];
 	/* big endian, packed:
 	uint8_t naa_basic : 4;
 	uint8_t ieee_company_id_0 : 4;
 	uint8_t ieee_company_id_1[2];
 	uint8_t ieee_company_id_2 : 4;
 	uint8_t vendor_specific_id_0 : 4;
 	uint8_t vendor_specific_id_1[4];
 	uint8_t vendor_specific_id_ext[8];
 	*/
 };
 
 struct scsi_vpd_id_rel_trgt_port_id
 {
 	uint8_t obsolete[2];
 	uint8_t rel_trgt_port_id[2];
 };
 
 struct scsi_vpd_id_trgt_port_grp_id
 {
 	uint8_t reserved[2];
 	uint8_t trgt_port_grp[2];
 };
 
 struct scsi_vpd_id_lun_grp_id
 {
 	uint8_t reserved[2];
 	uint8_t log_unit_grp[2];
 };
 
 struct scsi_vpd_id_md5_lun_id
 {
 	uint8_t lun_id[16];
 };
 
 struct scsi_vpd_id_scsi_name
 {
 	uint8_t name_string[256];
 };
 
 struct scsi_service_action_in
 {
 	uint8_t opcode;
 	uint8_t service_action;
 	uint8_t action_dependent[13];
 	uint8_t control;
 };
 
 struct scsi_vpd_extended_inquiry_data
 {
 	uint8_t device;
 	uint8_t page_code;
 #define	SVPD_EXTENDED_INQUIRY_DATA	0x86
 	uint8_t page_length[2];
 	uint8_t flags1;
 
 	/* These values are for direct access devices */
 #define	SVPD_EID_AM_MASK	0xC0
 #define	SVPD_EID_AM_DEFER	0x80
 #define	SVPD_EID_AM_IMMED	0x40
 #define	SVPD_EID_AM_UNDEFINED	0x00
 #define	SVPD_EID_AM_RESERVED	0xc0
 #define	SVPD_EID_SPT		0x38
 #define	SVPD_EID_SPT_1		0x00
 #define	SVPD_EID_SPT_12		0x08
 #define	SVPD_EID_SPT_2		0x10
 #define	SVPD_EID_SPT_13		0x18
 #define	SVPD_EID_SPT_3		0x20
 #define	SVPD_EID_SPT_23		0x28
 #define	SVPD_EID_SPT_123	0x38
 
 	/* These values are for sequential access devices */
 #define	SVPD_EID_SA_SPT_LBP	0x08
 
 #define	SVPD_EID_GRD_CHK	0x04
 #define	SVPD_EID_APP_CHK	0x02
 #define	SVPD_EID_REF_CHK	0x01
 
 	uint8_t flags2;
 #define	SVPD_EID_UASK_SUP	0x20
 #define	SVPD_EID_GROUP_SUP	0x10
 #define	SVPD_EID_PRIOR_SUP	0x08
 #define	SVPD_EID_HEADSUP	0x04
 #define	SVPD_EID_ORDSUP		0x02
 #define	SVPD_EID_SIMPSUP	0x01
 	uint8_t flags3;
 #define	SVPD_EID_WU_SUP		0x08
 #define	SVPD_EID_CRD_SUP	0x04
 #define	SVPD_EID_NV_SUP		0x02
 #define	SVPD_EID_V_SUP		0x01
 	uint8_t flags4;
 #define	SVPD_EID_NO_PI_CHK	0x20
 #define	SVPD_EID_P_I_I_SUP	0x10
 #define	SVPD_EID_LUICLR		0x01
 	uint8_t flags5;
 #define	SVPD_EID_LUCT_MASK	0xe0
 #define	SVPD_EID_LUCT_NOT_REP	0x00
 #define	SVPD_EID_LUCT_CONGL	0x20
 #define	SVPD_EID_LUCT_GROUP	0x40
 #define	SVPD_EID_R_SUP		0x10
 #define	SVPD_EID_RTD_SUP	0x08
 #define	SVPD_EID_HSSRELEF	0x02
 #define	SVPD_EID_CBCS		0x01
 	uint8_t flags6;
 #define	SVPD_EID_MULTI_I_T_FW	0x0F
 #define	SVPD_EID_MC_VENDOR_SPEC	0x00
 #define	SVPD_EID_MC_MODE_1	0x01
 #define	SVPD_EID_MC_MODE_2	0x02
 #define	SVPD_EID_MC_MODE_3	0x03
 	uint8_t est[2];
 	uint8_t flags7;
 #define	SVPD_EID_POA_SUP	0x80
 #define	SVPD_EID_HRA_SUP	0x40
 #define	SVPD_EID_VSA_SUP	0x20
 	uint8_t max_sense_length;
 	uint8_t bind_flags;
 #define	SVPD_EID_IBS		0x80
 #define	SVPD_EID_IAS		0x40
 #define	SVPD_EID_SAC		0x04
 #define	SVPD_EID_NRD1		0x02
 #define	SVPD_EID_NRD0		0x01
 	uint8_t reserved2[49];
 };
 
 struct scsi_vpd_mode_page_policy_descr
 {
 	uint8_t page_code;
 	uint8_t subpage_code;
 	uint8_t policy;
 #define	SVPD_MPP_SHARED		0x00
 #define	SVPD_MPP_PORT		0x01
 #define	SVPD_MPP_I_T		0x03
 #define	SVPD_MPP_MLUS		0x80
 	uint8_t reserved;
 };
 
 struct scsi_vpd_mode_page_policy
 {
 	uint8_t device;
 	uint8_t page_code;
 #define	SVPD_MODE_PAGE_POLICY	0x87
 	uint8_t page_length[2];
 	struct scsi_vpd_mode_page_policy_descr descr[0];
 };
 
 struct scsi_diag_page {
 	uint8_t page_code;
 	uint8_t page_specific_flags;
 	uint8_t length[2];
 	uint8_t params[0];
 };
 
 struct scsi_vpd_port_designation
 {
 	uint8_t reserved[2];
 	uint8_t relative_port_id[2];
 	uint8_t reserved2[2];
 	uint8_t initiator_transportid_length[2];
 	uint8_t initiator_transportid[0];
 };
 
 struct scsi_vpd_port_designation_cont
 {
 	uint8_t reserved[2];
 	uint8_t target_port_descriptors_length[2];
 	struct scsi_vpd_id_descriptor target_port_descriptors[0];
 };
 
 struct scsi_vpd_scsi_ports
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define	SVPD_SCSI_PORTS		0x88
 	u_int8_t page_length[2];
 	struct scsi_vpd_port_designation design[];
 };
 
 /*
  * ATA Information VPD Page based on
  * T10/2126-D Revision 04
  */
 #define SVPD_ATA_INFORMATION		0x89
 
 
 struct scsi_vpd_tpc_descriptor
 {
 	uint8_t desc_type[2];
 	uint8_t desc_length[2];
 	uint8_t parameters[];
 };
 
 struct scsi_vpd_tpc_descriptor_bdrl
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_BDRL			0x0000
 	uint8_t desc_length[2];
 	uint8_t vendor_specific[6];
 	uint8_t maximum_ranges[2];
 	uint8_t maximum_inactivity_timeout[4];
 	uint8_t default_inactivity_timeout[4];
 	uint8_t maximum_token_transfer_size[8];
 	uint8_t optimal_transfer_count[8];
 };
 
 struct scsi_vpd_tpc_descriptor_sc_descr
 {
 	uint8_t opcode;
 	uint8_t sa_length;
 	uint8_t supported_service_actions[0];
 };
 
 struct scsi_vpd_tpc_descriptor_sc
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_SC			0x0001
 	uint8_t desc_length[2];
 	uint8_t list_length;
 	struct scsi_vpd_tpc_descriptor_sc_descr descr[];
 };
 
 struct scsi_vpd_tpc_descriptor_pd
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_PD			0x0004
 	uint8_t desc_length[2];
 	uint8_t reserved[4];
 	uint8_t maximum_cscd_descriptor_count[2];
 	uint8_t maximum_segment_descriptor_count[2];
 	uint8_t maximum_descriptor_list_length[4];
 	uint8_t maximum_inline_data_length[4];
 	uint8_t reserved2[12];
 };
 
 struct scsi_vpd_tpc_descriptor_sd
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_SD			0x0008
 	uint8_t desc_length[2];
 	uint8_t list_length;
 	uint8_t supported_descriptor_codes[];
 };
 
 struct scsi_vpd_tpc_descriptor_sdid
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_SDID			0x000C
 	uint8_t desc_length[2];
 	uint8_t list_length[2];
 	uint8_t supported_descriptor_ids[];
 };
 
 struct scsi_vpd_tpc_descriptor_rtf_block
 {
 	uint8_t type_format;
 #define	SVPD_TPC_RTF_BLOCK			0x00
 	uint8_t reserved;
 	uint8_t desc_length[2];
 	uint8_t reserved2[2];
 	uint8_t optimal_length_granularity[2];
 	uint8_t maximum_bytes[8];
 	uint8_t optimal_bytes[8];
 	uint8_t optimal_bytes_to_token_per_segment[8];
 	uint8_t optimal_bytes_from_token_per_segment[8];
 	uint8_t reserved3[8];
 };
 
 struct scsi_vpd_tpc_descriptor_rtf
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_RTF			0x0106
 	uint8_t desc_length[2];
 	uint8_t remote_tokens;
 	uint8_t reserved[11];
 	uint8_t minimum_token_lifetime[4];
 	uint8_t maximum_token_lifetime[4];
 	uint8_t maximum_token_inactivity_timeout[4];
 	uint8_t reserved2[18];
 	uint8_t type_specific_features_length[2];
 	uint8_t type_specific_features[0];
 };
 
 struct scsi_vpd_tpc_descriptor_srtd
 {
 	uint8_t rod_type[4];
 	uint8_t flags;
 #define	SVPD_TPC_SRTD_TOUT		0x01
 #define	SVPD_TPC_SRTD_TIN		0x02
 #define	SVPD_TPC_SRTD_ECPY		0x80
 	uint8_t reserved;
 	uint8_t preference_indicator[2];
 	uint8_t reserved2[56];
 };
 
 struct scsi_vpd_tpc_descriptor_srt
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_SRT			0x0108
 	uint8_t desc_length[2];
 	uint8_t reserved[2];
 	uint8_t rod_type_descriptors_length[2];
 	uint8_t rod_type_descriptors[0];
 };
 
 struct scsi_vpd_tpc_descriptor_gco
 {
 	uint8_t desc_type[2];
 #define	SVPD_TPC_GCO			0x8001
 	uint8_t desc_length[2];
 	uint8_t total_concurrent_copies[4];
 	uint8_t maximum_identified_concurrent_copies[4];
 	uint8_t maximum_segment_length[4];
 	uint8_t data_segment_granularity;
 	uint8_t inline_data_granularity;
 	uint8_t reserved[18];
 };
 
 struct scsi_vpd_tpc
 {
 	uint8_t device;
 	uint8_t page_code;
 #define	SVPD_SCSI_TPC			0x8F
 	uint8_t page_length[2];
 	struct scsi_vpd_tpc_descriptor descr[];
 };
 
 /*
+ * SCSI Feature Sets VPD Page
+ */
+struct scsi_vpd_sfs
+{
+	uint8_t device;
+	uint8_t page_code;
+#define	SVPD_SCSI_SFS			0x92
+	uint8_t page_length[2];
+	uint8_t reserved[4];
+	uint8_t codes[];
+};
+
+/*
  * Block Device Characteristics VPD Page based on
  * T10/1799-D Revision 31
  */
 struct scsi_vpd_block_characteristics
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define SVPD_BDC			0xB1
 	u_int8_t page_length[2];
 	u_int8_t medium_rotation_rate[2];
 #define SVPD_BDC_RATE_NOT_REPORTED	0x00
 #define SVPD_BDC_RATE_NON_ROTATING	0x01
 	u_int8_t reserved1;
 	u_int8_t nominal_form_factor;
 #define SVPD_BDC_FORM_NOT_REPORTED	0x00
 #define SVPD_BDC_FORM_5_25INCH		0x01
 #define SVPD_BDC_FORM_3_5INCH		0x02
 #define SVPD_BDC_FORM_2_5INCH		0x03
 #define SVPD_BDC_FORM_1_5INCH		0x04
 #define SVPD_BDC_FORM_LESSTHAN_1_5INCH	0x05
 	u_int8_t reserved2[56];
 };
 
 /*
  * Block Device Characteristics VPD Page
  */
 struct scsi_vpd_block_device_characteristics
 {
 	uint8_t device;
 	uint8_t page_code;
 #define	SVPD_BDC		0xB1
 	uint8_t page_length[2];
 	uint8_t medium_rotation_rate[2];
 #define	SVPD_NOT_REPORTED	0x0000
 #define	SVPD_NON_ROTATING	0x0001
 	uint8_t product_type;
 	uint8_t wab_wac_ff;
 	uint8_t flags;
 #define	SVPD_VBULS		0x01
 #define	SVPD_FUAB		0x02
+#define	SVPD_BOCS		0x04
+#define	SVPD_RBWZ		0x08
 #define	SVPD_ZBC_NR		0x00	/* Not Reported */
 #define	SVPD_HAW_ZBC		0x10	/* Host Aware */
 #define	SVPD_DM_ZBC		0x20	/* Drive Managed */
 #define	SVPD_ZBC_MASK		0x30	/* Zoned mask */
-	uint8_t reserved[55];
+	uint8_t reserved[3];
+	uint8_t depopulation_time[4];
+	uint8_t reserved2[48];
 };
 
 #define SBDC_IS_PRESENT(bdc, length, field)				   \
 	((length >= offsetof(struct scsi_vpd_block_device_characteristics, \
 	  field) + sizeof(bdc->field)) ? 1 : 0)
 
 /*
  * Logical Block Provisioning VPD Page based on
  * T10/1799-D Revision 31
  */
 struct scsi_vpd_logical_block_prov
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define	SVPD_LBP		0xB2
 	u_int8_t page_length[2];
 #define SVPD_LBP_PL_BASIC	0x04
 	u_int8_t threshold_exponent;
 	u_int8_t flags;
 #define SVPD_LBP_UNMAP		0x80
 #define SVPD_LBP_WS16		0x40
 #define SVPD_LBP_WS10		0x20
 #define SVPD_LBP_RZ		0x04
 #define SVPD_LBP_ANC_SUP	0x02
 #define SVPD_LBP_DP		0x01
 	u_int8_t prov_type;
 #define SVPD_LBP_RESOURCE	0x01
 #define SVPD_LBP_THIN		0x02
 	u_int8_t reserved;
 	/*
 	 * Provisioning Group Descriptor can be here if SVPD_LBP_DP is set
 	 * Its size can be determined from page_length - 4
 	 */
 };
 
 /*
  * Block Limits VDP Page based on SBC-4 Revision 2
  */
 struct scsi_vpd_block_limits
 {
 	u_int8_t device;
 	u_int8_t page_code;
 #define	SVPD_BLOCK_LIMITS	0xB0
 	u_int8_t page_length[2];
 #define SVPD_BL_PL_BASIC	0x10
 #define SVPD_BL_PL_TP		0x3C
 	u_int8_t reserved1;
 	u_int8_t max_cmp_write_len;
 	u_int8_t opt_txfer_len_grain[2];
 	u_int8_t max_txfer_len[4];
 	u_int8_t opt_txfer_len[4];
 	u_int8_t max_prefetch[4];
 	u_int8_t max_unmap_lba_cnt[4];
 	u_int8_t max_unmap_blk_cnt[4];
 	u_int8_t opt_unmap_grain[4];
 	u_int8_t unmap_grain_align[4];
 	u_int8_t max_write_same_length[8];
 	u_int8_t max_atomic_transfer_length[4];
 	u_int8_t atomic_alignment[4];
 	u_int8_t atomic_transfer_length_granularity[4];
 	u_int8_t max_atomic_transfer_length_with_atomic_boundary[4];
 	u_int8_t max_atomic_boundary_size[4];
 };
 
 /*
  * Zoned Block Device Characacteristics VPD page.
  * From ZBC-r04, dated August 12, 2015.
  */
 struct scsi_vpd_zoned_bdc {
 	uint8_t device;
 	uint8_t page_code;
 #define	SVPD_ZONED_BDC		0xB6
 	uint8_t page_length[2];
 #define	SVPD_ZBDC_PL	0x3C
 	uint8_t flags;
 #define	SVPD_ZBDC_URSWRZ	0x01
 	uint8_t reserved1[3];
 	uint8_t optimal_seq_zones[4];
 #define	SVPD_ZBDC_OPT_SEQ_NR		0xffffffff
 	uint8_t optimal_nonseq_zones[4];
 #define SVPD_ZBDC_OPT_NONSEQ_NR		0xffffffff
 	uint8_t max_seq_req_zones[4];
 #define	SVPD_ZBDC_MAX_SEQ_UNLIMITED	0xffffffff
 	uint8_t reserved2[44];
 };
 
 struct scsi_read_capacity
 {
 	u_int8_t opcode;
 	u_int8_t byte2;
 #define	SRC_RELADR	0x01
 	u_int8_t addr[4];
 	u_int8_t unused[2];
 	u_int8_t pmi;
 #define	SRC_PMI		0x01
 	u_int8_t control;
 };
 
 struct scsi_read_capacity_16
 {
 	uint8_t opcode;
 #define	SRC16_SERVICE_ACTION	0x10
 	uint8_t service_action;
 	uint8_t addr[8];
 	uint8_t alloc_len[4];
 #define	SRC16_PMI		0x01
 #define	SRC16_RELADR		0x02
 	uint8_t reladr;
 	uint8_t control;
 };
 
 struct scsi_read_capacity_data
 {
 	u_int8_t addr[4];
 	u_int8_t length[4];
 };
 
 struct scsi_read_capacity_data_long
 {
 	uint8_t addr[8];
 	uint8_t length[4];
 #define	SRC16_PROT_EN		0x01
 #define	SRC16_P_TYPE		0x0e
 #define	SRC16_P_TYPE_SHIFT	1
 #define	SRC16_PTYPE_1		0x00
 #define	SRC16_PTYPE_2		0x02
 #define	SRC16_PTYPE_3		0x04
 	uint8_t prot;
 #define	SRC16_LBPPBE		0x0f
 #define	SRC16_PI_EXPONENT	0xf0
 #define	SRC16_PI_EXPONENT_SHIFT	4
 	uint8_t prot_lbppbe;
 #define	SRC16_LALBA		0x3f
 #define	SRC16_LBPRZ		0x40
 #define	SRC16_LBPME		0x80
 /*
  * Alternate versions of these macros that are intended for use on a 16-bit
  * version of the lalba_lbp field instead of the array of 2 8 bit numbers.
  */
 #define	SRC16_LALBA_A		0x3fff
 #define	SRC16_LBPRZ_A		0x4000
 #define	SRC16_LBPME_A		0x8000
 	uint8_t lalba_lbp[2];
 	uint8_t	reserved[16];
 };
 
 struct scsi_get_lba_status
 {
 	uint8_t opcode;
 #define	SGLS_SERVICE_ACTION	0x12
 	uint8_t service_action;
 	uint8_t addr[8];
 	uint8_t alloc_len[4];
 	uint8_t reserved;
 	uint8_t control;
 };
 
 struct scsi_get_lba_status_data_descr
 {
 	uint8_t addr[8];
 	uint8_t length[4];
 	uint8_t status;
 	uint8_t reserved[3];
 };
 
 struct scsi_get_lba_status_data
 {
 	uint8_t length[4];
 	uint8_t reserved[4];
 	struct scsi_get_lba_status_data_descr descr[];
 };
 
 struct scsi_report_luns
 {
 	uint8_t opcode;
 	uint8_t reserved1;
 #define	RPL_REPORT_DEFAULT	0x00
 #define	RPL_REPORT_WELLKNOWN	0x01
 #define	RPL_REPORT_ALL		0x02
 #define	RPL_REPORT_ADMIN	0x10
 #define	RPL_REPORT_NONSUBSID	0x11
 #define	RPL_REPORT_CONGLOM	0x12
 	uint8_t select_report;
 	uint8_t reserved2[3];
 	uint8_t length[4];
 	uint8_t reserved3;
 	uint8_t control;
 };
 
 struct scsi_report_luns_lundata {
 	uint8_t lundata[8];
 #define	RPL_LUNDATA_PERIPH_BUS_MASK	0x3f
 #define	RPL_LUNDATA_FLAT_LUN_MASK	0x3f
 #define	RPL_LUNDATA_FLAT_LUN_BITS	0x06
 #define	RPL_LUNDATA_LUN_TARG_MASK	0x3f
 #define	RPL_LUNDATA_LUN_BUS_MASK	0xe0
 #define	RPL_LUNDATA_LUN_LUN_MASK	0x1f
 #define	RPL_LUNDATA_EXT_LEN_MASK	0x30
 #define	RPL_LUNDATA_EXT_EAM_MASK	0x0f
 #define	RPL_LUNDATA_EXT_EAM_WK		0x01
 #define	RPL_LUNDATA_EXT_EAM_NOT_SPEC	0x0f
 #define	RPL_LUNDATA_ATYP_MASK	0xc0	/* MBZ for type 0 lun */
 #define	RPL_LUNDATA_ATYP_PERIPH	0x00
 #define	RPL_LUNDATA_ATYP_FLAT	0x40
 #define	RPL_LUNDATA_ATYP_LUN	0x80
 #define	RPL_LUNDATA_ATYP_EXTLUN	0xc0
 };
 
 struct scsi_report_luns_data {
 	u_int8_t length[4];	/* length of LUN inventory, in bytes */
 	u_int8_t reserved[4];	/* unused */
 	/*
 	 * LUN inventory- we only support the type zero form for now.
 	 */
 	struct scsi_report_luns_lundata luns[0];
 };
 
 struct scsi_target_group
 {
 	uint8_t opcode;
 	uint8_t service_action;
 #define	STG_PDF_MASK		0xe0
 #define	STG_PDF_LENGTH		0x00
 #define	STG_PDF_EXTENDED	0x20
 	uint8_t reserved1[4];
 	uint8_t length[4];
 	uint8_t reserved2;
 	uint8_t control;
 };
 
 struct scsi_timestamp
 {
 	uint8_t opcode;
 	uint8_t service_action;
 	uint8_t reserved1[4];
 	uint8_t length[4];
 	uint8_t reserved2;
 	uint8_t control;
 };
 
 struct scsi_set_timestamp_parameters
 {
 	uint8_t reserved1[4];
 	uint8_t timestamp[6];
 	uint8_t reserved2[2];
 };
 
 struct scsi_report_timestamp_parameter_data
 {
 	uint8_t length[2];
 	uint8_t reserved1[2];
 	uint8_t timestamp[6];
 	uint8_t reserved2[2];
 };
 
 struct scsi_target_port_descriptor {
 	uint8_t	reserved[2];
 	uint8_t	relative_target_port_identifier[2];
 	uint8_t desc_list[];
 };
 
 struct scsi_target_port_group_descriptor {
 	uint8_t	pref_state;
 #define	TPG_PRIMARY				0x80
 #define	TPG_ASYMMETRIC_ACCESS_STATE_MASK	0xf
 #define	TPG_ASYMMETRIC_ACCESS_OPTIMIZED		0x0
 #define	TPG_ASYMMETRIC_ACCESS_NONOPTIMIZED	0x1
 #define	TPG_ASYMMETRIC_ACCESS_STANDBY		0x2
 #define	TPG_ASYMMETRIC_ACCESS_UNAVAILABLE	0x3
 #define	TPG_ASYMMETRIC_ACCESS_LBA_DEPENDENT	0x4
 #define	TPG_ASYMMETRIC_ACCESS_OFFLINE		0xE
 #define	TPG_ASYMMETRIC_ACCESS_TRANSITIONING	0xF
 	uint8_t support;
 #define	TPG_AO_SUP	0x01
 #define	TPG_AN_SUP	0x02
 #define	TPG_S_SUP	0x04
 #define	TPG_U_SUP	0x08
 #define	TPG_LBD_SUP	0x10
 #define	TPG_O_SUP	0x40
 #define	TPG_T_SUP	0x80
 	uint8_t target_port_group[2];
 	uint8_t reserved;
 	uint8_t status;
 #define TPG_UNAVLBL      0
 #define TPG_SET_BY_STPG  0x01
 #define TPG_IMPLICIT     0x02
 	uint8_t vendor_specific;
 	uint8_t	target_port_count;
 	struct scsi_target_port_descriptor descriptors[];
 };
 
 struct scsi_target_group_data {
 	uint8_t length[4];	/* length of returned data, in bytes */
 	struct scsi_target_port_group_descriptor groups[];
 };
 
 struct scsi_target_group_data_extended {
 	uint8_t length[4];	/* length of returned data, in bytes */
 	uint8_t format_type;	/* STG_PDF_LENGTH or STG_PDF_EXTENDED */
 	uint8_t	implicit_transition_time;
 	uint8_t reserved[2];
 	struct scsi_target_port_group_descriptor groups[];
 };
 
 struct scsi_security_protocol_in
 {
 	uint8_t opcode;
 	uint8_t security_protocol;
 #define	SPI_PROT_INFORMATION		0x00
 #define	SPI_PROT_CBCS			0x07
 #define	SPI_PROT_TAPE_DATA_ENC		0x20
 #define	SPI_PROT_DATA_ENC_CONFIG	0x21
 #define	SPI_PROT_SA_CREATE_CAP		0x40
 #define	SPI_PROT_IKEV2_SCSI		0x41
 #define	SPI_PROT_JEDEC_UFS		0xEC
 #define	SPI_PROT_SDCARD_TFSSS		0xED
 #define	SPI_PROT_AUTH_HOST_TRANSIENT	0xEE
 #define	SPI_PROT_ATA_DEVICE_PASSWORD	0xEF
 	uint8_t security_protocol_specific[2];
 	uint8_t byte4;
 #define	SPI_INC_512	0x80
 	uint8_t reserved1;
 	uint8_t length[4];
 	uint8_t reserved2;
 	uint8_t control;
 };
 
 struct scsi_security_protocol_out
 {
 	uint8_t opcode;
 	uint8_t security_protocol;
 	uint8_t security_protocol_specific[2];
 	uint8_t byte4;
 #define	SPO_INC_512	0x80
 	uint8_t reserved1;
 	uint8_t length[4];
 	uint8_t reserved2;
 	uint8_t control;
 };
 
 typedef enum {
 	SSD_TYPE_NONE,
 	SSD_TYPE_FIXED,
 	SSD_TYPE_DESC
 } scsi_sense_data_type;
 
 typedef enum {
 	SSD_ELEM_NONE,
 	SSD_ELEM_SKIP,
 	SSD_ELEM_DESC,
 	SSD_ELEM_SKS,
 	SSD_ELEM_COMMAND,
 	SSD_ELEM_INFO,
 	SSD_ELEM_FRU,
 	SSD_ELEM_STREAM,
 	SSD_ELEM_MAX
 } scsi_sense_elem_type;
 
 
 struct scsi_sense_data
 {
 	uint8_t error_code;
 	/*
 	 * SPC-4 says that the maximum length of sense data is 252 bytes.
 	 * So this structure is exactly 252 bytes log.
 	 */
 #define	SSD_FULL_SIZE 252
 	uint8_t sense_buf[SSD_FULL_SIZE - 1];
 	/*
 	 * XXX KDM is this still a reasonable minimum size?
 	 */
 #define	SSD_MIN_SIZE 18
 	/*
 	 * Maximum value for the extra_len field in the sense data.
 	 */
 #define	SSD_EXTRA_MAX 244
 };
 
 /*
  * Fixed format sense data.
  */
 struct scsi_sense_data_fixed
 {
 	u_int8_t error_code;
 #define	SSD_ERRCODE			0x7F
 #define		SSD_CURRENT_ERROR	0x70
 #define		SSD_DEFERRED_ERROR	0x71
 #define	SSD_ERRCODE_VALID	0x80	
 	u_int8_t segment;
 	u_int8_t flags;
 #define	SSD_KEY				0x0F
 #define		SSD_KEY_NO_SENSE	0x00
 #define		SSD_KEY_RECOVERED_ERROR	0x01
 #define		SSD_KEY_NOT_READY	0x02
 #define		SSD_KEY_MEDIUM_ERROR	0x03
 #define		SSD_KEY_HARDWARE_ERROR	0x04
 #define		SSD_KEY_ILLEGAL_REQUEST	0x05
 #define		SSD_KEY_UNIT_ATTENTION	0x06
 #define		SSD_KEY_DATA_PROTECT	0x07
 #define		SSD_KEY_BLANK_CHECK	0x08
 #define		SSD_KEY_Vendor_Specific	0x09
 #define		SSD_KEY_COPY_ABORTED	0x0a
 #define		SSD_KEY_ABORTED_COMMAND	0x0b
 #define		SSD_KEY_EQUAL		0x0c
 #define		SSD_KEY_VOLUME_OVERFLOW	0x0d
 #define		SSD_KEY_MISCOMPARE	0x0e
 #define		SSD_KEY_COMPLETED	0x0f
 #define	SSD_SDAT_OVFL	0x10
 #define	SSD_ILI		0x20
 #define	SSD_EOM		0x40
 #define	SSD_FILEMARK	0x80
 	u_int8_t info[4];
 	u_int8_t extra_len;
 	u_int8_t cmd_spec_info[4];
 	u_int8_t add_sense_code;
 	u_int8_t add_sense_code_qual;
 	u_int8_t fru;
 	u_int8_t sense_key_spec[3];
 #define	SSD_SCS_VALID		0x80
 #define	SSD_FIELDPTR_CMD	0x40
 #define	SSD_BITPTR_VALID	0x08
 #define	SSD_BITPTR_VALUE	0x07
 	u_int8_t extra_bytes[14];
 #define	SSD_FIXED_IS_PRESENT(sense, length, field) 			\
 	((length >= (offsetof(struct scsi_sense_data_fixed, field) +	\
 	sizeof(sense->field))) ? 1 :0)
 #define	SSD_FIXED_IS_FILLED(sense, field) 				\
 	((((offsetof(struct scsi_sense_data_fixed, field) +		\
 	sizeof(sense->field)) -						\
 	(offsetof(struct scsi_sense_data_fixed, extra_len) +		\
 	sizeof(sense->extra_len))) <= sense->extra_len) ? 1 : 0)
 };
 
 /*
  * Descriptor format sense data definitions.
  * Introduced in SPC-3.
  */
 struct scsi_sense_data_desc 
 {
 	uint8_t	error_code;
 #define	SSD_DESC_CURRENT_ERROR	0x72
 #define	SSD_DESC_DEFERRED_ERROR	0x73
 	uint8_t sense_key;
 	uint8_t	add_sense_code;
 	uint8_t	add_sense_code_qual;
 	uint8_t	flags;
 #define	SSDD_SDAT_OVFL		0x80
 	uint8_t	reserved[2];
 	/*
 	 * Note that SPC-4, section 4.5.2.1 says that the extra_len field
 	 * must be less than or equal to 244.
 	 */
 	uint8_t	extra_len;
 	uint8_t	sense_desc[0];
 #define	SSD_DESC_IS_PRESENT(sense, length, field) 			\
 	((length >= (offsetof(struct scsi_sense_data_desc, field) +	\
 	sizeof(sense->field))) ? 1 :0)
 };
 
 struct scsi_sense_desc_header
 {
 	uint8_t desc_type;
 	uint8_t length;
 };
 /*
  * The information provide in the Information descriptor is device type or
  * command specific information, and defined in a command standard.
  *
  * Note that any changes to the field names or positions in this structure,
  * even reserved fields, should be accompanied by an examination of the
  * code in ctl_set_sense() that uses them.
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_info
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_INFO	0x00
 	uint8_t	length;
 	uint8_t	byte2;
 #define	SSD_INFO_VALID	0x80
 	uint8_t	reserved;
 	uint8_t	info[8];
 };
 
 /*
  * Command-specific information depends on the command for which the
  * reported condition occurred.
  *
  * Note that any changes to the field names or positions in this structure,
  * even reserved fields, should be accompanied by an examination of the
  * code in ctl_set_sense() that uses them.
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_command
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_COMMAND	0x01
 	uint8_t	length;
 	uint8_t	reserved[2];
 	uint8_t	command_info[8];
 };
 
 /*
  * Sense key specific descriptor.  The sense key specific data format
  * depends on the sense key in question.
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_sks
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_SKS		0x02
 	uint8_t	length;
 	uint8_t reserved1[2];
 	uint8_t	sense_key_spec[3];
 #define	SSD_SKS_VALID		0x80
 	uint8_t reserved2;
 };
 
 /*
  * This is used for the Illegal Request sense key (0x05) only.
  */
 struct scsi_sense_sks_field
 {
 	uint8_t	byte0;
 #define	SSD_SKS_FIELD_VALID	0x80
 #define	SSD_SKS_FIELD_CMD	0x40
 #define	SSD_SKS_BPV		0x08
 #define	SSD_SKS_BIT_VALUE	0x07
 	uint8_t	field[2];
 };
 
 
 /* 
  * This is used for the Hardware Error (0x04), Medium Error (0x03) and
  * Recovered Error (0x01) sense keys.
  */
 struct scsi_sense_sks_retry
 {
 	uint8_t byte0;
 #define	SSD_SKS_RETRY_VALID	0x80
 	uint8_t actual_retry_count[2];
 };
 
 /*
  * Used with the NO Sense (0x00) or Not Ready (0x02) sense keys.
  */
 struct scsi_sense_sks_progress
 {
 	uint8_t byte0;
 #define	SSD_SKS_PROGRESS_VALID	0x80
 	uint8_t progress[2];
 #define	SSD_SKS_PROGRESS_DENOM	0x10000
 };
 
 /*
  * Used with the Copy Aborted (0x0a) sense key.
  */
 struct scsi_sense_sks_segment
 {
 	uint8_t byte0;
 #define	SSD_SKS_SEGMENT_VALID	0x80
 #define	SSD_SKS_SEGMENT_SD	0x20
 #define	SSD_SKS_SEGMENT_BPV	0x08
 #define	SSD_SKS_SEGMENT_BITPTR	0x07
 	uint8_t field[2];
 };
 
 /*
  * Used with the Unit Attention (0x06) sense key.
  *
  * This is currently used to indicate that the unit attention condition
  * queue has overflowed (when the overflow bit is set).
  */
 struct scsi_sense_sks_overflow
 {
 	uint8_t byte0;
 #define	SSD_SKS_OVERFLOW_VALID	0x80
 #define	SSD_SKS_OVERFLOW_SET	0x01
 	uint8_t	reserved[2];
 };
 
 /*
  * This specifies which component is associated with the sense data.  There
  * is no standard meaning for the fru value.
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_fru
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_FRU		0x03
 	uint8_t	length;
 	uint8_t reserved;
 	uint8_t fru;
 };
 
 /*
  * Used for Stream commands, defined in SSC-4.
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
  
 struct scsi_sense_stream
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_STREAM		0x04
 	uint8_t	length;
 	uint8_t	reserved;
 	uint8_t	byte3;
 #define	SSD_DESC_STREAM_FM	0x80
 #define	SSD_DESC_STREAM_EOM	0x40
 #define	SSD_DESC_STREAM_ILI	0x20
 };
 
 /*
  * Used for Block commands, defined in SBC-3.
  *
  * This is currently (as of SBC-3) only used for the Incorrect Length
  * Indication (ILI) bit, which says that the data length requested in the
  * READ LONG or WRITE LONG command did not match the length of the logical
  * block.
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_block
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_BLOCK		0x05
 	uint8_t	length;
 	uint8_t	reserved;
 	uint8_t	byte3;
 #define	SSD_DESC_BLOCK_ILI	0x20
 };
 
 /*
  * Used for Object-Based Storage Devices (OSD-3).
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_osd_objid
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_OSD_OBJID	0x06
 	uint8_t	length;
 	uint8_t	reserved[6];
 	/*
 	 * XXX KDM provide the bit definitions here?  There are a lot of
 	 * them, and we don't have an OSD driver yet.
 	 */
 	uint8_t	not_init_cmds[4];
 	uint8_t	completed_cmds[4];
 	uint8_t	partition_id[8];
 	uint8_t	object_id[8];
 };
 
 /*
  * Used for Object-Based Storage Devices (OSD-3).
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_osd_integrity
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_OSD_INTEGRITY	0x07
 	uint8_t	length;
 	uint8_t	integ_check_val[32];
 };
 
 /*
  * Used for Object-Based Storage Devices (OSD-3).
  *
  * Maximum descriptors allowed: 1 (as of SPC-4)
  */
 struct scsi_sense_osd_attr_id
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_OSD_ATTR_ID	0x08
 	uint8_t	length;
 	uint8_t	reserved[2];
 	uint8_t	attr_desc[0];
 };
 
 /*
  * ATA Return descriptor, used for the SCSI ATA PASS-THROUGH(12), (16) and
  * (32) commands.  Described in SAT-4r05.
  */
 struct scsi_sense_ata_ret_desc
 {
 	uint8_t desc_type;
 #define	SSD_DESC_ATA		0x09
 	uint8_t length;
 	uint8_t flags;
 #define	SSD_DESC_ATA_FLAG_EXTEND	0x01
 	uint8_t error;
 	uint8_t count_15_8;
 	uint8_t count_7_0;
 	uint8_t lba_31_24;
 	uint8_t lba_7_0;
 	uint8_t lba_39_32;
 	uint8_t lba_15_8;
 	uint8_t lba_47_40;
 	uint8_t lba_23_16;
 	uint8_t device;
 	uint8_t status;
 };
 /*
  * Used with Sense keys No Sense (0x00) and Not Ready (0x02).
  *
  * Maximum descriptors allowed: 32 (as of SPC-4)
  */
 struct scsi_sense_progress
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_PROGRESS	0x0a
 	uint8_t	length;
 	uint8_t	sense_key;
 	uint8_t	add_sense_code;
 	uint8_t	add_sense_code_qual;
 	uint8_t reserved;
 	uint8_t	progress[2];
 };
 
 /*
  * This is typically forwarded as the result of an EXTENDED COPY command.
  *
  * Maximum descriptors allowed: 2 (as of SPC-4)
  */
 struct scsi_sense_forwarded
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_FORWARDED	0x0c
 	uint8_t	length;
 	uint8_t	byte2;
 #define	SSD_FORWARDED_FSDT	0x80
 #define	SSD_FORWARDED_SDS_MASK	0x0f
 #define	SSD_FORWARDED_SDS_UNK	0x00
 #define	SSD_FORWARDED_SDS_EXSRC	0x01
 #define	SSD_FORWARDED_SDS_EXDST	0x02
 	uint8_t	status;
 	uint8_t	sense_data[];
 };
 
 /*
  * Vendor-specific sense descriptor.  The desc_type field will be in the
  * range between MIN and MAX inclusive.
  */
 struct scsi_sense_vendor
 {
 	uint8_t	desc_type;
 #define	SSD_DESC_VENDOR_MIN	0x80
 #define	SSD_DESC_VENDOR_MAX	0xff
 	uint8_t length;
 	uint8_t	data[0];
 };
 
 struct scsi_mode_header_6
 {
 	u_int8_t data_length;	/* Sense data length */
 	u_int8_t medium_type;
 	u_int8_t dev_spec;
 	u_int8_t blk_desc_len;
 };
 
 struct scsi_mode_header_10
 {
 	u_int8_t data_length[2];/* Sense data length */
 	u_int8_t medium_type;
 	u_int8_t dev_spec;
 	u_int8_t unused[2];
 	u_int8_t blk_desc_len[2];
 };
 
 struct scsi_mode_page_header
 {
 	u_int8_t page_code;
 #define	SMPH_PS		0x80
 #define	SMPH_SPF	0x40
 #define	SMPH_PC_MASK	0x3f
 	u_int8_t page_length;
 };
 
 struct scsi_mode_page_header_sp
 {
 	uint8_t page_code;
 	uint8_t subpage;
 	uint8_t page_length[2];
 };
 
 
 struct scsi_mode_blk_desc
 {
 	u_int8_t density;
 	u_int8_t nblocks[3];
 	u_int8_t reserved;
 	u_int8_t blklen[3];
 };
 
 #define	SCSI_DEFAULT_DENSITY	0x00	/* use 'default' density */
 #define	SCSI_SAME_DENSITY	0x7f	/* use 'same' density- >= SCSI-2 only */
 
 
 /*
  * Status Byte
  */
 #define	SCSI_STATUS_OK			0x00
 #define	SCSI_STATUS_CHECK_COND		0x02
 #define	SCSI_STATUS_COND_MET		0x04
 #define	SCSI_STATUS_BUSY		0x08
 #define	SCSI_STATUS_INTERMED		0x10
 #define	SCSI_STATUS_INTERMED_COND_MET	0x14
 #define	SCSI_STATUS_RESERV_CONFLICT	0x18
 #define	SCSI_STATUS_CMD_TERMINATED	0x22	/* Obsolete in SAM-2 */
 #define	SCSI_STATUS_QUEUE_FULL		0x28
 #define	SCSI_STATUS_ACA_ACTIVE		0x30
 #define	SCSI_STATUS_TASK_ABORTED	0x40
 
 struct scsi_inquiry_pattern {
 	u_int8_t   type;
 	u_int8_t   media_type;
 #define	SIP_MEDIA_REMOVABLE	0x01
 #define	SIP_MEDIA_FIXED		0x02
 	const char *vendor;
 	const char *product;
 	const char *revision;
 }; 
 
 struct scsi_static_inquiry_pattern {
 	u_int8_t   type;
 	u_int8_t   media_type;
 	char       vendor[SID_VENDOR_SIZE+1];
 	char       product[SID_PRODUCT_SIZE+1];
 	char       revision[SID_REVISION_SIZE+1];
 };
 
 struct scsi_sense_quirk_entry {
 	struct scsi_inquiry_pattern	inq_pat;
 	int				num_sense_keys;
 	int				num_ascs;
 	struct sense_key_table_entry	*sense_key_info;
 	struct asc_table_entry		*asc_info;
 };
 
 struct sense_key_table_entry {
 	u_int8_t    sense_key;
 	u_int32_t   action;
 	const char *desc;
 };
 
 struct asc_table_entry {
 	u_int8_t    asc;
 	u_int8_t    ascq;
 	u_int32_t   action;
 	const char *desc;
 };
 
 struct op_table_entry {
 	u_int8_t    opcode;
 	u_int32_t   opmask;
 	const char  *desc;
 };
 
 struct scsi_op_quirk_entry {
 	struct scsi_inquiry_pattern	inq_pat;
 	int				num_ops;
 	struct op_table_entry		*op_table;
 };
 
 typedef enum {
 	SSS_FLAG_NONE		= 0x00,
 	SSS_FLAG_PRINT_COMMAND	= 0x01
 } scsi_sense_string_flags;
 
 struct scsi_nv {
 	const char *name;
 	uint64_t value;
 };
 
 typedef enum {
 	SCSI_NV_FOUND,
 	SCSI_NV_AMBIGUOUS,
 	SCSI_NV_NOT_FOUND
 } scsi_nv_status;
 
 typedef enum {
 	SCSI_NV_FLAG_NONE	= 0x00,
 	SCSI_NV_FLAG_IG_CASE	= 0x01	/* Case insensitive comparison */
 } scsi_nv_flags;
 
 struct ccb_scsiio;
 struct cam_periph;
 union  ccb;
 #ifndef _KERNEL
 struct cam_device;
 #endif
 
 extern const char *scsi_sense_key_text[];
 
 __BEGIN_DECLS
 void scsi_sense_desc(int sense_key, int asc, int ascq,
 		     struct scsi_inquiry_data *inq_data,
 		     const char **sense_key_desc, const char **asc_desc);
 scsi_sense_action scsi_error_action(struct ccb_scsiio* csio,
 				    struct scsi_inquiry_data *inq_data,
 				    u_int32_t sense_flags);
 const char *	scsi_status_string(struct ccb_scsiio *csio);
 
 void scsi_desc_iterate(struct scsi_sense_data_desc *sense, u_int sense_len,
 		       int (*iter_func)(struct scsi_sense_data_desc *sense,
 					u_int, struct scsi_sense_desc_header *,
 					void *), void *arg);
 uint8_t *scsi_find_desc(struct scsi_sense_data_desc *sense, u_int sense_len,
 			uint8_t desc_type);
 void scsi_set_sense_data(struct scsi_sense_data *sense_data,
 			 scsi_sense_data_type sense_format, int current_error,
 			 int sense_key, int asc, int ascq, ...) ;
 void scsi_set_sense_data_len(struct scsi_sense_data *sense_data,
     u_int *sense_len, scsi_sense_data_type sense_format, int current_error,
     int sense_key, int asc, int ascq, ...) ;
 void scsi_set_sense_data_va(struct scsi_sense_data *sense_data,
     u_int *sense_len, scsi_sense_data_type sense_format,
     int current_error, int sense_key, int asc, int ascq, va_list ap);
 int scsi_get_sense_info(struct scsi_sense_data *sense_data, u_int sense_len,
 			uint8_t info_type, uint64_t *info,
 			int64_t *signed_info);
 int scsi_get_sks(struct scsi_sense_data *sense_data, u_int sense_len,
 		 uint8_t *sks);
 int scsi_get_block_info(struct scsi_sense_data *sense_data, u_int sense_len,
 			struct scsi_inquiry_data *inq_data,
 			uint8_t *block_bits);
 int scsi_get_stream_info(struct scsi_sense_data *sense_data, u_int sense_len,
 			 struct scsi_inquiry_data *inq_data,
 			 uint8_t *stream_bits);
 void scsi_info_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len,
 		    struct scsi_inquiry_data *inq_data, uint64_t info);
 void scsi_command_sbuf(struct sbuf *sb, uint8_t *cdb, int cdb_len,
 		       struct scsi_inquiry_data *inq_data, uint64_t csi);
 void scsi_progress_sbuf(struct sbuf *sb, uint16_t progress);
 int scsi_sks_sbuf(struct sbuf *sb, int sense_key, uint8_t *sks);
 void scsi_fru_sbuf(struct sbuf *sb, uint64_t fru);
 void scsi_stream_sbuf(struct sbuf *sb, uint8_t stream_bits);
 void scsi_block_sbuf(struct sbuf *sb, uint8_t block_bits);
 void scsi_sense_info_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			  u_int sense_len, uint8_t *cdb, int cdb_len,
 			  struct scsi_inquiry_data *inq_data,
 			  struct scsi_sense_desc_header *header);
 
 void scsi_sense_command_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			     u_int sense_len, uint8_t *cdb, int cdb_len,
 			     struct scsi_inquiry_data *inq_data,
 			     struct scsi_sense_desc_header *header);
 void scsi_sense_sks_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			 u_int sense_len, uint8_t *cdb, int cdb_len,
 			 struct scsi_inquiry_data *inq_data,
 			 struct scsi_sense_desc_header *header);
 void scsi_sense_fru_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			 u_int sense_len, uint8_t *cdb, int cdb_len,
 			 struct scsi_inquiry_data *inq_data,
 			 struct scsi_sense_desc_header *header);
 void scsi_sense_stream_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			    u_int sense_len, uint8_t *cdb, int cdb_len,
 			    struct scsi_inquiry_data *inq_data,
 			    struct scsi_sense_desc_header *header);
 void scsi_sense_block_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			   u_int sense_len, uint8_t *cdb, int cdb_len,
 			   struct scsi_inquiry_data *inq_data,
 			   struct scsi_sense_desc_header *header);
 void scsi_sense_progress_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			      u_int sense_len, uint8_t *cdb, int cdb_len,
 			      struct scsi_inquiry_data *inq_data,
 			      struct scsi_sense_desc_header *header);
 void scsi_sense_ata_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			 u_int sense_len, uint8_t *cdb, int cdb_len,
 			 struct scsi_inquiry_data *inq_data,
 			 struct scsi_sense_desc_header *header);
 void scsi_sense_forwarded_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			      u_int sense_len, uint8_t *cdb, int cdb_len,
 			      struct scsi_inquiry_data *inq_data,
 			      struct scsi_sense_desc_header *header);
 void scsi_sense_generic_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			     u_int sense_len, uint8_t *cdb, int cdb_len,
 			     struct scsi_inquiry_data *inq_data,
 			     struct scsi_sense_desc_header *header);
 void scsi_sense_desc_sbuf(struct sbuf *sb, struct scsi_sense_data *sense,
 			  u_int sense_len, uint8_t *cdb, int cdb_len,
 			  struct scsi_inquiry_data *inq_data,
 			  struct scsi_sense_desc_header *header);
 scsi_sense_data_type scsi_sense_type(struct scsi_sense_data *sense_data);
 
 void scsi_sense_only_sbuf(struct scsi_sense_data *sense, u_int sense_len,
 			  struct sbuf *sb, char *path_str,
 			  struct scsi_inquiry_data *inq_data, uint8_t *cdb,
 			  int cdb_len);
 
 #ifdef _KERNEL
 int		scsi_command_string(struct ccb_scsiio *csio, struct sbuf *sb);
 int		scsi_sense_sbuf(struct ccb_scsiio *csio, struct sbuf *sb,
 				scsi_sense_string_flags flags);
 char *		scsi_sense_string(struct ccb_scsiio *csio,
 				  char *str, int str_len);
 void		scsi_sense_print(struct ccb_scsiio *csio);
 int 		scsi_vpd_supported_page(struct cam_periph *periph,
 					uint8_t page_id);
 #else /* _KERNEL */
 int		scsi_command_string(struct cam_device *device,
 				    struct ccb_scsiio *csio, struct sbuf *sb);
 int		scsi_sense_sbuf(struct cam_device *device, 
 				struct ccb_scsiio *csio, struct sbuf *sb,
 				scsi_sense_string_flags flags);
 char *		scsi_sense_string(struct cam_device *device, 
 				  struct ccb_scsiio *csio,
 				  char *str, int str_len);
 void		scsi_sense_print(struct cam_device *device, 
 				 struct ccb_scsiio *csio, FILE *ofile);
 #endif /* _KERNEL */
 
 const char *	scsi_op_desc(u_int16_t opcode, 
 			     struct scsi_inquiry_data *inq_data);
 char *		scsi_cdb_string(u_int8_t *cdb_ptr, char *cdb_string,
 				size_t len);
 void		scsi_cdb_sbuf(u_int8_t *cdb_ptr, struct sbuf *sb);
 
 void		scsi_print_inquiry(struct scsi_inquiry_data *inq_data);
 void		scsi_print_inquiry_sbuf(struct sbuf *sb,
 				        struct scsi_inquiry_data *inq_data);
 void		scsi_print_inquiry_short(struct scsi_inquiry_data *inq_data);
 void		scsi_print_inquiry_short_sbuf(struct sbuf *sb,
 					      struct scsi_inquiry_data *inq_data);
 
 u_int		scsi_calc_syncsrate(u_int period_factor);
 u_int		scsi_calc_syncparam(u_int period);
 
 typedef int	(*scsi_devid_checkfn_t)(uint8_t *);
 int		scsi_devid_is_naa_ieee_reg(uint8_t *bufp);
 int		scsi_devid_is_sas_target(uint8_t *bufp);
 int		scsi_devid_is_lun_eui64(uint8_t *bufp);
 int		scsi_devid_is_lun_naa(uint8_t *bufp);
 int		scsi_devid_is_lun_name(uint8_t *bufp);
 int		scsi_devid_is_lun_t10(uint8_t *bufp);
 int		scsi_devid_is_lun_md5(uint8_t *bufp);
 int		scsi_devid_is_lun_uuid(uint8_t *bufp);
 int		scsi_devid_is_port_naa(uint8_t *bufp);
 struct scsi_vpd_id_descriptor *
 		scsi_get_devid(struct scsi_vpd_device_id *id, uint32_t len,
 			       scsi_devid_checkfn_t ck_fn);
 struct scsi_vpd_id_descriptor *
 		scsi_get_devid_desc(struct scsi_vpd_id_descriptor *desc, uint32_t len,
 			       scsi_devid_checkfn_t ck_fn);
 
 int		scsi_transportid_sbuf(struct sbuf *sb,
 				      struct scsi_transportid_header *hdr,
 				      uint32_t valid_len);
 
 const char *	scsi_nv_to_str(struct scsi_nv *table, int num_table_entries,
 			       uint64_t value);
 
 scsi_nv_status	scsi_get_nv(struct scsi_nv *table, int num_table_entries,
 			    char *name, int *table_entry, scsi_nv_flags flags);
 
 int	scsi_parse_transportid_64bit(int proto_id, char *id_str,
 				     struct scsi_transportid_header **hdr,
 				     unsigned int *alloc_len,
 #ifdef _KERNEL
 				     struct malloc_type *type, int flags,
 #endif
 				     char *error_str, int error_str_len);
 
 int	scsi_parse_transportid_spi(char *id_str,
 				   struct scsi_transportid_header **hdr,
 				   unsigned int *alloc_len,
 #ifdef _KERNEL
 				   struct malloc_type *type, int flags,
 #endif
 				   char *error_str, int error_str_len);
 
 int	scsi_parse_transportid_rdma(char *id_str,
 				    struct scsi_transportid_header **hdr,
 				    unsigned int *alloc_len,
 #ifdef _KERNEL
 				    struct malloc_type *type, int flags,
 #endif
 				    char *error_str, int error_str_len);
 
 int	scsi_parse_transportid_iscsi(char *id_str,
 				     struct scsi_transportid_header **hdr,
 				     unsigned int *alloc_len,
 #ifdef _KERNEL
 				     struct malloc_type *type, int flags,
 #endif
 				     char *error_str,int error_str_len);
 
 int	scsi_parse_transportid_sop(char *id_str,
 				   struct scsi_transportid_header **hdr,
 				   unsigned int *alloc_len,
 #ifdef _KERNEL
 				   struct malloc_type *type, int flags,
 #endif
 				   char *error_str,int error_str_len);
 
 int	scsi_parse_transportid(char *transportid_str,
 			       struct scsi_transportid_header **hdr,
 			       unsigned int *alloc_len,
 #ifdef _KERNEL
 			       struct malloc_type *type, int flags,
 #endif
 			       char *error_str, int error_str_len);
 
 
 int scsi_attrib_volcoh_sbuf(struct sbuf *sb,
 			    struct scsi_mam_attribute_header *hdr,
 			    uint32_t valid_len, uint32_t flags,
 			    uint32_t output_flags, char *error_str,
 			    int error_str_len);
 
 int scsi_attrib_vendser_sbuf(struct sbuf *sb,
 			     struct scsi_mam_attribute_header *hdr,
 			     uint32_t valid_len, uint32_t flags,
 			     uint32_t output_flags, char *error_str,
 			     int error_str_len);
 
 int scsi_attrib_hexdump_sbuf(struct sbuf *sb,
 			     struct scsi_mam_attribute_header *hdr,
 			     uint32_t valid_len, uint32_t flags,
 			     uint32_t output_flags, char *error_str,
 			     int error_str_len);
 
 int scsi_attrib_int_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr,
 			 uint32_t valid_len, uint32_t flags,
 			 uint32_t output_flags, char *error_str,
 			 int error_str_len);
 
 int scsi_attrib_ascii_sbuf(struct sbuf *sb,
 			   struct scsi_mam_attribute_header *hdr,
 			   uint32_t valid_len, uint32_t flags,
 			   uint32_t output_flags, char *error_str,
 			   int error_str_len);
 
 int scsi_attrib_text_sbuf(struct sbuf *sb,
 			  struct scsi_mam_attribute_header *hdr,
 			  uint32_t valid_len, uint32_t flags,
 			  uint32_t output_flags, char *error_str,
 			  int error_str_len);
 
 struct scsi_attrib_table_entry *scsi_find_attrib_entry(
 			struct scsi_attrib_table_entry *table,
 			size_t num_table_entries, uint32_t id);
 
 struct scsi_attrib_table_entry *scsi_get_attrib_entry(uint32_t id);
 
 int scsi_attrib_value_sbuf(struct sbuf *sb, uint32_t valid_len,
 			   struct scsi_mam_attribute_header *hdr,
 			   uint32_t output_flags, char *error_str,
 			   size_t error_str_len);
 
 void scsi_attrib_prefix_sbuf(struct sbuf *sb, uint32_t output_flags,
 			     struct scsi_mam_attribute_header *hdr,
 			     uint32_t valid_len, const char *desc);
 
 int scsi_attrib_sbuf(struct sbuf *sb, struct scsi_mam_attribute_header *hdr,
 		     uint32_t valid_len,
 		     struct scsi_attrib_table_entry *user_table,
 		     size_t num_user_entries, int prefer_user_table,
 		     uint32_t output_flags, char *error_str, int error_str_len);
 
 void		scsi_test_unit_ready(struct ccb_scsiio *csio, u_int32_t retries,
 				     void (*cbfcnp)(struct cam_periph *, 
 						    union ccb *),
 				     u_int8_t tag_action, 
 				     u_int8_t sense_len, u_int32_t timeout);
 
 void		scsi_request_sense(struct ccb_scsiio *csio, u_int32_t retries,
 				   void (*cbfcnp)(struct cam_periph *, 
 						  union ccb *),
 				   void *data_ptr, u_int8_t dxfer_len,
 				   u_int8_t tag_action, u_int8_t sense_len,
 				   u_int32_t timeout);
 
 void		scsi_inquiry(struct ccb_scsiio *csio, u_int32_t retries,
 			     void (*cbfcnp)(struct cam_periph *, union ccb *),
 			     u_int8_t tag_action, u_int8_t *inq_buf, 
 			     u_int32_t inq_len, int evpd, u_int8_t page_code,
 			     u_int8_t sense_len, u_int32_t timeout);
 
 void		scsi_mode_sense(struct ccb_scsiio *csio, u_int32_t retries,
 		    void (*cbfcnp)(struct cam_periph *, union ccb *),
 		    uint8_t tag_action, int dbd, uint8_t pc, uint8_t page,
 		    uint8_t *param_buf, uint32_t param_len,
 		    uint8_t sense_len, uint32_t timeout);
 
 void		scsi_mode_sense_len(struct ccb_scsiio *csio, u_int32_t retries,
 		    void (*cbfcnp)(struct cam_periph *, union ccb *),
 		    uint8_t tag_action, int dbd, uint8_t pc, uint8_t page,
 		    uint8_t *param_buf, uint32_t param_len,
 		    int minimum_cmd_size, uint8_t sense_len, uint32_t timeout);
 
 void		scsi_mode_sense_subpage(struct ccb_scsiio *csio,
 		    uint32_t retries,
 		    void (*cbfcnp)(struct cam_periph *, union ccb *),
 		    uint8_t tag_action, int dbd, uint8_t pc,
 		    uint8_t page, uint8_t subpage,
 		    uint8_t *param_buf, uint32_t param_len,
 		    int minimum_cmd_size, uint8_t sense_len, uint32_t timeout);
 
 void		scsi_mode_select(struct ccb_scsiio *csio, u_int32_t retries,
 				 void (*cbfcnp)(struct cam_periph *,
 						union ccb *),
 				 u_int8_t tag_action, int scsi_page_fmt,
 				 int save_pages, u_int8_t *param_buf,
 				 u_int32_t param_len, u_int8_t sense_len,
 				 u_int32_t timeout);
 
 void		scsi_mode_select_len(struct ccb_scsiio *csio, u_int32_t retries,
 				     void (*cbfcnp)(struct cam_periph *,
 						    union ccb *),
 				     u_int8_t tag_action, int scsi_page_fmt,
 				     int save_pages, u_int8_t *param_buf,
 				     u_int32_t param_len, int minimum_cmd_size,
 				     u_int8_t sense_len, u_int32_t timeout);
 
 void		scsi_log_sense(struct ccb_scsiio *csio, u_int32_t retries,
 			       void (*cbfcnp)(struct cam_periph *, union ccb *),
 			       u_int8_t tag_action, u_int8_t page_code,
 			       u_int8_t page, int save_pages, int ppc,
 			       u_int32_t paramptr, u_int8_t *param_buf,
 			       u_int32_t param_len, u_int8_t sense_len,
 			       u_int32_t timeout);
 
 void		scsi_log_select(struct ccb_scsiio *csio, u_int32_t retries,
 				void (*cbfcnp)(struct cam_periph *,
 				union ccb *), u_int8_t tag_action,
 				u_int8_t page_code, int save_pages,
 				int pc_reset, u_int8_t *param_buf,
 				u_int32_t param_len, u_int8_t sense_len,
 				u_int32_t timeout);
 
 void		scsi_prevent(struct ccb_scsiio *csio, u_int32_t retries,
 			     void (*cbfcnp)(struct cam_periph *, union ccb *),
 			     u_int8_t tag_action, u_int8_t action,
 			     u_int8_t sense_len, u_int32_t timeout);
 
 void		scsi_read_capacity(struct ccb_scsiio *csio, u_int32_t retries,
 				   void (*cbfcnp)(struct cam_periph *, 
 				   union ccb *), u_int8_t tag_action, 
 				   struct scsi_read_capacity_data *,
 				   u_int8_t sense_len, u_int32_t timeout);
 void		scsi_read_capacity_16(struct ccb_scsiio *csio, uint32_t retries,
 				      void (*cbfcnp)(struct cam_periph *,
 				      union ccb *), uint8_t tag_action,
 				      uint64_t lba, int reladr, int pmi,
 				      uint8_t *rcap_buf, int rcap_buf_len,
 				      uint8_t sense_len, uint32_t timeout);
 
 void		scsi_report_luns(struct ccb_scsiio *csio, u_int32_t retries,
 				 void (*cbfcnp)(struct cam_periph *, 
 				 union ccb *), u_int8_t tag_action, 
 				 u_int8_t select_report,
 				 struct scsi_report_luns_data *rpl_buf,
 				 u_int32_t alloc_len, u_int8_t sense_len,
 				 u_int32_t timeout);
 
 void		scsi_report_target_group(struct ccb_scsiio *csio, u_int32_t retries,
 				 void (*cbfcnp)(struct cam_periph *, 
 				 union ccb *), u_int8_t tag_action, 
 				 u_int8_t pdf,
 				 void *buf,
 				 u_int32_t alloc_len, u_int8_t sense_len,
 				 u_int32_t timeout);
 
 void		scsi_report_timestamp(struct ccb_scsiio *csio, u_int32_t retries,
 				 void (*cbfcnp)(struct cam_periph *, 
 				 union ccb *), u_int8_t tag_action, 
 				 u_int8_t pdf,
 				 void *buf,
 				 u_int32_t alloc_len, u_int8_t sense_len,
 				 u_int32_t timeout);
 
 void		scsi_set_target_group(struct ccb_scsiio *csio, u_int32_t retries,
 				 void (*cbfcnp)(struct cam_periph *, 
 				 union ccb *), u_int8_t tag_action, void *buf,
 				 u_int32_t alloc_len, u_int8_t sense_len,
 				 u_int32_t timeout);
 
 void		scsi_create_timestamp(uint8_t *timestamp_6b_buf,
 				      uint64_t timestamp);
 
 void		scsi_set_timestamp(struct ccb_scsiio *csio, u_int32_t retries,
 				   void (*cbfcnp)(struct cam_periph *, 
 				   union ccb *), u_int8_t tag_action,
 				   void *buf, u_int32_t alloc_len,
 				   u_int8_t sense_len, u_int32_t timeout);
 
 void		scsi_synchronize_cache(struct ccb_scsiio *csio, 
 				       u_int32_t retries,
 				       void (*cbfcnp)(struct cam_periph *, 
 				       union ccb *), u_int8_t tag_action, 
 				       u_int32_t begin_lba, u_int16_t lb_count,
 				       u_int8_t sense_len, u_int32_t timeout);
 
 void scsi_receive_diagnostic_results(struct ccb_scsiio *csio, u_int32_t retries,
 				     void (*cbfcnp)(struct cam_periph *,
 						    union ccb*),
 				     uint8_t tag_action, int pcv,
 				     uint8_t page_code, uint8_t *data_ptr,
 				     uint16_t allocation_length,
 				     uint8_t sense_len, uint32_t timeout);
 
 void scsi_send_diagnostic(struct ccb_scsiio *csio, u_int32_t retries,
 			  void (*cbfcnp)(struct cam_periph *, union ccb *),
 			  uint8_t tag_action, int unit_offline,
 			  int device_offline, int self_test, int page_format,
 			  int self_test_code, uint8_t *data_ptr,
 			  uint16_t param_list_length, uint8_t sense_len,
 			  uint32_t timeout);
 
 void scsi_read_buffer(struct ccb_scsiio *csio, u_int32_t retries,
 			void (*cbfcnp)(struct cam_periph *, union ccb*),
 			uint8_t tag_action, int mode,
 			uint8_t buffer_id, u_int32_t offset,
 			uint8_t *data_ptr, uint32_t allocation_length,
 			uint8_t sense_len, uint32_t timeout);
 
 void scsi_write_buffer(struct ccb_scsiio *csio, u_int32_t retries,
 			void (*cbfcnp)(struct cam_periph *, union ccb *),
 			uint8_t tag_action, int mode,
 			uint8_t buffer_id, u_int32_t offset,
 			uint8_t *data_ptr, uint32_t param_list_length,
 			uint8_t sense_len, uint32_t timeout);
 
 #define	SCSI_RW_READ	0x0001
 #define	SCSI_RW_WRITE	0x0002
 #define	SCSI_RW_DIRMASK	0x0003
 #define	SCSI_RW_BIO	0x1000
 void scsi_read_write(struct ccb_scsiio *csio, u_int32_t retries,
 		     void (*cbfcnp)(struct cam_periph *, union ccb *),
 		     u_int8_t tag_action, int readop, u_int8_t byte2, 
 		     int minimum_cmd_size, u_int64_t lba,
 		     u_int32_t block_count, u_int8_t *data_ptr,
 		     u_int32_t dxfer_len, u_int8_t sense_len,
 		     u_int32_t timeout);
 
 void scsi_write_same(struct ccb_scsiio *csio, u_int32_t retries,
 		     void (*cbfcnp)(struct cam_periph *, union ccb *),
 		     u_int8_t tag_action, u_int8_t byte2, 
 		     int minimum_cmd_size, u_int64_t lba,
 		     u_int32_t block_count, u_int8_t *data_ptr,
 		     u_int32_t dxfer_len, u_int8_t sense_len,
 		     u_int32_t timeout);
 
 void scsi_ata_identify(struct ccb_scsiio *csio, u_int32_t retries,
 		       void (*cbfcnp)(struct cam_periph *, union ccb *),
 		       u_int8_t tag_action, u_int8_t *data_ptr,
 		       u_int16_t dxfer_len, u_int8_t sense_len,
 		       u_int32_t timeout);
 
 void scsi_ata_trim(struct ccb_scsiio *csio, u_int32_t retries,
 	           void (*cbfcnp)(struct cam_periph *, union ccb *),
 	           u_int8_t tag_action, u_int16_t block_count,
 	           u_int8_t *data_ptr, u_int16_t dxfer_len,
 	           u_int8_t sense_len, u_int32_t timeout);
 
 int scsi_ata_read_log(struct ccb_scsiio *csio, uint32_t retries,
 		      void (*cbfcnp)(struct cam_periph *, union ccb *),
 		      uint8_t tag_action, uint32_t log_address,
 		      uint32_t page_number, uint16_t block_count,
 		      uint8_t protocol, uint8_t *data_ptr, uint32_t dxfer_len,
 		      uint8_t sense_len, uint32_t timeout);
 
 int scsi_ata_setfeatures(struct ccb_scsiio *csio, uint32_t retries,
 			 void (*cbfcnp)(struct cam_periph *, union ccb *),
 			 uint8_t tag_action, uint8_t feature,
 			 uint64_t lba, uint32_t count,
 			 uint8_t sense_len, uint32_t timeout);
 
 int scsi_ata_pass(struct ccb_scsiio *csio, uint32_t retries,
 		  void (*cbfcnp)(struct cam_periph *, union ccb *),
 		  uint32_t flags, uint8_t tag_action,
 		  uint8_t protocol, uint8_t ata_flags, uint16_t features,
 		  uint16_t sector_count, uint64_t lba, uint8_t command,
 		  uint8_t device, uint8_t icc, uint32_t auxiliary,
 		  uint8_t control, u_int8_t *data_ptr, uint32_t dxfer_len,
 		  uint8_t *cdb_storage, size_t cdb_storage_len,
 		  int minimum_cmd_size, u_int8_t sense_len, u_int32_t timeout);
 
 void scsi_ata_pass_16(struct ccb_scsiio *csio, u_int32_t retries,
 		      void (*cbfcnp)(struct cam_periph *, union ccb *),
 		      u_int32_t flags, u_int8_t tag_action,
 		      u_int8_t protocol, u_int8_t ata_flags, u_int16_t features,
 		      u_int16_t sector_count, uint64_t lba, u_int8_t command,
 		      u_int8_t control, u_int8_t *data_ptr, u_int16_t dxfer_len,
 		      u_int8_t sense_len, u_int32_t timeout);
 
 void scsi_unmap(struct ccb_scsiio *csio, u_int32_t retries,
 		void (*cbfcnp)(struct cam_periph *, union ccb *),
 		u_int8_t tag_action, u_int8_t byte2,
 		u_int8_t *data_ptr, u_int16_t dxfer_len,
 		u_int8_t sense_len, u_int32_t timeout);
 
 void scsi_start_stop(struct ccb_scsiio *csio, u_int32_t retries,
 		     void (*cbfcnp)(struct cam_periph *, union ccb *),
 		     u_int8_t tag_action, int start, int load_eject,
 		     int immediate, u_int8_t sense_len, u_int32_t timeout);
 void scsi_read_attribute(struct ccb_scsiio *csio, u_int32_t retries, 
 			 void (*cbfcnp)(struct cam_periph *, union ccb *),
 			 u_int8_t tag_action, u_int8_t service_action,
 			 uint32_t element, u_int8_t elem_type,
 			 int logical_volume, int partition,
 			 u_int32_t first_attribute, int cache, u_int8_t *data_ptr,
 			 u_int32_t length, int sense_len, u_int32_t timeout);
 void scsi_write_attribute(struct ccb_scsiio *csio, u_int32_t retries, 
 			  void (*cbfcnp)(struct cam_periph *, union ccb *),
 			  u_int8_t tag_action, uint32_t element,
 			  int logical_volume, int partition, int wtc, u_int8_t *data_ptr,
 			  u_int32_t length, int sense_len, u_int32_t timeout);
 
 void scsi_security_protocol_in(struct ccb_scsiio *csio, uint32_t retries, 
 			       void (*cbfcnp)(struct cam_periph *, union ccb *),
 			       uint8_t tag_action, uint32_t security_protocol,
 			       uint32_t security_protocol_specific, int byte4,
 			       uint8_t *data_ptr, uint32_t dxfer_len,
 			       int sense_len, int timeout);
 
 void scsi_security_protocol_out(struct ccb_scsiio *csio, uint32_t retries, 
 				void (*cbfcnp)(struct cam_periph *,union ccb *),
 				uint8_t tag_action, uint32_t security_protocol,
 				uint32_t security_protocol_specific, int byte4,
 				uint8_t *data_ptr, uint32_t dxfer_len,
 				int sense_len, int timeout);
 
 void scsi_persistent_reserve_in(struct ccb_scsiio *csio, uint32_t retries, 
 				void (*cbfcnp)(struct cam_periph *,union ccb *),
 				uint8_t tag_action, int service_action,
 				uint8_t *data_ptr, uint32_t dxfer_len,
 				int sense_len, int timeout);
 
 void scsi_persistent_reserve_out(struct ccb_scsiio *csio, uint32_t retries, 
 				 void (*cbfcnp)(struct cam_periph *,
 				       union ccb *),
 				 uint8_t tag_action, int service_action,
 				 int scope, int res_type, uint8_t *data_ptr,
 				 uint32_t dxfer_len, int sense_len,
 				 int timeout);
 
 void scsi_report_supported_opcodes(struct ccb_scsiio *csio, uint32_t retries, 
 				   void (*cbfcnp)(struct cam_periph *,
 						  union ccb *),
 				   uint8_t tag_action, int options,
 				   int req_opcode, int req_service_action,
 				   uint8_t *data_ptr, uint32_t dxfer_len,
 				   int sense_len, int timeout);
 
 int		scsi_inquiry_match(caddr_t inqbuffer, caddr_t table_entry);
 int		scsi_static_inquiry_match(caddr_t inqbuffer,
 					  caddr_t table_entry);
 int		scsi_devid_match(uint8_t *rhs, size_t rhs_len,
 				 uint8_t *lhs, size_t lhs_len);
 
 void scsi_extract_sense(struct scsi_sense_data *sense, int *error_code,
 			int *sense_key, int *asc, int *ascq);
 int scsi_extract_sense_ccb(union ccb *ccb, int *error_code, int *sense_key,
 			   int *asc, int *ascq);
 void scsi_extract_sense_len(struct scsi_sense_data *sense,
 			    u_int sense_len, int *error_code, int *sense_key,
 			    int *asc, int *ascq, int show_errors);
 int scsi_get_sense_key(struct scsi_sense_data *sense, u_int sense_len,
 		       int show_errors);
 int scsi_get_asc(struct scsi_sense_data *sense, u_int sense_len,
 		 int show_errors);
 int scsi_get_ascq(struct scsi_sense_data *sense, u_int sense_len,
 		  int show_errors);
 static __inline void scsi_ulto2b(u_int32_t val, u_int8_t *bytes);
 static __inline void scsi_ulto3b(u_int32_t val, u_int8_t *bytes);
 static __inline void scsi_ulto4b(u_int32_t val, u_int8_t *bytes);
 static __inline void scsi_u64to8b(u_int64_t val, u_int8_t *bytes);
 static __inline uint32_t scsi_2btoul(const uint8_t *bytes);
 static __inline uint32_t scsi_3btoul(const uint8_t *bytes);
 static __inline int32_t scsi_3btol(const uint8_t *bytes);
 static __inline uint32_t scsi_4btoul(const uint8_t *bytes);
 static __inline uint64_t scsi_8btou64(const uint8_t *bytes);
 static __inline void *find_mode_page_6(struct scsi_mode_header_6 *mode_header);
 static __inline void *find_mode_page_10(struct scsi_mode_header_10 *mode_header);
 
 static __inline void
 scsi_ulto2b(u_int32_t val, u_int8_t *bytes)
 {
 
 	bytes[0] = (val >> 8) & 0xff;
 	bytes[1] = val & 0xff;
 }
 
 static __inline void
 scsi_ulto3b(u_int32_t val, u_int8_t *bytes)
 {
 
 	bytes[0] = (val >> 16) & 0xff;
 	bytes[1] = (val >> 8) & 0xff;
 	bytes[2] = val & 0xff;
 }
 
 static __inline void
 scsi_ulto4b(u_int32_t val, u_int8_t *bytes)
 {
 
 	bytes[0] = (val >> 24) & 0xff;
 	bytes[1] = (val >> 16) & 0xff;
 	bytes[2] = (val >> 8) & 0xff;
 	bytes[3] = val & 0xff;
 }
 
 static __inline void
 scsi_u64to8b(u_int64_t val, u_int8_t *bytes)
 {
 
 	bytes[0] = (val >> 56) & 0xff;
 	bytes[1] = (val >> 48) & 0xff;
 	bytes[2] = (val >> 40) & 0xff;
 	bytes[3] = (val >> 32) & 0xff;
 	bytes[4] = (val >> 24) & 0xff;
 	bytes[5] = (val >> 16) & 0xff;
 	bytes[6] = (val >> 8) & 0xff;
 	bytes[7] = val & 0xff;
 }
 
 static __inline uint32_t
 scsi_2btoul(const uint8_t *bytes)
 {
 	uint32_t rv;
 
 	rv = (bytes[0] << 8) |
 	     bytes[1];
 	return (rv);
 }
 
 static __inline uint32_t
 scsi_3btoul(const uint8_t *bytes)
 {
 	uint32_t rv;
 
 	rv = (bytes[0] << 16) |
 	     (bytes[1] << 8) |
 	     bytes[2];
 	return (rv);
 }
 
 static __inline int32_t 
 scsi_3btol(const uint8_t *bytes)
 {
 	uint32_t rc = scsi_3btoul(bytes);
  
 	if (rc & 0x00800000)
 		rc |= 0xff000000;
 
 	return (int32_t) rc;
 }
 
 static __inline uint32_t
 scsi_4btoul(const uint8_t *bytes)
 {
 	uint32_t rv;
 
 	rv = (bytes[0] << 24) |
 	     (bytes[1] << 16) |
 	     (bytes[2] << 8) |
 	     bytes[3];
 	return (rv);
 }
 
 static __inline uint64_t
 scsi_8btou64(const uint8_t *bytes)
 {
         uint64_t rv;
  
 	rv = (((uint64_t)bytes[0]) << 56) |
 	     (((uint64_t)bytes[1]) << 48) |
 	     (((uint64_t)bytes[2]) << 40) |
 	     (((uint64_t)bytes[3]) << 32) |
 	     (((uint64_t)bytes[4]) << 24) |
 	     (((uint64_t)bytes[5]) << 16) |
 	     (((uint64_t)bytes[6]) << 8) |
 	     bytes[7];
 	return (rv);
 }
 
 /*
  * Given the pointer to a returned mode sense buffer, return a pointer to
  * the start of the first mode page.
  */
 static __inline void *
 find_mode_page_6(struct scsi_mode_header_6 *mode_header)
 {
 	void *page_start;
 
 	page_start = (void *)((u_int8_t *)&mode_header[1] +
 			      mode_header->blk_desc_len);
 
 	return(page_start);
 }
 
 static __inline void *
 find_mode_page_10(struct scsi_mode_header_10 *mode_header)
 {
 	void *page_start;
 
 	page_start = (void *)((u_int8_t *)&mode_header[1] +
 			       scsi_2btoul(mode_header->blk_desc_len));
 
 	return(page_start);
 }
 
 __END_DECLS
 
 #endif /*_SCSI_SCSI_ALL_H*/
Index: projects/nfsv42/sys/dev/nvme/nvme_ctrlr.c
===================================================================
--- projects/nfsv42/sys/dev/nvme/nvme_ctrlr.c	(revision 350367)
+++ projects/nfsv42/sys/dev/nvme/nvme_ctrlr.c	(revision 350368)
@@ -1,1418 +1,1418 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (C) 2012-2016 Intel Corporation
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_cam.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/buf.h>
 #include <sys/bus.h>
 #include <sys/conf.h>
 #include <sys/ioccom.h>
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/uio.h>
 #include <sys/endian.h>
 
 #include <dev/pci/pcireg.h>
 #include <dev/pci/pcivar.h>
 
 #include "nvme_private.h"
 
 #define B4_CHK_RDY_DELAY_MS	2300		/* work around controller bug */
 
 static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
 						struct nvme_async_event_request *aer);
 static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr);
 
 static int
 nvme_ctrlr_allocate_bar(struct nvme_controller *ctrlr)
 {
 
 	ctrlr->resource_id = PCIR_BAR(0);
 
 	ctrlr->resource = bus_alloc_resource_any(ctrlr->dev, SYS_RES_MEMORY,
 	    &ctrlr->resource_id, RF_ACTIVE);
 
 	if(ctrlr->resource == NULL) {
 		nvme_printf(ctrlr, "unable to allocate pci resource\n");
 		return (ENOMEM);
 	}
 
 	ctrlr->bus_tag = rman_get_bustag(ctrlr->resource);
 	ctrlr->bus_handle = rman_get_bushandle(ctrlr->resource);
 	ctrlr->regs = (struct nvme_registers *)ctrlr->bus_handle;
 
 	/*
 	 * The NVMe spec allows for the MSI-X table to be placed behind
 	 *  BAR 4/5, separate from the control/doorbell registers.  Always
 	 *  try to map this bar, because it must be mapped prior to calling
 	 *  pci_alloc_msix().  If the table isn't behind BAR 4/5,
 	 *  bus_alloc_resource() will just return NULL which is OK.
 	 */
 	ctrlr->bar4_resource_id = PCIR_BAR(4);
 	ctrlr->bar4_resource = bus_alloc_resource_any(ctrlr->dev, SYS_RES_MEMORY,
 	    &ctrlr->bar4_resource_id, RF_ACTIVE);
 
 	return (0);
 }
 
 static int
 nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr)
 {
 	struct nvme_qpair	*qpair;
 	uint32_t		num_entries;
 	int			error;
 
 	qpair = &ctrlr->adminq;
 
 	num_entries = NVME_ADMIN_ENTRIES;
 	TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries);
 	/*
 	 * If admin_entries was overridden to an invalid value, revert it
 	 *  back to our default value.
 	 */
 	if (num_entries < NVME_MIN_ADMIN_ENTRIES ||
 	    num_entries > NVME_MAX_ADMIN_ENTRIES) {
 		nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d "
 		    "specified\n", num_entries);
 		num_entries = NVME_ADMIN_ENTRIES;
 	}
 
 	/*
 	 * The admin queue's max xfer size is treated differently than the
 	 *  max I/O xfer size.  16KB is sufficient here - maybe even less?
 	 */
 	error = nvme_qpair_construct(qpair, 
 				     0, /* qpair ID */
 				     0, /* vector */
 				     num_entries,
 				     NVME_ADMIN_TRACKERS,
 				     ctrlr);
 	return (error);
 }
 
 static int
 nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr)
 {
 	struct nvme_qpair	*qpair;
 	uint32_t		cap_lo;
 	uint16_t		mqes;
 	int			i, error, num_entries, num_trackers;
 
 	num_entries = NVME_IO_ENTRIES;
 	TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries);
 
 	/*
 	 * NVMe spec sets a hard limit of 64K max entries, but
 	 *  devices may specify a smaller limit, so we need to check
 	 *  the MQES field in the capabilities register.
 	 */
 	cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
 	mqes = NVME_CAP_LO_MQES(cap_lo);
 	num_entries = min(num_entries, mqes + 1);
 
 	num_trackers = NVME_IO_TRACKERS;
 	TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers);
 
 	num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS);
 	num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS);
 	/*
 	 * No need to have more trackers than entries in the submit queue.
 	 *  Note also that for a queue size of N, we can only have (N-1)
 	 *  commands outstanding, hence the "-1" here.
 	 */
 	num_trackers = min(num_trackers, (num_entries-1));
 
 	/*
 	 * Our best estimate for the maximum number of I/Os that we should
 	 * noramlly have in flight at one time. This should be viewed as a hint,
 	 * not a hard limit and will need to be revisitted when the upper layers
 	 * of the storage system grows multi-queue support.
 	 */
 	ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4;
 
 	/*
 	 * This was calculated previously when setting up interrupts, but
 	 *  a controller could theoretically support fewer I/O queues than
 	 *  MSI-X vectors.  So calculate again here just to be safe.
 	 */
 	ctrlr->num_cpus_per_ioq = howmany(mp_ncpus, ctrlr->num_io_queues);
 
 	ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair),
 	    M_NVME, M_ZERO | M_WAITOK);
 
 	for (i = 0; i < ctrlr->num_io_queues; i++) {
 		qpair = &ctrlr->ioq[i];
 
 		/*
 		 * Admin queue has ID=0. IO queues start at ID=1 -
 		 *  hence the 'i+1' here.
 		 *
 		 * For I/O queues, use the controller-wide max_xfer_size
 		 *  calculated in nvme_attach().
 		 */
 		error = nvme_qpair_construct(qpair,
 				     i+1, /* qpair ID */
 				     ctrlr->msix_enabled ? i+1 : 0, /* vector */
 				     num_entries,
 				     num_trackers,
 				     ctrlr);
 		if (error)
 			return (error);
 
 		/*
 		 * Do not bother binding interrupts if we only have one I/O
 		 *  interrupt thread for this controller.
 		 */
 		if (ctrlr->num_io_queues > 1)
 			bus_bind_intr(ctrlr->dev, qpair->res,
 			    i * ctrlr->num_cpus_per_ioq);
 	}
 
 	return (0);
 }
 
 static void
 nvme_ctrlr_fail(struct nvme_controller *ctrlr)
 {
 	int i;
 
 	ctrlr->is_failed = TRUE;
 	nvme_qpair_fail(&ctrlr->adminq);
 	if (ctrlr->ioq != NULL) {
 		for (i = 0; i < ctrlr->num_io_queues; i++)
 			nvme_qpair_fail(&ctrlr->ioq[i]);
 	}
 	nvme_notify_fail_consumers(ctrlr);
 }
 
 void
 nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr,
     struct nvme_request *req)
 {
 
 	mtx_lock(&ctrlr->lock);
 	STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq);
 	mtx_unlock(&ctrlr->lock);
 	taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
 }
 
 static void
 nvme_ctrlr_fail_req_task(void *arg, int pending)
 {
 	struct nvme_controller	*ctrlr = arg;
 	struct nvme_request	*req;
 
 	mtx_lock(&ctrlr->lock);
 	while ((req = STAILQ_FIRST(&ctrlr->fail_req)) != NULL) {
 		STAILQ_REMOVE_HEAD(&ctrlr->fail_req, stailq);
 		mtx_unlock(&ctrlr->lock);
 		nvme_qpair_manual_complete_request(req->qpair, req,
 		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST);
 		mtx_lock(&ctrlr->lock);
 	}
 	mtx_unlock(&ctrlr->lock);
 }
 
 static int
 nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val)
 {
 	int ms_waited;
 	uint32_t csts;
 
 	csts = nvme_mmio_read_4(ctrlr, csts);
 
 	ms_waited = 0;
 	while (((csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK) != desired_val) {
 		if (ms_waited++ > ctrlr->ready_timeout_in_ms) {
 			nvme_printf(ctrlr, "controller ready did not become %d "
 			    "within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms);
 			return (ENXIO);
 		}
 		DELAY(1000);
 		csts = nvme_mmio_read_4(ctrlr, csts);
 	}
 
 	return (0);
 }
 
 static int
 nvme_ctrlr_disable(struct nvme_controller *ctrlr)
 {
 	uint32_t cc;
 	uint32_t csts;
 	uint8_t  en, rdy;
 	int err;
 
 	cc = nvme_mmio_read_4(ctrlr, cc);
 	csts = nvme_mmio_read_4(ctrlr, csts);
 
 	en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK;
 	rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK;
 
 	/*
 	 * Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1
 	 * when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when
 	 * CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY
 	 * isn't the desired value. Short circuit if we're already disabled.
 	 */
 	if (en == 1) {
 		if (rdy == 0) {
 			/* EN == 1, wait for  RDY == 1 or fail */
 			err = nvme_ctrlr_wait_for_ready(ctrlr, 1);
 			if (err != 0)
 				return (err);
 		}
 	} else {
 		/* EN == 0 already wait for RDY == 0 */
 		if (rdy == 0)
 			return (0);
 		else
 			return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
 	}
 
 	cc &= ~NVME_CC_REG_EN_MASK;
 	nvme_mmio_write_4(ctrlr, cc, cc);
 	/*
 	 * Some drives have issues with accessing the mmio after we
 	 * disable, so delay for a bit after we write the bit to
 	 * cope with these issues.
 	 */
 	if (ctrlr->quirks & QUIRK_DELAY_B4_CHK_RDY)
 		pause("nvmeR", B4_CHK_RDY_DELAY_MS * hz / 1000);
 	return (nvme_ctrlr_wait_for_ready(ctrlr, 0));
 }
 
 static int
 nvme_ctrlr_enable(struct nvme_controller *ctrlr)
 {
 	uint32_t	cc;
 	uint32_t	csts;
 	uint32_t	aqa;
 	uint32_t	qsize;
 	uint8_t		en, rdy;
 	int		err;
 
 	cc = nvme_mmio_read_4(ctrlr, cc);
 	csts = nvme_mmio_read_4(ctrlr, csts);
 
 	en = (cc >> NVME_CC_REG_EN_SHIFT) & NVME_CC_REG_EN_MASK;
 	rdy = (csts >> NVME_CSTS_REG_RDY_SHIFT) & NVME_CSTS_REG_RDY_MASK;
 
 	/*
 	 * See note in nvme_ctrlr_disable. Short circuit if we're already enabled.
 	 */
 	if (en == 1) {
 		if (rdy == 1)
 			return (0);
 		else
 			return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
 	} else {
 		/* EN == 0 already wait for RDY == 0 or fail */
 		err = nvme_ctrlr_wait_for_ready(ctrlr, 0);
 		if (err != 0)
 			return (err);
 	}
 
 	nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr);
 	DELAY(5000);
 	nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr);
 	DELAY(5000);
 
 	/* acqs and asqs are 0-based. */
 	qsize = ctrlr->adminq.num_entries - 1;
 
 	aqa = 0;
 	aqa = (qsize & NVME_AQA_REG_ACQS_MASK) << NVME_AQA_REG_ACQS_SHIFT;
 	aqa |= (qsize & NVME_AQA_REG_ASQS_MASK) << NVME_AQA_REG_ASQS_SHIFT;
 	nvme_mmio_write_4(ctrlr, aqa, aqa);
 	DELAY(5000);
 
 	/* Initialization values for CC */
 	cc = 0;
 	cc |= 1 << NVME_CC_REG_EN_SHIFT;
 	cc |= 0 << NVME_CC_REG_CSS_SHIFT;
 	cc |= 0 << NVME_CC_REG_AMS_SHIFT;
 	cc |= 0 << NVME_CC_REG_SHN_SHIFT;
 	cc |= 6 << NVME_CC_REG_IOSQES_SHIFT; /* SQ entry size == 64 == 2^6 */
 	cc |= 4 << NVME_CC_REG_IOCQES_SHIFT; /* CQ entry size == 16 == 2^4 */
 
 	/* This evaluates to 0, which is according to spec. */
 	cc |= (PAGE_SIZE >> 13) << NVME_CC_REG_MPS_SHIFT;
 
 	nvme_mmio_write_4(ctrlr, cc, cc);
 
 	return (nvme_ctrlr_wait_for_ready(ctrlr, 1));
 }
 
 int
 nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr)
 {
 	int i, err;
 
 	nvme_admin_qpair_disable(&ctrlr->adminq);
 	/*
 	 * I/O queues are not allocated before the initial HW
 	 *  reset, so do not try to disable them.  Use is_initialized
 	 *  to determine if this is the initial HW reset.
 	 */
 	if (ctrlr->is_initialized) {
 		for (i = 0; i < ctrlr->num_io_queues; i++)
 			nvme_io_qpair_disable(&ctrlr->ioq[i]);
 	}
 
 	DELAY(100*1000);
 
 	err = nvme_ctrlr_disable(ctrlr);
 	if (err != 0)
 		return err;
 	return (nvme_ctrlr_enable(ctrlr));
 }
 
 void
 nvme_ctrlr_reset(struct nvme_controller *ctrlr)
 {
 	int cmpset;
 
 	cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1);
 
 	if (cmpset == 0 || ctrlr->is_failed)
 		/*
 		 * Controller is already resetting or has failed.  Return
 		 *  immediately since there is no need to kick off another
 		 *  reset in these cases.
 		 */
 		return;
 
 	taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
 }
 
 static int
 nvme_ctrlr_identify(struct nvme_controller *ctrlr)
 {
 	struct nvme_completion_poll_status	status;
 
 	status.done = 0;
 	nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata,
 	    nvme_completion_poll_cb, &status);
 	while (!atomic_load_acq_int(&status.done))
 		pause("nvme", 1);
 	if (nvme_completion_is_error(&status.cpl)) {
 		nvme_printf(ctrlr, "nvme_identify_controller failed!\n");
 		return (ENXIO);
 	}
 
 	/* Convert data to host endian */
 	nvme_controller_data_swapbytes(&ctrlr->cdata);
 
 	/*
 	 * Use MDTS to ensure our default max_xfer_size doesn't exceed what the
 	 *  controller supports.
 	 */
 	if (ctrlr->cdata.mdts > 0)
 		ctrlr->max_xfer_size = min(ctrlr->max_xfer_size,
 		    ctrlr->min_page_size * (1 << (ctrlr->cdata.mdts)));
 
 	return (0);
 }
 
 static int
 nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr)
 {
 	struct nvme_completion_poll_status	status;
 	int					cq_allocated, sq_allocated;
 
 	status.done = 0;
 	nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues,
 	    nvme_completion_poll_cb, &status);
 	while (!atomic_load_acq_int(&status.done))
 		pause("nvme", 1);
 	if (nvme_completion_is_error(&status.cpl)) {
 		nvme_printf(ctrlr, "nvme_ctrlr_set_num_qpairs failed!\n");
 		return (ENXIO);
 	}
 
 	/*
 	 * Data in cdw0 is 0-based.
 	 * Lower 16-bits indicate number of submission queues allocated.
 	 * Upper 16-bits indicate number of completion queues allocated.
 	 */
 	sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1;
 	cq_allocated = (status.cpl.cdw0 >> 16) + 1;
 
 	/*
 	 * Controller may allocate more queues than we requested,
 	 *  so use the minimum of the number requested and what was
 	 *  actually allocated.
 	 */
 	ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated);
 	ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated);
 
 	return (0);
 }
 
 static int
 nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr)
 {
 	struct nvme_completion_poll_status	status;
 	struct nvme_qpair			*qpair;
 	int					i;
 
 	for (i = 0; i < ctrlr->num_io_queues; i++) {
 		qpair = &ctrlr->ioq[i];
 
 		status.done = 0;
 		nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, qpair->vector,
 		    nvme_completion_poll_cb, &status);
 		while (!atomic_load_acq_int(&status.done))
 			pause("nvme", 1);
 		if (nvme_completion_is_error(&status.cpl)) {
 			nvme_printf(ctrlr, "nvme_create_io_cq failed!\n");
 			return (ENXIO);
 		}
 
 		status.done = 0;
 		nvme_ctrlr_cmd_create_io_sq(qpair->ctrlr, qpair,
 		    nvme_completion_poll_cb, &status);
 		while (!atomic_load_acq_int(&status.done))
 			pause("nvme", 1);
 		if (nvme_completion_is_error(&status.cpl)) {
 			nvme_printf(ctrlr, "nvme_create_io_sq failed!\n");
 			return (ENXIO);
 		}
 	}
 
 	return (0);
 }
 
 static int
 nvme_ctrlr_destroy_qpairs(struct nvme_controller *ctrlr)
 {
 	struct nvme_completion_poll_status	status;
 	struct nvme_qpair			*qpair;
 
 	for (int i = 0; i < ctrlr->num_io_queues; i++) {
 		qpair = &ctrlr->ioq[i];
 
 		status.done = 0;
 		nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair,
 		    nvme_completion_poll_cb, &status);
 		while (!atomic_load_acq_int(&status.done))
 			pause("nvme", 1);
 		if (nvme_completion_is_error(&status.cpl)) {
 			nvme_printf(ctrlr, "nvme_destroy_io_sq failed!\n");
 			return (ENXIO);
 		}
 
 		status.done = 0;
 		nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair,
 		    nvme_completion_poll_cb, &status);
 		while (!atomic_load_acq_int(&status.done))
 			pause("nvme", 1);
 		if (nvme_completion_is_error(&status.cpl)) {
 			nvme_printf(ctrlr, "nvme_destroy_io_cq failed!\n");
 			return (ENXIO);
 		}
 	}
 
 	return (0);
 }
 
 static int
 nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr)
 {
 	struct nvme_namespace	*ns;
 	uint32_t 		i;
 
 	for (i = 0; i < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); i++) {
 		ns = &ctrlr->ns[i];
 		nvme_ns_construct(ns, i+1, ctrlr);
 	}
 
 	return (0);
 }
 
 static boolean_t
 is_log_page_id_valid(uint8_t page_id)
 {
 
 	switch (page_id) {
 	case NVME_LOG_ERROR:
 	case NVME_LOG_HEALTH_INFORMATION:
 	case NVME_LOG_FIRMWARE_SLOT:
 	case NVME_LOG_CHANGED_NAMESPACE:
 		return (TRUE);
 	}
 
 	return (FALSE);
 }
 
 static uint32_t
 nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id)
 {
 	uint32_t	log_page_size;
 
 	switch (page_id) {
 	case NVME_LOG_ERROR:
 		log_page_size = min(
 		    sizeof(struct nvme_error_information_entry) *
 		    (ctrlr->cdata.elpe + 1), NVME_MAX_AER_LOG_SIZE);
 		break;
 	case NVME_LOG_HEALTH_INFORMATION:
 		log_page_size = sizeof(struct nvme_health_information_page);
 		break;
 	case NVME_LOG_FIRMWARE_SLOT:
 		log_page_size = sizeof(struct nvme_firmware_page);
 		break;
 	case NVME_LOG_CHANGED_NAMESPACE:
 		log_page_size = sizeof(struct nvme_ns_list);
 		break;
 	default:
 		log_page_size = 0;
 		break;
 	}
 
 	return (log_page_size);
 }
 
 static void
 nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr,
     uint8_t state)
 {
 
 	if (state & NVME_CRIT_WARN_ST_AVAILABLE_SPARE)
 		nvme_printf(ctrlr, "available spare space below threshold\n");
 
 	if (state & NVME_CRIT_WARN_ST_TEMPERATURE)
 		nvme_printf(ctrlr, "temperature above threshold\n");
 
 	if (state & NVME_CRIT_WARN_ST_DEVICE_RELIABILITY)
 		nvme_printf(ctrlr, "device reliability degraded\n");
 
 	if (state & NVME_CRIT_WARN_ST_READ_ONLY)
 		nvme_printf(ctrlr, "media placed in read only mode\n");
 
 	if (state & NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP)
 		nvme_printf(ctrlr, "volatile memory backup device failed\n");
 
 	if (state & NVME_CRIT_WARN_ST_RESERVED_MASK)
 		nvme_printf(ctrlr,
 		    "unknown critical warning(s): state = 0x%02x\n", state);
 }
 
 static void
 nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl)
 {
 	struct nvme_async_event_request		*aer = arg;
 	struct nvme_health_information_page	*health_info;
 	struct nvme_ns_list			*nsl;
 	struct nvme_error_information_entry	*err;
 	int i;
 
 	/*
 	 * If the log page fetch for some reason completed with an error,
 	 *  don't pass log page data to the consumers.  In practice, this case
 	 *  should never happen.
 	 */
 	if (nvme_completion_is_error(cpl))
 		nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
 		    aer->log_page_id, NULL, 0);
 	else {
 		/* Convert data to host endian */
 		switch (aer->log_page_id) {
 		case NVME_LOG_ERROR:
 			err = (struct nvme_error_information_entry *)aer->log_page_buffer;
 			for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++)
 				nvme_error_information_entry_swapbytes(err++);
 			break;
 		case NVME_LOG_HEALTH_INFORMATION:
 			nvme_health_information_page_swapbytes(
 			    (struct nvme_health_information_page *)aer->log_page_buffer);
 			break;
 		case NVME_LOG_FIRMWARE_SLOT:
 			nvme_firmware_page_swapbytes(
 			    (struct nvme_firmware_page *)aer->log_page_buffer);
 			break;
 		case NVME_LOG_CHANGED_NAMESPACE:
 			nvme_ns_list_swapbytes(
 			    (struct nvme_ns_list *)aer->log_page_buffer);
 			break;
 		case INTEL_LOG_TEMP_STATS:
 			intel_log_temp_stats_swapbytes(
 			    (struct intel_log_temp_stats *)aer->log_page_buffer);
 			break;
 		default:
 			break;
 		}
 
 		if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) {
 			health_info = (struct nvme_health_information_page *)
 			    aer->log_page_buffer;
 			nvme_ctrlr_log_critical_warnings(aer->ctrlr,
 			    health_info->critical_warning);
 			/*
 			 * Critical warnings reported through the
 			 *  SMART/health log page are persistent, so
 			 *  clear the associated bits in the async event
 			 *  config so that we do not receive repeated
 			 *  notifications for the same event.
 			 */
 			aer->ctrlr->async_event_config &=
 			    ~health_info->critical_warning;
 			nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr,
 			    aer->ctrlr->async_event_config, NULL, NULL);
 		} else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE &&
 		    !nvme_use_nvd) {
 			nsl = (struct nvme_ns_list *)aer->log_page_buffer;
 			for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) {
 				if (nsl->ns[i] > NVME_MAX_NAMESPACES)
 					break;
 				nvme_notify_ns(aer->ctrlr, nsl->ns[i]);
 			}
 		}
 
 
 		/*
 		 * Pass the cpl data from the original async event completion,
 		 *  not the log page fetch.
 		 */
 		nvme_notify_async_consumers(aer->ctrlr, &aer->cpl,
 		    aer->log_page_id, aer->log_page_buffer, aer->log_page_size);
 	}
 
 	/*
 	 * Repost another asynchronous event request to replace the one
 	 *  that just completed.
 	 */
 	nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
 }
 
 static void
 nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl)
 {
 	struct nvme_async_event_request	*aer = arg;
 
 	if (nvme_completion_is_error(cpl)) {
 		/*
 		 *  Do not retry failed async event requests.  This avoids
 		 *  infinite loops where a new async event request is submitted
 		 *  to replace the one just failed, only to fail again and
 		 *  perpetuate the loop.
 		 */
 		return;
 	}
 
 	/* Associated log page is in bits 23:16 of completion entry dw0. */
 	aer->log_page_id = (cpl->cdw0 & 0xFF0000) >> 16;
 
 	nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x,"
 	    " page 0x%02x)\n", (cpl->cdw0 & 0x03), (cpl->cdw0 & 0xFF00) >> 8,
 	    aer->log_page_id);
 
 	if (is_log_page_id_valid(aer->log_page_id)) {
 		aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr,
 		    aer->log_page_id);
 		memcpy(&aer->cpl, cpl, sizeof(*cpl));
 		nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id,
 		    NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer,
 		    aer->log_page_size, nvme_ctrlr_async_event_log_page_cb,
 		    aer);
 		/* Wait to notify consumers until after log page is fetched. */
 	} else {
 		nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id,
 		    NULL, 0);
 
 		/*
 		 * Repost another asynchronous event request to replace the one
 		 *  that just completed.
 		 */
 		nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer);
 	}
 }
 
 static void
 nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr,
     struct nvme_async_event_request *aer)
 {
 	struct nvme_request *req;
 
 	aer->ctrlr = ctrlr;
 	req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer);
 	aer->req = req;
 
 	/*
 	 * Disable timeout here, since asynchronous event requests should by
 	 *  nature never be timed out.
 	 */
 	req->timeout = FALSE;
 	req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST;
 	nvme_ctrlr_submit_admin_request(ctrlr, req);
 }
 
 static void
 nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr)
 {
 	struct nvme_completion_poll_status	status;
 	struct nvme_async_event_request		*aer;
 	uint32_t				i;
 
 	ctrlr->async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE |
 	    NVME_CRIT_WARN_ST_DEVICE_RELIABILITY |
 	    NVME_CRIT_WARN_ST_READ_ONLY |
 	    NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP;
 	if (ctrlr->cdata.ver >= NVME_REV(1, 2))
 		ctrlr->async_event_config |= 0x300;
 
 	status.done = 0;
 	nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD,
 	    0, NULL, 0, nvme_completion_poll_cb, &status);
 	while (!atomic_load_acq_int(&status.done))
 		pause("nvme", 1);
 	if (nvme_completion_is_error(&status.cpl) ||
 	    (status.cpl.cdw0 & 0xFFFF) == 0xFFFF ||
 	    (status.cpl.cdw0 & 0xFFFF) == 0x0000) {
 		nvme_printf(ctrlr, "temperature threshold not supported\n");
 	} else
 		ctrlr->async_event_config |= NVME_CRIT_WARN_ST_TEMPERATURE;
 
 	nvme_ctrlr_cmd_set_async_event_config(ctrlr,
 	    ctrlr->async_event_config, NULL, NULL);
 
 	/* aerl is a zero-based value, so we need to add 1 here. */
 	ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1));
 
 	for (i = 0; i < ctrlr->num_aers; i++) {
 		aer = &ctrlr->aer[i];
 		nvme_ctrlr_construct_and_submit_aer(ctrlr, aer);
 	}
 }
 
 static void
 nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr)
 {
 
 	ctrlr->int_coal_time = 0;
 	TUNABLE_INT_FETCH("hw.nvme.int_coal_time",
 	    &ctrlr->int_coal_time);
 
 	ctrlr->int_coal_threshold = 0;
 	TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold",
 	    &ctrlr->int_coal_threshold);
 
 	nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time,
 	    ctrlr->int_coal_threshold, NULL, NULL);
 }
 
 static void
 nvme_ctrlr_start(void *ctrlr_arg)
 {
 	struct nvme_controller *ctrlr = ctrlr_arg;
 	uint32_t old_num_io_queues;
 	int i;
 
 	/*
 	 * Only reset adminq here when we are restarting the
 	 *  controller after a reset.  During initialization,
 	 *  we have already submitted admin commands to get
 	 *  the number of I/O queues supported, so cannot reset
 	 *  the adminq again here.
 	 */
 	if (ctrlr->is_resetting) {
 		nvme_qpair_reset(&ctrlr->adminq);
 	}
 
 	for (i = 0; i < ctrlr->num_io_queues; i++)
 		nvme_qpair_reset(&ctrlr->ioq[i]);
 
 	nvme_admin_qpair_enable(&ctrlr->adminq);
 
 	if (nvme_ctrlr_identify(ctrlr) != 0) {
 		nvme_ctrlr_fail(ctrlr);
 		return;
 	}
 
 	/*
 	 * The number of qpairs are determined during controller initialization,
 	 *  including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the
 	 *  HW limit.  We call SET_FEATURES again here so that it gets called
 	 *  after any reset for controllers that depend on the driver to
 	 *  explicit specify how many queues it will use.  This value should
 	 *  never change between resets, so panic if somehow that does happen.
 	 */
 	if (ctrlr->is_resetting) {
 		old_num_io_queues = ctrlr->num_io_queues;
 		if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) {
 			nvme_ctrlr_fail(ctrlr);
 			return;
 		}
 
 		if (old_num_io_queues != ctrlr->num_io_queues) {
 			panic("num_io_queues changed from %u to %u",
 			      old_num_io_queues, ctrlr->num_io_queues);
 		}
 	}
 
 	if (nvme_ctrlr_create_qpairs(ctrlr) != 0) {
 		nvme_ctrlr_fail(ctrlr);
 		return;
 	}
 
 	if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) {
 		nvme_ctrlr_fail(ctrlr);
 		return;
 	}
 
 	nvme_ctrlr_configure_aer(ctrlr);
 	nvme_ctrlr_configure_int_coalescing(ctrlr);
 
 	for (i = 0; i < ctrlr->num_io_queues; i++)
 		nvme_io_qpair_enable(&ctrlr->ioq[i]);
 }
 
 void
 nvme_ctrlr_start_config_hook(void *arg)
 {
 	struct nvme_controller *ctrlr = arg;
 
 	nvme_qpair_reset(&ctrlr->adminq);
 	nvme_admin_qpair_enable(&ctrlr->adminq);
 
 	if (nvme_ctrlr_set_num_qpairs(ctrlr) == 0 &&
 	    nvme_ctrlr_construct_io_qpairs(ctrlr) == 0)
 		nvme_ctrlr_start(ctrlr);
 	else
 		nvme_ctrlr_fail(ctrlr);
 
 	nvme_sysctl_initialize_ctrlr(ctrlr);
 	config_intrhook_disestablish(&ctrlr->config_hook);
 
 	ctrlr->is_initialized = 1;
 	nvme_notify_new_controller(ctrlr);
 }
 
 static void
 nvme_ctrlr_reset_task(void *arg, int pending)
 {
 	struct nvme_controller	*ctrlr = arg;
 	int			status;
 
 	nvme_printf(ctrlr, "resetting controller\n");
 	status = nvme_ctrlr_hw_reset(ctrlr);
 	/*
 	 * Use pause instead of DELAY, so that we yield to any nvme interrupt
 	 *  handlers on this CPU that were blocked on a qpair lock. We want
 	 *  all nvme interrupts completed before proceeding with restarting the
 	 *  controller.
 	 *
 	 * XXX - any way to guarantee the interrupt handlers have quiesced?
 	 */
 	pause("nvmereset", hz / 10);
 	if (status == 0)
 		nvme_ctrlr_start(ctrlr);
 	else
 		nvme_ctrlr_fail(ctrlr);
 
 	atomic_cmpset_32(&ctrlr->is_resetting, 1, 0);
 }
 
 /*
  * Poll all the queues enabled on the device for completion.
  */
 void
 nvme_ctrlr_poll(struct nvme_controller *ctrlr)
 {
 	int i;
 
 	nvme_qpair_process_completions(&ctrlr->adminq);
 
 	for (i = 0; i < ctrlr->num_io_queues; i++)
 		if (ctrlr->ioq && ctrlr->ioq[i].cpl)
 			nvme_qpair_process_completions(&ctrlr->ioq[i]);
 }
 
 /*
  * Poll the single-vector intertrupt case: num_io_queues will be 1 and
  * there's only a single vector. While we're polling, we mask further
  * interrupts in the controller.
  */
 void
 nvme_ctrlr_intx_handler(void *arg)
 {
 	struct nvme_controller *ctrlr = arg;
 
 	nvme_mmio_write_4(ctrlr, intms, 1);
 	nvme_ctrlr_poll(ctrlr);
 	nvme_mmio_write_4(ctrlr, intmc, 1);
 }
 
 static int
 nvme_ctrlr_configure_intx(struct nvme_controller *ctrlr)
 {
 
 	ctrlr->msix_enabled = 0;
 	ctrlr->num_io_queues = 1;
 	ctrlr->num_cpus_per_ioq = mp_ncpus;
 	ctrlr->rid = 0;
 	ctrlr->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ,
 	    &ctrlr->rid, RF_SHAREABLE | RF_ACTIVE);
 
 	if (ctrlr->res == NULL) {
 		nvme_printf(ctrlr, "unable to allocate shared IRQ\n");
 		return (ENOMEM);
 	}
 
 	bus_setup_intr(ctrlr->dev, ctrlr->res,
 	    INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_ctrlr_intx_handler,
 	    ctrlr, &ctrlr->tag);
 
 	if (ctrlr->tag == NULL) {
 		nvme_printf(ctrlr, "unable to setup intx handler\n");
 		return (ENOMEM);
 	}
 
 	return (0);
 }
 
 static void
 nvme_pt_done(void *arg, const struct nvme_completion *cpl)
 {
 	struct nvme_pt_command *pt = arg;
 	struct mtx *mtx = pt->driver_lock;
 	uint16_t status;
 
 	bzero(&pt->cpl, sizeof(pt->cpl));
 	pt->cpl.cdw0 = cpl->cdw0;
 
 	status = cpl->status;
 	status &= ~NVME_STATUS_P_MASK;
 	pt->cpl.status = status;
 
 	mtx_lock(mtx);
 	pt->driver_lock = NULL;
 	wakeup(pt);
 	mtx_unlock(mtx);
 }
 
 int
 nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr,
     struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer,
     int is_admin_cmd)
 {
 	struct nvme_request	*req;
 	struct mtx		*mtx;
 	struct buf		*buf = NULL;
 	int			ret = 0;
 	vm_offset_t		addr, end;
 
 	if (pt->len > 0) {
 		/*
 		 * vmapbuf calls vm_fault_quick_hold_pages which only maps full
 		 * pages. Ensure this request has fewer than MAXPHYS bytes when
 		 * extended to full pages.
 		 */
 		addr = (vm_offset_t)pt->buf;
 		end = round_page(addr + pt->len);
 		addr = trunc_page(addr);
 		if (end - addr > MAXPHYS)
 			return EIO;
 
 		if (pt->len > ctrlr->max_xfer_size) {
 			nvme_printf(ctrlr, "pt->len (%d) "
 			    "exceeds max_xfer_size (%d)\n", pt->len,
 			    ctrlr->max_xfer_size);
 			return EIO;
 		}
 		if (is_user_buffer) {
 			/*
 			 * Ensure the user buffer is wired for the duration of
 			 *  this passthrough command.
 			 */
 			PHOLD(curproc);
 			buf = uma_zalloc(pbuf_zone, M_WAITOK);
 			buf->b_data = pt->buf;
 			buf->b_bufsize = pt->len;
 			buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE;
 			if (vmapbuf(buf, 1) < 0) {
 				ret = EFAULT;
 				goto err;
 			}
 			req = nvme_allocate_request_vaddr(buf->b_data, pt->len, 
 			    nvme_pt_done, pt);
 		} else
 			req = nvme_allocate_request_vaddr(pt->buf, pt->len,
 			    nvme_pt_done, pt);
 	} else
 		req = nvme_allocate_request_null(nvme_pt_done, pt);
 
 	/* Assume userspace already converted to little-endian */
 	req->cmd.opc = pt->cmd.opc;
 	req->cmd.fuse = pt->cmd.fuse;
 	req->cmd.rsvd2 = pt->cmd.rsvd2;
 	req->cmd.rsvd3 = pt->cmd.rsvd3;
 	req->cmd.cdw10 = pt->cmd.cdw10;
 	req->cmd.cdw11 = pt->cmd.cdw11;
 	req->cmd.cdw12 = pt->cmd.cdw12;
 	req->cmd.cdw13 = pt->cmd.cdw13;
 	req->cmd.cdw14 = pt->cmd.cdw14;
 	req->cmd.cdw15 = pt->cmd.cdw15;
 
 	req->cmd.nsid = htole32(nsid);
 
 	mtx = mtx_pool_find(mtxpool_sleep, pt);
 	pt->driver_lock = mtx;
 
 	if (is_admin_cmd)
 		nvme_ctrlr_submit_admin_request(ctrlr, req);
 	else
 		nvme_ctrlr_submit_io_request(ctrlr, req);
 
 	mtx_lock(mtx);
 	while (pt->driver_lock != NULL)
 		mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0);
 	mtx_unlock(mtx);
 
 err:
 	if (buf != NULL) {
 		uma_zfree(pbuf_zone, buf);
 		PRELE(curproc);
 	}
 
 	return (ret);
 }
 
 static int
 nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag,
     struct thread *td)
 {
 	struct nvme_controller			*ctrlr;
 	struct nvme_pt_command			*pt;
 
 	ctrlr = cdev->si_drv1;
 
 	switch (cmd) {
 	case NVME_RESET_CONTROLLER:
 		nvme_ctrlr_reset(ctrlr);
 		break;
 	case NVME_PASSTHROUGH_CMD:
 		pt = (struct nvme_pt_command *)arg;
 		return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, le32toh(pt->cmd.nsid),
 		    1 /* is_user_buffer */, 1 /* is_admin_cmd */));
 	default:
 		return (ENOTTY);
 	}
 
 	return (0);
 }
 
 static struct cdevsw nvme_ctrlr_cdevsw = {
 	.d_version =	D_VERSION,
 	.d_flags =	0,
 	.d_ioctl =	nvme_ctrlr_ioctl
 };
 
 static void
 nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr)
 {
 	device_t	dev;
 	int		per_cpu_io_queues;
 	int		min_cpus_per_ioq;
 	int		num_vectors_requested, num_vectors_allocated;
 	int		num_vectors_available;
 
 	dev = ctrlr->dev;
 	min_cpus_per_ioq = 1;
 	TUNABLE_INT_FETCH("hw.nvme.min_cpus_per_ioq", &min_cpus_per_ioq);
 
 	if (min_cpus_per_ioq < 1) {
 		min_cpus_per_ioq = 1;
 	} else if (min_cpus_per_ioq > mp_ncpus) {
 		min_cpus_per_ioq = mp_ncpus;
 	}
 
 	per_cpu_io_queues = 1;
 	TUNABLE_INT_FETCH("hw.nvme.per_cpu_io_queues", &per_cpu_io_queues);
 
 	if (per_cpu_io_queues == 0) {
 		min_cpus_per_ioq = mp_ncpus;
 	}
 
 	ctrlr->force_intx = 0;
 	TUNABLE_INT_FETCH("hw.nvme.force_intx", &ctrlr->force_intx);
 
 	/*
 	 * FreeBSD currently cannot allocate more than about 190 vectors at
 	 *  boot, meaning that systems with high core count and many devices
 	 *  requesting per-CPU interrupt vectors will not get their full
 	 *  allotment.  So first, try to allocate as many as we may need to
 	 *  understand what is available, then immediately release them.
 	 *  Then figure out how many of those we will actually use, based on
 	 *  assigning an equal number of cores to each I/O queue.
 	 */
 
 	/* One vector for per core I/O queue, plus one vector for admin queue. */
 	num_vectors_available = min(pci_msix_count(dev), mp_ncpus + 1);
 	if (pci_alloc_msix(dev, &num_vectors_available) != 0) {
 		num_vectors_available = 0;
 	}
 	pci_release_msi(dev);
 
 	if (ctrlr->force_intx || num_vectors_available < 2) {
 		nvme_ctrlr_configure_intx(ctrlr);
 		return;
 	}
 
 	/*
 	 * Do not use all vectors for I/O queues - one must be saved for the
 	 *  admin queue.
 	 */
 	ctrlr->num_cpus_per_ioq = max(min_cpus_per_ioq,
 	    howmany(mp_ncpus, num_vectors_available - 1));
 
 	ctrlr->num_io_queues = howmany(mp_ncpus, ctrlr->num_cpus_per_ioq);
 	num_vectors_requested = ctrlr->num_io_queues + 1;
 	num_vectors_allocated = num_vectors_requested;
 
 	/*
 	 * Now just allocate the number of vectors we need.  This should
 	 *  succeed, since we previously called pci_alloc_msix()
 	 *  successfully returning at least this many vectors, but just to
 	 *  be safe, if something goes wrong just revert to INTx.
 	 */
 	if (pci_alloc_msix(dev, &num_vectors_allocated) != 0) {
 		nvme_ctrlr_configure_intx(ctrlr);
 		return;
 	}
 
 	if (num_vectors_allocated < num_vectors_requested) {
 		pci_release_msi(dev);
 		nvme_ctrlr_configure_intx(ctrlr);
 		return;
 	}
 
 	ctrlr->msix_enabled = 1;
 }
 
 int
 nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev)
 {
 	struct make_dev_args	md_args;
 	uint32_t	cap_lo;
 	uint32_t	cap_hi;
-	uint8_t		to;
+	uint32_t	to;
 	uint8_t		dstrd;
 	uint8_t		mpsmin;
 	int		status, timeout_period;
 
 	ctrlr->dev = dev;
 
 	mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF);
 
 	status = nvme_ctrlr_allocate_bar(ctrlr);
 
 	if (status != 0)
 		return (status);
 
 	/*
 	 * Software emulators may set the doorbell stride to something
 	 *  other than zero, but this driver is not set up to handle that.
 	 */
 	cap_hi = nvme_mmio_read_4(ctrlr, cap_hi);
 	dstrd = NVME_CAP_HI_DSTRD(cap_hi);
 	if (dstrd != 0)
 		return (ENXIO);
 
 	mpsmin = NVME_CAP_HI_MPSMIN(cap_hi);
 	ctrlr->min_page_size = 1 << (12 + mpsmin);
 
 	/* Get ready timeout value from controller, in units of 500ms. */
 	cap_lo = nvme_mmio_read_4(ctrlr, cap_lo);
 	to = NVME_CAP_LO_TO(cap_lo) + 1;
 	ctrlr->ready_timeout_in_ms = to * 500;
 
 	timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD;
 	TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period);
 	timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD);
 	timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD);
 	ctrlr->timeout_period = timeout_period;
 
 	nvme_retry_count = NVME_DEFAULT_RETRY_COUNT;
 	TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count);
 
 	ctrlr->enable_aborts = 0;
 	TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts);
 
 	nvme_ctrlr_setup_interrupts(ctrlr);
 
 	ctrlr->max_xfer_size = NVME_MAX_XFER_SIZE;
 	if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0)
 		return (ENXIO);
 
 	ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK,
 	    taskqueue_thread_enqueue, &ctrlr->taskqueue);
 	taskqueue_start_threads(&ctrlr->taskqueue, 1, PI_DISK, "nvme taskq");
 
 	ctrlr->is_resetting = 0;
 	ctrlr->is_initialized = 0;
 	ctrlr->notification_sent = 0;
 	TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr);
 	TASK_INIT(&ctrlr->fail_req_task, 0, nvme_ctrlr_fail_req_task, ctrlr);
 	STAILQ_INIT(&ctrlr->fail_req);
 	ctrlr->is_failed = FALSE;
 
 	make_dev_args_init(&md_args);
 	md_args.mda_devsw = &nvme_ctrlr_cdevsw;
 	md_args.mda_uid = UID_ROOT;
 	md_args.mda_gid = GID_WHEEL;
 	md_args.mda_mode = 0600;
 	md_args.mda_unit = device_get_unit(dev);
 	md_args.mda_si_drv1 = (void *)ctrlr;
 	status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d",
 	    device_get_unit(dev));
 	if (status != 0)
 		return (ENXIO);
 
 	return (0);
 }
 
 void
 nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev)
 {
 	int				i;
 
 	if (ctrlr->resource == NULL)
 		goto nores;
 
 	nvme_notify_fail_consumers(ctrlr);
 
 	for (i = 0; i < NVME_MAX_NAMESPACES; i++)
 		nvme_ns_destruct(&ctrlr->ns[i]);
 
 	if (ctrlr->cdev)
 		destroy_dev(ctrlr->cdev);
 
 	nvme_ctrlr_destroy_qpairs(ctrlr);
 	for (i = 0; i < ctrlr->num_io_queues; i++) {
 		nvme_io_qpair_destroy(&ctrlr->ioq[i]);
 	}
 	free(ctrlr->ioq, M_NVME);
 
 	nvme_admin_qpair_destroy(&ctrlr->adminq);
 
 	/*
 	 *  Notify the controller of a shutdown, even though this is due to
 	 *   a driver unload, not a system shutdown (this path is not invoked
 	 *   during shutdown).  This ensures the controller receives a
 	 *   shutdown notification in case the system is shutdown before
 	 *   reloading the driver.
 	 */
 	nvme_ctrlr_shutdown(ctrlr);
 
 	nvme_ctrlr_disable(ctrlr);
 
 	if (ctrlr->taskqueue)
 		taskqueue_free(ctrlr->taskqueue);
 
 	if (ctrlr->tag)
 		bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
 
 	if (ctrlr->res)
 		bus_release_resource(ctrlr->dev, SYS_RES_IRQ,
 		    rman_get_rid(ctrlr->res), ctrlr->res);
 
 	if (ctrlr->msix_enabled)
 		pci_release_msi(dev);
 
 	if (ctrlr->bar4_resource != NULL) {
 		bus_release_resource(dev, SYS_RES_MEMORY,
 		    ctrlr->bar4_resource_id, ctrlr->bar4_resource);
 	}
 
 	bus_release_resource(dev, SYS_RES_MEMORY,
 	    ctrlr->resource_id, ctrlr->resource);
 
 nores:
 	mtx_destroy(&ctrlr->lock);
 }
 
 void
 nvme_ctrlr_shutdown(struct nvme_controller *ctrlr)
 {
 	uint32_t	cc;
 	uint32_t	csts;
 	int		ticks = 0;
 
 	cc = nvme_mmio_read_4(ctrlr, cc);
 	cc &= ~(NVME_CC_REG_SHN_MASK << NVME_CC_REG_SHN_SHIFT);
 	cc |= NVME_SHN_NORMAL << NVME_CC_REG_SHN_SHIFT;
 	nvme_mmio_write_4(ctrlr, cc, cc);
 
 	csts = nvme_mmio_read_4(ctrlr, csts);
 	while ((NVME_CSTS_GET_SHST(csts) != NVME_SHST_COMPLETE) && (ticks++ < 5*hz)) {
 		pause("nvme shn", 1);
 		csts = nvme_mmio_read_4(ctrlr, csts);
 	}
 	if (NVME_CSTS_GET_SHST(csts) != NVME_SHST_COMPLETE)
 		nvme_printf(ctrlr, "did not complete shutdown within 5 seconds "
 		    "of notification\n");
 }
 
 void
 nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr,
     struct nvme_request *req)
 {
 
 	nvme_qpair_submit_request(&ctrlr->adminq, req);
 }
 
 void
 nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr,
     struct nvme_request *req)
 {
 	struct nvme_qpair       *qpair;
 
 	qpair = &ctrlr->ioq[curcpu / ctrlr->num_cpus_per_ioq];
 	nvme_qpair_submit_request(qpair, req);
 }
 
 device_t
 nvme_ctrlr_get_device(struct nvme_controller *ctrlr)
 {
 
 	return (ctrlr->dev);
 }
 
 const struct nvme_controller_data *
 nvme_ctrlr_get_data(struct nvme_controller *ctrlr)
 {
 
 	return (&ctrlr->cdata);
 }
Index: projects/nfsv42/sys/dev/virtio/mmio/virtio_mmio.c
===================================================================
--- projects/nfsv42/sys/dev/virtio/mmio/virtio_mmio.c	(revision 350367)
+++ projects/nfsv42/sys/dev/virtio/mmio/virtio_mmio.c	(revision 350368)
@@ -1,788 +1,788 @@
 /*-
  * Copyright (c) 2014 Ruslan Bukin <br@bsdpad.com>
  * Copyright (c) 2014 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed by SRI International and the University of
  * Cambridge Computer Laboratory under DARPA/AFRL contract (FA8750-10-C-0237)
  * ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Portions of this software were developed by Andrew Turner
  * under sponsorship from the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 /*
  * VirtIO MMIO interface.
  * This driver is heavily based on VirtIO PCI interface driver.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 #include <sys/rman.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 
 #include <dev/virtio/virtio.h>
 #include <dev/virtio/virtqueue.h>
 #include <dev/virtio/mmio/virtio_mmio.h>
 
 #include "virtio_mmio_if.h"
 #include "virtio_bus_if.h"
 #include "virtio_if.h"
 
 #define	PAGE_SHIFT	12
 
 struct vtmmio_virtqueue {
 	struct virtqueue	*vtv_vq;
 	int			 vtv_no_intr;
 };
 
 static int	vtmmio_detach(device_t);
 static int	vtmmio_suspend(device_t);
 static int	vtmmio_resume(device_t);
 static int	vtmmio_shutdown(device_t);
 static void	vtmmio_driver_added(device_t, driver_t *);
 static void	vtmmio_child_detached(device_t, device_t);
 static int	vtmmio_read_ivar(device_t, device_t, int, uintptr_t *);
 static int	vtmmio_write_ivar(device_t, device_t, int, uintptr_t);
 static uint64_t	vtmmio_negotiate_features(device_t, uint64_t);
 static int	vtmmio_with_feature(device_t, uint64_t);
 static int	vtmmio_alloc_virtqueues(device_t, int, int,
 		    struct vq_alloc_info *);
 static int	vtmmio_setup_intr(device_t, enum intr_type);
 static void	vtmmio_stop(device_t);
 static void	vtmmio_poll(device_t);
 static int	vtmmio_reinit(device_t, uint64_t);
 static void	vtmmio_reinit_complete(device_t);
 static void	vtmmio_notify_virtqueue(device_t, uint16_t);
 static uint8_t	vtmmio_get_status(device_t);
 static void	vtmmio_set_status(device_t, uint8_t);
 static void	vtmmio_read_dev_config(device_t, bus_size_t, void *, int);
 static void	vtmmio_write_dev_config(device_t, bus_size_t, void *, int);
 static void	vtmmio_describe_features(struct vtmmio_softc *, const char *,
 		    uint64_t);
 static void	vtmmio_probe_and_attach_child(struct vtmmio_softc *);
 static int	vtmmio_reinit_virtqueue(struct vtmmio_softc *, int);
 static void	vtmmio_free_interrupts(struct vtmmio_softc *);
 static void	vtmmio_free_virtqueues(struct vtmmio_softc *);
 static void	vtmmio_release_child_resources(struct vtmmio_softc *);
 static void	vtmmio_reset(struct vtmmio_softc *);
 static void	vtmmio_select_virtqueue(struct vtmmio_softc *, int);
 static void	vtmmio_vq_intr(void *);
 
 /*
  * I/O port read/write wrappers.
  */
 #define vtmmio_write_config_1(sc, o, v)				\
 do {								\
 	if (sc->platform != NULL)				\
 		VIRTIO_MMIO_PREWRITE(sc->platform, (o), (v));	\
 	bus_write_1((sc)->res[0], (o), (v)); 			\
 	if (sc->platform != NULL)				\
 		VIRTIO_MMIO_NOTE(sc->platform, (o), (v));	\
 } while (0)
 #define vtmmio_write_config_2(sc, o, v)				\
 do {								\
 	if (sc->platform != NULL)				\
 		VIRTIO_MMIO_PREWRITE(sc->platform, (o), (v));	\
 	bus_write_2((sc)->res[0], (o), (v));			\
 	if (sc->platform != NULL)				\
 		VIRTIO_MMIO_NOTE(sc->platform, (o), (v));	\
 } while (0)
 #define vtmmio_write_config_4(sc, o, v)				\
 do {								\
 	if (sc->platform != NULL)				\
 		VIRTIO_MMIO_PREWRITE(sc->platform, (o), (v));	\
 	bus_write_4((sc)->res[0], (o), (v));			\
 	if (sc->platform != NULL)				\
 		VIRTIO_MMIO_NOTE(sc->platform, (o), (v));	\
 } while (0)
 
 #define vtmmio_read_config_1(sc, o) \
 	bus_read_1((sc)->res[0], (o))
 #define vtmmio_read_config_2(sc, o) \
 	bus_read_2((sc)->res[0], (o))
 #define vtmmio_read_config_4(sc, o) \
 	bus_read_4((sc)->res[0], (o))
 
 static device_method_t vtmmio_methods[] = {
 	/* Device interface. */
 	DEVMETHOD(device_attach,		  vtmmio_attach),
 	DEVMETHOD(device_detach,		  vtmmio_detach),
 	DEVMETHOD(device_suspend,		  vtmmio_suspend),
 	DEVMETHOD(device_resume,		  vtmmio_resume),
 	DEVMETHOD(device_shutdown,		  vtmmio_shutdown),
 
 	/* Bus interface. */
 	DEVMETHOD(bus_driver_added,		  vtmmio_driver_added),
 	DEVMETHOD(bus_child_detached,		  vtmmio_child_detached),
 	DEVMETHOD(bus_child_pnpinfo_str,	  virtio_child_pnpinfo_str),
 	DEVMETHOD(bus_read_ivar,		  vtmmio_read_ivar),
 	DEVMETHOD(bus_write_ivar,		  vtmmio_write_ivar),
 
 	/* VirtIO bus interface. */
 	DEVMETHOD(virtio_bus_negotiate_features,  vtmmio_negotiate_features),
 	DEVMETHOD(virtio_bus_with_feature,	  vtmmio_with_feature),
 	DEVMETHOD(virtio_bus_alloc_virtqueues,	  vtmmio_alloc_virtqueues),
 	DEVMETHOD(virtio_bus_setup_intr,	  vtmmio_setup_intr),
 	DEVMETHOD(virtio_bus_stop,		  vtmmio_stop),
 	DEVMETHOD(virtio_bus_poll,		  vtmmio_poll),
 	DEVMETHOD(virtio_bus_reinit,		  vtmmio_reinit),
 	DEVMETHOD(virtio_bus_reinit_complete,	  vtmmio_reinit_complete),
 	DEVMETHOD(virtio_bus_notify_vq,		  vtmmio_notify_virtqueue),
 	DEVMETHOD(virtio_bus_read_device_config,  vtmmio_read_dev_config),
 	DEVMETHOD(virtio_bus_write_device_config, vtmmio_write_dev_config),
 
 	DEVMETHOD_END
 };
 
 DEFINE_CLASS_0(virtio_mmio, vtmmio_driver, vtmmio_methods,
     sizeof(struct vtmmio_softc));
 
 MODULE_VERSION(virtio_mmio, 1);
 
 static int
 vtmmio_setup_intr(device_t dev, enum intr_type type)
 {
 	struct vtmmio_softc *sc;
 	int rid;
 	int err;
 
 	sc = device_get_softc(dev);
 
 	if (sc->platform != NULL) {
 		err = VIRTIO_MMIO_SETUP_INTR(sc->platform, sc->dev,
 					vtmmio_vq_intr, sc);
 		if (err == 0) {
 			/* Okay we have backend-specific interrupts */
 			return (0);
 		}
 	}
 
 	rid = 0;
 	sc->res[1] = bus_alloc_resource_any(dev, SYS_RES_IRQ, &rid,
 		RF_ACTIVE);
 	if (!sc->res[1]) {
 		device_printf(dev, "Can't allocate interrupt\n");
 		return (ENXIO);
 	}
 
 	if (bus_setup_intr(dev, sc->res[1], INTR_TYPE_MISC | INTR_MPSAFE,
 		NULL, vtmmio_vq_intr, sc, &sc->ih)) {
 		device_printf(dev, "Can't setup the interrupt\n");
 		return (ENXIO);
 	}
 
 	return (0);
 }
 
 int
 vtmmio_attach(device_t dev)
 {
 	struct vtmmio_softc *sc;
 	device_t child;
 	int rid;
 
 	sc = device_get_softc(dev);
 	sc->dev = dev;
 
 	rid = 0;
 	sc->res[0] = bus_alloc_resource_any(dev, SYS_RES_MEMORY, &rid,
 			RF_ACTIVE);
 	if (!sc->res[0]) {
 		device_printf(dev, "Cannot allocate memory window.\n");
 		return (ENXIO);
 	}
 
 	vtmmio_reset(sc);
 
 	/* Tell the host we've noticed this device. */
 	vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
 
 	if ((child = device_add_child(dev, NULL, -1)) == NULL) {
 		device_printf(dev, "Cannot create child device.\n");
 		vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
 		vtmmio_detach(dev);
 		return (ENOMEM);
 	}
 
 	sc->vtmmio_child_dev = child;
 	vtmmio_probe_and_attach_child(sc);
 
 	return (0);
 }
 
 static int
 vtmmio_detach(device_t dev)
 {
 	struct vtmmio_softc *sc;
 	device_t child;
 	int error;
 
 	sc = device_get_softc(dev);
 
 	if ((child = sc->vtmmio_child_dev) != NULL) {
 		error = device_delete_child(dev, child);
 		if (error)
 			return (error);
 		sc->vtmmio_child_dev = NULL;
 	}
 
 	vtmmio_reset(sc);
 
 	if (sc->res[0] != NULL) {
 		bus_release_resource(dev, SYS_RES_MEMORY, 0,
 		    sc->res[0]);
 		sc->res[0] = NULL;
 	}
 
 	return (0);
 }
 
 static int
 vtmmio_suspend(device_t dev)
 {
 
 	return (bus_generic_suspend(dev));
 }
 
 static int
 vtmmio_resume(device_t dev)
 {
 
 	return (bus_generic_resume(dev));
 }
 
 static int
 vtmmio_shutdown(device_t dev)
 {
 
 	(void) bus_generic_shutdown(dev);
 
 	/* Forcibly stop the host device. */
 	vtmmio_stop(dev);
 
 	return (0);
 }
 
 static void
 vtmmio_driver_added(device_t dev, driver_t *driver)
 {
 	struct vtmmio_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	vtmmio_probe_and_attach_child(sc);
 }
 
 static void
 vtmmio_child_detached(device_t dev, device_t child)
 {
 	struct vtmmio_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	vtmmio_reset(sc);
 	vtmmio_release_child_resources(sc);
 }
 
 static int
 vtmmio_read_ivar(device_t dev, device_t child, int index, uintptr_t *result)
 {
 	struct vtmmio_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	if (sc->vtmmio_child_dev != child)
 		return (ENOENT);
 
 	switch (index) {
 	case VIRTIO_IVAR_DEVTYPE:
 	case VIRTIO_IVAR_SUBDEVICE:
 		*result = vtmmio_read_config_4(sc, VIRTIO_MMIO_DEVICE_ID);
 		break;
 	case VIRTIO_IVAR_VENDOR:
 		*result = vtmmio_read_config_4(sc, VIRTIO_MMIO_VENDOR_ID);
 		break;
 	case VIRTIO_IVAR_SUBVENDOR:
 	case VIRTIO_IVAR_DEVICE:
 		/*
 		 * Dummy value for fields not present in this bus.  Used by
 		 * bus-agnostic virtio_child_pnpinfo_str.
 		 */
 		*result = 0;
 		break;
 	default:
 		return (ENOENT);
 	}
 
 	return (0);
 }
 
 static int
 vtmmio_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
 {
 	struct vtmmio_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	if (sc->vtmmio_child_dev != child)
 		return (ENOENT);
 
 	switch (index) {
 	case VIRTIO_IVAR_FEATURE_DESC:
 		sc->vtmmio_child_feat_desc = (void *) value;
 		break;
 	default:
 		return (ENOENT);
 	}
 
 	return (0);
 }
 
 static uint64_t
 vtmmio_negotiate_features(device_t dev, uint64_t child_features)
 {
 	struct vtmmio_softc *sc;
 	uint64_t host_features, features;
 
 	sc = device_get_softc(dev);
 
 	host_features = vtmmio_read_config_4(sc, VIRTIO_MMIO_HOST_FEATURES);
 	vtmmio_describe_features(sc, "host", host_features);
 
 	/*
 	 * Limit negotiated features to what the driver, virtqueue, and
 	 * host all support.
 	 */
 	features = host_features & child_features;
 	features = virtqueue_filter_features(features);
 	sc->vtmmio_features = features;
 
 	vtmmio_describe_features(sc, "negotiated", features);
 	vtmmio_write_config_4(sc, VIRTIO_MMIO_GUEST_FEATURES, features);
 
 	return (features);
 }
 
 static int
 vtmmio_with_feature(device_t dev, uint64_t feature)
 {
 	struct vtmmio_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	return ((sc->vtmmio_features & feature) != 0);
 }
 
 static int
 vtmmio_alloc_virtqueues(device_t dev, int flags, int nvqs,
     struct vq_alloc_info *vq_info)
 {
 	struct vtmmio_virtqueue *vqx;
 	struct vq_alloc_info *info;
 	struct vtmmio_softc *sc;
 	struct virtqueue *vq;
 	uint32_t size;
 	int idx, error;
 
 	sc = device_get_softc(dev);
 
 	if (sc->vtmmio_nvqs != 0)
 		return (EALREADY);
 	if (nvqs <= 0)
 		return (EINVAL);
 
 	sc->vtmmio_vqs = malloc(nvqs * sizeof(struct vtmmio_virtqueue),
 	    M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (sc->vtmmio_vqs == NULL)
 		return (ENOMEM);
 
 	vtmmio_write_config_4(sc, VIRTIO_MMIO_GUEST_PAGE_SIZE,
 	    (1 << PAGE_SHIFT));
 
 	for (idx = 0; idx < nvqs; idx++) {
 		vqx = &sc->vtmmio_vqs[idx];
 		info = &vq_info[idx];
 
 		vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_SEL, idx);
 
 		vtmmio_select_virtqueue(sc, idx);
 		size = vtmmio_read_config_4(sc, VIRTIO_MMIO_QUEUE_NUM_MAX);
 
 		error = virtqueue_alloc(dev, idx, size,
-		    VIRTIO_MMIO_VRING_ALIGN, 0xFFFFFFFFUL, info, &vq);
+		    VIRTIO_MMIO_VRING_ALIGN, ~(vm_paddr_t)0, info, &vq);
 		if (error) {
 			device_printf(dev,
 			    "cannot allocate virtqueue %d: %d\n",
 			    idx, error);
 			break;
 		}
 
 		vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_NUM, size);
 		vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_ALIGN,
 		    VIRTIO_MMIO_VRING_ALIGN);
 #if 0
 		device_printf(dev, "virtqueue paddr 0x%08lx\n",
 		    (uint64_t)virtqueue_paddr(vq));
 #endif
 		vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_PFN,
 		    virtqueue_paddr(vq) >> PAGE_SHIFT);
 
 		vqx->vtv_vq = *info->vqai_vq = vq;
 		vqx->vtv_no_intr = info->vqai_intr == NULL;
 
 		sc->vtmmio_nvqs++;
 	}
 
 	if (error)
 		vtmmio_free_virtqueues(sc);
 
 	return (error);
 }
 
 static void
 vtmmio_stop(device_t dev)
 {
 
 	vtmmio_reset(device_get_softc(dev));
 }
 
 static void
 vtmmio_poll(device_t dev)
 {
 	struct vtmmio_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	if (sc->platform != NULL)
 		VIRTIO_MMIO_POLL(sc->platform);
 }
 
 static int
 vtmmio_reinit(device_t dev, uint64_t features)
 {
 	struct vtmmio_softc *sc;
 	int idx, error;
 
 	sc = device_get_softc(dev);
 
 	if (vtmmio_get_status(dev) != VIRTIO_CONFIG_STATUS_RESET)
 		vtmmio_stop(dev);
 
 	/*
 	 * Quickly drive the status through ACK and DRIVER. The device
 	 * does not become usable again until vtmmio_reinit_complete().
 	 */
 	vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
 	vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER);
 
 	vtmmio_negotiate_features(dev, features);
 
 	vtmmio_write_config_4(sc, VIRTIO_MMIO_GUEST_PAGE_SIZE,
 	    (1 << PAGE_SHIFT));
 
 	for (idx = 0; idx < sc->vtmmio_nvqs; idx++) {
 		error = vtmmio_reinit_virtqueue(sc, idx);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 static void
 vtmmio_reinit_complete(device_t dev)
 {
 
 	vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK);
 }
 
 static void
 vtmmio_notify_virtqueue(device_t dev, uint16_t queue)
 {
 	struct vtmmio_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_NOTIFY, queue);
 }
 
 static uint8_t
 vtmmio_get_status(device_t dev)
 {
 	struct vtmmio_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	return (vtmmio_read_config_4(sc, VIRTIO_MMIO_STATUS));
 }
 
 static void
 vtmmio_set_status(device_t dev, uint8_t status)
 {
 	struct vtmmio_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	if (status != VIRTIO_CONFIG_STATUS_RESET)
 		status |= vtmmio_get_status(dev);
 
 	vtmmio_write_config_4(sc, VIRTIO_MMIO_STATUS, status);
 }
 
 static void
 vtmmio_read_dev_config(device_t dev, bus_size_t offset,
     void *dst, int length)
 {
 	struct vtmmio_softc *sc;
 	bus_size_t off;
 	uint8_t *d;
 	int size;
 
 	sc = device_get_softc(dev);
 	off = VIRTIO_MMIO_CONFIG + offset;
 
 	for (d = dst; length > 0; d += size, off += size, length -= size) {
 #ifdef ALLOW_WORD_ALIGNED_ACCESS
 		if (length >= 4) {
 			size = 4;
 			*(uint32_t *)d = vtmmio_read_config_4(sc, off);
 		} else if (length >= 2) {
 			size = 2;
 			*(uint16_t *)d = vtmmio_read_config_2(sc, off);
 		} else
 #endif
 		{
 			size = 1;
 			*d = vtmmio_read_config_1(sc, off);
 		}
 	}
 }
 
 static void
 vtmmio_write_dev_config(device_t dev, bus_size_t offset,
     void *src, int length)
 {
 	struct vtmmio_softc *sc;
 	bus_size_t off;
 	uint8_t *s;
 	int size;
 
 	sc = device_get_softc(dev);
 	off = VIRTIO_MMIO_CONFIG + offset;
 
 	for (s = src; length > 0; s += size, off += size, length -= size) {
 #ifdef ALLOW_WORD_ALIGNED_ACCESS
 		if (length >= 4) {
 			size = 4;
 			vtmmio_write_config_4(sc, off, *(uint32_t *)s);
 		} else if (length >= 2) {
 			size = 2;
 			vtmmio_write_config_2(sc, off, *(uint16_t *)s);
 		} else
 #endif
 		{
 			size = 1;
 			vtmmio_write_config_1(sc, off, *s);
 		}
 	}
 }
 
 static void
 vtmmio_describe_features(struct vtmmio_softc *sc, const char *msg,
     uint64_t features)
 {
 	device_t dev, child;
 
 	dev = sc->dev;
 	child = sc->vtmmio_child_dev;
 
 	if (device_is_attached(child) || bootverbose == 0)
 		return;
 
 	virtio_describe(dev, msg, features, sc->vtmmio_child_feat_desc);
 }
 
 static void
 vtmmio_probe_and_attach_child(struct vtmmio_softc *sc)
 {
 	device_t dev, child;
 
 	dev = sc->dev;
 	child = sc->vtmmio_child_dev;
 
 	if (child == NULL)
 		return;
 
 	if (device_get_state(child) != DS_NOTPRESENT) {
 		return;
 	}
 
 	if (device_probe(child) != 0) {
 		return;
 	}
 
 	vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER);
 	if (device_attach(child) != 0) {
 		vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
 		vtmmio_reset(sc);
 		vtmmio_release_child_resources(sc);
 		/* Reset status for future attempt. */
 		vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
 	} else {
 		vtmmio_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK);
 		VIRTIO_ATTACH_COMPLETED(child);
 	}
 }
 
 static int
 vtmmio_reinit_virtqueue(struct vtmmio_softc *sc, int idx)
 {
 	struct vtmmio_virtqueue *vqx;
 	struct virtqueue *vq;
 	int error;
 	uint16_t size;
 
 	vqx = &sc->vtmmio_vqs[idx];
 	vq = vqx->vtv_vq;
 
 	KASSERT(vq != NULL, ("%s: vq %d not allocated", __func__, idx));
 
 	vtmmio_select_virtqueue(sc, idx);
 	size = vtmmio_read_config_4(sc, VIRTIO_MMIO_QUEUE_NUM_MAX);
 
 	error = virtqueue_reinit(vq, size);
 	if (error)
 		return (error);
 
 	vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_NUM, size);
 	vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_ALIGN,
 	    VIRTIO_MMIO_VRING_ALIGN);
 #if 0
 	device_printf(sc->dev, "virtqueue paddr 0x%08lx\n",
 	    (uint64_t)virtqueue_paddr(vq));
 #endif
 	vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_PFN,
 	    virtqueue_paddr(vq) >> PAGE_SHIFT);
 
 	return (0);
 }
 
 static void
 vtmmio_free_interrupts(struct vtmmio_softc *sc)
 {
 
 	if (sc->ih != NULL)
 		bus_teardown_intr(sc->dev, sc->res[1], sc->ih);
 
 	if (sc->res[1] != NULL)
 		bus_release_resource(sc->dev, SYS_RES_IRQ, 0, sc->res[1]);
 }
 
 static void
 vtmmio_free_virtqueues(struct vtmmio_softc *sc)
 {
 	struct vtmmio_virtqueue *vqx;
 	int idx;
 
 	for (idx = 0; idx < sc->vtmmio_nvqs; idx++) {
 		vqx = &sc->vtmmio_vqs[idx];
 
 		vtmmio_select_virtqueue(sc, idx);
 		vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_PFN, 0);
 
 		virtqueue_free(vqx->vtv_vq);
 		vqx->vtv_vq = NULL;
 	}
 
 	free(sc->vtmmio_vqs, M_DEVBUF);
 	sc->vtmmio_vqs = NULL;
 	sc->vtmmio_nvqs = 0;
 }
 
 static void
 vtmmio_release_child_resources(struct vtmmio_softc *sc)
 {
 
 	vtmmio_free_interrupts(sc);
 	vtmmio_free_virtqueues(sc);
 }
 
 static void
 vtmmio_reset(struct vtmmio_softc *sc)
 {
 
 	/*
 	 * Setting the status to RESET sets the host device to
 	 * the original, uninitialized state.
 	 */
 	vtmmio_set_status(sc->dev, VIRTIO_CONFIG_STATUS_RESET);
 }
 
 static void
 vtmmio_select_virtqueue(struct vtmmio_softc *sc, int idx)
 {
 
 	vtmmio_write_config_4(sc, VIRTIO_MMIO_QUEUE_SEL, idx);
 }
 
 static void
 vtmmio_vq_intr(void *arg)
 {
 	struct vtmmio_virtqueue *vqx;
 	struct vtmmio_softc *sc;
 	struct virtqueue *vq;
 	uint32_t status;
 	int idx;
 
 	sc = arg;
 
 	status = vtmmio_read_config_4(sc, VIRTIO_MMIO_INTERRUPT_STATUS);
 	vtmmio_write_config_4(sc, VIRTIO_MMIO_INTERRUPT_ACK, status);
 
 	/* The config changed */
 	if (status & VIRTIO_MMIO_INT_CONFIG)
 		if (sc->vtmmio_child_dev != NULL)
 			VIRTIO_CONFIG_CHANGE(sc->vtmmio_child_dev);
 
 	/* Notify all virtqueues. */
 	if (status & VIRTIO_MMIO_INT_VRING) {
 		for (idx = 0; idx < sc->vtmmio_nvqs; idx++) {
 			vqx = &sc->vtmmio_vqs[idx];
 			if (vqx->vtv_no_intr == 0) {
 				vq = vqx->vtv_vq;
 				virtqueue_intr(vq);
 			}
 		}
 	}
 }
Index: projects/nfsv42/sys/dev/virtio/pci/virtio_pci.c
===================================================================
--- projects/nfsv42/sys/dev/virtio/pci/virtio_pci.c	(revision 350367)
+++ projects/nfsv42/sys/dev/virtio/pci/virtio_pci.c	(revision 350368)
@@ -1,1333 +1,1333 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2011, Bryan Venteicher <bryanv@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice unmodified, this list of conditions, and the following
  *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
 /* Driver for the VirtIO PCI interface. */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bus.h>
 #include <sys/kernel.h>
 #include <sys/module.h>
 #include <sys/malloc.h>
 
 #include <machine/bus.h>
 #include <machine/resource.h>
 #include <sys/bus.h>
 #include <sys/rman.h>
 
 #include <dev/pci/pcivar.h>
 #include <dev/pci/pcireg.h>
 
 #include <dev/virtio/virtio.h>
 #include <dev/virtio/virtqueue.h>
 #include <dev/virtio/pci/virtio_pci.h>
 
 #include "virtio_bus_if.h"
 #include "virtio_if.h"
 
 struct vtpci_interrupt {
 	struct resource		*vti_irq;
 	int			 vti_rid;
 	void			*vti_handler;
 };
 
 struct vtpci_virtqueue {
 	struct virtqueue	*vtv_vq;
 	int			 vtv_no_intr;
 };
 
 struct vtpci_softc {
 	device_t			 vtpci_dev;
 	struct resource			*vtpci_res;
 	struct resource			*vtpci_msix_res;
 	uint64_t			 vtpci_features;
 	uint32_t			 vtpci_flags;
 #define VTPCI_FLAG_NO_MSI		0x0001
 #define VTPCI_FLAG_NO_MSIX		0x0002
 #define VTPCI_FLAG_LEGACY		0x1000
 #define VTPCI_FLAG_MSI			0x2000
 #define VTPCI_FLAG_MSIX			0x4000
 #define VTPCI_FLAG_SHARED_MSIX		0x8000
 #define VTPCI_FLAG_ITYPE_MASK		0xF000
 
 	/* This "bus" will only ever have one child. */
 	device_t			 vtpci_child_dev;
 	struct virtio_feature_desc	*vtpci_child_feat_desc;
 
 	int				 vtpci_nvqs;
 	struct vtpci_virtqueue		*vtpci_vqs;
 
 	/*
 	 * Ideally, each virtqueue that the driver provides a callback for will
 	 * receive its own MSIX vector. If there are not sufficient vectors
 	 * available, then attempt to have all the VQs share one vector. For
 	 * MSIX, the configuration changed notifications must be on their own
 	 * vector.
 	 *
 	 * If MSIX is not available, we will attempt to have the whole device
 	 * share one MSI vector, and then, finally, one legacy interrupt.
 	 */
 	struct vtpci_interrupt		 vtpci_device_interrupt;
 	struct vtpci_interrupt		*vtpci_msix_vq_interrupts;
 	int				 vtpci_nmsix_resources;
 };
 
 static int	vtpci_probe(device_t);
 static int	vtpci_attach(device_t);
 static int	vtpci_detach(device_t);
 static int	vtpci_suspend(device_t);
 static int	vtpci_resume(device_t);
 static int	vtpci_shutdown(device_t);
 static void	vtpci_driver_added(device_t, driver_t *);
 static void	vtpci_child_detached(device_t, device_t);
 static int	vtpci_read_ivar(device_t, device_t, int, uintptr_t *);
 static int	vtpci_write_ivar(device_t, device_t, int, uintptr_t);
 
 static uint64_t	vtpci_negotiate_features(device_t, uint64_t);
 static int	vtpci_with_feature(device_t, uint64_t);
 static int	vtpci_alloc_virtqueues(device_t, int, int,
 		    struct vq_alloc_info *);
 static int	vtpci_setup_intr(device_t, enum intr_type);
 static void	vtpci_stop(device_t);
 static int	vtpci_reinit(device_t, uint64_t);
 static void	vtpci_reinit_complete(device_t);
 static void	vtpci_notify_virtqueue(device_t, uint16_t);
 static uint8_t	vtpci_get_status(device_t);
 static void	vtpci_set_status(device_t, uint8_t);
 static void	vtpci_read_dev_config(device_t, bus_size_t, void *, int);
 static void	vtpci_write_dev_config(device_t, bus_size_t, void *, int);
 
 static void	vtpci_describe_features(struct vtpci_softc *, const char *,
 		    uint64_t);
 static void	vtpci_probe_and_attach_child(struct vtpci_softc *);
 
 static int	vtpci_alloc_msix(struct vtpci_softc *, int);
 static int	vtpci_alloc_msi(struct vtpci_softc *);
 static int	vtpci_alloc_intr_msix_pervq(struct vtpci_softc *);
 static int	vtpci_alloc_intr_msix_shared(struct vtpci_softc *);
 static int	vtpci_alloc_intr_msi(struct vtpci_softc *);
 static int	vtpci_alloc_intr_legacy(struct vtpci_softc *);
 static int	vtpci_alloc_interrupt(struct vtpci_softc *, int, int,
 		    struct vtpci_interrupt *);
 static int	vtpci_alloc_intr_resources(struct vtpci_softc *);
 
 static int	vtpci_setup_legacy_interrupt(struct vtpci_softc *,
 		    enum intr_type);
 static int	vtpci_setup_pervq_msix_interrupts(struct vtpci_softc *,
 		    enum intr_type);
 static int	vtpci_setup_msix_interrupts(struct vtpci_softc *,
 		    enum intr_type);
 static int	vtpci_setup_interrupts(struct vtpci_softc *, enum intr_type);
 
 static int	vtpci_register_msix_vector(struct vtpci_softc *, int,
 		    struct vtpci_interrupt *);
 static int	vtpci_set_host_msix_vectors(struct vtpci_softc *);
 static int	vtpci_reinit_virtqueue(struct vtpci_softc *, int);
 
 static void	vtpci_free_interrupt(struct vtpci_softc *,
 		    struct vtpci_interrupt *);
 static void	vtpci_free_interrupts(struct vtpci_softc *);
 static void	vtpci_free_virtqueues(struct vtpci_softc *);
 static void	vtpci_release_child_resources(struct vtpci_softc *);
 static void	vtpci_cleanup_setup_intr_attempt(struct vtpci_softc *);
 static void	vtpci_reset(struct vtpci_softc *);
 
 static void	vtpci_select_virtqueue(struct vtpci_softc *, int);
 
 static void	vtpci_legacy_intr(void *);
 static int	vtpci_vq_shared_intr_filter(void *);
 static void	vtpci_vq_shared_intr(void *);
 static int	vtpci_vq_intr_filter(void *);
 static void	vtpci_vq_intr(void *);
 static void	vtpci_config_intr(void *);
 
 #define vtpci_setup_msi_interrupt vtpci_setup_legacy_interrupt
 
 #define VIRTIO_PCI_CONFIG(_sc) \
     VIRTIO_PCI_CONFIG_OFF((((_sc)->vtpci_flags & VTPCI_FLAG_MSIX)) != 0)
 
 /*
  * I/O port read/write wrappers.
  */
 #define vtpci_read_config_1(sc, o)	bus_read_1((sc)->vtpci_res, (o))
 #define vtpci_read_config_2(sc, o)	bus_read_2((sc)->vtpci_res, (o))
 #define vtpci_read_config_4(sc, o)	bus_read_4((sc)->vtpci_res, (o))
 #define vtpci_write_config_1(sc, o, v)	bus_write_1((sc)->vtpci_res, (o), (v))
 #define vtpci_write_config_2(sc, o, v)	bus_write_2((sc)->vtpci_res, (o), (v))
 #define vtpci_write_config_4(sc, o, v)	bus_write_4((sc)->vtpci_res, (o), (v))
 
 /* Tunables. */
 static int vtpci_disable_msix = 0;
 TUNABLE_INT("hw.virtio.pci.disable_msix", &vtpci_disable_msix);
 
 static device_method_t vtpci_methods[] = {
 	/* Device interface. */
 	DEVMETHOD(device_probe,			  vtpci_probe),
 	DEVMETHOD(device_attach,		  vtpci_attach),
 	DEVMETHOD(device_detach,		  vtpci_detach),
 	DEVMETHOD(device_suspend,		  vtpci_suspend),
 	DEVMETHOD(device_resume,		  vtpci_resume),
 	DEVMETHOD(device_shutdown,		  vtpci_shutdown),
 
 	/* Bus interface. */
 	DEVMETHOD(bus_driver_added,		  vtpci_driver_added),
 	DEVMETHOD(bus_child_detached,		  vtpci_child_detached),
 	DEVMETHOD(bus_child_pnpinfo_str,	  virtio_child_pnpinfo_str),
 	DEVMETHOD(bus_read_ivar,		  vtpci_read_ivar),
 	DEVMETHOD(bus_write_ivar,		  vtpci_write_ivar),
 
 	/* VirtIO bus interface. */
 	DEVMETHOD(virtio_bus_negotiate_features,  vtpci_negotiate_features),
 	DEVMETHOD(virtio_bus_with_feature,	  vtpci_with_feature),
 	DEVMETHOD(virtio_bus_alloc_virtqueues,	  vtpci_alloc_virtqueues),
 	DEVMETHOD(virtio_bus_setup_intr,	  vtpci_setup_intr),
 	DEVMETHOD(virtio_bus_stop,		  vtpci_stop),
 	DEVMETHOD(virtio_bus_reinit,		  vtpci_reinit),
 	DEVMETHOD(virtio_bus_reinit_complete,	  vtpci_reinit_complete),
 	DEVMETHOD(virtio_bus_notify_vq,		  vtpci_notify_virtqueue),
 	DEVMETHOD(virtio_bus_read_device_config,  vtpci_read_dev_config),
 	DEVMETHOD(virtio_bus_write_device_config, vtpci_write_dev_config),
 
 	DEVMETHOD_END
 };
 
 static driver_t vtpci_driver = {
 	"virtio_pci",
 	vtpci_methods,
 	sizeof(struct vtpci_softc)
 };
 
 devclass_t vtpci_devclass;
 
 DRIVER_MODULE(virtio_pci, pci, vtpci_driver, vtpci_devclass, 0, 0);
 MODULE_VERSION(virtio_pci, 1);
 MODULE_DEPEND(virtio_pci, pci, 1, 1, 1);
 MODULE_DEPEND(virtio_pci, virtio, 1, 1, 1);
 
 static int
 vtpci_probe(device_t dev)
 {
 	char desc[36];
 	const char *name;
 
 	if (pci_get_vendor(dev) != VIRTIO_PCI_VENDORID)
 		return (ENXIO);
 
 	if (pci_get_device(dev) < VIRTIO_PCI_DEVICEID_MIN ||
 	    pci_get_device(dev) > VIRTIO_PCI_DEVICEID_MAX)
 		return (ENXIO);
 
 	if (pci_get_revid(dev) != VIRTIO_PCI_ABI_VERSION)
 		return (ENXIO);
 
 	name = virtio_device_name(pci_get_subdevice(dev));
 	if (name == NULL)
 		name = "Unknown";
 
 	snprintf(desc, sizeof(desc), "VirtIO PCI %s adapter", name);
 	device_set_desc_copy(dev, desc);
 
 	return (BUS_PROBE_DEFAULT);
 }
 
 static int
 vtpci_attach(device_t dev)
 {
 	struct vtpci_softc *sc;
 	device_t child;
 	int rid;
 
 	sc = device_get_softc(dev);
 	sc->vtpci_dev = dev;
 
 	pci_enable_busmaster(dev);
 
 	rid = PCIR_BAR(0);
 	sc->vtpci_res = bus_alloc_resource_any(dev, SYS_RES_IOPORT, &rid,
 	    RF_ACTIVE);
 	if (sc->vtpci_res == NULL) {
 		device_printf(dev, "cannot map I/O space\n");
 		return (ENXIO);
 	}
 
 	if (pci_find_cap(dev, PCIY_MSI, NULL) != 0)
 		sc->vtpci_flags |= VTPCI_FLAG_NO_MSI;
 
 	if (pci_find_cap(dev, PCIY_MSIX, NULL) == 0) {
 		rid = PCIR_BAR(1);
 		sc->vtpci_msix_res = bus_alloc_resource_any(dev,
 		    SYS_RES_MEMORY, &rid, RF_ACTIVE);
 	}
 
 	if (sc->vtpci_msix_res == NULL)
 		sc->vtpci_flags |= VTPCI_FLAG_NO_MSIX;
 
 	vtpci_reset(sc);
 
 	/* Tell the host we've noticed this device. */
 	vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
 
 	if ((child = device_add_child(dev, NULL, -1)) == NULL) {
 		device_printf(dev, "cannot create child device\n");
 		vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
 		vtpci_detach(dev);
 		return (ENOMEM);
 	}
 
 	sc->vtpci_child_dev = child;
 	vtpci_probe_and_attach_child(sc);
 
 	return (0);
 }
 
 static int
 vtpci_detach(device_t dev)
 {
 	struct vtpci_softc *sc;
 	device_t child;
 	int error;
 
 	sc = device_get_softc(dev);
 
 	if ((child = sc->vtpci_child_dev) != NULL) {
 		error = device_delete_child(dev, child);
 		if (error)
 			return (error);
 		sc->vtpci_child_dev = NULL;
 	}
 
 	vtpci_reset(sc);
 
 	if (sc->vtpci_msix_res != NULL) {
 		bus_release_resource(dev, SYS_RES_MEMORY, PCIR_BAR(1),
 		    sc->vtpci_msix_res);
 		sc->vtpci_msix_res = NULL;
 	}
 
 	if (sc->vtpci_res != NULL) {
 		bus_release_resource(dev, SYS_RES_IOPORT, PCIR_BAR(0),
 		    sc->vtpci_res);
 		sc->vtpci_res = NULL;
 	}
 
 	return (0);
 }
 
 static int
 vtpci_suspend(device_t dev)
 {
 
 	return (bus_generic_suspend(dev));
 }
 
 static int
 vtpci_resume(device_t dev)
 {
 
 	return (bus_generic_resume(dev));
 }
 
 static int
 vtpci_shutdown(device_t dev)
 {
 
 	(void) bus_generic_shutdown(dev);
 	/* Forcibly stop the host device. */
 	vtpci_stop(dev);
 
 	return (0);
 }
 
 static void
 vtpci_driver_added(device_t dev, driver_t *driver)
 {
 	struct vtpci_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	vtpci_probe_and_attach_child(sc);
 }
 
 static void
 vtpci_child_detached(device_t dev, device_t child)
 {
 	struct vtpci_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	vtpci_reset(sc);
 	vtpci_release_child_resources(sc);
 }
 
 static int
 vtpci_read_ivar(device_t dev, device_t child, int index, uintptr_t *result)
 {
 	struct vtpci_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	if (sc->vtpci_child_dev != child)
 		return (ENOENT);
 
 	switch (index) {
 	case VIRTIO_IVAR_DEVTYPE:
 	case VIRTIO_IVAR_SUBDEVICE:
 		*result = pci_get_subdevice(dev);
 		break;
 	case VIRTIO_IVAR_VENDOR:
 		*result = pci_get_vendor(dev);
 		break;
 	case VIRTIO_IVAR_DEVICE:
 		*result = pci_get_device(dev);
 		break;
 	case VIRTIO_IVAR_SUBVENDOR:
 		*result = pci_get_subvendor(dev);
 		break;
 	default:
 		return (ENOENT);
 	}
 
 	return (0);
 }
 
 static int
 vtpci_write_ivar(device_t dev, device_t child, int index, uintptr_t value)
 {
 	struct vtpci_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	if (sc->vtpci_child_dev != child)
 		return (ENOENT);
 
 	switch (index) {
 	case VIRTIO_IVAR_FEATURE_DESC:
 		sc->vtpci_child_feat_desc = (void *) value;
 		break;
 	default:
 		return (ENOENT);
 	}
 
 	return (0);
 }
 
 static uint64_t
 vtpci_negotiate_features(device_t dev, uint64_t child_features)
 {
 	struct vtpci_softc *sc;
 	uint64_t host_features, features;
 
 	sc = device_get_softc(dev);
 
 	host_features = vtpci_read_config_4(sc, VIRTIO_PCI_HOST_FEATURES);
 	vtpci_describe_features(sc, "host", host_features);
 
 	/*
 	 * Limit negotiated features to what the driver, virtqueue, and
 	 * host all support.
 	 */
 	features = host_features & child_features;
 	features = virtqueue_filter_features(features);
 	sc->vtpci_features = features;
 
 	vtpci_describe_features(sc, "negotiated", features);
 	vtpci_write_config_4(sc, VIRTIO_PCI_GUEST_FEATURES, features);
 
 	return (features);
 }
 
 static int
 vtpci_with_feature(device_t dev, uint64_t feature)
 {
 	struct vtpci_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	return ((sc->vtpci_features & feature) != 0);
 }
 
 static int
 vtpci_alloc_virtqueues(device_t dev, int flags, int nvqs,
     struct vq_alloc_info *vq_info)
 {
 	struct vtpci_softc *sc;
 	struct virtqueue *vq;
 	struct vtpci_virtqueue *vqx;
 	struct vq_alloc_info *info;
 	int idx, error;
 	uint16_t size;
 
 	sc = device_get_softc(dev);
 
 	if (sc->vtpci_nvqs != 0)
 		return (EALREADY);
 	if (nvqs <= 0)
 		return (EINVAL);
 
 	sc->vtpci_vqs = malloc(nvqs * sizeof(struct vtpci_virtqueue),
 	    M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (sc->vtpci_vqs == NULL)
 		return (ENOMEM);
 
 	for (idx = 0; idx < nvqs; idx++) {
 		vqx = &sc->vtpci_vqs[idx];
 		info = &vq_info[idx];
 
 		vtpci_select_virtqueue(sc, idx);
 		size = vtpci_read_config_2(sc, VIRTIO_PCI_QUEUE_NUM);
 
 		error = virtqueue_alloc(dev, idx, size, VIRTIO_PCI_VRING_ALIGN,
-		    0xFFFFFFFFUL, info, &vq);
+		    ~(vm_paddr_t)0, info, &vq);
 		if (error) {
 			device_printf(dev,
 			    "cannot allocate virtqueue %d: %d\n", idx, error);
 			break;
 		}
 
 		vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN,
 		    virtqueue_paddr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT);
 
 		vqx->vtv_vq = *info->vqai_vq = vq;
 		vqx->vtv_no_intr = info->vqai_intr == NULL;
 
 		sc->vtpci_nvqs++;
 	}
 
 	if (error)
 		vtpci_free_virtqueues(sc);
 
 	return (error);
 }
 
 static int
 vtpci_setup_intr(device_t dev, enum intr_type type)
 {
 	struct vtpci_softc *sc;
 	int attempt, error;
 
 	sc = device_get_softc(dev);
 
 	for (attempt = 0; attempt < 5; attempt++) {
 		/*
 		 * Start with the most desirable interrupt configuration and
 		 * fallback towards less desirable ones.
 		 */
 		switch (attempt) {
 		case 0:
 			error = vtpci_alloc_intr_msix_pervq(sc);
 			break;
 		case 1:
 			error = vtpci_alloc_intr_msix_shared(sc);
 			break;
 		case 2:
 			error = vtpci_alloc_intr_msi(sc);
 			break;
 		case 3:
 			error = vtpci_alloc_intr_legacy(sc);
 			break;
 		default:
 			device_printf(dev,
 			    "exhausted all interrupt allocation attempts\n");
 			return (ENXIO);
 		}
 
 		if (error == 0 && vtpci_setup_interrupts(sc, type) == 0)
 			break;
 
 		vtpci_cleanup_setup_intr_attempt(sc);
 	}
 
 	if (bootverbose) {
 		if (sc->vtpci_flags & VTPCI_FLAG_LEGACY)
 			device_printf(dev, "using legacy interrupt\n");
 		else if (sc->vtpci_flags & VTPCI_FLAG_MSI)
 			device_printf(dev, "using MSI interrupt\n");
 		else if (sc->vtpci_flags & VTPCI_FLAG_SHARED_MSIX)
 			device_printf(dev, "using shared MSIX interrupts\n");
 		else
 			device_printf(dev, "using per VQ MSIX interrupts\n");
 	}
 
 	return (0);
 }
 
 static void
 vtpci_stop(device_t dev)
 {
 
 	vtpci_reset(device_get_softc(dev));
 }
 
 static int
 vtpci_reinit(device_t dev, uint64_t features)
 {
 	struct vtpci_softc *sc;
 	int idx, error;
 
 	sc = device_get_softc(dev);
 
 	/*
 	 * Redrive the device initialization. This is a bit of an abuse of
 	 * the specification, but VirtualBox, QEMU/KVM, and BHyVe seem to
 	 * play nice.
 	 *
 	 * We do not allow the host device to change from what was originally
 	 * negotiated beyond what the guest driver changed. MSIX state should
 	 * not change, number of virtqueues and their size remain the same, etc.
 	 * This will need to be rethought when we want to support migration.
 	 */
 
 	if (vtpci_get_status(dev) != VIRTIO_CONFIG_STATUS_RESET)
 		vtpci_stop(dev);
 
 	/*
 	 * Quickly drive the status through ACK and DRIVER. The device
 	 * does not become usable again until vtpci_reinit_complete().
 	 */
 	vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
 	vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER);
 
 	vtpci_negotiate_features(dev, features);
 
 	for (idx = 0; idx < sc->vtpci_nvqs; idx++) {
 		error = vtpci_reinit_virtqueue(sc, idx);
 		if (error)
 			return (error);
 	}
 
 	if (sc->vtpci_flags & VTPCI_FLAG_MSIX) {
 		error = vtpci_set_host_msix_vectors(sc);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 static void
 vtpci_reinit_complete(device_t dev)
 {
 
 	vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK);
 }
 
 static void
 vtpci_notify_virtqueue(device_t dev, uint16_t queue)
 {
 	struct vtpci_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_NOTIFY, queue);
 }
 
 static uint8_t
 vtpci_get_status(device_t dev)
 {
 	struct vtpci_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	return (vtpci_read_config_1(sc, VIRTIO_PCI_STATUS));
 }
 
 static void
 vtpci_set_status(device_t dev, uint8_t status)
 {
 	struct vtpci_softc *sc;
 
 	sc = device_get_softc(dev);
 
 	if (status != VIRTIO_CONFIG_STATUS_RESET)
 		status |= vtpci_get_status(dev);
 
 	vtpci_write_config_1(sc, VIRTIO_PCI_STATUS, status);
 }
 
 static void
 vtpci_read_dev_config(device_t dev, bus_size_t offset,
     void *dst, int length)
 {
 	struct vtpci_softc *sc;
 	bus_size_t off;
 	uint8_t *d;
 	int size;
 
 	sc = device_get_softc(dev);
 	off = VIRTIO_PCI_CONFIG(sc) + offset;
 
 	for (d = dst; length > 0; d += size, off += size, length -= size) {
 		if (length >= 4) {
 			size = 4;
 			*(uint32_t *)d = vtpci_read_config_4(sc, off);
 		} else if (length >= 2) {
 			size = 2;
 			*(uint16_t *)d = vtpci_read_config_2(sc, off);
 		} else {
 			size = 1;
 			*d = vtpci_read_config_1(sc, off);
 		}
 	}
 }
 
 static void
 vtpci_write_dev_config(device_t dev, bus_size_t offset,
     void *src, int length)
 {
 	struct vtpci_softc *sc;
 	bus_size_t off;
 	uint8_t *s;
 	int size;
 
 	sc = device_get_softc(dev);
 	off = VIRTIO_PCI_CONFIG(sc) + offset;
 
 	for (s = src; length > 0; s += size, off += size, length -= size) {
 		if (length >= 4) {
 			size = 4;
 			vtpci_write_config_4(sc, off, *(uint32_t *)s);
 		} else if (length >= 2) {
 			size = 2;
 			vtpci_write_config_2(sc, off, *(uint16_t *)s);
 		} else {
 			size = 1;
 			vtpci_write_config_1(sc, off, *s);
 		}
 	}
 }
 
 static void
 vtpci_describe_features(struct vtpci_softc *sc, const char *msg,
     uint64_t features)
 {
 	device_t dev, child;
 
 	dev = sc->vtpci_dev;
 	child = sc->vtpci_child_dev;
 
 	if (device_is_attached(child) || bootverbose == 0)
 		return;
 
 	virtio_describe(dev, msg, features, sc->vtpci_child_feat_desc);
 }
 
 static void
 vtpci_probe_and_attach_child(struct vtpci_softc *sc)
 {
 	device_t dev, child;
 
 	dev = sc->vtpci_dev;
 	child = sc->vtpci_child_dev;
 
 	if (child == NULL)
 		return;
 
 	if (device_get_state(child) != DS_NOTPRESENT)
 		return;
 
 	if (device_probe(child) != 0)
 		return;
 
 	vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER);
 	if (device_attach(child) != 0) {
 		vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_FAILED);
 		vtpci_reset(sc);
 		vtpci_release_child_resources(sc);
 		/* Reset status for future attempt. */
 		vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_ACK);
 	} else {
 		vtpci_set_status(dev, VIRTIO_CONFIG_STATUS_DRIVER_OK);
 		VIRTIO_ATTACH_COMPLETED(child);
 	}
 }
 
 static int
 vtpci_alloc_msix(struct vtpci_softc *sc, int nvectors)
 {
 	device_t dev;
 	int nmsix, cnt, required;
 
 	dev = sc->vtpci_dev;
 
 	/* Allocate an additional vector for the config changes. */
 	required = nvectors + 1;
 
 	nmsix = pci_msix_count(dev);
 	if (nmsix < required)
 		return (1);
 
 	cnt = required;
 	if (pci_alloc_msix(dev, &cnt) == 0 && cnt >= required) {
 		sc->vtpci_nmsix_resources = required;
 		return (0);
 	}
 
 	pci_release_msi(dev);
 
 	return (1);
 }
 
 static int
 vtpci_alloc_msi(struct vtpci_softc *sc)
 {
 	device_t dev;
 	int nmsi, cnt, required;
 
 	dev = sc->vtpci_dev;
 	required = 1;
 
 	nmsi = pci_msi_count(dev);
 	if (nmsi < required)
 		return (1);
 
 	cnt = required;
 	if (pci_alloc_msi(dev, &cnt) == 0 && cnt >= required)
 		return (0);
 
 	pci_release_msi(dev);
 
 	return (1);
 }
 
 static int
 vtpci_alloc_intr_msix_pervq(struct vtpci_softc *sc)
 {
 	int i, nvectors, error;
 
 	if (vtpci_disable_msix != 0 ||
 	    sc->vtpci_flags & VTPCI_FLAG_NO_MSIX)
 		return (ENOTSUP);
 
 	for (nvectors = 0, i = 0; i < sc->vtpci_nvqs; i++) {
 		if (sc->vtpci_vqs[i].vtv_no_intr == 0)
 			nvectors++;
 	}
 
 	error = vtpci_alloc_msix(sc, nvectors);
 	if (error)
 		return (error);
 
 	sc->vtpci_flags |= VTPCI_FLAG_MSIX;
 
 	return (0);
 }
 
 static int
 vtpci_alloc_intr_msix_shared(struct vtpci_softc *sc)
 {
 	int error;
 
 	if (vtpci_disable_msix != 0 ||
 	    sc->vtpci_flags & VTPCI_FLAG_NO_MSIX)
 		return (ENOTSUP);
 
 	error = vtpci_alloc_msix(sc, 1);
 	if (error)
 		return (error);
 
 	sc->vtpci_flags |= VTPCI_FLAG_MSIX | VTPCI_FLAG_SHARED_MSIX;
 
 	return (0);
 }
 
 static int
 vtpci_alloc_intr_msi(struct vtpci_softc *sc)
 {
 	int error;
 
 	/* Only BHyVe supports MSI. */
 	if (sc->vtpci_flags & VTPCI_FLAG_NO_MSI)
 		return (ENOTSUP);
 
 	error = vtpci_alloc_msi(sc);
 	if (error)
 		return (error);
 
 	sc->vtpci_flags |= VTPCI_FLAG_MSI;
 
 	return (0);
 }
 
 static int
 vtpci_alloc_intr_legacy(struct vtpci_softc *sc)
 {
 
 	sc->vtpci_flags |= VTPCI_FLAG_LEGACY;
 
 	return (0);
 }
 
 static int
 vtpci_alloc_interrupt(struct vtpci_softc *sc, int rid, int flags,
     struct vtpci_interrupt *intr)
 {
 	struct resource *irq;
 
 	irq = bus_alloc_resource_any(sc->vtpci_dev, SYS_RES_IRQ, &rid, flags);
 	if (irq == NULL)
 		return (ENXIO);
 
 	intr->vti_irq = irq;
 	intr->vti_rid = rid;
 
 	return (0);
 }
 
 static int
 vtpci_alloc_intr_resources(struct vtpci_softc *sc)
 {
 	struct vtpci_interrupt *intr;
 	int i, rid, flags, nvq_intrs, error;
 
 	rid = 0;
 	flags = RF_ACTIVE;
 
 	if (sc->vtpci_flags & VTPCI_FLAG_LEGACY)
 		flags |= RF_SHAREABLE;
 	else
 		rid = 1;
 
 	/*
 	 * For legacy and MSI interrupts, this single resource handles all
 	 * interrupts. For MSIX, this resource is used for the configuration
 	 * changed interrupt.
 	 */
 	intr = &sc->vtpci_device_interrupt;
 	error = vtpci_alloc_interrupt(sc, rid, flags, intr);
 	if (error || sc->vtpci_flags & (VTPCI_FLAG_LEGACY | VTPCI_FLAG_MSI))
 		return (error);
 
 	/* Subtract one for the configuration changed interrupt. */
 	nvq_intrs = sc->vtpci_nmsix_resources - 1;
 
 	intr = sc->vtpci_msix_vq_interrupts = malloc(nvq_intrs *
 	    sizeof(struct vtpci_interrupt), M_DEVBUF, M_NOWAIT | M_ZERO);
 	if (sc->vtpci_msix_vq_interrupts == NULL)
 		return (ENOMEM);
 
 	for (i = 0, rid++; i < nvq_intrs; i++, rid++, intr++) {
 		error = vtpci_alloc_interrupt(sc, rid, flags, intr);
 		if (error)
 			return (error);
 	}
 
 	return (0);
 }
 
 static int
 vtpci_setup_legacy_interrupt(struct vtpci_softc *sc, enum intr_type type)
 {
 	struct vtpci_interrupt *intr;
 	int error;
 
 	intr = &sc->vtpci_device_interrupt;
 	error = bus_setup_intr(sc->vtpci_dev, intr->vti_irq, type, NULL,
 	    vtpci_legacy_intr, sc, &intr->vti_handler);
 
 	return (error);
 }
 
 static int
 vtpci_setup_pervq_msix_interrupts(struct vtpci_softc *sc, enum intr_type type)
 {
 	struct vtpci_virtqueue *vqx;
 	struct vtpci_interrupt *intr;
 	int i, error;
 
 	intr = sc->vtpci_msix_vq_interrupts;
 
 	for (i = 0; i < sc->vtpci_nvqs; i++) {
 		vqx = &sc->vtpci_vqs[i];
 
 		if (vqx->vtv_no_intr)
 			continue;
 
 		error = bus_setup_intr(sc->vtpci_dev, intr->vti_irq, type,
 		    vtpci_vq_intr_filter, vtpci_vq_intr, vqx->vtv_vq,
 		    &intr->vti_handler);
 		if (error)
 			return (error);
 
 		intr++;
 	}
 
 	return (0);
 }
 
 static int
 vtpci_setup_msix_interrupts(struct vtpci_softc *sc, enum intr_type type)
 {
 	device_t dev;
 	struct vtpci_interrupt *intr;
 	int error;
 
 	dev = sc->vtpci_dev;
 	intr = &sc->vtpci_device_interrupt;
 
 	error = bus_setup_intr(dev, intr->vti_irq, type, NULL,
 	    vtpci_config_intr, sc, &intr->vti_handler);
 	if (error)
 		return (error);
 
 	if (sc->vtpci_flags & VTPCI_FLAG_SHARED_MSIX) {
 		intr = sc->vtpci_msix_vq_interrupts;
 		error = bus_setup_intr(dev, intr->vti_irq, type,
 		    vtpci_vq_shared_intr_filter, vtpci_vq_shared_intr, sc,
 		    &intr->vti_handler);
 	} else
 		error = vtpci_setup_pervq_msix_interrupts(sc, type);
 
 	return (error ? error : vtpci_set_host_msix_vectors(sc));
 }
 
 static int
 vtpci_setup_interrupts(struct vtpci_softc *sc, enum intr_type type)
 {
 	int error;
 
 	type |= INTR_MPSAFE;
 	KASSERT(sc->vtpci_flags & VTPCI_FLAG_ITYPE_MASK,
 	    ("%s: no interrupt type selected %#x", __func__, sc->vtpci_flags));
 
 	error = vtpci_alloc_intr_resources(sc);
 	if (error)
 		return (error);
 
 	if (sc->vtpci_flags & VTPCI_FLAG_LEGACY)
 		error = vtpci_setup_legacy_interrupt(sc, type);
 	else if (sc->vtpci_flags & VTPCI_FLAG_MSI)
 		error = vtpci_setup_msi_interrupt(sc, type);
 	else
 		error = vtpci_setup_msix_interrupts(sc, type);
 
 	return (error);
 }
 
 static int
 vtpci_register_msix_vector(struct vtpci_softc *sc, int offset,
     struct vtpci_interrupt *intr)
 {
 	device_t dev;
 	uint16_t vector;
 
 	dev = sc->vtpci_dev;
 
 	if (intr != NULL) {
 		/* Map from guest rid to host vector. */
 		vector = intr->vti_rid - 1;
 	} else
 		vector = VIRTIO_MSI_NO_VECTOR;
 
 	vtpci_write_config_2(sc, offset, vector);
 
 	/* Read vector to determine if the host had sufficient resources. */
 	if (vtpci_read_config_2(sc, offset) != vector) {
 		device_printf(dev,
 		    "insufficient host resources for MSIX interrupts\n");
 		return (ENODEV);
 	}
 
 	return (0);
 }
 
 static int
 vtpci_set_host_msix_vectors(struct vtpci_softc *sc)
 {
 	struct vtpci_interrupt *intr, *tintr;
 	int idx, offset, error;
 
 	intr = &sc->vtpci_device_interrupt;
 	offset = VIRTIO_MSI_CONFIG_VECTOR;
 
 	error = vtpci_register_msix_vector(sc, offset, intr);
 	if (error)
 		return (error);
 
 	intr = sc->vtpci_msix_vq_interrupts;
 	offset = VIRTIO_MSI_QUEUE_VECTOR;
 
 	for (idx = 0; idx < sc->vtpci_nvqs; idx++) {
 		vtpci_select_virtqueue(sc, idx);
 
 		if (sc->vtpci_vqs[idx].vtv_no_intr)
 			tintr = NULL;
 		else
 			tintr = intr;
 
 		error = vtpci_register_msix_vector(sc, offset, tintr);
 		if (error)
 			break;
 
 		/*
 		 * For shared MSIX, all the virtqueues share the first
 		 * interrupt.
 		 */
 		if (!sc->vtpci_vqs[idx].vtv_no_intr &&
 		    (sc->vtpci_flags & VTPCI_FLAG_SHARED_MSIX) == 0)
 			intr++;
 	}
 
 	return (error);
 }
 
 static int
 vtpci_reinit_virtqueue(struct vtpci_softc *sc, int idx)
 {
 	struct vtpci_virtqueue *vqx;
 	struct virtqueue *vq;
 	int error;
 	uint16_t size;
 
 	vqx = &sc->vtpci_vqs[idx];
 	vq = vqx->vtv_vq;
 
 	KASSERT(vq != NULL, ("%s: vq %d not allocated", __func__, idx));
 
 	vtpci_select_virtqueue(sc, idx);
 	size = vtpci_read_config_2(sc, VIRTIO_PCI_QUEUE_NUM);
 
 	error = virtqueue_reinit(vq, size);
 	if (error)
 		return (error);
 
 	vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN,
 	    virtqueue_paddr(vq) >> VIRTIO_PCI_QUEUE_ADDR_SHIFT);
 
 	return (0);
 }
 
 static void
 vtpci_free_interrupt(struct vtpci_softc *sc, struct vtpci_interrupt *intr)
 {
 	device_t dev;
 
 	dev = sc->vtpci_dev;
 
 	if (intr->vti_handler != NULL) {
 		bus_teardown_intr(dev, intr->vti_irq, intr->vti_handler);
 		intr->vti_handler = NULL;
 	}
 
 	if (intr->vti_irq != NULL) {
 		bus_release_resource(dev, SYS_RES_IRQ, intr->vti_rid,
 		    intr->vti_irq);
 		intr->vti_irq = NULL;
 		intr->vti_rid = -1;
 	}
 }
 
 static void
 vtpci_free_interrupts(struct vtpci_softc *sc)
 {
 	struct vtpci_interrupt *intr;
 	int i, nvq_intrs;
 
 	vtpci_free_interrupt(sc, &sc->vtpci_device_interrupt);
 
 	if (sc->vtpci_nmsix_resources != 0) {
 		nvq_intrs = sc->vtpci_nmsix_resources - 1;
 		sc->vtpci_nmsix_resources = 0;
 
 		intr = sc->vtpci_msix_vq_interrupts;
 		if (intr != NULL) {
 			for (i = 0; i < nvq_intrs; i++, intr++)
 				vtpci_free_interrupt(sc, intr);
 
 			free(sc->vtpci_msix_vq_interrupts, M_DEVBUF);
 			sc->vtpci_msix_vq_interrupts = NULL;
 		}
 	}
 
 	if (sc->vtpci_flags & (VTPCI_FLAG_MSI | VTPCI_FLAG_MSIX))
 		pci_release_msi(sc->vtpci_dev);
 
 	sc->vtpci_flags &= ~VTPCI_FLAG_ITYPE_MASK;
 }
 
 static void
 vtpci_free_virtqueues(struct vtpci_softc *sc)
 {
 	struct vtpci_virtqueue *vqx;
 	int idx;
 
 	for (idx = 0; idx < sc->vtpci_nvqs; idx++) {
 		vqx = &sc->vtpci_vqs[idx];
 
 		vtpci_select_virtqueue(sc, idx);
 		vtpci_write_config_4(sc, VIRTIO_PCI_QUEUE_PFN, 0);
 
 		virtqueue_free(vqx->vtv_vq);
 		vqx->vtv_vq = NULL;
 	}
 
 	free(sc->vtpci_vqs, M_DEVBUF);
 	sc->vtpci_vqs = NULL;
 	sc->vtpci_nvqs = 0;
 }
 
 static void
 vtpci_release_child_resources(struct vtpci_softc *sc)
 {
 
 	vtpci_free_interrupts(sc);
 	vtpci_free_virtqueues(sc);
 }
 
 static void
 vtpci_cleanup_setup_intr_attempt(struct vtpci_softc *sc)
 {
 	int idx;
 
 	if (sc->vtpci_flags & VTPCI_FLAG_MSIX) {
 		vtpci_write_config_2(sc, VIRTIO_MSI_CONFIG_VECTOR,
 		    VIRTIO_MSI_NO_VECTOR);
 
 		for (idx = 0; idx < sc->vtpci_nvqs; idx++) {
 			vtpci_select_virtqueue(sc, idx);
 			vtpci_write_config_2(sc, VIRTIO_MSI_QUEUE_VECTOR,
 			    VIRTIO_MSI_NO_VECTOR);
 		}
 	}
 
 	vtpci_free_interrupts(sc);
 }
 
 static void
 vtpci_reset(struct vtpci_softc *sc)
 {
 
 	/*
 	 * Setting the status to RESET sets the host device to
 	 * the original, uninitialized state.
 	 */
 	vtpci_set_status(sc->vtpci_dev, VIRTIO_CONFIG_STATUS_RESET);
 }
 
 static void
 vtpci_select_virtqueue(struct vtpci_softc *sc, int idx)
 {
 
 	vtpci_write_config_2(sc, VIRTIO_PCI_QUEUE_SEL, idx);
 }
 
 static void
 vtpci_legacy_intr(void *xsc)
 {
 	struct vtpci_softc *sc;
 	struct vtpci_virtqueue *vqx;
 	int i;
 	uint8_t isr;
 
 	sc = xsc;
 	vqx = &sc->vtpci_vqs[0];
 
 	/* Reading the ISR also clears it. */
 	isr = vtpci_read_config_1(sc, VIRTIO_PCI_ISR);
 
 	if (isr & VIRTIO_PCI_ISR_CONFIG)
 		vtpci_config_intr(sc);
 
 	if (isr & VIRTIO_PCI_ISR_INTR) {
 		for (i = 0; i < sc->vtpci_nvqs; i++, vqx++) {
 			if (vqx->vtv_no_intr == 0)
 				virtqueue_intr(vqx->vtv_vq);
 		}
 	}
 }
 
 static int
 vtpci_vq_shared_intr_filter(void *xsc)
 {
 	struct vtpci_softc *sc;
 	struct vtpci_virtqueue *vqx;
 	int i, rc;
 
 	rc = 0;
 	sc = xsc;
 	vqx = &sc->vtpci_vqs[0];
 
 	for (i = 0; i < sc->vtpci_nvqs; i++, vqx++) {
 		if (vqx->vtv_no_intr == 0)
 			rc |= virtqueue_intr_filter(vqx->vtv_vq);
 	}
 
 	return (rc ? FILTER_SCHEDULE_THREAD : FILTER_STRAY);
 }
 
 static void
 vtpci_vq_shared_intr(void *xsc)
 {
 	struct vtpci_softc *sc;
 	struct vtpci_virtqueue *vqx;
 	int i;
 
 	sc = xsc;
 	vqx = &sc->vtpci_vqs[0];
 
 	for (i = 0; i < sc->vtpci_nvqs; i++, vqx++) {
 		if (vqx->vtv_no_intr == 0)
 			virtqueue_intr(vqx->vtv_vq);
 	}
 }
 
 static int
 vtpci_vq_intr_filter(void *xvq)
 {
 	struct virtqueue *vq;
 	int rc;
 
 	vq = xvq;
 	rc = virtqueue_intr_filter(vq);
 
 	return (rc ? FILTER_SCHEDULE_THREAD : FILTER_STRAY);
 }
 
 static void
 vtpci_vq_intr(void *xvq)
 {
 	struct virtqueue *vq;
 
 	vq = xvq;
 	virtqueue_intr(vq);
 }
 
 static void
 vtpci_config_intr(void *xsc)
 {
 	struct vtpci_softc *sc;
 	device_t child;
 
 	sc = xsc;
 	child = sc->vtpci_child_dev;
 
 	if (child != NULL)
 		VIRTIO_CONFIG_CHANGE(child);
 }
Index: projects/nfsv42/sys/i386/i386/pmap.c
===================================================================
--- projects/nfsv42/sys/i386/i386/pmap.c	(revision 350367)
+++ projects/nfsv42/sys/i386/i386/pmap.c	(revision 350368)
@@ -1,6164 +1,6154 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  * Copyright (c) 2018 The FreeBSD Foundation
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Portions of this software were developed by
  * Konstantin Belousov <kib@FreeBSD.org> under sponsorship from
  * the FreeBSD Foundation.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include "opt_apic.h"
 #include "opt_cpu.h"
 #include "opt_pmap.h"
 #include "opt_smp.h"
 #include "opt_vm.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sf_buf.h>
 #include <sys/sx.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 #include <sys/vmem.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 #ifdef DEV_APIC
 #include <sys/bus.h>
 #include <machine/intr_machdep.h>
 #include <x86/apicvar.h>
 #endif
 #include <x86/ifunc.h>
 #include <machine/bootinfo.h>
 #include <machine/cpu.h>
 #include <machine/cputypes.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/specialreg.h>
 #ifdef SMP
 #include <machine/smp.h>
 #endif
 #include <machine/pmap_base.h>
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
 #else
 #define PMAP_INLINE	extern inline
 #endif
 #else
 #define PMAP_INLINE
 #endif
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #define	pa_index(pa)	((pa) >> PDRSHIFT)
 #define	pa_to_pvh(pa)	(&pv_table[pa_index(pa)])
 
 /*
  * PTmap is recursive pagemap at top of virtual address space.
  * Within PTmap, the page directory can be found (third indirection).
  */
 #define	PTmap	((pt_entry_t *)(PTDPTDI << PDRSHIFT))
 #define	PTD	((pd_entry_t *)((PTDPTDI << PDRSHIFT) + (PTDPTDI * PAGE_SIZE)))
 #define	PTDpde	((pd_entry_t *)((PTDPTDI << PDRSHIFT) + (PTDPTDI * PAGE_SIZE) + \
     (PTDPTDI * PDESIZE)))
 
 /*
  * Translate a virtual address to the kernel virtual address of its page table
  * entry (PTE).  This can be used recursively.  If the address of a PTE as
  * previously returned by this macro is itself given as the argument, then the
  * address of the page directory entry (PDE) that maps the PTE will be
  * returned.
  *
  * This macro may be used before pmap_bootstrap() is called.
  */
 #define	vtopte(va)	(PTmap + i386_btop(va))
 
 /*
  * Get PDEs and PTEs for user/kernel address space
  */
 #define	pmap_pde(m, v)	(&((m)->pm_pdir[(vm_offset_t)(v) >> PDRSHIFT]))
 #define pdir_pde(m, v) (m[(vm_offset_t)(v) >> PDRSHIFT])
 
 #define pmap_pde_v(pte)		((*(int *)pte & PG_V) != 0)
 #define pmap_pte_w(pte)		((*(int *)pte & PG_W) != 0)
 #define pmap_pte_m(pte)		((*(int *)pte & PG_M) != 0)
 #define pmap_pte_u(pte)		((*(int *)pte & PG_A) != 0)
 #define pmap_pte_v(pte)		((*(int *)pte & PG_V) != 0)
 
 #define pmap_pte_set_w(pte, v)	((v) ? atomic_set_int((u_int *)(pte), PG_W) : \
     atomic_clear_int((u_int *)(pte), PG_W))
 #define pmap_pte_set_prot(pte, v) ((*(int *)pte &= ~PG_PROT), (*(int *)pte |= (v)))
 
 _Static_assert(sizeof(struct pmap) <= sizeof(struct pmap_KBI),
     "pmap_KBI");
 
 static int pgeflag = 0;		/* PG_G or-in */
 static int pseflag = 0;		/* PG_PS or-in */
 
 static int nkpt = NKPT;
 
 #ifdef PMAP_PAE_COMP
 pt_entry_t pg_nx;
 static uma_zone_t pdptzone;
 #endif
 
 _Static_assert(VM_MAXUSER_ADDRESS == VADDR(TRPTDI, 0), "VM_MAXUSER_ADDRESS");
 _Static_assert(VM_MAX_KERNEL_ADDRESS <= VADDR(PTDPTDI, 0),
     "VM_MAX_KERNEL_ADDRESS");
 _Static_assert(PMAP_MAP_LOW == VADDR(LOWPTDI, 0), "PMAP_MAP_LOW");
 _Static_assert(KERNLOAD == (KERNPTDI << PDRSHIFT), "KERNLOAD");
 
 extern int pat_works;
 extern int pg_ps_enabled;
 
 extern int elf32_nxstack;
 
 #define	PAT_INDEX_SIZE	8
 static int pat_index[PAT_INDEX_SIZE];	/* cache mode to PAT index conversion */
 
 /*
  * pmap_mapdev support pre initialization (i.e. console)
  */
 #define	PMAP_PREINIT_MAPPING_COUNT	8
 static struct pmap_preinit_mapping {
 	vm_paddr_t	pa;
 	vm_offset_t	va;
 	vm_size_t	sz;
 	int		mode;
 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
 static int pmap_initialized;
 
 static struct rwlock_padalign pvh_global_lock;
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 extern int pv_entry_max, pv_entry_count;
 static int pv_entry_high_water = 0;
 static struct md_page *pv_table;
 extern int shpgperproc;
 
 static struct pv_chunk *pv_chunkbase;	/* KVA block for pv_chunks */
 static int pv_maxchunks;		/* How many chunks we have KVA for */
 static vm_offset_t pv_vafree;		/* freelist stored in the PTE */
 
 /*
  * All those kernel PT submaps that BSD is so fond of
  */
 static pt_entry_t *CMAP3;
 static pd_entry_t *KPTD;
 static caddr_t CADDR3;
 
 /*
  * Crashdump maps.
  */
 static caddr_t crashdumpmap;
 
 static pt_entry_t *PMAP1 = NULL, *PMAP2, *PMAP3;
 static pt_entry_t *PADDR1 = NULL, *PADDR2, *PADDR3;
 #ifdef SMP
 static int PMAP1cpu, PMAP3cpu;
 extern int PMAP1changedcpu;
 #endif
 extern int PMAP1changed;
 extern int PMAP1unchanged;
 static struct mtx PMAP2mutex;
 
 /*
  * Internal flags for pmap_enter()'s helper functions.
  */
 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
 
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, boolean_t try);
 static void	pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 static bool	pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde,
 		    u_int flags);
 #if VM_NRESERVLEVEL > 0
 static void	pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa);
 #endif
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
 static int	pmap_pvh_wired_mappings(struct md_page *pvh, int count);
 
 static boolean_t pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 static bool	pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m,
 		    vm_prot_t prot);
 static int	pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde,
 		    u_int flags, vm_page_t m);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte);
 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted);
 static void pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va,
 		    pd_entry_t pde);
 static void pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte);
 static boolean_t pmap_is_modified_pvh(struct md_page *pvh);
 static boolean_t pmap_is_referenced_pvh(struct md_page *pvh);
 static void pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode);
 static void pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde);
 static void pmap_pde_attr(pd_entry_t *pde, int cache_bits);
 #if VM_NRESERVLEVEL > 0
 static void pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va);
 #endif
 static boolean_t pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva,
     vm_prot_t prot);
 static void pmap_pte_attr(pt_entry_t *pte, int cache_bits);
 static void pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free);
 static int pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t sva,
     struct spglist *free);
 static vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
 static void pmap_remove_page(struct pmap *pmap, vm_offset_t va,
     struct spglist *free);
 static bool	pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
 		    struct spglist *free);
 static void pmap_remove_entry(struct pmap *pmap, vm_page_t m,
 					vm_offset_t va);
 static void pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m);
 static void pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde,
     pd_entry_t newpde);
 static void pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde);
 
 static vm_page_t pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags);
 
 static vm_page_t _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags);
 static void _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free);
 static pt_entry_t *pmap_pte_quick(pmap_t pmap, vm_offset_t va);
 static void pmap_pte_release(pt_entry_t *pte);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, struct spglist *);
 #ifdef PMAP_PAE_COMP
 static void *pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain,
     uint8_t *flags, int wait);
 #endif
 static void pmap_init_trm(void);
 static void pmap_invalidate_all_int(pmap_t pmap);
 
 static __inline void pagezero(void *page);
 
 CTASSERT(1 << PDESHIFT == sizeof(pd_entry_t));
 CTASSERT(1 << PTESHIFT == sizeof(pt_entry_t));
 
 extern char _end[];
 extern u_long physfree;	/* phys addr of next free page */
 extern u_long vm86phystk;/* PA of vm86/bios stack */
 extern u_long vm86paddr;/* address of vm86 region */
 extern int vm86pa;	/* phys addr of vm86 region */
 extern u_long KERNend;	/* phys addr end of kernel (just after bss) */
 #ifdef PMAP_PAE_COMP
 pd_entry_t *IdlePTD_pae;	/* phys addr of kernel PTD */
 pdpt_entry_t *IdlePDPT;	/* phys addr of kernel PDPT */
 pt_entry_t *KPTmap_pae;	/* address of kernel page tables */
 #define	IdlePTD	IdlePTD_pae
 #define	KPTmap	KPTmap_pae
 #else
 pd_entry_t *IdlePTD_nopae;
 pt_entry_t *KPTmap_nopae;
 #define	IdlePTD	IdlePTD_nopae
 #define	KPTmap	KPTmap_nopae
 #endif
 extern u_long KPTphys;	/* phys addr of kernel page tables */
 extern u_long tramp_idleptd;
 
 static u_long
 allocpages(u_int cnt, u_long *physfree)
 {
 	u_long res;
 
 	res = *physfree;
 	*physfree += PAGE_SIZE * cnt;
 	bzero((void *)res, PAGE_SIZE * cnt);
 	return (res);
 }
 
 static void
 pmap_cold_map(u_long pa, u_long va, u_long cnt)
 {
 	pt_entry_t *pt;
 
 	for (pt = (pt_entry_t *)KPTphys + atop(va); cnt > 0;
 	    cnt--, pt++, va += PAGE_SIZE, pa += PAGE_SIZE)
 		*pt = pa | PG_V | PG_RW | PG_A | PG_M;
 }
 
 static void
 pmap_cold_mapident(u_long pa, u_long cnt)
 {
 
 	pmap_cold_map(pa, pa, cnt);
 }
 
 _Static_assert(LOWPTDI * 2 * NBPDR == KERNBASE,
     "Broken double-map of zero PTD");
 
 static void
 __CONCAT(PMTYPE, remap_lower)(bool enable)
 {
 	int i;
 
 	for (i = 0; i < LOWPTDI; i++)
 		IdlePTD[i] = enable ? IdlePTD[LOWPTDI + i] : 0;
 	load_cr3(rcr3());		/* invalidate TLB */
 }
 
 /*
  * Called from locore.s before paging is enabled.  Sets up the first
  * kernel page table.  Since kernel is mapped with PA == VA, this code
  * does not require relocations.
  */
 void
 __CONCAT(PMTYPE, cold)(void)
 {
 	pt_entry_t *pt;
 	u_long a;
 	u_int cr3, ncr4;
 
 	physfree = (u_long)&_end;
 	if (bootinfo.bi_esymtab != 0)
 		physfree = bootinfo.bi_esymtab;
 	if (bootinfo.bi_kernend != 0)
 		physfree = bootinfo.bi_kernend;
 	physfree = roundup2(physfree, NBPDR);
 	KERNend = physfree;
 
 	/* Allocate Kernel Page Tables */
 	KPTphys = allocpages(NKPT, &physfree);
 	KPTmap = (pt_entry_t *)KPTphys;
 
 	/* Allocate Page Table Directory */
 #ifdef PMAP_PAE_COMP
 	/* XXX only need 32 bytes (easier for now) */
 	IdlePDPT = (pdpt_entry_t *)allocpages(1, &physfree);
 #endif
 	IdlePTD = (pd_entry_t *)allocpages(NPGPTD, &physfree);
 
 	/*
 	 * Allocate KSTACK.  Leave a guard page between IdlePTD and
 	 * proc0kstack, to control stack overflow for thread0 and
 	 * prevent corruption of the page table.  We leak the guard
 	 * physical memory due to 1:1 mappings.
 	 */
 	allocpages(1, &physfree);
 	proc0kstack = allocpages(TD0_KSTACK_PAGES, &physfree);
 
 	/* vm86/bios stack */
 	vm86phystk = allocpages(1, &physfree);
 
 	/* pgtable + ext + IOPAGES */
 	vm86paddr = vm86pa = allocpages(3, &physfree);
 
 	/* Install page tables into PTD.  Page table page 1 is wasted. */
 	for (a = 0; a < NKPT; a++)
 		IdlePTD[a] = (KPTphys + ptoa(a)) | PG_V | PG_RW | PG_A | PG_M;
 
 #ifdef PMAP_PAE_COMP
 	/* PAE install PTD pointers into PDPT */
 	for (a = 0; a < NPGPTD; a++)
 		IdlePDPT[a] = ((u_int)IdlePTD + ptoa(a)) | PG_V;
 #endif
 
 	/*
 	 * Install recursive mapping for kernel page tables into
 	 * itself.
 	 */
 	for (a = 0; a < NPGPTD; a++)
 		IdlePTD[PTDPTDI + a] = ((u_int)IdlePTD + ptoa(a)) | PG_V |
 		    PG_RW;
 
 	/*
 	 * Initialize page table pages mapping physical address zero
 	 * through the (physical) end of the kernel.  Many of these
 	 * pages must be reserved, and we reserve them all and map
 	 * them linearly for convenience.  We do this even if we've
 	 * enabled PSE above; we'll just switch the corresponding
 	 * kernel PDEs before we turn on paging.
 	 *
 	 * This and all other page table entries allow read and write
 	 * access for various reasons.  Kernel mappings never have any
 	 * access restrictions.
 	 */
 	pmap_cold_mapident(0, atop(NBPDR) * LOWPTDI);
 	pmap_cold_map(0, NBPDR * LOWPTDI, atop(NBPDR) * LOWPTDI);
 	pmap_cold_mapident(KERNBASE, atop(KERNend - KERNBASE));
 
 	/* Map page table directory */
 #ifdef PMAP_PAE_COMP
 	pmap_cold_mapident((u_long)IdlePDPT, 1);
 #endif
 	pmap_cold_mapident((u_long)IdlePTD, NPGPTD);
 
 	/* Map early KPTmap.  It is really pmap_cold_mapident. */
 	pmap_cold_map(KPTphys, (u_long)KPTmap, NKPT);
 
 	/* Map proc0kstack */
 	pmap_cold_mapident(proc0kstack, TD0_KSTACK_PAGES);
 	/* ISA hole already mapped */
 
 	pmap_cold_mapident(vm86phystk, 1);
 	pmap_cold_mapident(vm86pa, 3);
 
 	/* Map page 0 into the vm86 page table */
 	*(pt_entry_t *)vm86pa = 0 | PG_RW | PG_U | PG_A | PG_M | PG_V;
 
 	/* ...likewise for the ISA hole for vm86 */
 	for (pt = (pt_entry_t *)vm86pa + atop(ISA_HOLE_START), a = 0;
 	    a < atop(ISA_HOLE_LENGTH); a++, pt++)
 		*pt = (ISA_HOLE_START + ptoa(a)) | PG_RW | PG_U | PG_A |
 		    PG_M | PG_V;
 
 	/* Enable PSE, PGE, VME, and PAE if configured. */
 	ncr4 = 0;
 	if ((cpu_feature & CPUID_PSE) != 0) {
 		ncr4 |= CR4_PSE;
 		pseflag = PG_PS;
 		/*
 		 * Superpage mapping of the kernel text.  Existing 4k
 		 * page table pages are wasted.
 		 */
 		for (a = KERNBASE; a < KERNend; a += NBPDR)
 			IdlePTD[a >> PDRSHIFT] = a | PG_PS | PG_A | PG_M |
 			    PG_RW | PG_V;
 	}
 	if ((cpu_feature & CPUID_PGE) != 0) {
 		ncr4 |= CR4_PGE;
 		pgeflag = PG_G;
 	}
 	ncr4 |= (cpu_feature & CPUID_VME) != 0 ? CR4_VME : 0;
 #ifdef PMAP_PAE_COMP
 	ncr4 |= CR4_PAE;
 #endif
 	if (ncr4 != 0)
 		load_cr4(rcr4() | ncr4);
 
 	/* Now enable paging */
 #ifdef PMAP_PAE_COMP
 	cr3 = (u_int)IdlePDPT;
 	if ((cpu_feature & CPUID_PAT) == 0)
 		wbinvd();
 #else
 	cr3 = (u_int)IdlePTD;
 #endif
 	tramp_idleptd = cr3;
 	load_cr3(cr3);
 	load_cr0(rcr0() | CR0_PG);
 
 	/*
 	 * Now running relocated at KERNBASE where the system is
 	 * linked to run.
 	 */
 
 	/*
 	 * Remove the lowest part of the double mapping of low memory
 	 * to get some null pointer checks.
 	 */
 	__CONCAT(PMTYPE, remap_lower)(false);
 
 	kernel_vm_end = /* 0 + */ NKPT * NBPDR;
 #ifdef PMAP_PAE_COMP
 	i386_pmap_VM_NFREEORDER = VM_NFREEORDER_PAE;
 	i386_pmap_VM_LEVEL_0_ORDER = VM_LEVEL_0_ORDER_PAE;
 	i386_pmap_PDRSHIFT = PDRSHIFT_PAE;
 #else
 	i386_pmap_VM_NFREEORDER = VM_NFREEORDER_NOPAE;
 	i386_pmap_VM_LEVEL_0_ORDER = VM_LEVEL_0_ORDER_NOPAE;
 	i386_pmap_PDRSHIFT = PDRSHIFT_NOPAE;
 #endif
 }
 
 static void
 __CONCAT(PMTYPE, set_nx)(void)
 {
 
 #ifdef PMAP_PAE_COMP
 	if ((amd_feature & AMDID_NX) == 0)
 		return;
 	pg_nx = PG_NX;
 	elf32_nxstack = 1;
 	/* EFER.EFER_NXE is set in initializecpu(). */
 #endif
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  *
  *	On the i386 this is called after pmap_cold() created initial
  *	kernel page table and enabled paging, and just syncs the pmap
  *	module with what has already been done.
  */
 static void
 __CONCAT(PMTYPE, bootstrap)(vm_paddr_t firstaddr)
 {
 	vm_offset_t va;
 	pt_entry_t *pte, *unused;
 	struct pcpu *pc;
 	u_long res;
 	int i;
 
 	res = atop(firstaddr - (vm_paddr_t)KERNLOAD);
 
 	/*
 	 * Add a physical memory segment (vm_phys_seg) corresponding to the
 	 * preallocated kernel page table pages so that vm_page structures
 	 * representing these pages will be created.  The vm_page structures
 	 * are required for promotion of the corresponding kernel virtual
 	 * addresses to superpage mappings.
 	 */
 	vm_phys_add_seg(KPTphys, KPTphys + ptoa(nkpt));
 
 	/*
 	 * Initialize the first available kernel virtual address.
 	 * However, using "firstaddr" may waste a few pages of the
 	 * kernel virtual address space, because pmap_cold() may not
 	 * have mapped every physical page that it allocated.
 	 * Preferably, pmap_cold() would provide a first unused
 	 * virtual address in addition to "firstaddr".
 	 */
 	virtual_avail = (vm_offset_t)firstaddr;
 	virtual_end = VM_MAX_KERNEL_ADDRESS;
 
 	/*
 	 * Initialize the kernel pmap (which is statically allocated).
 	 * Count bootstrap data as being resident in case any of this data is
 	 * later unmapped (using pmap_remove()) and freed.
 	 */
 	PMAP_LOCK_INIT(kernel_pmap);
 	kernel_pmap->pm_pdir = IdlePTD;
 #ifdef PMAP_PAE_COMP
 	kernel_pmap->pm_pdpt = IdlePDPT;
 #endif
 	CPU_FILL(&kernel_pmap->pm_active);	/* don't allow deactivation */
 	kernel_pmap->pm_stats.resident_count = res;
 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
 
  	/*
 	 * Initialize the global pv list lock.
 	 */
 	rw_init(&pvh_global_lock, "pmap pv global");
 
 	/*
 	 * Reserve some special page table entries/VA space for temporary
 	 * mapping of pages.
 	 */
 #define	SYSMAP(c, p, v, n)	\
 	v = (c)va; va += ((n)*PAGE_SIZE); p = pte; pte += (n);
 
 	va = virtual_avail;
 	pte = vtopte(va);
 
 
 	/*
 	 * Initialize temporary map objects on the current CPU for use
 	 * during early boot.
 	 * CMAP1/CMAP2 are used for zeroing and copying pages.
 	 * CMAP3 is used for the boot-time memory test.
 	 */
 	pc = get_pcpu();
 	mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
 	SYSMAP(caddr_t, pc->pc_cmap_pte1, pc->pc_cmap_addr1, 1)
 	SYSMAP(caddr_t, pc->pc_cmap_pte2, pc->pc_cmap_addr2, 1)
 	SYSMAP(vm_offset_t, pte, pc->pc_qmap_addr, 1)
 
 	SYSMAP(caddr_t, CMAP3, CADDR3, 1);
 
 	/*
 	 * Crashdump maps.
 	 */
 	SYSMAP(caddr_t, unused, crashdumpmap, MAXDUMPPGS)
 
 	/*
 	 * ptvmmap is used for reading arbitrary physical pages via /dev/mem.
 	 */
 	SYSMAP(caddr_t, unused, ptvmmap, 1)
 
 	/*
 	 * msgbufp is used to map the system message buffer.
 	 */
 	SYSMAP(struct msgbuf *, unused, msgbufp, atop(round_page(msgbufsize)))
 
 	/*
 	 * KPTmap is used by pmap_kextract().
 	 *
 	 * KPTmap is first initialized by pmap_cold().  However, that initial
 	 * KPTmap can only support NKPT page table pages.  Here, a larger
 	 * KPTmap is created that can support KVA_PAGES page table pages.
 	 */
 	SYSMAP(pt_entry_t *, KPTD, KPTmap, KVA_PAGES)
 
 	for (i = 0; i < NKPT; i++)
 		KPTD[i] = (KPTphys + ptoa(i)) | PG_RW | PG_V;
 
 	/*
 	 * PADDR1 and PADDR2 are used by pmap_pte_quick() and pmap_pte(),
 	 * respectively.
 	 */
 	SYSMAP(pt_entry_t *, PMAP1, PADDR1, 1)
 	SYSMAP(pt_entry_t *, PMAP2, PADDR2, 1)
 	SYSMAP(pt_entry_t *, PMAP3, PADDR3, 1)
 
 	mtx_init(&PMAP2mutex, "PMAP2", NULL, MTX_DEF);
 
 	virtual_avail = va;
 
 	/*
 	 * Initialize the PAT MSR if present.
 	 * pmap_init_pat() clears and sets CR4_PGE, which, as a
 	 * side-effect, invalidates stale PG_G TLB entries that might
 	 * have been created in our pre-boot environment.  We assume
 	 * that PAT support implies PGE and in reverse, PGE presence
 	 * comes with PAT.  Both features were added for Pentium Pro.
 	 */
 	pmap_init_pat();
 }
 
 static void
 pmap_init_reserved_pages(void)
 {
 	struct pcpu *pc;
 	vm_offset_t pages;
 	int i;
 
 #ifdef PMAP_PAE_COMP
 	if (!pae_mode)
 		return;
 #else
 	if (pae_mode)
 		return;
 #endif
 	CPU_FOREACH(i) {
 		pc = pcpu_find(i);
 		mtx_init(&pc->pc_copyout_mlock, "cpmlk", NULL, MTX_DEF |
 		    MTX_NEW);
 		pc->pc_copyout_maddr = kva_alloc(ptoa(2));
 		if (pc->pc_copyout_maddr == 0)
 			panic("unable to allocate non-sleepable copyout KVA");
 		sx_init(&pc->pc_copyout_slock, "cpslk");
 		pc->pc_copyout_saddr = kva_alloc(ptoa(2));
 		if (pc->pc_copyout_saddr == 0)
 			panic("unable to allocate sleepable copyout KVA");
 		pc->pc_pmap_eh_va = kva_alloc(ptoa(1));
 		if (pc->pc_pmap_eh_va == 0)
 			panic("unable to allocate pmap_extract_and_hold KVA");
 		pc->pc_pmap_eh_ptep = (char *)vtopte(pc->pc_pmap_eh_va);
 
 		/*
 		 * Skip if the mappings have already been initialized,
 		 * i.e. this is the BSP.
 		 */
 		if (pc->pc_cmap_addr1 != 0)
 			continue;
 
 		mtx_init(&pc->pc_cmap_lock, "SYSMAPS", NULL, MTX_DEF);
 		pages = kva_alloc(PAGE_SIZE * 3);
 		if (pages == 0)
 			panic("unable to allocate CMAP KVA");
 		pc->pc_cmap_pte1 = vtopte(pages);
 		pc->pc_cmap_pte2 = vtopte(pages + PAGE_SIZE);
 		pc->pc_cmap_addr1 = (caddr_t)pages;
 		pc->pc_cmap_addr2 = (caddr_t)(pages + PAGE_SIZE);
 		pc->pc_qmap_addr = pages + ptoa(2);
 	}
 }
  
 SYSINIT(rpages_init, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_reserved_pages, NULL);
 
 /*
  * Setup the PAT MSR.
  */
 static void
 __CONCAT(PMTYPE, init_pat)(void)
 {
 	int pat_table[PAT_INDEX_SIZE];
 	uint64_t pat_msr;
 	u_long cr0, cr4;
 	int i;
 
 	/* Set default PAT index table. */
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		pat_table[i] = -1;
 	pat_table[PAT_WRITE_BACK] = 0;
 	pat_table[PAT_WRITE_THROUGH] = 1;
 	pat_table[PAT_UNCACHEABLE] = 3;
 	pat_table[PAT_WRITE_COMBINING] = 3;
 	pat_table[PAT_WRITE_PROTECTED] = 3;
 	pat_table[PAT_UNCACHED] = 3;
 
 	/*
 	 * Bail if this CPU doesn't implement PAT.
 	 * We assume that PAT support implies PGE.
 	 */
 	if ((cpu_feature & CPUID_PAT) == 0) {
 		for (i = 0; i < PAT_INDEX_SIZE; i++)
 			pat_index[i] = pat_table[i];
 		pat_works = 0;
 		return;
 	}
 
 	/*
 	 * Due to some Intel errata, we can only safely use the lower 4
 	 * PAT entries.
 	 *
 	 *   Intel Pentium III Processor Specification Update
 	 * Errata E.27 (Upper Four PAT Entries Not Usable With Mode B
 	 * or Mode C Paging)
 	 *
 	 *   Intel Pentium IV  Processor Specification Update
 	 * Errata N46 (PAT Index MSB May Be Calculated Incorrectly)
 	 */
 	if (cpu_vendor_id == CPU_VENDOR_INTEL &&
 	    !(CPUID_TO_FAMILY(cpu_id) == 6 && CPUID_TO_MODEL(cpu_id) >= 0xe))
 		pat_works = 0;
 
 	/* Initialize default PAT entries. */
 	pat_msr = PAT_VALUE(0, PAT_WRITE_BACK) |
 	    PAT_VALUE(1, PAT_WRITE_THROUGH) |
 	    PAT_VALUE(2, PAT_UNCACHED) |
 	    PAT_VALUE(3, PAT_UNCACHEABLE) |
 	    PAT_VALUE(4, PAT_WRITE_BACK) |
 	    PAT_VALUE(5, PAT_WRITE_THROUGH) |
 	    PAT_VALUE(6, PAT_UNCACHED) |
 	    PAT_VALUE(7, PAT_UNCACHEABLE);
 
 	if (pat_works) {
 		/*
 		 * Leave the indices 0-3 at the default of WB, WT, UC-, and UC.
 		 * Program 5 and 6 as WP and WC.
 		 * Leave 4 and 7 as WB and UC.
 		 */
 		pat_msr &= ~(PAT_MASK(5) | PAT_MASK(6));
 		pat_msr |= PAT_VALUE(5, PAT_WRITE_PROTECTED) |
 		    PAT_VALUE(6, PAT_WRITE_COMBINING);
 		pat_table[PAT_UNCACHED] = 2;
 		pat_table[PAT_WRITE_PROTECTED] = 5;
 		pat_table[PAT_WRITE_COMBINING] = 6;
 	} else {
 		/*
 		 * Just replace PAT Index 2 with WC instead of UC-.
 		 */
 		pat_msr &= ~PAT_MASK(2);
 		pat_msr |= PAT_VALUE(2, PAT_WRITE_COMBINING);
 		pat_table[PAT_WRITE_COMBINING] = 2;
 	}
 
 	/* Disable PGE. */
 	cr4 = rcr4();
 	load_cr4(cr4 & ~CR4_PGE);
 
 	/* Disable caches (CD = 1, NW = 0). */
 	cr0 = rcr0();
 	load_cr0((cr0 & ~CR0_NW) | CR0_CD);
 
 	/* Flushes caches and TLBs. */
 	wbinvd();
 	invltlb();
 
 	/* Update PAT and index table. */
 	wrmsr(MSR_PAT, pat_msr);
 	for (i = 0; i < PAT_INDEX_SIZE; i++)
 		pat_index[i] = pat_table[i];
 
 	/* Flush caches and TLBs again. */
 	wbinvd();
 	invltlb();
 
 	/* Restore caches and PGE. */
 	load_cr0(cr0);
 	load_cr4(cr4);
 }
 
 #ifdef PMAP_PAE_COMP
 static void *
 pmap_pdpt_allocf(uma_zone_t zone, vm_size_t bytes, int domain, uint8_t *flags,
     int wait)
 {
 
 	/* Inform UMA that this allocator uses kernel_map/object. */
 	*flags = UMA_SLAB_KERNEL;
 	return ((void *)kmem_alloc_contig_domainset(DOMAINSET_FIXED(domain),
 	    bytes, wait, 0x0ULL, 0xffffffffULL, 1, 0, VM_MEMATTR_DEFAULT));
 }
 #endif
 
 /*
  * Abuse the pte nodes for unmapped kva to thread a kva freelist through.
  * Requirements:
  *  - Must deal with pages in order to ensure that none of the PG_* bits
  *    are ever set, PG_V in particular.
  *  - Assumes we can write to ptes without pte_store() atomic ops, even
  *    on PAE systems.  This should be ok.
  *  - Assumes nothing will ever test these addresses for 0 to indicate
  *    no mapping instead of correctly checking PG_V.
  *  - Assumes a vm_offset_t will fit in a pte (true for i386).
  * Because PG_V is never set, there can be no mappings to invalidate.
  */
 static vm_offset_t
 pmap_ptelist_alloc(vm_offset_t *head)
 {
 	pt_entry_t *pte;
 	vm_offset_t va;
 
 	va = *head;
 	if (va == 0)
 		panic("pmap_ptelist_alloc: exhausted ptelist KVA");
 	pte = vtopte(va);
 	*head = *pte;
 	if (*head & PG_V)
 		panic("pmap_ptelist_alloc: va with PG_V set!");
 	*pte = 0;
 	return (va);
 }
 
 static void
 pmap_ptelist_free(vm_offset_t *head, vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	if (va & PG_V)
 		panic("pmap_ptelist_free: freeing va with PG_V set!");
 	pte = vtopte(va);
 	*pte = *head;		/* virtual! PG_V is 0 though */
 	*head = va;
 }
 
 static void
 pmap_ptelist_init(vm_offset_t *head, void *base, int npages)
 {
 	int i;
 	vm_offset_t va;
 
 	*head = 0;
 	for (i = npages - 1; i >= 0; i--) {
 		va = (vm_offset_t)base + i * PAGE_SIZE;
 		pmap_ptelist_free(head, va);
 	}
 }
 
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 static void
 __CONCAT(PMTYPE, init)(void)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_page_t mpte;
 	vm_size_t s;
 	int i, pv_npg;
 
 	/*
 	 * Initialize the vm page array entries for the kernel pmap's
 	 * page table pages.
 	 */ 
 	PMAP_LOCK(kernel_pmap);
 	for (i = 0; i < NKPT; i++) {
 		mpte = PHYS_TO_VM_PAGE(KPTphys + ptoa(i));
 		KASSERT(mpte >= vm_page_array &&
 		    mpte < &vm_page_array[vm_page_array_size],
 		    ("pmap_init: page table page is out of range"));
 		mpte->pindex = i + KPTDI;
 		mpte->phys_addr = KPTphys + ptoa(i);
 		mpte->wire_count = 1;
 
 		/*
 		 * Collect the page table pages that were replaced by a 2/4MB
 		 * page.  They are filled with equivalent 4KB page mappings.
 		 */
 		if (pseflag != 0 &&
 		    KERNBASE <= i << PDRSHIFT && i << PDRSHIFT < KERNend &&
 		    pmap_insert_pt_page(kernel_pmap, mpte, true))
 			panic("pmap_init: pmap_insert_pt_page failed");
 	}
 	PMAP_UNLOCK(kernel_pmap);
 	vm_wire_add(NKPT);
 
 	/*
 	 * Initialize the address space (zone) for the pv entries.  Set a
 	 * high water mark so that the system can recover from excessive
 	 * numbers of pv entries.
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.shpgperproc", &shpgperproc);
 	pv_entry_max = shpgperproc * maxproc + vm_cnt.v_page_count;
 	TUNABLE_INT_FETCH("vm.pmap.pv_entries", &pv_entry_max);
 	pv_entry_max = roundup(pv_entry_max, _NPCPV);
 	pv_entry_high_water = 9 * (pv_entry_max / 10);
 
 	/*
 	 * If the kernel is running on a virtual machine, then it must assume
 	 * that MCA is enabled by the hypervisor.  Moreover, the kernel must
 	 * be prepared for the hypervisor changing the vendor and family that
 	 * are reported by CPUID.  Consequently, the workaround for AMD Family
 	 * 10h Erratum 383 is enabled if the processor's feature set does not
 	 * include at least one feature that is only supported by older Intel
 	 * or newer AMD processors.
 	 */
 	if (vm_guest != VM_GUEST_NO && (cpu_feature & CPUID_SS) == 0 &&
 	    (cpu_feature2 & (CPUID2_SSSE3 | CPUID2_SSE41 | CPUID2_AESNI |
 	    CPUID2_AVX | CPUID2_XSAVE)) == 0 && (amd_feature2 & (AMDID2_XOP |
 	    AMDID2_FMA4)) == 0)
 		workaround_erratum383 = 1;
 
 	/*
 	 * Are large page mappings supported and enabled?
 	 */
 	TUNABLE_INT_FETCH("vm.pmap.pg_ps_enabled", &pg_ps_enabled);
 	if (pseflag == 0)
 		pg_ps_enabled = 0;
 	else if (pg_ps_enabled) {
 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
 		    ("pmap_init: can't assign to pagesizes[1]"));
 		pagesizes[1] = NBPDR;
 	}
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 * Handle the possibility that "vm_phys_segs[...].end" is zero.
 	 */
 	pv_npg = trunc_4mpage(vm_phys_segs[vm_phys_nsegs - 1].end -
 	    PAGE_SIZE) / NBPDR + 1;
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 
 	pv_maxchunks = MAX(pv_entry_max / _NPCPV, maxproc);
 	pv_chunkbase = (struct pv_chunk *)kva_alloc(PAGE_SIZE * pv_maxchunks);
 	if (pv_chunkbase == NULL)
 		panic("pmap_init: not enough kvm for pv chunks");
 	pmap_ptelist_init(&pv_vafree, pv_chunkbase, pv_maxchunks);
 #ifdef PMAP_PAE_COMP
 	pdptzone = uma_zcreate("PDPT", NPGPTD * sizeof(pdpt_entry_t), NULL,
 	    NULL, NULL, NULL, (NPGPTD * sizeof(pdpt_entry_t)) - 1,
 	    UMA_ZONE_VM | UMA_ZONE_NOFREE);
 	uma_zone_set_allocf(pdptzone, pmap_pdpt_allocf);
 #endif
 
 	pmap_initialized = 1;
 	pmap_init_trm();
 
 	if (!bootverbose)
 		return;
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == 0)
 			continue;
 		printf("PPIM %u: PA=%#jx, VA=%#x, size=%#x, mode=%#x\n", i,
 		    (uintmax_t)ppim->pa, ppim->va, ppim->sz, ppim->mode);
 	}
 
 }
 
 extern u_long pmap_pde_demotions;
 extern u_long pmap_pde_mappings;
 extern u_long pmap_pde_p_failures;
 extern u_long pmap_pde_promotions;
 
 /***************************************************
  * Low level helper routines.....
  ***************************************************/
 
 static boolean_t
 __CONCAT(PMTYPE, is_valid_memattr)(pmap_t pmap __unused, vm_memattr_t mode)
 {
 
 	return (mode >= 0 && mode < PAT_INDEX_SIZE &&
 	    pat_index[(int)mode] >= 0);
 }
 
 /*
  * Determine the appropriate bits to set in a PTE or PDE for a specified
  * caching mode.
  */
 static int
 __CONCAT(PMTYPE, cache_bits)(pmap_t pmap, int mode, boolean_t is_pde)
 {
 	int cache_bits, pat_flag, pat_idx;
 
 	if (!pmap_is_valid_memattr(pmap, mode))
 		panic("Unknown caching mode %d\n", mode);
 
 	/* The PAT bit is different for PTE's and PDE's. */
 	pat_flag = is_pde ? PG_PDE_PAT : PG_PTE_PAT;
 
 	/* Map the caching mode to a PAT index. */
 	pat_idx = pat_index[mode];
 
 	/* Map the 3-bit index value into the PAT, PCD, and PWT bits. */
 	cache_bits = 0;
 	if (pat_idx & 0x4)
 		cache_bits |= pat_flag;
 	if (pat_idx & 0x2)
 		cache_bits |= PG_NC_PCD;
 	if (pat_idx & 0x1)
 		cache_bits |= PG_NC_PWT;
 	return (cache_bits);
 }
 
 static bool
 __CONCAT(PMTYPE, ps_enabled)(pmap_t pmap __unused)
 {
 
 	return (pg_ps_enabled);
 }
 
 /*
  * The caller is responsible for maintaining TLB consistency.
  */
 static void
 pmap_kenter_pde(vm_offset_t va, pd_entry_t newpde)
 {
 	pd_entry_t *pde;
 
 	pde = pmap_pde(kernel_pmap, va);
 	pde_store(pde, newpde);
 }
 
 /*
  * After changing the page size for the specified virtual address in the page
  * table, flush the corresponding entries from the processor's TLB.  Only the
  * calling processor's TLB is affected.
  *
  * The calling thread must be pinned to a processor.
  */
 static void
 pmap_update_pde_invalidate(vm_offset_t va, pd_entry_t newpde)
 {
 
 	if ((newpde & PG_PS) == 0)
 		/* Demotion: flush a specific 2MB page mapping. */
 		invlpg(va);
 	else /* if ((newpde & PG_G) == 0) */
 		/*
 		 * Promotion: flush every 4KB page mapping from the TLB
 		 * because there are too many to flush individually.
 		 */
 		invltlb();
 }
 
 #ifdef SMP
 /*
  * For SMP, these functions have to use the IPI mechanism for coherence.
  *
  * N.B.: Before calling any of the following TLB invalidation functions,
  * the calling processor must ensure that all stores updating a non-
  * kernel page table are globally performed.  Otherwise, another
  * processor could cache an old, pre-update entry without being
  * invalidated.  This can happen one of two ways: (1) The pmap becomes
  * active on another processor after its pm_active field is checked by
  * one of the following functions but before a store updating the page
  * table is globally performed. (2) The pmap becomes active on another
  * processor before its pm_active field is checked but due to
  * speculative loads one of the following functions stills reads the
  * pmap as inactive on the other processor.
  * 
  * The kernel page table is exempt because its pm_active field is
  * immutable.  The kernel page table is always active on every
  * processor.
  */
 static void
 pmap_invalidate_page_int(pmap_t pmap, vm_offset_t va)
 {
 	cpuset_t *mask, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	if (pmap == kernel_pmap) {
 		invlpg(va);
 		mask = &all_cpus;
 	} else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
 		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
 	smp_masked_invlpg(*mask, va, pmap);
 	sched_unpin();
 }
 
 /* 4k PTEs -- Chosen to exceed the total size of Broadwell L2 TLB */
 #define	PMAP_INVLPG_THRESHOLD	(4 * 1024 * PAGE_SIZE)
 
 static void
 pmap_invalidate_range_int(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	cpuset_t *mask, other_cpus;
 	vm_offset_t addr;
 	u_int cpuid;
 
 	if (eva - sva >= PMAP_INVLPG_THRESHOLD) {
 		pmap_invalidate_all_int(pmap);
 		return;
 	}
 
 	sched_pin();
 	if (pmap == kernel_pmap) {
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 		mask = &all_cpus;
 	} else  if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
 		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
 	smp_masked_invlpg_range(*mask, sva, eva, pmap);
 	sched_unpin();
 }
 
 static void
 pmap_invalidate_all_int(pmap_t pmap)
 {
 	cpuset_t *mask, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	if (pmap == kernel_pmap) {
 		invltlb();
 		mask = &all_cpus;
 	} else if (!CPU_CMP(&pmap->pm_active, &all_cpus)) {
 		mask = &all_cpus;
 	} else {
 		cpuid = PCPU_GET(cpuid);
 		other_cpus = all_cpus;
 		CPU_CLR(cpuid, &other_cpus);
 		CPU_AND(&other_cpus, &pmap->pm_active);
 		mask = &other_cpus;
 	}
 	smp_masked_invltlb(*mask, pmap);
 	sched_unpin();
 }
 
 static void
 __CONCAT(PMTYPE, invalidate_cache)(void)
 {
 
 	sched_pin();
 	wbinvd();
 	smp_cache_flush();
 	sched_unpin();
 }
 
 struct pde_action {
 	cpuset_t invalidate;	/* processors that invalidate their TLB */
 	vm_offset_t va;
 	pd_entry_t *pde;
 	pd_entry_t newpde;
 	u_int store;		/* processor that updates the PDE */
 };
 
 static void
 pmap_update_pde_kernel(void *arg)
 {
 	struct pde_action *act = arg;
 	pd_entry_t *pde;
 
 	if (act->store == PCPU_GET(cpuid)) {
 		pde = pmap_pde(kernel_pmap, act->va);
 		pde_store(pde, act->newpde);
 	}
 }
 
 static void
 pmap_update_pde_user(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (act->store == PCPU_GET(cpuid))
 		pde_store(act->pde, act->newpde);
 }
 
 static void
 pmap_update_pde_teardown(void *arg)
 {
 	struct pde_action *act = arg;
 
 	if (CPU_ISSET(PCPU_GET(cpuid), &act->invalidate))
 		pmap_update_pde_invalidate(act->va, act->newpde);
 }
 
 /*
  * Change the page size for the specified virtual address in a way that
  * prevents any possibility of the TLB ever having two entries that map the
  * same virtual address using different page sizes.  This is the recommended
  * workaround for Erratum 383 on AMD Family 10h processors.  It prevents a
  * machine check exception for a TLB state that is improperly diagnosed as a
  * hardware error.
  */
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 	struct pde_action act;
 	cpuset_t active, other_cpus;
 	u_int cpuid;
 
 	sched_pin();
 	cpuid = PCPU_GET(cpuid);
 	other_cpus = all_cpus;
 	CPU_CLR(cpuid, &other_cpus);
 	if (pmap == kernel_pmap)
 		active = all_cpus;
 	else
 		active = pmap->pm_active;
 	if (CPU_OVERLAP(&active, &other_cpus)) {
 		act.store = cpuid;
 		act.invalidate = active;
 		act.va = va;
 		act.pde = pde;
 		act.newpde = newpde;
 		CPU_SET(cpuid, &active);
 		smp_rendezvous_cpus(active,
 		    smp_no_rendezvous_barrier, pmap == kernel_pmap ?
 		    pmap_update_pde_kernel : pmap_update_pde_user,
 		    pmap_update_pde_teardown, &act);
 	} else {
 		if (pmap == kernel_pmap)
 			pmap_kenter_pde(va, newpde);
 		else
 			pde_store(pde, newpde);
 		if (CPU_ISSET(cpuid, &active))
 			pmap_update_pde_invalidate(va, newpde);
 	}
 	sched_unpin();
 }
 #else /* !SMP */
 /*
  * Normal, non-SMP, 486+ invalidation functions.
  * We inline these within pmap.c for speed.
  */
 static void
 pmap_invalidate_page_int(pmap_t pmap, vm_offset_t va)
 {
 
 	if (pmap == kernel_pmap)
 		invlpg(va);
 }
 
 static void
 pmap_invalidate_range_int(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t addr;
 
 	if (pmap == kernel_pmap)
 		for (addr = sva; addr < eva; addr += PAGE_SIZE)
 			invlpg(addr);
 }
 
 static void
 pmap_invalidate_all_int(pmap_t pmap)
 {
 
 	if (pmap == kernel_pmap)
 		invltlb();
 }
 
 static void
 __CONCAT(PMTYPE, invalidate_cache)(void)
 {
 
 	wbinvd();
 }
 
 static void
 pmap_update_pde(pmap_t pmap, vm_offset_t va, pd_entry_t *pde, pd_entry_t newpde)
 {
 
 	if (pmap == kernel_pmap)
 		pmap_kenter_pde(va, newpde);
 	else
 		pde_store(pde, newpde);
 	if (pmap == kernel_pmap || !CPU_EMPTY(&pmap->pm_active))
 		pmap_update_pde_invalidate(va, newpde);
 }
 #endif /* !SMP */
 
 static void
 __CONCAT(PMTYPE, invalidate_page)(pmap_t pmap, vm_offset_t va)
 {
 
 	pmap_invalidate_page_int(pmap, va);
 }
 
 static void
 __CONCAT(PMTYPE, invalidate_range)(pmap_t pmap, vm_offset_t sva,
     vm_offset_t eva)
 {
 
 	pmap_invalidate_range_int(pmap, sva, eva);
 }
 
 static void
 __CONCAT(PMTYPE, invalidate_all)(pmap_t pmap)
 {
 
 	pmap_invalidate_all_int(pmap);
 }
 
 static void
 pmap_invalidate_pde_page(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
 {
 
 	/*
 	 * When the PDE has PG_PROMOTED set, the 2- or 4MB page mapping was
 	 * created by a promotion that did not invalidate the 512 or 1024 4KB
 	 * page mappings that might exist in the TLB.  Consequently, at this
 	 * point, the TLB may hold both 4KB and 2- or 4MB page mappings for
 	 * the address range [va, va + NBPDR).  Therefore, the entire range
 	 * must be invalidated here.  In contrast, when PG_PROMOTED is clear,
 	 * the TLB will not hold any 4KB page mappings for the address range
 	 * [va, va + NBPDR), and so a single INVLPG suffices to invalidate the
 	 * 2- or 4MB page mapping from the TLB.
 	 */
 	if ((pde & PG_PROMOTED) != 0)
 		pmap_invalidate_range_int(pmap, va, va + NBPDR - 1);
 	else
 		pmap_invalidate_page_int(pmap, va);
 }
 
 /*
  * Are we current address space or kernel?
  */
 static __inline int
 pmap_is_current(pmap_t pmap)
 {
 
 	return (pmap == kernel_pmap);
 }
 
 /*
  * If the given pmap is not the current or kernel pmap, the returned pte must
  * be released by passing it to pmap_pte_release().
  */
 static pt_entry_t *
 __CONCAT(PMTYPE, pte)(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
 		mtx_lock(&PMAP2mutex);
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP2 & PG_FRAME) != newpf) {
 			*PMAP2 = newpf | PG_RW | PG_V | PG_A | PG_M;
 			pmap_invalidate_page_int(kernel_pmap,
 			    (vm_offset_t)PADDR2);
 		}
 		return (PADDR2 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (NULL);
 }
 
 /*
  * Releases a pte that was obtained from pmap_pte().  Be prepared for the pte
  * being NULL.
  */
 static __inline void
 pmap_pte_release(pt_entry_t *pte)
 {
 
 	if ((pt_entry_t *)((vm_offset_t)pte & ~PAGE_MASK) == PADDR2)
 		mtx_unlock(&PMAP2mutex);
 }
 
 /*
  * NB:  The sequence of updating a page table followed by accesses to the
  * corresponding pages is subject to the situation described in the "AMD64
  * Architecture Programmer's Manual Volume 2: System Programming" rev. 3.23,
  * "7.3.1 Special Coherency Considerations".  Therefore, issuing the INVLPG
  * right after modifying the PTE bits is crucial.
  */
 static __inline void
 invlcaddr(void *caddr)
 {
 
 	invlpg((u_int)caddr);
 }
 
 /*
  * Super fast pmap_pte routine best used when scanning
  * the pv lists.  This eliminates many coarse-grained
  * invltlb calls.  Note that many of the pv list
  * scans are across different pmaps.  It is very wasteful
  * to do an entire invltlb for checking a single mapping.
  *
  * If the given pmap is not the current pmap, pvh_global_lock
  * must be held and curthread pinned to a CPU.
  */
 static pt_entry_t *
 pmap_pte_quick(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		/* are we current address space or kernel? */
 		if (pmap_is_current(pmap))
 			return (vtopte(va));
 		rw_assert(&pvh_global_lock, RA_WLOCKED);
 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP1 & PG_FRAME) != newpf) {
 			*PMAP1 = newpf | PG_RW | PG_V | PG_A | PG_M;
 #ifdef SMP
 			PMAP1cpu = PCPU_GET(cpuid);
 #endif
 			invlcaddr(PADDR1);
 			PMAP1changed++;
 		} else
 #ifdef SMP
 		if (PMAP1cpu != PCPU_GET(cpuid)) {
 			PMAP1cpu = PCPU_GET(cpuid);
 			invlcaddr(PADDR1);
 			PMAP1changedcpu++;
 		} else
 #endif
 			PMAP1unchanged++;
 		return (PADDR1 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (0);
 }
 
 static pt_entry_t *
 pmap_pte_quick3(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t newpf;
 	pd_entry_t *pde;
 
 	pde = pmap_pde(pmap, va);
 	if (*pde & PG_PS)
 		return (pde);
 	if (*pde != 0) {
 		rw_assert(&pvh_global_lock, RA_WLOCKED);
 		KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 		newpf = *pde & PG_FRAME;
 		if ((*PMAP3 & PG_FRAME) != newpf) {
 			*PMAP3 = newpf | PG_RW | PG_V | PG_A | PG_M;
 #ifdef SMP
 			PMAP3cpu = PCPU_GET(cpuid);
 #endif
 			invlcaddr(PADDR3);
 			PMAP1changed++;
 		} else
 #ifdef SMP
 		if (PMAP3cpu != PCPU_GET(cpuid)) {
 			PMAP3cpu = PCPU_GET(cpuid);
 			invlcaddr(PADDR3);
 			PMAP1changedcpu++;
 		} else
 #endif
 			PMAP1unchanged++;
 		return (PADDR3 + (i386_btop(va) & (NPTEPG - 1)));
 	}
 	return (0);
 }
 
 static pt_entry_t
 pmap_pte_ufast(pmap_t pmap, vm_offset_t va, pd_entry_t pde)
 {
 	pt_entry_t *eh_ptep, pte, *ptep;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pde &= PG_FRAME;
 	critical_enter();
 	eh_ptep = (pt_entry_t *)PCPU_GET(pmap_eh_ptep);
 	if ((*eh_ptep & PG_FRAME) != pde) {
 		*eh_ptep = pde | PG_RW | PG_V | PG_A | PG_M;
 		invlcaddr((void *)PCPU_GET(pmap_eh_va));
 	}
 	ptep = (pt_entry_t *)PCPU_GET(pmap_eh_va) + (i386_btop(va) &
 	    (NPTEPG - 1));
 	pte = *ptep;
 	critical_exit();
 	return (pte);
 }
 
 /*
  * Extract from the kernel page table the physical address that is mapped by
  * the given virtual address "va".
  *
  * This function may be used before pmap_bootstrap() is called.
  */
 static vm_paddr_t
 __CONCAT(PMTYPE, kextract)(vm_offset_t va)
 {
 	vm_paddr_t pa;
 
 	if ((pa = pte_load(&PTD[va >> PDRSHIFT])) & PG_PS) {
 		pa = (pa & PG_PS_FRAME) | (va & PDRMASK);
 	} else {
 		/*
 		 * Beware of a concurrent promotion that changes the PDE at
 		 * this point!  For example, vtopte() must not be used to
 		 * access the PTE because it would use the new PDE.  It is,
 		 * however, safe to use the old PDE because the page table
 		 * page is preserved by the promotion.
 		 */
 		pa = KPTmap[i386_btop(va)];
 		pa = (pa & PG_FRAME) | (va & PAGE_MASK);
 	}
 	return (pa);
 }
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 static vm_paddr_t
 __CONCAT(PMTYPE, extract)(pmap_t pmap, vm_offset_t va)
 {
 	vm_paddr_t rtval;
 	pt_entry_t pte;
 	pd_entry_t pde;
 
 	rtval = 0;
 	PMAP_LOCK(pmap);
 	pde = pmap->pm_pdir[va >> PDRSHIFT];
 	if (pde != 0) {
 		if ((pde & PG_PS) != 0)
 			rtval = (pde & PG_PS_FRAME) | (va & PDRMASK);
 		else {
 			pte = pmap_pte_ufast(pmap, va, pde);
 			rtval = (pte & PG_FRAME) | (va & PAGE_MASK);
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (rtval);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 static vm_page_t
 __CONCAT(PMTYPE, extract_and_hold)(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pd_entry_t pde;
 	pt_entry_t pte;
 	vm_page_t m;
 	vm_paddr_t pa;
 
 	pa = 0;
 	m = NULL;
 	PMAP_LOCK(pmap);
 retry:
 	pde = *pmap_pde(pmap, va);
 	if (pde != 0) {
 		if (pde & PG_PS) {
 			if ((pde & PG_RW) || (prot & VM_PROT_WRITE) == 0) {
 				if (vm_page_pa_tryrelock(pmap, (pde &
 				    PG_PS_FRAME) | (va & PDRMASK), &pa))
 					goto retry;
 				m = PHYS_TO_VM_PAGE(pa);
 			}
 		} else {
 			pte = pmap_pte_ufast(pmap, va, pde);
 			if (pte != 0 &&
 			    ((pte & PG_RW) || (prot & VM_PROT_WRITE) == 0)) {
 				if (vm_page_pa_tryrelock(pmap, pte & PG_FRAME,
 				    &pa))
 					goto retry;
 				m = PHYS_TO_VM_PAGE(pa);
 			}
 		}
 		if (m != NULL)
 			vm_page_wire(m);
 	}
 	PA_UNLOCK_COND(pa);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 /*
  * Add a wired page to the kva.
  * Note: not SMP coherent.
  *
  * This function may be used before pmap_bootstrap() is called.
  */
 static void
 __CONCAT(PMTYPE, kenter)(vm_offset_t va, vm_paddr_t pa)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V);
 }
 
 static __inline void
 pmap_kenter_attr(vm_offset_t va, vm_paddr_t pa, int mode)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_store(pte, pa | PG_RW | PG_V | pmap_cache_bits(kernel_pmap,
 	    mode, 0));
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  *
  * This function may be used before pmap_bootstrap() is called.
  */
 static void
 __CONCAT(PMTYPE, kremove)(vm_offset_t va)
 {
 	pt_entry_t *pte;
 
 	pte = vtopte(va);
 	pte_clear(pte);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 static vm_offset_t
 __CONCAT(PMTYPE, map)(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end,
     int prot)
 {
 	vm_offset_t va, sva;
 	vm_paddr_t superpage_offset;
 	pd_entry_t newpde;
 
 	va = *virt;
 	/*
 	 * Does the physical address range's size and alignment permit at
 	 * least one superpage mapping to be created?
 	 */ 
 	superpage_offset = start & PDRMASK;
 	if ((end - start) - ((NBPDR - superpage_offset) & PDRMASK) >= NBPDR) {
 		/*
 		 * Increase the starting virtual address so that its alignment
 		 * does not preclude the use of superpage mappings.
 		 */
 		if ((va & PDRMASK) < superpage_offset)
 			va = (va & ~PDRMASK) + superpage_offset;
 		else if ((va & PDRMASK) > superpage_offset)
 			va = ((va + PDRMASK) & ~PDRMASK) + superpage_offset;
 	}
 	sva = va;
 	while (start < end) {
 		if ((start & PDRMASK) == 0 && end - start >= NBPDR &&
 		    pseflag != 0) {
 			KASSERT((va & PDRMASK) == 0,
 			    ("pmap_map: misaligned va %#x", va));
 			newpde = start | PG_PS | PG_RW | PG_V;
 			pmap_kenter_pde(va, newpde);
 			va += NBPDR;
 			start += NBPDR;
 		} else {
 			pmap_kenter(va, start);
 			va += PAGE_SIZE;
 			start += PAGE_SIZE;
 		}
 	}
 	pmap_invalidate_range_int(kernel_pmap, sva, va);
 	*virt = va;
 	return (sva);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 static void
 __CONCAT(PMTYPE, qenter)(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *endpte, oldpte, pa, *pte;
 	vm_page_t m;
 
 	oldpte = 0;
 	pte = vtopte(sva);
 	endpte = pte + count;
 	while (pte < endpte) {
 		m = *ma++;
 		pa = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(kernel_pmap,
 		    m->md.pat_mode, 0);
 		if ((*pte & (PG_FRAME | PG_PTE_CACHE)) != pa) {
 			oldpte |= *pte;
 #ifdef PMAP_PAE_COMP
 			pte_store(pte, pa | pg_nx | PG_RW | PG_V);
 #else
 			pte_store(pte, pa | PG_RW | PG_V);
 #endif
 		}
 		pte++;
 	}
 	if (__predict_false((oldpte & PG_V) != 0))
 		pmap_invalidate_range_int(kernel_pmap, sva, sva + count *
 		    PAGE_SIZE);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 static void
 __CONCAT(PMTYPE, qremove)(vm_offset_t sva, int count)
 {
 	vm_offset_t va;
 
 	va = sva;
 	while (count-- > 0) {
 		pmap_kremove(va);
 		va += PAGE_SIZE;
 	}
 	pmap_invalidate_range_int(kernel_pmap, sva, va);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  *
  * If "promoted" is false, then the page table page "mpte" must be zero filled.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
 	return (vm_radix_insert(&pmap->pm_root, mpte));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_root, va >> PDRSHIFT));
 }
 
 /*
  * Decrements a page table page's wire count, which is used to record the
  * number of valid page table entries within the page.  If the wire count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0) {
 		_pmap_unwire_ptp(pmap, m, free);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 static void
 _pmap_unwire_ptp(pmap_t pmap, vm_page_t m, struct spglist *free)
 {
 
 	/*
 	 * unmap the page table page
 	 */
 	pmap->pm_pdir[m->pindex] = 0;
 	--pmap->pm_stats.resident_count;
 
 	/*
 	 * There is not need to invalidate the recursive mapping since
 	 * we never instantiate such mapping for the usermode pmaps,
 	 * and never remove page table pages from the kernel pmap.
 	 * Put page on a list so that it is released since all TLB
 	 * shootdown is done.
 	 */
 	MPASS(pmap != kernel_pmap);
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, struct spglist *free)
 {
 	pd_entry_t ptepde;
 	vm_page_t mpte;
 
 	if (pmap == kernel_pmap)
 		return (0);
 	ptepde = *pmap_pde(pmap, va);
 	mpte = PHYS_TO_VM_PAGE(ptepde & PG_FRAME);
 	return (pmap_unwire_ptp(pmap, mpte, free));
 }
 
 /*
  * Initialize the pmap for the swapper process.
  */
 static void
 __CONCAT(PMTYPE, pinit0)(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
 	pmap->pm_pdir = IdlePTD;
 #ifdef PMAP_PAE_COMP
 	pmap->pm_pdpt = IdlePDPT;
 #endif
 	pmap->pm_root.rt_root = 0;
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 	pmap_activate_boot(pmap);
 }
 
 /*
  * Initialize a preallocated and zeroed pmap structure,
  * such as one in a vmspace structure.
  */
 static int
 __CONCAT(PMTYPE, pinit)(pmap_t pmap)
 {
 	vm_page_t m;
 	int i;
 
 	/*
 	 * No need to allocate page table space yet but we do need a valid
 	 * page directory table.
 	 */
 	if (pmap->pm_pdir == NULL) {
 		pmap->pm_pdir = (pd_entry_t *)kva_alloc(NBPTD);
 		if (pmap->pm_pdir == NULL)
 			return (0);
 #ifdef PMAP_PAE_COMP
 		pmap->pm_pdpt = uma_zalloc(pdptzone, M_WAITOK | M_ZERO);
 		KASSERT(((vm_offset_t)pmap->pm_pdpt &
 		    ((NPGPTD * sizeof(pdpt_entry_t)) - 1)) == 0,
 		    ("pmap_pinit: pdpt misaligned"));
 		KASSERT(pmap_kextract((vm_offset_t)pmap->pm_pdpt) < (4ULL<<30),
 		    ("pmap_pinit: pdpt above 4g"));
 #endif
 		pmap->pm_root.rt_root = 0;
 	}
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_pinit: pmap has reserved page table page(s)"));
 
 	/*
 	 * allocate the page directory page(s)
 	 */
 	for (i = 0; i < NPGPTD; i++) {
 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED | VM_ALLOC_ZERO | VM_ALLOC_WAITOK);
 		pmap->pm_ptdpg[i] = m;
 #ifdef PMAP_PAE_COMP
 		pmap->pm_pdpt[i] = VM_PAGE_TO_PHYS(m) | PG_V;
 #endif
 	}
 
 	pmap_qenter((vm_offset_t)pmap->pm_pdir, pmap->pm_ptdpg, NPGPTD);
 #ifdef PMAP_PAE_COMP
 	if ((cpu_feature & CPUID_PAT) == 0) {
 		pmap_invalidate_cache_range(
 		    trunc_page((vm_offset_t)pmap->pm_pdpt),
 		    round_page((vm_offset_t)pmap->pm_pdpt +
 		    NPGPTD * sizeof(pdpt_entry_t)));
 	}
 #endif
 
 	for (i = 0; i < NPGPTD; i++)
 		if ((pmap->pm_ptdpg[i]->flags & PG_ZERO) == 0)
 			pagezero(pmap->pm_pdir + (i * NPDEPG));
 
 	/* Install the trampoline mapping. */
 	pmap->pm_pdir[TRPTDI] = PTD[TRPTDI];
 
 	CPU_ZERO(&pmap->pm_active);
 	TAILQ_INIT(&pmap->pm_pvchunk);
 	bzero(&pmap->pm_stats, sizeof pmap->pm_stats);
 
 	return (1);
 }
 
 /*
  * this routine is called if the page table page is not
  * mapped correctly.
  */
 static vm_page_t
 _pmap_allocpte(pmap_t pmap, u_int ptepindex, u_int flags)
 {
 	vm_paddr_t ptepa;
 	vm_page_t m;
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if ((flags & PMAP_ENTER_NOSLEEP) == 0) {
 			PMAP_UNLOCK(pmap);
 			rw_wunlock(&pvh_global_lock);
 			vm_wait(NULL);
 			rw_wlock(&pvh_global_lock);
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	pmap->pm_stats.resident_count++;
 
 	ptepa = VM_PAGE_TO_PHYS(m);
 	pmap->pm_pdir[ptepindex] =
 		(pd_entry_t) (ptepa | PG_U | PG_RW | PG_V | PG_A | PG_M);
 
 	return (m);
 }
 
 static vm_page_t
 pmap_allocpte(pmap_t pmap, vm_offset_t va, u_int flags)
 {
 	u_int ptepindex;
 	pd_entry_t ptepa;
 	vm_page_t m;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = va >> PDRSHIFT;
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	ptepa = pmap->pm_pdir[ptepindex];
 
 	/*
 	 * This supports switching from a 4MB page to a
 	 * normal 4K page.
 	 */
 	if (ptepa & PG_PS) {
 		(void)pmap_demote_pde(pmap, &pmap->pm_pdir[ptepindex], va);
 		ptepa = pmap->pm_pdir[ptepindex];
 	}
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (ptepa) {
 		m = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 		m->wire_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has
 		 * been deallocated. 
 		 */
 		m = _pmap_allocpte(pmap, ptepindex, flags);
 		if (m == NULL && (flags & PMAP_ENTER_NOSLEEP) == 0)
 			goto retry;
 	}
 	return (m);
 }
 
 
 /***************************************************
 * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 static void
 __CONCAT(PMTYPE, release)(pmap_t pmap)
 {
 	vm_page_t m;
 	int i;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
 	    ("pmap_release: pmap has reserved page table page(s)"));
 	KASSERT(CPU_EMPTY(&pmap->pm_active),
 	    ("releasing active pmap %p", pmap));
 
 	pmap_qremove((vm_offset_t)pmap->pm_pdir, NPGPTD);
 
 	for (i = 0; i < NPGPTD; i++) {
 		m = pmap->pm_ptdpg[i];
 #ifdef PMAP_PAE_COMP
 		KASSERT(VM_PAGE_TO_PHYS(m) == (pmap->pm_pdpt[i] & PG_FRAME),
 		    ("pmap_release: got wrong ptd page"));
 #endif
 		vm_page_unwire_noq(m);
 		vm_page_free(m);
 	}
 }
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 static void
 __CONCAT(PMTYPE, growkernel)(vm_offset_t addr)
 {
 	vm_paddr_t ptppaddr;
 	vm_page_t nkpg;
 	pd_entry_t newpdir;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 	addr = roundup2(addr, NBPDR);
 	if (addr - 1 >= vm_map_max(kernel_map))
 		addr = vm_map_max(kernel_map);
 	while (kernel_vm_end < addr) {
 		if (pdir_pde(PTD, kernel_vm_end)) {
 			kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 				kernel_vm_end = vm_map_max(kernel_map);
 				break;
 			}
 			continue;
 		}
 
 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> PDRSHIFT,
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 
 		nkpt++;
 
 		if ((nkpg->flags & PG_ZERO) == 0)
 			pmap_zero_page(nkpg);
 		ptppaddr = VM_PAGE_TO_PHYS(nkpg);
 		newpdir = (pd_entry_t) (ptppaddr | PG_V | PG_RW | PG_A | PG_M);
 		pdir_pde(KPTD, kernel_vm_end) = newpdir;
 
 		pmap_kenter_pde(kernel_vm_end, newpdir);
 		kernel_vm_end = (kernel_vm_end + NBPDR) & ~PDRMASK;
 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 			kernel_vm_end = vm_map_max(kernel_map);
 			break;
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 11);
 CTASSERT(_NPCPV == 336);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0_9	0xfffffffful	/* Free values for index 0 through 9 */
 #define	PC_FREE10	0x0000fffful	/* Free values for index 10 */
 
 static const uint32_t pc_freemask[_NPCM] = {
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE0_9, PC_FREE0_9,
 	PC_FREE0_9, PC_FREE10
 };
 
 #ifdef PV_STATS
 extern int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 extern long pv_entry_frees, pv_entry_allocs;
 extern int pv_entry_spare;
 #endif
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  */
 static vm_page_t
 pmap_pv_reclaim(pmap_t locked_pmap)
 {
 	struct pch newtail;
 	struct pv_chunk *pc;
 	struct md_page *pvh;
 	pd_entry_t *pde;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pv_entry_t pv;
 	vm_offset_t va;
 	vm_page_t m, m_pc;
 	struct spglist free;
 	uint32_t inuse;
 	int bit, field, freed;
 
 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
 	pmap = NULL;
 	m_pc = NULL;
 	SLIST_INIT(&free);
 	TAILQ_INIT(&newtail);
 	while ((pc = TAILQ_FIRST(&pv_chunks)) != NULL && (pv_vafree == 0 ||
 	    SLIST_EMPTY(&free))) {
 		TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 		if (pmap != pc->pc_pmap) {
 			if (pmap != NULL) {
 				pmap_invalidate_all_int(pmap);
 				if (pmap != locked_pmap)
 					PMAP_UNLOCK(pmap);
 			}
 			pmap = pc->pc_pmap;
 			/* Avoid deadlock and lock recursion. */
 			if (pmap > locked_pmap)
 				PMAP_LOCK(pmap);
 			else if (pmap != locked_pmap && !PMAP_TRYLOCK(pmap)) {
 				pmap = NULL;
 				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 				continue;
 			}
 		}
 
 		/*
 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
 		 */
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
 			    inuse != 0; inuse &= ~(1UL << bit)) {
 				bit = bsfl(inuse);
 				pv = &pc->pc_pventry[field * 32 + bit];
 				va = pv->pv_va;
 				pde = pmap_pde(pmap, va);
 				if ((*pde & PG_PS) != 0)
 					continue;
 				pte = __CONCAT(PMTYPE, pte)(pmap, va);
 				tpte = *pte;
 				if ((tpte & PG_W) == 0)
 					tpte = pte_load_clear(pte);
 				pmap_pte_release(pte);
 				if ((tpte & PG_W) != 0)
 					continue;
 				KASSERT(tpte != 0,
 				    ("pmap_pv_reclaim: pmap %p va %x zero pte",
 				    pmap, va));
 				if ((tpte & PG_G) != 0)
 					pmap_invalidate_page_int(pmap, va);
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 					vm_page_dirty(m);
 				if ((tpte & PG_A) != 0)
 					vm_page_aflag_set(m, PGA_REFERENCED);
 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 				if (TAILQ_EMPTY(&m->md.pv_list) &&
 				    (m->flags & PG_FICTITIOUS) == 0) {
 					pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						vm_page_aflag_clear(m,
 						    PGA_WRITEABLE);
 					}
 				}
 				pc->pc_map[field] |= 1UL << bit;
 				pmap_unuse_pt(pmap, va, &free);
 				freed++;
 			}
 		}
 		if (freed == 0) {
 			TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 			continue;
 		}
 		/* Every freed mapping is for a 4 KB page. */
 		pmap->pm_stats.resident_count -= freed;
 		PV_STAT(pv_entry_frees += freed);
 		PV_STAT(pv_entry_spare += freed);
 		pv_entry_count -= freed;
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		for (field = 0; field < _NPCM; field++)
 			if (pc->pc_map[field] != pc_freemask[field]) {
 				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
 				    pc_list);
 				TAILQ_INSERT_TAIL(&newtail, pc, pc_lru);
 
 				/*
 				 * One freed pv entry in locked_pmap is
 				 * sufficient.
 				 */
 				if (pmap == locked_pmap)
 					goto out;
 				break;
 			}
 		if (field == _NPCM) {
 			PV_STAT(pv_entry_spare -= _NPCPV);
 			PV_STAT(pc_chunk_count--);
 			PV_STAT(pc_chunk_frees++);
 			/* Entire chunk is free; return it. */
 			m_pc = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 			pmap_qremove((vm_offset_t)pc, 1);
 			pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 			break;
 		}
 	}
 out:
 	TAILQ_CONCAT(&pv_chunks, &newtail, pc_lru);
 	if (pmap != NULL) {
 		pmap_invalidate_all_int(pmap);
 		if (pmap != locked_pmap)
 			PMAP_UNLOCK(pmap);
 	}
 	if (m_pc == NULL && pv_vafree != 0 && SLIST_EMPTY(&free)) {
 		m_pc = SLIST_FIRST(&free);
 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
 		/* Recycle a freed page table page. */
 		m_pc->wire_count = 1;
 	}
 	vm_page_free_pages_toq(&free, true);
 	return (m_pc);
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_frees++);
 	PV_STAT(pv_entry_spare++);
 	pv_entry_count--;
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 32;
 	bit = idx % 32;
 	pc->pc_map[field] |= 1ul << bit;
 	for (idx = 0; idx < _NPCM; idx++)
 		if (pc->pc_map[idx] != pc_freemask[idx]) {
 			/*
 			 * 98% of the time, pc is already at the head of the
 			 * list.  If it isn't already, move it to the head.
 			 */
 			if (__predict_false(TAILQ_FIRST(&pmap->pm_pvchunk) !=
 			    pc)) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			return;
 		}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	PV_STAT(pv_entry_spare -= _NPCPV);
 	PV_STAT(pc_chunk_count--);
 	PV_STAT(pc_chunk_frees++);
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(pmap_kextract((vm_offset_t)pc));
 	pmap_qremove((vm_offset_t)pc, 1);
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 	pmap_ptelist_free(&pv_vafree, (vm_offset_t)pc);
 }
 
 /*
  * get a new pv_entry, allocating a block from the system
  * when needed.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, boolean_t try)
 {
 	static const struct timeval printinterval = { 60, 0 };
 	static struct timeval lastprint;
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(pv_entry_allocs++);
 	pv_entry_count++;
 	if (pv_entry_count > pv_entry_high_water)
 		if (ratecheck(&lastprint, &printinterval))
 			printf("Approaching the limit on PV entries, consider "
 			    "increasing either the vm.pmap.shpgperproc or the "
 			    "vm.pmap.pv_entries tunable.\n");
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = bsfl(pc->pc_map[field]);
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 32 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			for (field = 0; field < _NPCM; field++)
 				if (pc->pc_map[field] != 0) {
 					PV_STAT(pv_entry_spare--);
 					return (pv);	/* not full, return */
 				}
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 			PV_STAT(pv_entry_spare--);
 			return (pv);
 		}
 	}
 	/*
 	 * Access to the ptelist "pv_vafree" is synchronized by the pvh
 	 * global lock.  If "pv_vafree" is currently non-empty, it will
 	 * remain non-empty until pmap_ptelist_alloc() completes.
 	 */
 	if (pv_vafree == 0 || (m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) == NULL) {
 		if (try) {
 			pv_entry_count--;
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = pmap_pv_reclaim(pmap);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(pc_chunk_count++);
 	PV_STAT(pc_chunk_allocs++);
 	pc = (struct pv_chunk *)pmap_ptelist_alloc(&pv_vafree);
 	pmap_qenter((vm_offset_t)pc, &m, 1);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = pc_freemask[0] & ~1ul;	/* preallocated bit 0 */
 	for (field = 1; field < _NPCM; field++)
 		pc->pc_map[field] = pc_freemask[field];
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(pv_entry_spare += _NPCPV - 1);
 	return (pv);
 }
 
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			break;
 		}
 	}
 	return (pv);
 }
 
 static void
 pmap_pv_demote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_demote_pde: pa is not 4mpage aligned"));
 
 	/*
 	 * Transfer the 4mpage's pv entry for this mapping to the first
 	 * page's pv list.
 	 */
 	pvh = pa_to_pvh(pa);
 	va = trunc_4mpage(va);
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_pde: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	/* Instantiate the remaining NPTEPG - 1 pv entries. */
 	va_last = va + NBPDR - PAGE_SIZE;
 	do {
 		m++;
 		KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 		    ("pmap_pv_demote_pde: page %p is not managed", m));
 		va += PAGE_SIZE;
 		pmap_insert_entry(pmap, va, m);
 	} while (va < va_last);
 }
 
 #if VM_NRESERVLEVEL > 0
 static void
 pmap_pv_promote_pde(pmap_t pmap, vm_offset_t va, vm_paddr_t pa)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_offset_t va_last;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((pa & PDRMASK) == 0,
 	    ("pmap_pv_promote_pde: pa is not 4mpage aligned"));
 
 	/*
 	 * Transfer the first page's pv entry for this mapping to the
 	 * 4mpage's pv list.  Aside from avoiding the cost of a call
 	 * to get_pv_entry(), a transfer avoids the possibility that
 	 * get_pv_entry() calls pmap_collect() and that pmap_collect()
 	 * removes one of the mappings that is being promoted.
 	 */
 	m = PHYS_TO_VM_PAGE(pa);
 	va = trunc_4mpage(va);
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_pde: pv not found"));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	/* Free the remaining NPTEPG - 1 pv entries. */
 	va_last = va + NBPDR - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
 	free_pv_entry(pmap, pv);
 }
 
 static void
 pmap_remove_entry(pmap_t pmap, vm_page_t m, vm_offset_t va)
 {
 	struct md_page *pvh;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	pmap_pvh_free(&m->md, pmap, va);
 	if (TAILQ_EMPTY(&m->md.pv_list) && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		if (TAILQ_EMPTY(&pvh->pv_list))
 			vm_page_aflag_clear(m, PGA_WRITEABLE);
 	}
 }
 
 /*
  * Create a pv entry for page at pa for
  * (pmap, va).
  */
 static void
 pmap_insert_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pv = get_pv_entry(pmap, FALSE);
 	pv->pv_va = va;
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 }
 
 /*
  * Conditionally create a pv entry.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (pv_entry_count < pv_entry_high_water && 
 	    (pv = get_pv_entry(pmap, TRUE)) != NULL) {
 		pv->pv_va = va;
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * Create the pv entries for each of the pages within a superpage.
  */
 static bool
 pmap_pv_insert_pde(pmap_t pmap, vm_offset_t va, pd_entry_t pde, u_int flags)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	bool noreclaim;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	noreclaim = (flags & PMAP_ENTER_NORECLAIM) != 0;
 	if ((noreclaim && pv_entry_count >= pv_entry_high_water) ||
 	    (pv = get_pv_entry(pmap, noreclaim)) == NULL)
 		return (false);
 	pv->pv_va = va;
 	pvh = pa_to_pvh(pde & PG_PS_FRAME);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	return (true);
 }
 
 /*
  * Fills a page table page with mappings to consecutive physical pages.
  */
 static void
 pmap_fill_ptp(pt_entry_t *firstpte, pt_entry_t newpte)
 {
 	pt_entry_t *pte;
 
 	for (pte = firstpte; pte < firstpte + NPTEPG; pte++) {
 		*pte = newpte;	
 		newpte += PAGE_SIZE;
 	}
 }
 
 /*
  * Tries to demote a 2- or 4MB page mapping.  If demotion fails, the
  * 2- or 4MB page mapping is invalidated.
  */
 static boolean_t
 pmap_demote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde, oldpde;
 	pt_entry_t *firstpte, newpte;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 	struct spglist free;
 	vm_offset_t sva;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpde = *pde;
 	KASSERT((oldpde & (PG_PS | PG_V)) == (PG_PS | PG_V),
 	    ("pmap_demote_pde: oldpde is missing PG_PS and/or PG_V"));
 	if ((oldpde & PG_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
 	    NULL) {
 		KASSERT((oldpde & PG_W) == 0,
 		    ("pmap_demote_pde: page table page for a wired mapping"
 		    " is missing"));
 
 		/*
 		 * Invalidate the 2- or 4MB page mapping and return
 		 * "failure" if the mapping was never accessed or the
 		 * allocation of the new page table page fails.
 		 */
 		if ((oldpde & PG_A) == 0 || (mpte = vm_page_alloc(NULL,
 		    va >> PDRSHIFT, VM_ALLOC_NOOBJ | VM_ALLOC_NORMAL |
 		    VM_ALLOC_WIRED)) == NULL) {
 			SLIST_INIT(&free);
 			sva = trunc_4mpage(va);
 			pmap_remove_pde(pmap, pde, sva, &free);
 			if ((oldpde & PG_G) == 0)
 				pmap_invalidate_pde_page(pmap, sva, oldpde);
 			vm_page_free_pages_toq(&free, true);
 			CTR2(KTR_PMAP, "pmap_demote_pde: failure for va %#x"
 			    " in pmap %p", va, pmap);
 			return (FALSE);
 		}
 		if (pmap != kernel_pmap) {
 			mpte->wire_count = NPTEPG;
 			pmap->pm_stats.resident_count++;
 		}
 	}
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 
 	/*
 	 * If the page mapping is in the kernel's address space, then the
 	 * KPTmap can provide access to the page table page.  Otherwise,
 	 * temporarily map the page table page (mpte) into the kernel's
 	 * address space at either PADDR1 or PADDR2. 
 	 */
 	if (pmap == kernel_pmap)
 		firstpte = &KPTmap[i386_btop(trunc_4mpage(va))];
 	else if (curthread->td_pinned > 0 && rw_wowned(&pvh_global_lock)) {
 		if ((*PMAP1 & PG_FRAME) != mptepa) {
 			*PMAP1 = mptepa | PG_RW | PG_V | PG_A | PG_M;
 #ifdef SMP
 			PMAP1cpu = PCPU_GET(cpuid);
 #endif
 			invlcaddr(PADDR1);
 			PMAP1changed++;
 		} else
 #ifdef SMP
 		if (PMAP1cpu != PCPU_GET(cpuid)) {
 			PMAP1cpu = PCPU_GET(cpuid);
 			invlcaddr(PADDR1);
 			PMAP1changedcpu++;
 		} else
 #endif
 			PMAP1unchanged++;
 		firstpte = PADDR1;
 	} else {
 		mtx_lock(&PMAP2mutex);
 		if ((*PMAP2 & PG_FRAME) != mptepa) {
 			*PMAP2 = mptepa | PG_RW | PG_V | PG_A | PG_M;
 			pmap_invalidate_page_int(kernel_pmap,
 			    (vm_offset_t)PADDR2);
 		}
 		firstpte = PADDR2;
 	}
 	newpde = mptepa | PG_M | PG_A | (oldpde & PG_U) | PG_RW | PG_V;
 	KASSERT((oldpde & PG_A) != 0,
 	    ("pmap_demote_pde: oldpde is missing PG_A"));
 	KASSERT((oldpde & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_demote_pde: oldpde is missing PG_M"));
 	newpte = oldpde & ~PG_PS;
 	if ((newpte & PG_PDE_PAT) != 0)
 		newpte ^= PG_PDE_PAT | PG_PTE_PAT;
 
 	/*
 	 * If the page table page is not leftover from an earlier promotion,
 	 * initialize it.
 	 */
 	if (mpte->valid == 0)
 		pmap_fill_ptp(firstpte, newpte);
 
 	KASSERT((*firstpte & PG_FRAME) == (newpte & PG_FRAME),
 	    ("pmap_demote_pde: firstpte and newpte map different physical"
 	    " addresses"));
 
 	/*
 	 * If the mapping has changed attributes, update the page table
 	 * entries.
 	 */ 
 	if ((*firstpte & PG_PTE_PROMOTE) != (newpte & PG_PTE_PROMOTE))
 		pmap_fill_ptp(firstpte, newpte);
 	
 	/*
 	 * Demote the mapping.  This pmap is locked.  The old PDE has
 	 * PG_A set.  If the old PDE has PG_RW set, it also has PG_M
 	 * set.  Thus, there is no danger of a race with another
 	 * processor changing the setting of PG_A and/or PG_M between
 	 * the read above and the store below. 
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else if (pmap == kernel_pmap)
 		pmap_kenter_pde(va, newpde);
 	else
 		pde_store(pde, newpde);	
 	if (firstpte == PADDR2)
 		mtx_unlock(&PMAP2mutex);
 
 	/*
 	 * Invalidate the recursive mapping of the page table page.
 	 */
 	pmap_invalidate_page_int(pmap, (vm_offset_t)vtopte(va));
 
 	/*
 	 * Demote the pv entry.  This depends on the earlier demotion
 	 * of the mapping.  Specifically, the (re)creation of a per-
 	 * page pv entry might trigger the execution of pmap_collect(),
 	 * which might reclaim a newly (re)created per-page pv entry
 	 * and destroy the associated mapping.  In order to destroy
 	 * the mapping, the PDE must have already changed from mapping
 	 * the 2mpage to referencing the page table page.
 	 */
 	if ((oldpde & PG_MANAGED) != 0)
 		pmap_pv_demote_pde(pmap, va, oldpde & PG_PS_FRAME);
 
 	pmap_pde_demotions++;
 	CTR2(KTR_PMAP, "pmap_demote_pde: success for va %#x"
 	    " in pmap %p", va, pmap);
 	return (TRUE);
 }
 
 /*
  * Removes a 2- or 4MB page mapping from the kernel pmap.
  */
 static void
 pmap_remove_kernel_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde;
 	vm_paddr_t mptepa;
 	vm_page_t mpte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	mpte = pmap_remove_pt_page(pmap, va);
 	if (mpte == NULL)
 		panic("pmap_remove_kernel_pde: Missing pt page.");
 
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	newpde = mptepa | PG_M | PG_A | PG_RW | PG_V;
 
 	/*
 	 * If this page table page was unmapped by a promotion, then it
 	 * contains valid mappings.  Zero it to invalidate those mappings.
 	 */
 	if (mpte->valid != 0)
 		pagezero((void *)&KPTmap[i386_btop(trunc_4mpage(va))]);
 
 	/*
 	 * Remove the mapping.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, newpde);
 	else 
 		pmap_kenter_pde(va, newpde);
 
 	/*
 	 * Invalidate the recursive mapping of the page table page.
 	 */
 	pmap_invalidate_page_int(pmap, (vm_offset_t)vtopte(va));
 }
 
 /*
  * pmap_remove_pde: do the things to unmap a superpage in a process
  */
 static void
 pmap_remove_pde(pmap_t pmap, pd_entry_t *pdq, vm_offset_t sva,
     struct spglist *free)
 {
 	struct md_page *pvh;
 	pd_entry_t oldpde;
 	vm_offset_t eva, va;
 	vm_page_t m, mpte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_remove_pde: sva is not 4mpage aligned"));
 	oldpde = pte_load_clear(pdq);
 	if (oldpde & PG_W)
 		pmap->pm_stats.wired_count -= NBPDR / PAGE_SIZE;
 
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if ((oldpde & PG_G) != 0)
 		pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 
 	pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 	if (oldpde & PG_MANAGED) {
 		pvh = pa_to_pvh(oldpde & PG_PS_FRAME);
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + NBPDR;
 		for (va = sva, m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 		    va < eva; va += PAGE_SIZE, m++) {
 			if ((oldpde & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(m);
 			if (oldpde & PG_A)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_pde(pmap, pdq, sva);
 	} else {
 		mpte = pmap_remove_pt_page(pmap, sva);
 		if (mpte != NULL) {
 			KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_remove_pde: pte page not promoted"));
 			pmap->pm_stats.resident_count--;
 			KASSERT(mpte->wire_count == NPTEPG,
 			    ("pmap_remove_pde: pte page wire count error"));
 			mpte->wire_count = 0;
 			pmap_add_delayed_free_list(mpte, free, FALSE);
 		}
 	}
 }
 
 /*
  * pmap_remove_pte: do the things to unmap a page in a process
  */
 static int
 pmap_remove_pte(pmap_t pmap, pt_entry_t *ptq, vm_offset_t va,
     struct spglist *free)
 {
 	pt_entry_t oldpte;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	oldpte = pte_load_clear(ptq);
 	KASSERT(oldpte != 0,
 	    ("pmap_remove_pte: pmap %p va %x zero pte", pmap, va));
 	if (oldpte & PG_W)
 		pmap->pm_stats.wired_count -= 1;
 	/*
 	 * Machines that don't support invlpg, also don't support
 	 * PG_G.
 	 */
 	if (oldpte & PG_G)
 		pmap_invalidate_page_int(kernel_pmap, va);
 	pmap->pm_stats.resident_count -= 1;
 	if (oldpte & PG_MANAGED) {
 		m = PHYS_TO_VM_PAGE(oldpte & PG_FRAME);
 		if ((oldpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if (oldpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		pmap_remove_entry(pmap, m, va);
 	}
 	return (pmap_unuse_pt(pmap, va, free));
 }
 
 /*
  * Remove a single page from a process address space
  */
 static void
 pmap_remove_page(pmap_t pmap, vm_offset_t va, struct spglist *free)
 {
 	pt_entry_t *pte;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if ((pte = pmap_pte_quick(pmap, va)) == NULL || *pte == 0)
 		return;
 	pmap_remove_pte(pmap, pte, va, free);
 	pmap_invalidate_page_int(pmap, va);
 }
 
 /*
  * Removes the specified range of addresses from the page table page.
  */
 static bool
 pmap_remove_ptes(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     struct spglist *free)
 {
 	pt_entry_t *pte;
 	bool anyvalid;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT(curthread->td_pinned > 0, ("curthread not pinned"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	anyvalid = false;
 	for (pte = pmap_pte_quick(pmap, sva); sva != eva; pte++,
 	    sva += PAGE_SIZE) {
 		if (*pte == 0)
 			continue;
 
 		/*
 		 * The TLB entry for a PG_G mapping is invalidated by
 		 * pmap_remove_pte().
 		 */
 		if ((*pte & PG_G) == 0)
 			anyvalid = true;
 
 		if (pmap_remove_pte(pmap, pte, sva, free))
 			break;
 	}
 	return (anyvalid);
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 static void
 __CONCAT(PMTYPE, remove)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	struct spglist free;
 	int anyvalid;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	anyvalid = 0;
 	SLIST_INIT(&free);
 
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	PMAP_LOCK(pmap);
 
 	/*
 	 * special handling of removing one page.  a very
 	 * common operation and easy to short circuit some
 	 * code.
 	 */
 	if ((sva + PAGE_SIZE == eva) && 
 	    ((pmap->pm_pdir[(sva >> PDRSHIFT)] & PG_PS) == 0)) {
 		pmap_remove_page(pmap, sva, &free);
 		goto out;
 	}
 
 	for (; sva < eva; sva = pdnxt) {
 		u_int pdirindex;
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pdnxt < sva)
 			pdnxt = eva;
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we removing the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_remove_pde().
 				 */
 				if ((ptpaddr & PG_G) == 0)
 					anyvalid = 1;
 				pmap_remove_pde(pmap,
 				    &pmap->pm_pdir[pdirindex], sva, &free);
 				continue;
 			} else if (!pmap_demote_pde(pmap,
 			    &pmap->pm_pdir[pdirindex], sva)) {
 				/* The large page mapping was destroyed. */
 				continue;
 			}
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		if (pmap_remove_ptes(pmap, sva, pdnxt, &free))
 			anyvalid = 1;
 	}
 out:
 	sched_unpin();
 	if (anyvalid)
 		pmap_invalidate_all_int(pmap);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 static void
 __CONCAT(PMTYPE, remove_all)(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	pmap_t pmap;
 	pt_entry_t *pte, tpte;
 	pd_entry_t *pde;
 	vm_offset_t va;
 	struct spglist free;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		va = pv->pv_va;
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, va);
 		(void)pmap_demote_pde(pmap, pde, va);
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pmap->pm_stats.resident_count--;
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_remove_all: found"
 		    " a 4mpage in page %p's pv list", m));
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		tpte = pte_load_clear(pte);
 		KASSERT(tpte != 0, ("pmap_remove_all: pmap %p va %x zero pte",
 		    pmap, pv->pv_va));
 		if (tpte & PG_W)
 			pmap->pm_stats.wired_count--;
 		if (tpte & PG_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, &free);
 		pmap_invalidate_page_int(pmap, pv->pv_va);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  * pmap_protect_pde: do the things to protect a 4mpage in a process
  */
 static boolean_t
 pmap_protect_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t sva, vm_prot_t prot)
 {
 	pd_entry_t newpde, oldpde;
 	vm_page_t m, mt;
 	boolean_t anychanged;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & PDRMASK) == 0,
 	    ("pmap_protect_pde: sva is not 4mpage aligned"));
 	anychanged = FALSE;
 retry:
 	oldpde = newpde = *pde;
 	if ((prot & VM_PROT_WRITE) == 0) {
 		if ((oldpde & (PG_MANAGED | PG_M | PG_RW)) ==
 		    (PG_MANAGED | PG_M | PG_RW)) {
 			m = PHYS_TO_VM_PAGE(oldpde & PG_PS_FRAME);
 			for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 				vm_page_dirty(mt);
 		}
 		newpde &= ~(PG_RW | PG_M);
 	}
 #ifdef PMAP_PAE_COMP
 	if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec)
 		newpde |= pg_nx;
 #endif
 	if (newpde != oldpde) {
 		/*
 		 * As an optimization to future operations on this PDE, clear
 		 * PG_PROMOTED.  The impending invalidation will remove any
 		 * lingering 4KB page mappings from the TLB.
 		 */
 		if (!pde_cmpset(pde, oldpde, newpde & ~PG_PROMOTED))
 			goto retry;
 		if ((oldpde & PG_G) != 0)
 			pmap_invalidate_pde_page(kernel_pmap, sva, oldpde);
 		else
 			anychanged = TRUE;
 	}
 	return (anychanged);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 static void
 __CONCAT(PMTYPE, protect)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     vm_prot_t prot)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t ptpaddr;
 	pt_entry_t *pte;
 	boolean_t anychanged, pv_lists_locked;
 
 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
 	if (prot == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 #ifdef PMAP_PAE_COMP
 	if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
 		return;
 #else
 	if (prot & VM_PROT_WRITE)
 		return;
 #endif
 
 	if (pmap_is_current(pmap))
 		pv_lists_locked = FALSE;
 	else {
 		pv_lists_locked = TRUE;
 resume:
 		rw_wlock(&pvh_global_lock);
 		sched_pin();
 	}
 	anychanged = FALSE;
 
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pt_entry_t obits, pbits;
 		u_int pdirindex;
 
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pdnxt < sva)
 			pdnxt = eva;
 
 		pdirindex = sva >> PDRSHIFT;
 		ptpaddr = pmap->pm_pdir[pdirindex];
 
 		/*
 		 * Weed out invalid mappings. Note: we assume that the page
 		 * directory table is always allocated, and in kernel virtual.
 		 */
 		if (ptpaddr == 0)
 			continue;
 
 		/*
 		 * Check for large page.
 		 */
 		if ((ptpaddr & PG_PS) != 0) {
 			/*
 			 * Are we protecting the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
 				/*
 				 * The TLB entry for a PG_G mapping is
 				 * invalidated by pmap_protect_pde().
 				 */
 				if (pmap_protect_pde(pmap,
 				    &pmap->pm_pdir[pdirindex], sva, prot))
 					anychanged = TRUE;
 				continue;
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = TRUE;
 					if (!rw_try_wlock(&pvh_global_lock)) {
 						if (anychanged)
 							pmap_invalidate_all_int(
 							    pmap);
 						PMAP_UNLOCK(pmap);
 						goto resume;
 					}
 					sched_pin();
 				}
 				if (!pmap_demote_pde(pmap,
 				    &pmap->pm_pdir[pdirindex], sva)) {
 					/*
 					 * The large page mapping was
 					 * destroyed.
 					 */
 					continue;
 				}
 			}
 		}
 
 		if (pdnxt > eva)
 			pdnxt = eva;
 
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			vm_page_t m;
 
 retry:
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits in
 			 * size, PG_RW, PG_A, and PG_M are among the least
 			 * significant 32 bits.
 			 */
 			obits = pbits = *pte;
 			if ((pbits & PG_V) == 0)
 				continue;
 
 			if ((prot & VM_PROT_WRITE) == 0) {
 				if ((pbits & (PG_MANAGED | PG_M | PG_RW)) ==
 				    (PG_MANAGED | PG_M | PG_RW)) {
 					m = PHYS_TO_VM_PAGE(pbits & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				pbits &= ~(PG_RW | PG_M);
 			}
 #ifdef PMAP_PAE_COMP
 			if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec)
 				pbits |= pg_nx;
 #endif
 
 			if (pbits != obits) {
 #ifdef PMAP_PAE_COMP
 				if (!atomic_cmpset_64(pte, obits, pbits))
 					goto retry;
 #else
 				if (!atomic_cmpset_int((u_int *)pte, obits,
 				    pbits))
 					goto retry;
 #endif
 				if (obits & PG_G)
 					pmap_invalidate_page_int(pmap, sva);
 				else
 					anychanged = TRUE;
 			}
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all_int(pmap);
 	if (pv_lists_locked) {
 		sched_unpin();
 		rw_wunlock(&pvh_global_lock);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 #if VM_NRESERVLEVEL > 0
 /*
  * Tries to promote the 512 or 1024, contiguous 4KB page mappings that are
  * within a single page table page (PTP) to a single 2- or 4MB page mapping.
  * For promotion to occur, two conditions must be met: (1) the 4KB page
  * mappings must map aligned, contiguous physical memory and (2) the 4KB page
  * mappings must have identical characteristics.
  *
  * Managed (PG_MANAGED) mappings within the kernel address space are not
  * promoted.  The reason is that kernel PDEs are replicated in each pmap but
  * pmap_clear_ptes() and pmap_ts_referenced() only read the PDE from the kernel
  * pmap.
  */
 static void
 pmap_promote_pde(pmap_t pmap, pd_entry_t *pde, vm_offset_t va)
 {
 	pd_entry_t newpde;
 	pt_entry_t *firstpte, oldpte, pa, *pte;
 	vm_offset_t oldpteva;
 	vm_page_t mpte;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Examine the first PTE in the specified PTP.  Abort if this PTE is
 	 * either invalid, unused, or does not map the first 4KB physical page
 	 * within a 2- or 4MB page.
 	 */
 	firstpte = pmap_pte_quick(pmap, trunc_4mpage(va));
 setpde:
 	newpde = *firstpte;
 	if ((newpde & ((PG_FRAME & PDRMASK) | PG_A | PG_V)) != (PG_A | PG_V)) {
 		pmap_pde_p_failures++;
 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 	if ((*firstpte & PG_MANAGED) != 0 && pmap == kernel_pmap) {
 		pmap_pde_p_failures++;
 		CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 		    " in pmap %p", va, pmap);
 		return;
 	}
 	if ((newpde & (PG_M | PG_RW)) == PG_RW) {
 		/*
 		 * When PG_M is already clear, PG_RW can be cleared without
 		 * a TLB invalidation.
 		 */
 		if (!atomic_cmpset_int((u_int *)firstpte, newpde, newpde &
 		    ~PG_RW))  
 			goto setpde;
 		newpde &= ~PG_RW;
 	}
 
 	/* 
 	 * Examine each of the other PTEs in the specified PTP.  Abort if this
 	 * PTE maps an unexpected 4KB physical page or does not have identical
 	 * characteristics to the first PTE.
 	 */
 	pa = (newpde & (PG_PS_FRAME | PG_A | PG_V)) + NBPDR - PAGE_SIZE;
 	for (pte = firstpte + NPTEPG - 1; pte > firstpte; pte--) {
 setpte:
 		oldpte = *pte;
 		if ((oldpte & (PG_FRAME | PG_A | PG_V)) != pa) {
 			pmap_pde_p_failures++;
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		if ((oldpte & (PG_M | PG_RW)) == PG_RW) {
 			/*
 			 * When PG_M is already clear, PG_RW can be cleared
 			 * without a TLB invalidation.
 			 */
 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
 			    oldpte & ~PG_RW))
 				goto setpte;
 			oldpte &= ~PG_RW;
 			oldpteva = (oldpte & PG_FRAME & PDRMASK) |
 			    (va & ~PDRMASK);
 			CTR2(KTR_PMAP, "pmap_promote_pde: protect for va %#x"
 			    " in pmap %p", oldpteva, pmap);
 		}
 		if ((oldpte & PG_PTE_PROMOTE) != (newpde & PG_PTE_PROMOTE)) {
 			pmap_pde_p_failures++;
 			CTR2(KTR_PMAP, "pmap_promote_pde: failure for va %#x"
 			    " in pmap %p", va, pmap);
 			return;
 		}
 		pa -= PAGE_SIZE;
 	}
 
 	/*
 	 * Save the page table page in its current state until the PDE
 	 * mapping the superpage is demoted by pmap_demote_pde() or
 	 * destroyed by pmap_remove_pde(). 
 	 */
 	mpte = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 	KASSERT(mpte >= vm_page_array &&
 	    mpte < &vm_page_array[vm_page_array_size],
 	    ("pmap_promote_pde: page table page is out of range"));
 	KASSERT(mpte->pindex == va >> PDRSHIFT,
 	    ("pmap_promote_pde: page table page's pindex is wrong"));
 	if (pmap_insert_pt_page(pmap, mpte, true)) {
 		pmap_pde_p_failures++;
 		CTR2(KTR_PMAP,
 		    "pmap_promote_pde: failure for va %#x in pmap %p", va,
 		    pmap);
 		return;
 	}
 
 	/*
 	 * Promote the pv entries.
 	 */
 	if ((newpde & PG_MANAGED) != 0)
 		pmap_pv_promote_pde(pmap, va, newpde & PG_PS_FRAME);
 
 	/*
 	 * Propagate the PAT index to its proper position.
 	 */
 	if ((newpde & PG_PTE_PAT) != 0)
 		newpde ^= PG_PDE_PAT | PG_PTE_PAT;
 
 	/*
 	 * Map the superpage.
 	 */
 	if (workaround_erratum383)
 		pmap_update_pde(pmap, va, pde, PG_PS | newpde);
 	else if (pmap == kernel_pmap)
 		pmap_kenter_pde(va, PG_PROMOTED | PG_PS | newpde);
 	else
 		pde_store(pde, PG_PROMOTED | PG_PS | newpde);
 
 	pmap_pde_promotions++;
 	CTR2(KTR_PMAP, "pmap_promote_pde: success for va %#x"
 	    " in pmap %p", va, pmap);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 static int
 __CONCAT(PMTYPE, enter)(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, u_int flags, int8_t psind)
 {
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	pt_entry_t newpte, origpte;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa;
 	vm_page_t mpte, om;
 	int rv;
 
 	va = trunc_page(va);
 	KASSERT((pmap == kernel_pmap && va < VM_MAX_KERNEL_ADDRESS) ||
 	    (pmap != kernel_pmap && va < VM_MAXUSER_ADDRESS),
 	    ("pmap_enter: toobig k%d %#x", pmap == kernel_pmap, va));
 	KASSERT(va < PMAP_TRM_MIN_ADDRESS,
 	    ("pmap_enter: invalid to pmap_enter into trampoline (va: 0x%x)",
 	    va));
 	KASSERT(pmap != kernel_pmap || (m->oflags & VPO_UNMANAGED) != 0 ||
 	    va < kmi.clean_sva || va >= kmi.clean_eva,
 	    ("pmap_enter: managed mapping within the clean submap"));
 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 	KASSERT((flags & PMAP_ENTER_RESERVED) == 0,
 	    ("pmap_enter: flags %u has reserved bits set", flags));
 	pa = VM_PAGE_TO_PHYS(m);
 	newpte = (pt_entry_t)(pa | PG_A | PG_V);
 	if ((flags & VM_PROT_WRITE) != 0)
 		newpte |= PG_M;
 	if ((prot & VM_PROT_WRITE) != 0)
 		newpte |= PG_RW;
 	KASSERT((newpte & (PG_M | PG_RW)) != PG_M,
 	    ("pmap_enter: flags includes VM_PROT_WRITE but prot doesn't"));
 #ifdef PMAP_PAE_COMP
 	if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec)
 		newpte |= pg_nx;
 #endif
 	if ((flags & PMAP_ENTER_WIRED) != 0)
 		newpte |= PG_W;
 	if (pmap != kernel_pmap)
 		newpte |= PG_U;
 	newpte |= pmap_cache_bits(pmap, m->md.pat_mode, psind > 0);
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		newpte |= PG_MANAGED;
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	sched_pin();
 	if (psind == 1) {
 		/* Assert the required virtual and physical alignment. */ 
 		KASSERT((va & PDRMASK) == 0, ("pmap_enter: va unaligned"));
 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 		rv = pmap_enter_pde(pmap, va, newpte | PG_PS, flags, m);
 		goto out;
 	}
 
 	pde = pmap_pde(pmap, va);
 	if (pmap != kernel_pmap) {
 		/*
 		 * va is for UVA.
 		 * In the case that a page table page is not resident,
 		 * we are creating it here.  pmap_allocpte() handles
 		 * demotion.
 		 */
 		mpte = pmap_allocpte(pmap, va, flags);
 		if (mpte == NULL) {
 			KASSERT((flags & PMAP_ENTER_NOSLEEP) != 0,
 			    ("pmap_allocpte failed with sleep allowed"));
 			rv = KERN_RESOURCE_SHORTAGE;
 			goto out;
 		}
 	} else {
 		/*
 		 * va is for KVA, so pmap_demote_pde() will never fail
 		 * to install a page table page.  PG_V is also
 		 * asserted by pmap_demote_pde().
 		 */
 		mpte = NULL;
 		KASSERT(pde != NULL && (*pde & PG_V) != 0,
 		    ("KVA %#x invalid pde pdir %#jx", va,
 		    (uintmax_t)pmap->pm_pdir[PTDPTDI]));
 		if ((*pde & PG_PS) != 0)
 			pmap_demote_pde(pmap, pde, va);
 	}
 	pte = pmap_pte_quick(pmap, va);
 
 	/*
 	 * Page Directory table entry is not valid, which should not
 	 * happen.  We should have either allocated the page table
 	 * page or demoted the existing mapping above.
 	 */
 	if (pte == NULL) {
 		panic("pmap_enter: invalid page directory pdir=%#jx, va=%#x",
 		    (uintmax_t)pmap->pm_pdir[PTDPTDI], va);
 	}
 
 	origpte = *pte;
 	pv = NULL;
 
 	/*
 	 * Is the specified virtual address already mapped?
 	 */
 	if ((origpte & PG_V) != 0) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if ((newpte & PG_W) != 0 && (origpte & PG_W) == 0)
 			pmap->pm_stats.wired_count++;
 		else if ((newpte & PG_W) == 0 && (origpte & PG_W) != 0)
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove the extra PT page reference.
 		 */
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%x", va));
 		}
 
 		/*
 		 * Has the physical page changed?
 		 */
 		opa = origpte & PG_FRAME;
 		if (opa == pa) {
 			/*
 			 * No, might be a protection or wiring change.
 			 */
 			if ((origpte & PG_MANAGED) != 0 &&
 			    (newpte & PG_RW) != 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 			if (((origpte ^ newpte) & ~(PG_M | PG_A)) == 0)
 				goto unchanged;
 			goto validate;
 		}
 
 		/*
 		 * The physical page has changed.  Temporarily invalidate
 		 * the mapping.  This ensures that all threads sharing the
 		 * pmap keep a consistent view of the mapping, which is
 		 * necessary for the correct handling of COW faults.  It
 		 * also permits reuse of the old mapping's PV entry,
 		 * avoiding an allocation.
 		 *
 		 * For consistency, handle unmanaged mappings the same way.
 		 */
 		origpte = pte_load_clear(pte);
 		KASSERT((origpte & PG_FRAME) == opa,
 		    ("pmap_enter: unexpected pa update for %#x", va));
 		if ((origpte & PG_MANAGED) != 0) {
 			om = PHYS_TO_VM_PAGE(opa);
 
 			/*
 			 * The pmap lock is sufficient to synchronize with
 			 * concurrent calls to pmap_page_test_mappings() and
 			 * pmap_ts_referenced().
 			 */
 			if ((origpte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 				vm_page_dirty(om);
 			if ((origpte & PG_A) != 0)
 				vm_page_aflag_set(om, PGA_REFERENCED);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 			KASSERT(pv != NULL,
 			    ("pmap_enter: no PV entry for %#x", va));
 			if ((newpte & PG_MANAGED) == 0)
 				free_pv_entry(pmap, pv);
 			if ((om->aflags & PGA_WRITEABLE) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list) &&
 			    ((om->flags & PG_FICTITIOUS) != 0 ||
 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 		}
 		if ((origpte & PG_A) != 0)
 			pmap_invalidate_page_int(pmap, va);
 		origpte = 0;
 	} else {
 		/*
 		 * Increment the counters.
 		 */
 		if ((newpte & PG_W) != 0)
 			pmap->pm_stats.wired_count++;
 		pmap->pm_stats.resident_count++;
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((newpte & PG_MANAGED) != 0) {
 		if (pv == NULL) {
 			pv = get_pv_entry(pmap, FALSE);
 			pv->pv_va = va;
 		}
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		if ((newpte & PG_RW) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
 	/*
 	 * Update the PTE.
 	 */
 	if ((origpte & PG_V) != 0) {
 validate:
 		origpte = pte_load_store(pte, newpte);
 		KASSERT((origpte & PG_FRAME) == pa,
 		    ("pmap_enter: unexpected pa update for %#x", va));
 		if ((newpte & PG_M) == 0 && (origpte & (PG_M | PG_RW)) ==
 		    (PG_M | PG_RW)) {
 			if ((origpte & PG_MANAGED) != 0)
 				vm_page_dirty(m);
 
 			/*
 			 * Although the PTE may still have PG_RW set, TLB
 			 * invalidation may nonetheless be required because
 			 * the PTE no longer has PG_M set.
 			 */
 		}
 #ifdef PMAP_PAE_COMP
 		else if ((origpte & PG_NX) != 0 || (newpte & PG_NX) == 0) {
 			/*
 			 * This PTE change does not require TLB invalidation.
 			 */
 			goto unchanged;
 		}
 #endif
 		if ((origpte & PG_A) != 0)
 			pmap_invalidate_page_int(pmap, va);
 	} else
 		pte_store_zero(pte, newpte);
 
 unchanged:
 
 #if VM_NRESERVLEVEL > 0
 	/*
 	 * If both the page table page and the reservation are fully
 	 * populated, then attempt promotion.
 	 */
 	if ((mpte == NULL || mpte->wire_count == NPTEPG) &&
 	    pg_ps_enabled && (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
 		pmap_promote_pde(pmap, pde, va);
 #endif
 
 	rv = KERN_SUCCESS;
 out:
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Tries to create a read- and/or execute-only 2 or 4 MB page mapping.  Returns
  * true if successful.  Returns false if (1) a mapping already exists at the
  * specified virtual address or (2) a PV entry cannot be allocated without
  * reclaiming another PV entry.
  */
 static bool
 pmap_enter_4mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	pd_entry_t newpde;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	newpde = VM_PAGE_TO_PHYS(m) | pmap_cache_bits(pmap, m->md.pat_mode, 1) |
 	    PG_PS | PG_V;
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		newpde |= PG_MANAGED;
 #ifdef PMAP_PAE_COMP
 	if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec)
 		newpde |= pg_nx;
 #endif
 	if (pmap != kernel_pmap)
 		newpde |= PG_U;
 	return (pmap_enter_pde(pmap, va, newpde, PMAP_ENTER_NOSLEEP |
 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL) ==
 	    KERN_SUCCESS);
 }
 
 /*
  * Tries to create the specified 2 or 4 MB page mapping.  Returns KERN_SUCCESS
  * if the mapping was created, and either KERN_FAILURE or
  * KERN_RESOURCE_SHORTAGE otherwise.  Returns KERN_FAILURE if
  * PMAP_ENTER_NOREPLACE was specified and a mapping already exists at the
  * specified virtual address.  Returns KERN_RESOURCE_SHORTAGE if
  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
  *
  * The parameter "m" is only used when creating a managed, writeable mapping.
  */
 static int
 pmap_enter_pde(pmap_t pmap, vm_offset_t va, pd_entry_t newpde, u_int flags,
     vm_page_t m)
 {
 	struct spglist free;
 	pd_entry_t oldpde, *pde;
 	vm_page_t mt;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	KASSERT((newpde & (PG_M | PG_RW)) != PG_RW,
 	    ("pmap_enter_pde: newpde is missing PG_M"));
 	KASSERT(pmap == kernel_pmap || (newpde & PG_W) == 0,
 	    ("pmap_enter_pde: cannot create wired user mapping"));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pde = pmap_pde(pmap, va);
 	oldpde = *pde;
 	if ((oldpde & PG_V) != 0) {
 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (KERN_FAILURE);
 		}
 		/* Break the existing mapping(s). */
 		SLIST_INIT(&free);
 		if ((oldpde & PG_PS) != 0) {
 			/*
 			 * If the PDE resulted from a promotion, then a
 			 * reserved PT page could be freed.
 			 */
 			(void)pmap_remove_pde(pmap, pde, va, &free);
 			if ((oldpde & PG_G) == 0)
 				pmap_invalidate_pde_page(pmap, va, oldpde);
 		} else {
 			if (pmap_remove_ptes(pmap, va, va + NBPDR, &free))
 		               pmap_invalidate_all_int(pmap);
 		}
 		vm_page_free_pages_toq(&free, true);
 		if (pmap == kernel_pmap) {
 			/*
 			 * Both pmap_remove_pde() and pmap_remove_ptes() will
 			 * leave the kernel page table page zero filled.
 			 */
 			mt = PHYS_TO_VM_PAGE(*pde & PG_FRAME);
 			if (pmap_insert_pt_page(pmap, mt, false))
 				panic("pmap_enter_pde: trie insert failed");
 		} else
 			KASSERT(*pde == 0, ("pmap_enter_pde: non-zero pde %p",
 			    pde));
 	}
 	if ((newpde & PG_MANAGED) != 0) {
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_pde(pmap, va, newpde, flags)) {
 			CTR2(KTR_PMAP, "pmap_enter_pde: failure for va %#lx"
 			    " in pmap %p", va, pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		if ((newpde & PG_RW) != 0) {
 			for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 				vm_page_aflag_set(mt, PGA_WRITEABLE);
 		}
 	}
 
 	/*
 	 * Increment counters.
 	 */
 	if ((newpde & PG_W) != 0)
 		pmap->pm_stats.wired_count += NBPDR / PAGE_SIZE;
 	pmap->pm_stats.resident_count += NBPDR / PAGE_SIZE;
 
 	/*
 	 * Map the superpage.  (This is not a promoted mapping; there will not
 	 * be any lingering 4KB page mappings in the TLB.)
 	 */
 	pde_store(pde, newpde);
 
 	pmap_pde_mappings++;
 	CTR2(KTR_PMAP, "pmap_enter_pde: success for va %#lx"
 	    " in pmap %p", va, pmap);
 	return (KERN_SUCCESS);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 static void
 __CONCAT(PMTYPE, enter_object)(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & PDRMASK) == 0 && va + NBPDR <= end &&
 		    m->psind == 1 && pg_ps_enabled &&
 		    pmap_enter_4mpage(pmap, va, m, prot))
 			m = &m[NBPDR / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot,
 			    mpte);
 		m = TAILQ_NEXT(m, listq);
 	}
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 static void
 __CONCAT(PMTYPE, enter_quick)(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot)
 {
 
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte)
 {
 	pt_entry_t newpte, *pte;
 	struct spglist free;
 
 	KASSERT(pmap != kernel_pmap || va < kmi.clean_sva ||
 	    va >= kmi.clean_eva || (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (pmap != kernel_pmap) {
 		u_int ptepindex;
 		pd_entry_t ptepa;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		ptepindex = va >> PDRSHIFT;
 		if (mpte && (mpte->pindex == ptepindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the page directory entry
 			 */
 			ptepa = pmap->pm_pdir[ptepindex];
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.
 			 */
 			if (ptepa) {
 				if (ptepa & PG_PS)
 					return (NULL);
 				mpte = PHYS_TO_VM_PAGE(ptepa & PG_FRAME);
 				mpte->wire_count++;
 			} else {
 				mpte = _pmap_allocpte(pmap, ptepindex,
 				    PMAP_ENTER_NOSLEEP);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 	} else {
 		mpte = NULL;
 	}
 
 	sched_pin();
 	pte = pmap_pte_quick(pmap, va);
 	if (*pte) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		sched_unpin();
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m)) {
 		if (mpte != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, mpte, &free)) {
 				pmap_invalidate_page_int(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			
 			mpte = NULL;
 		}
 		sched_unpin();
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap->pm_stats.resident_count++;
 
 	newpte = VM_PAGE_TO_PHYS(m) | PG_V |
 	    pmap_cache_bits(pmap, m->md.pat_mode, 0);
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		newpte |= PG_MANAGED;
 #ifdef PMAP_PAE_COMP
 	if ((prot & VM_PROT_EXECUTE) == 0 && !i386_read_exec)
 		newpte |= pg_nx;
 #endif
 	if (pmap != kernel_pmap)
 		newpte |= PG_U;
 	pte_store_zero(pte, newpte);
 	sched_unpin();
 	return (mpte);
 }
 
 /*
  * Make a temporary mapping for a physical address.  This is only intended
  * to be used for panic dumps.
  */
 static void *
 __CONCAT(PMTYPE, kenter_temporary)(vm_paddr_t pa, int i)
 {
 	vm_offset_t va;
 
 	va = (vm_offset_t)crashdumpmap + (i * PAGE_SIZE);
 	pmap_kenter(va, pa);
 	invlpg(va);
 	return ((void *)crashdumpmap);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 static void
 __CONCAT(PMTYPE, object_init_pt)(pmap_t pmap, vm_offset_t addr,
     vm_object_t object, vm_pindex_t pindex, vm_size_t size)
 {
 	pd_entry_t *pde;
 	vm_paddr_t pa, ptepa;
 	vm_page_t p;
 	int pat_mode;
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 	if (pg_ps_enabled &&
 	    (addr & (NBPDR - 1)) == 0 && (size & (NBPDR - 1)) == 0) {
 		if (!vm_object_populate(object, pindex, pindex + atop(size)))
 			return;
 		p = vm_page_lookup(object, pindex);
 		KASSERT(p->valid == VM_PAGE_BITS_ALL,
 		    ("pmap_object_init_pt: invalid page %p", p));
 		pat_mode = p->md.pat_mode;
 
 		/*
 		 * Abort the mapping if the first page is not physically
 		 * aligned to a 2/4MB page boundary.
 		 */
 		ptepa = VM_PAGE_TO_PHYS(p);
 		if (ptepa & (NBPDR - 1))
 			return;
 
 		/*
 		 * Skip the first page.  Abort the mapping if the rest of
 		 * the pages are not physically contiguous or have differing
 		 * memory attributes.
 		 */
 		p = TAILQ_NEXT(p, listq);
 		for (pa = ptepa + PAGE_SIZE; pa < ptepa + size;
 		    pa += PAGE_SIZE) {
 			KASSERT(p->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_object_init_pt: invalid page %p", p));
 			if (pa != VM_PAGE_TO_PHYS(p) ||
 			    pat_mode != p->md.pat_mode)
 				return;
 			p = TAILQ_NEXT(p, listq);
 		}
 
 		/*
 		 * Map using 2/4MB pages.  Since "ptepa" is 2/4M aligned and
 		 * "size" is a multiple of 2/4M, adding the PAT setting to
 		 * "pa" will not affect the termination of this loop.
 		 */
 		PMAP_LOCK(pmap);
 		for (pa = ptepa | pmap_cache_bits(pmap, pat_mode, 1);
 		    pa < ptepa + size; pa += NBPDR) {
 			pde = pmap_pde(pmap, addr);
 			if (*pde == 0) {
 				pde_store(pde, pa | PG_PS | PG_M | PG_A |
 				    PG_U | PG_RW | PG_V);
 				pmap->pm_stats.resident_count += NBPDR /
 				    PAGE_SIZE;
 				pmap_pde_mappings++;
 			}
 			/* Else continue on if the PDE is already valid. */
 			addr += NBPDR;
 		}
 		PMAP_UNLOCK(pmap);
 	}
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware feature,
  *	so there is no need to invalidate any TLB entries.
  */
 static void
 __CONCAT(PMTYPE, unwire)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t pdnxt;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	boolean_t pv_lists_locked;
 
 	if (pmap_is_current(pmap))
 		pv_lists_locked = FALSE;
 	else {
 		pv_lists_locked = TRUE;
 resume:
 		rw_wlock(&pvh_global_lock);
 		sched_pin();
 	}
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pdnxt < sva)
 			pdnxt = eva;
 		pde = pmap_pde(pmap, sva);
 		if ((*pde & PG_V) == 0)
 			continue;
 		if ((*pde & PG_PS) != 0) {
 			if ((*pde & PG_W) == 0)
 				panic("pmap_unwire: pde %#jx is missing PG_W",
 				    (uintmax_t)*pde);
 
 			/*
 			 * Are we unwiring the entire large page?  If not,
 			 * demote the mapping and fall through.
 			 */
 			if (sva + NBPDR == pdnxt && eva >= pdnxt) {
 				/*
 				 * Regardless of whether a pde (or pte) is 32
 				 * or 64 bits in size, PG_W is among the least
 				 * significant 32 bits.
 				 */
 				atomic_clear_int((u_int *)pde, PG_W);
 				pmap->pm_stats.wired_count -= NBPDR /
 				    PAGE_SIZE;
 				continue;
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = TRUE;
 					if (!rw_try_wlock(&pvh_global_lock)) {
 						PMAP_UNLOCK(pmap);
 						/* Repeat sva. */
 						goto resume;
 					}
 					sched_pin();
 				}
 				if (!pmap_demote_pde(pmap, pde, sva))
 					panic("pmap_unwire: demotion failed");
 			}
 		}
 		if (pdnxt > eva)
 			pdnxt = eva;
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & PG_V) == 0)
 				continue;
 			if ((*pte & PG_W) == 0)
 				panic("pmap_unwire: pte %#jx is missing PG_W",
 				    (uintmax_t)*pte);
 
 			/*
 			 * PG_W must be cleared atomically.  Although the pmap
 			 * lock synchronizes access to PG_W, another processor
 			 * could be setting PG_M and/or PG_A concurrently.
 			 *
 			 * PG_W is among the least significant 32 bits.
 			 */
 			atomic_clear_int((u_int *)pte, PG_W);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	if (pv_lists_locked) {
 		sched_unpin();
 		rw_wunlock(&pvh_global_lock);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.  Since
  *	current pmap is always the kernel pmap when executing in
  *	kernel, and we do not copy from the kernel pmap to a user
  *	pmap, this optimization is not usable in 4/4G full split i386
  *	world.
  */
 
 static void
 __CONCAT(PMTYPE, copy)(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr,
     vm_size_t len, vm_offset_t src_addr)
 {
 	struct spglist free;
 	pt_entry_t *src_pte, *dst_pte, ptetemp;
 	pd_entry_t srcptepaddr;
 	vm_page_t dstmpte, srcmpte;
 	vm_offset_t addr, end_addr, pdnxt;
 	u_int ptepindex;
 
 	if (dst_addr != src_addr)
 		return;
 
 	end_addr = src_addr + len;
 
 	rw_wlock(&pvh_global_lock);
 	if (dst_pmap < src_pmap) {
 		PMAP_LOCK(dst_pmap);
 		PMAP_LOCK(src_pmap);
 	} else {
 		PMAP_LOCK(src_pmap);
 		PMAP_LOCK(dst_pmap);
 	}
 	sched_pin();
 	for (addr = src_addr; addr < end_addr; addr = pdnxt) {
 		KASSERT(addr < PMAP_TRM_MIN_ADDRESS,
 		    ("pmap_copy: invalid to pmap_copy the trampoline"));
 
 		pdnxt = (addr + NBPDR) & ~PDRMASK;
 		if (pdnxt < addr)
 			pdnxt = end_addr;
 		ptepindex = addr >> PDRSHIFT;
 
 		srcptepaddr = src_pmap->pm_pdir[ptepindex];
 		if (srcptepaddr == 0)
 			continue;
 
 		if (srcptepaddr & PG_PS) {
 			if ((addr & PDRMASK) != 0 || addr + NBPDR > end_addr)
 				continue;
 			if (dst_pmap->pm_pdir[ptepindex] == 0 &&
 			    ((srcptepaddr & PG_MANAGED) == 0 ||
 			    pmap_pv_insert_pde(dst_pmap, addr, srcptepaddr,
 			    PMAP_ENTER_NORECLAIM))) {
 				dst_pmap->pm_pdir[ptepindex] = srcptepaddr &
 				    ~PG_W;
 				dst_pmap->pm_stats.resident_count +=
 				    NBPDR / PAGE_SIZE;
 				pmap_pde_mappings++;
 			}
 			continue;
 		}
 
 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr & PG_FRAME);
 		KASSERT(srcmpte->wire_count > 0,
 		    ("pmap_copy: source page table page is unused"));
 
 		if (pdnxt > end_addr)
 			pdnxt = end_addr;
 
 		src_pte = pmap_pte_quick3(src_pmap, addr);
 		while (addr < pdnxt) {
 			ptetemp = *src_pte;
 			/*
 			 * we only virtual copy managed pages
 			 */
 			if ((ptetemp & PG_MANAGED) != 0) {
 				dstmpte = pmap_allocpte(dst_pmap, addr,
 				    PMAP_ENTER_NOSLEEP);
 				if (dstmpte == NULL)
 					goto out;
 				dst_pte = pmap_pte_quick(dst_pmap, addr);
 				if (*dst_pte == 0 &&
 				    pmap_try_insert_pv_entry(dst_pmap, addr,
 				    PHYS_TO_VM_PAGE(ptetemp & PG_FRAME))) {
 					/*
 					 * Clear the wired, modified, and
 					 * accessed (referenced) bits
 					 * during the copy.
 					 */
 					*dst_pte = ptetemp & ~(PG_W | PG_M |
 					    PG_A);
 					dst_pmap->pm_stats.resident_count++;
 				} else {
 					SLIST_INIT(&free);
 					if (pmap_unwire_ptp(dst_pmap, dstmpte,
 					    &free)) {
 						pmap_invalidate_page_int(
 						    dst_pmap, addr);
 						vm_page_free_pages_toq(&free,
 						    true);
 					}
 					goto out;
 				}
 				if (dstmpte->wire_count >= srcmpte->wire_count)
 					break;
 			}
 			addr += PAGE_SIZE;
 			src_pte++;
 		}
 	}
 out:
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(src_pmap);
 	PMAP_UNLOCK(dst_pmap);
 }
 
 /*
  * Zero 1 page of virtual memory mapped from a hardware page by the caller.
  */
 static __inline void
 pagezero(void *page)
 {
 #if defined(I686_CPU)
 	if (cpu_class == CPUCLASS_686) {
 		if (cpu_feature & CPUID_SSE2)
 			sse2_pagezero(page);
 		else
 			i686_pagezero(page);
 	} else
 #endif
 		bzero(page, PAGE_SIZE);
 }
 
 /*
  * Zero the specified hardware page.
  */
 static void
 __CONCAT(PMTYPE, zero_page)(vm_page_t m)
 {
 	pt_entry_t *cmap_pte2;
 	struct pcpu *pc;
 
 	sched_pin();
 	pc = get_pcpu();
 	cmap_pte2 = pc->pc_cmap_pte2;
 	mtx_lock(&pc->pc_cmap_lock);
 	if (*cmap_pte2)
 		panic("pmap_zero_page: CMAP2 busy");
 	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
 	    pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
 	invlcaddr(pc->pc_cmap_addr2);
 	pagezero(pc->pc_cmap_addr2);
 	*cmap_pte2 = 0;
 
 	/*
 	 * Unpin the thread before releasing the lock.  Otherwise the thread
 	 * could be rescheduled while still bound to the current CPU, only
 	 * to unpin itself immediately upon resuming execution.
 	 */
 	sched_unpin();
 	mtx_unlock(&pc->pc_cmap_lock);
 }
 
 /*
  * Zero an an area within a single hardware page.  off and size must not
  * cover an area beyond a single hardware page.
  */
 static void
 __CONCAT(PMTYPE, zero_page_area)(vm_page_t m, int off, int size)
 {
 	pt_entry_t *cmap_pte2;
 	struct pcpu *pc;
 
 	sched_pin();
 	pc = get_pcpu();
 	cmap_pte2 = pc->pc_cmap_pte2;
 	mtx_lock(&pc->pc_cmap_lock);
 	if (*cmap_pte2)
 		panic("pmap_zero_page_area: CMAP2 busy");
 	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
 	    pmap_cache_bits(kernel_pmap, m->md.pat_mode, 0);
 	invlcaddr(pc->pc_cmap_addr2);
 	if (off == 0 && size == PAGE_SIZE) 
 		pagezero(pc->pc_cmap_addr2);
 	else
 		bzero(pc->pc_cmap_addr2 + off, size);
 	*cmap_pte2 = 0;
 	sched_unpin();
 	mtx_unlock(&pc->pc_cmap_lock);
 }
 
 /*
  * Copy 1 specified hardware page to another.
  */
 static void
 __CONCAT(PMTYPE, copy_page)(vm_page_t src, vm_page_t dst)
 {
 	pt_entry_t *cmap_pte1, *cmap_pte2;
 	struct pcpu *pc;
 
 	sched_pin();
 	pc = get_pcpu();
 	cmap_pte1 = pc->pc_cmap_pte1; 
 	cmap_pte2 = pc->pc_cmap_pte2;
 	mtx_lock(&pc->pc_cmap_lock);
 	if (*cmap_pte1)
 		panic("pmap_copy_page: CMAP1 busy");
 	if (*cmap_pte2)
 		panic("pmap_copy_page: CMAP2 busy");
 	*cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(src) | PG_A |
 	    pmap_cache_bits(kernel_pmap, src->md.pat_mode, 0);
 	invlcaddr(pc->pc_cmap_addr1);
 	*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(dst) | PG_A | PG_M |
 	    pmap_cache_bits(kernel_pmap, dst->md.pat_mode, 0);
 	invlcaddr(pc->pc_cmap_addr2);
 	bcopy(pc->pc_cmap_addr1, pc->pc_cmap_addr2, PAGE_SIZE);
 	*cmap_pte1 = 0;
 	*cmap_pte2 = 0;
 	sched_unpin();
 	mtx_unlock(&pc->pc_cmap_lock);
 }
 
 static void
 __CONCAT(PMTYPE, copy_pages)(vm_page_t ma[], vm_offset_t a_offset,
     vm_page_t mb[], vm_offset_t b_offset, int xfersize)
 {
 	vm_page_t a_pg, b_pg;
 	char *a_cp, *b_cp;
 	vm_offset_t a_pg_offset, b_pg_offset;
 	pt_entry_t *cmap_pte1, *cmap_pte2;
 	struct pcpu *pc;
 	int cnt;
 
 	sched_pin();
 	pc = get_pcpu();
 	cmap_pte1 = pc->pc_cmap_pte1; 
 	cmap_pte2 = pc->pc_cmap_pte2;
 	mtx_lock(&pc->pc_cmap_lock);
 	if (*cmap_pte1 != 0)
 		panic("pmap_copy_pages: CMAP1 busy");
 	if (*cmap_pte2 != 0)
 		panic("pmap_copy_pages: CMAP2 busy");
 	while (xfersize > 0) {
 		a_pg = ma[a_offset >> PAGE_SHIFT];
 		a_pg_offset = a_offset & PAGE_MASK;
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		b_pg = mb[b_offset >> PAGE_SHIFT];
 		b_pg_offset = b_offset & PAGE_MASK;
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		*cmap_pte1 = PG_V | VM_PAGE_TO_PHYS(a_pg) | PG_A |
 		    pmap_cache_bits(kernel_pmap, a_pg->md.pat_mode, 0);
 		invlcaddr(pc->pc_cmap_addr1);
 		*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(b_pg) | PG_A |
 		    PG_M | pmap_cache_bits(kernel_pmap, b_pg->md.pat_mode, 0);
 		invlcaddr(pc->pc_cmap_addr2);
 		a_cp = pc->pc_cmap_addr1 + a_pg_offset;
 		b_cp = pc->pc_cmap_addr2 + b_pg_offset;
 		bcopy(a_cp, b_cp, cnt);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 	*cmap_pte1 = 0;
 	*cmap_pte2 = 0;
 	sched_unpin();
 	mtx_unlock(&pc->pc_cmap_lock);
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 static boolean_t
 __CONCAT(PMTYPE, page_exists_quick)(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	rw_wlock(&pvh_global_lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 static int
 __CONCAT(PMTYPE, page_wired_mappings)(vm_page_t m)
 {
 	int count;
 
 	count = 0;
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (count);
 	rw_wlock(&pvh_global_lock);
 	count = pmap_pvh_wired_mappings(&m->md, count);
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 	    count = pmap_pvh_wired_mappings(pa_to_pvh(VM_PAGE_TO_PHYS(m)),
 	        count);
 	}
 	rw_wunlock(&pvh_global_lock);
 	return (count);
 }
 
 /*
  *	pmap_pvh_wired_mappings:
  *
  *	Return the updated number "count" of managed mappings that are wired.
  */
 static int
 pmap_pvh_wired_mappings(struct md_page *pvh, int count)
 {
 	pmap_t pmap;
 	pt_entry_t *pte;
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & PG_W) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
 	return (count);
 }
 
 /*
  * Returns TRUE if the given page is mapped individually or as part of
  * a 4mpage.  Otherwise, returns FALSE.
  */
 static boolean_t
 __CONCAT(PMTYPE, page_is_mapped)(vm_page_t m)
 {
 	boolean_t rv;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (FALSE);
 	rw_wlock(&pvh_global_lock);
 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  * Remove all pages from specified address space
  * this aids process exit speeds.  Also, this code
  * is special cased for current process only, but
  * can have the more generic (and slightly slower)
  * mode enabled.  This is much faster than pmap_remove
  * in the case of running down an entire address space.
  */
 static void
 __CONCAT(PMTYPE, remove_pages)(pmap_t pmap)
 {
 	pt_entry_t *pte, tpte;
 	vm_page_t m, mpte, mt;
 	pv_entry_t pv;
 	struct md_page *pvh;
 	struct pv_chunk *pc, *npc;
 	struct spglist free;
 	int field, idx;
 	int32_t bit;
 	uint32_t inuse, bitmask;
 	int allfree;
 
 	if (pmap != PCPU_GET(curpmap)) {
 		printf("warning: pmap_remove_pages called with non-current pmap\n");
 		return;
 	}
 	SLIST_INIT(&free);
 	rw_wlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	sched_pin();
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		KASSERT(pc->pc_pmap == pmap, ("Wrong pmap %p %p", pmap,
 		    pc->pc_pmap));
 		allfree = 1;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = bsfl(inuse);
 				bitmask = 1UL << bit;
 				idx = field * 32 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = pmap_pde(pmap, pv->pv_va);
 				tpte = *pte;
 				if ((tpte & PG_PS) == 0) {
 					pte = pmap_pte_quick(pmap, pv->pv_va);
 					tpte = *pte & ~PG_PTE_PAT;
 				}
 
 				if (tpte == 0) {
 					printf(
 					    "TPTE at %p  IS ZERO @ VA %08x\n",
 					    pte, pv->pv_va);
 					panic("bad pte");
 				}
 
 /*
  * We cannot remove wired pages from a process' mapping at this time
  */
 				if (tpte & PG_W) {
 					allfree = 0;
 					continue;
 				}
 
 				m = PHYS_TO_VM_PAGE(tpte & PG_FRAME);
 				KASSERT(m->phys_addr == (tpte & PG_FRAME),
 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
 				    m, (uintmax_t)m->phys_addr,
 				    (uintmax_t)tpte));
 
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad tpte %#jx",
 				    (uintmax_t)tpte));
 
 				pte_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if ((tpte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 					if ((tpte & PG_PS) != 0) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							vm_page_dirty(mt);
 					} else
 						vm_page_dirty(m);
 				}
 
 				/* Mark free */
 				PV_STAT(pv_entry_frees++);
 				PV_STAT(pv_entry_spare++);
 				pv_entry_count--;
 				pc->pc_map[field] |= bitmask;
 				if ((tpte & PG_PS) != 0) {
 					pmap->pm_stats.resident_count -= NBPDR / PAGE_SIZE;
 					pvh = pa_to_pvh(tpte & PG_PS_FRAME);
 					TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 					if (TAILQ_EMPTY(&pvh->pv_list)) {
 						for (mt = m; mt < &m[NBPDR / PAGE_SIZE]; mt++)
 							if (TAILQ_EMPTY(&mt->md.pv_list))
 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
 					}
 					mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 					if (mpte != NULL) {
 						KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
 						    ("pmap_remove_pages: pte page not promoted"));
 						pmap->pm_stats.resident_count--;
 						KASSERT(mpte->wire_count == NPTEPG,
 						    ("pmap_remove_pages: pte page wire count error"));
 						mpte->wire_count = 0;
 						pmap_add_delayed_free_list(mpte, &free, FALSE);
 					}
 				} else {
 					pmap->pm_stats.resident_count--;
 					TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 					if (TAILQ_EMPTY(&m->md.pv_list) &&
 					    (m->flags & PG_FICTITIOUS) == 0) {
 						pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 						if (TAILQ_EMPTY(&pvh->pv_list))
 							vm_page_aflag_clear(m, PGA_WRITEABLE);
 					}
 					pmap_unuse_pt(pmap, pv->pv_va, &free);
 				}
 			}
 		}
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	sched_unpin();
 	pmap_invalidate_all_int(pmap);
 	rw_wunlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, true);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 static boolean_t
 __CONCAT(PMTYPE, is_modified)(vm_page_t m)
 {
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTEs can have PG_M set.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
 	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_modified_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_modified_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  * Returns TRUE if any of the given mappings were used to modify
  * physical memory.  Otherwise, returns FALSE.  Both page and 2mpage
  * mappings are supported.
  */
 static boolean_t
 pmap_is_modified_pvh(struct md_page *pvh)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	pmap_t pmap;
 	boolean_t rv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		rv = (*pte & (PG_M | PG_RW)) == (PG_M | PG_RW);
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
 	sched_unpin();
 	return (rv);
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is elgible
  *	for prefault.
  */
 static boolean_t
 __CONCAT(PMTYPE, is_prefaultable)(pmap_t pmap, vm_offset_t addr)
 {
 	pd_entry_t pde;
 	boolean_t rv;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	pde = *pmap_pde(pmap, addr);
 	if (pde != 0 && (pde & PG_PS) == 0)
 		rv = pmap_pte_ufast(pmap, addr, pde) == 0;
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 static boolean_t
 __CONCAT(PMTYPE, is_referenced)(vm_page_t m)
 {
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	rw_wlock(&pvh_global_lock);
 	rv = pmap_is_referenced_pvh(&m->md) ||
 	    ((m->flags & PG_FICTITIOUS) == 0 &&
 	    pmap_is_referenced_pvh(pa_to_pvh(VM_PAGE_TO_PHYS(m))));
 	rw_wunlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  * Returns TRUE if any of the given mappings were referenced and FALSE
  * otherwise.  Both page and 4mpage mappings are supported.
  */
 static boolean_t
 pmap_is_referenced_pvh(struct md_page *pvh)
 {
 	pv_entry_t pv;
 	pt_entry_t *pte;
 	pmap_t pmap;
 	boolean_t rv;
 
 	rw_assert(&pvh_global_lock, RA_WLOCKED);
 	rv = FALSE;
 	sched_pin();
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		rv = (*pte & (PG_A | PG_V)) == (PG_A | PG_V);
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			break;
 	}
 	sched_unpin();
 	return (rv);
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 static void
 __CONCAT(PMTYPE, remove_write)(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t next_pv, pv;
 	pmap_t pmap;
 	pd_entry_t *pde;
 	pt_entry_t oldpte, *pte;
 	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * set by another thread while the object is locked.  Thus,
 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		va = pv->pv_va;
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, va);
 		if ((*pde & PG_RW) != 0)
 			(void)pmap_demote_pde(pmap, pde, va);
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_write: found"
 		    " a 4mpage in page %p's pv list", m));
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 retry:
 		oldpte = *pte;
 		if ((oldpte & PG_RW) != 0) {
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits
 			 * in size, PG_RW and PG_M are among the least
 			 * significant 32 bits.
 			 */
 			if (!atomic_cmpset_int((u_int *)pte, oldpte,
 			    oldpte & ~(PG_RW | PG_M)))
 				goto retry;
 			if ((oldpte & PG_M) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page_int(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  */
 static int
 __CONCAT(PMTYPE, ts_referenced)(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	vm_paddr_t pa;
 	int rtval = 0;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	pa = VM_PAGE_TO_PHYS(m);
 	pvh = pa_to_pvh(pa);
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0 ||
 	    (pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		if ((*pde & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			/*
 			 * Although "*pde" is mapping a 2/4MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 		if ((*pde & PG_A) != 0) {
 			/*
 			 * Since this reference bit is shared by either 1024
 			 * or 512 4KB pages, it should not be cleared every
 			 * time it is tested.  Apply a simple "hash" function
 			 * on the physical page number, the virtual superpage
 			 * number, and the pmap address to select one 4KB page
 			 * out of the 1024 or 512 on which testing the
 			 * reference bit will result in clearing that bit.
 			 * This function is designed to avoid the selection of
 			 * the same 4KB page for every 2- or 4MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> PDRSHIFT) ^
 			    (uintptr_t)pmap) & (NPTEPG - 1)) == 0 &&
 			    (*pde & PG_W) == 0) {
 				atomic_clear_int((u_int *)pde, PG_A);
 				pmap_invalidate_page_int(pmap, pv->pv_va);
 			}
 			rtval++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 		}
 		if (rtval >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0,
 		    ("pmap_ts_referenced: found a 4mpage in page %p's pv list",
 		    m));
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			vm_page_dirty(m);
 		if ((*pte & PG_A) != 0) {
 			atomic_clear_int((u_int *)pte, PG_A);
 			pmap_invalidate_page_int(pmap, pv->pv_va);
 			rtval++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && rtval <
 	    PMAP_TS_REFERENCED_MAX);
 out:
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 	return (rtval);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 static void
 __CONCAT(PMTYPE, advise)(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
     int advice)
 {
 	pd_entry_t oldpde, *pde;
 	pt_entry_t *pte;
 	vm_offset_t va, pdnxt;
 	vm_page_t m;
 	boolean_t anychanged, pv_lists_locked;
 
 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
 		return;
 	if (pmap_is_current(pmap))
 		pv_lists_locked = FALSE;
 	else {
 		pv_lists_locked = TRUE;
 resume:
 		rw_wlock(&pvh_global_lock);
 		sched_pin();
 	}
 	anychanged = FALSE;
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = pdnxt) {
 		pdnxt = (sva + NBPDR) & ~PDRMASK;
 		if (pdnxt < sva)
 			pdnxt = eva;
 		pde = pmap_pde(pmap, sva);
 		oldpde = *pde;
 		if ((oldpde & PG_V) == 0)
 			continue;
 		else if ((oldpde & PG_PS) != 0) {
 			if ((oldpde & PG_MANAGED) == 0)
 				continue;
 			if (!pv_lists_locked) {
 				pv_lists_locked = TRUE;
 				if (!rw_try_wlock(&pvh_global_lock)) {
 					if (anychanged)
 						pmap_invalidate_all_int(pmap);
 					PMAP_UNLOCK(pmap);
 					goto resume;
 				}
 				sched_pin();
 			}
 			if (!pmap_demote_pde(pmap, pde, sva)) {
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 
 			/*
 			 * Unless the page mappings are wired, remove the
 			 * mapping to a single page so that a subsequent
 			 * access may repromote.  Since the underlying page
 			 * table page is fully populated, this removal never
 			 * frees a page table page.
 			 */
 			if ((oldpde & PG_W) == 0) {
 				pte = pmap_pte_quick(pmap, sva);
 				KASSERT((*pte & PG_V) != 0,
 				    ("pmap_advise: invalid PTE"));
 				pmap_remove_pte(pmap, pte, sva, NULL);
 				anychanged = TRUE;
 			}
 		}
 		if (pdnxt > eva)
 			pdnxt = eva;
 		va = pdnxt;
 		for (pte = pmap_pte_quick(pmap, sva); sva != pdnxt; pte++,
 		    sva += PAGE_SIZE) {
 			if ((*pte & (PG_MANAGED | PG_V)) != (PG_MANAGED | PG_V))
 				goto maybe_invlrng;
 			else if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 				if (advice == MADV_DONTNEED) {
 					/*
 					 * Future calls to pmap_is_modified()
 					 * can be avoided by making the page
 					 * dirty now.
 					 */
 					m = PHYS_TO_VM_PAGE(*pte & PG_FRAME);
 					vm_page_dirty(m);
 				}
 				atomic_clear_int((u_int *)pte, PG_M | PG_A);
 			} else if ((*pte & PG_A) != 0)
 				atomic_clear_int((u_int *)pte, PG_A);
 			else
 				goto maybe_invlrng;
 			if ((*pte & PG_G) != 0) {
 				if (va == pdnxt)
 					va = sva;
 			} else
 				anychanged = TRUE;
 			continue;
 maybe_invlrng:
 			if (va != pdnxt) {
 				pmap_invalidate_range_int(pmap, va, sva);
 				va = pdnxt;
 			}
 		}
 		if (va != pdnxt)
 			pmap_invalidate_range_int(pmap, va, sva);
 	}
 	if (anychanged)
 		pmap_invalidate_all_int(pmap);
 	if (pv_lists_locked) {
 		sched_unpin();
 		rw_wunlock(&pvh_global_lock);
 	}
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 static void
 __CONCAT(PMTYPE, clear_modify)(vm_page_t m)
 {
 	struct md_page *pvh;
 	pv_entry_t next_pv, pv;
 	pmap_t pmap;
 	pd_entry_t oldpde, *pde;
-	pt_entry_t oldpte, *pte;
+	pt_entry_t *pte;
 	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT(!vm_page_xbusied(m),
 	    ("pmap_clear_modify: page %p is exclusive busied", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 	 * If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	rw_wlock(&pvh_global_lock);
 	sched_pin();
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		goto small_mappings;
 	pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		va = pv->pv_va;
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, va);
 		oldpde = *pde;
-		if ((oldpde & PG_RW) != 0) {
-			if (pmap_demote_pde(pmap, pde, va)) {
-				if ((oldpde & PG_W) == 0) {
-					/*
-					 * Write protect the mapping to a
-					 * single page so that a subsequent
-					 * write access may repromote.
-					 */
-					va += VM_PAGE_TO_PHYS(m) - (oldpde &
-					    PG_PS_FRAME);
-					pte = pmap_pte_quick(pmap, va);
-					oldpte = *pte;
-					if ((oldpte & PG_V) != 0) {
-						/*
-						 * Regardless of whether a pte is 32 or 64 bits
-						 * in size, PG_RW and PG_M are among the least
-						 * significant 32 bits.
-						 */
-						while (!atomic_cmpset_int((u_int *)pte,
-						    oldpte,
-						    oldpte & ~(PG_M | PG_RW)))
-							oldpte = *pte;
-						vm_page_dirty(m);
-						pmap_invalidate_page_int(pmap,
-						    va);
-					}
-				}
-			}
+		/* If oldpde has PG_RW set, then it also has PG_M set. */
+		if ((oldpde & PG_RW) != 0 &&
+		    pmap_demote_pde(pmap, pde, va) &&
+		    (oldpde & PG_W) == 0) {
+			/*
+			 * Write protect the mapping to a single page so that
+			 * a subsequent write access may repromote.
+			 */
+			va += VM_PAGE_TO_PHYS(m) - (oldpde & PG_PS_FRAME);
+			pte = pmap_pte_quick(pmap, va);
+			/*
+			 * Regardless of whether a pte is 32 or 64 bits
+			 * in size, PG_RW and PG_M are among the least
+			 * significant 32 bits.
+			 */
+			atomic_clear_int((u_int *)pte, PG_M | PG_RW);
+			vm_page_dirty(m);
+			pmap_invalidate_page_int(pmap, va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 small_mappings:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pde = pmap_pde(pmap, pv->pv_va);
 		KASSERT((*pde & PG_PS) == 0, ("pmap_clear_modify: found"
 		    " a 4mpage in page %p's pv list", m));
 		pte = pmap_pte_quick(pmap, pv->pv_va);
 		if ((*pte & (PG_M | PG_RW)) == (PG_M | PG_RW)) {
 			/*
 			 * Regardless of whether a pte is 32 or 64 bits
 			 * in size, PG_M is among the least significant
 			 * 32 bits. 
 			 */
 			atomic_clear_int((u_int *)pte, PG_M);
 			pmap_invalidate_page_int(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	sched_unpin();
 	rw_wunlock(&pvh_global_lock);
 }
 
 /*
  * Miscellaneous support routines follow
  */
 
 /* Adjust the cache mode for a 4KB page mapped via a PTE. */
 static __inline void
 pmap_pte_attr(pt_entry_t *pte, int cache_bits)
 {
 	u_int opte, npte;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PTE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opte = *(u_int *)pte;
 		npte = opte & ~PG_PTE_CACHE;
 		npte |= cache_bits;
 	} while (npte != opte && !atomic_cmpset_int((u_int *)pte, opte, npte));
 }
 
 /* Adjust the cache mode for a 2/4MB page mapped via a PDE. */
 static __inline void
 pmap_pde_attr(pd_entry_t *pde, int cache_bits)
 {
 	u_int opde, npde;
 
 	/*
 	 * The cache mode bits are all in the low 32-bits of the
 	 * PDE, so we can just spin on updating the low 32-bits.
 	 */
 	do {
 		opde = *(u_int *)pde;
 		npde = opde & ~PG_PDE_CACHE;
 		npde |= cache_bits;
 	} while (npde != opde && !atomic_cmpset_int((u_int *)pde, opde, npde));
 }
 
 /*
  * Map a set of physical memory pages into the kernel virtual
  * address space. Return a pointer to where it is mapped. This
  * routine is intended to be used for mapping device memory,
  * NOT real memory.
  */
 static void *
 __CONCAT(PMTYPE, mapdev_attr)(vm_paddr_t pa, vm_size_t size, int mode)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t va, offset;
 	vm_size_t tmpsize;
 	int i;
 
 	offset = pa & PAGE_MASK;
 	size = round_page(offset + size);
 	pa = pa & PG_FRAME;
 
 	if (pa < PMAP_MAP_LOW && pa + size <= PMAP_MAP_LOW)
 		va = pa + PMAP_MAP_LOW;
 	else if (!pmap_initialized) {
 		va = 0;
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->va == 0) {
 				ppim->pa = pa;
 				ppim->sz = size;
 				ppim->mode = mode;
 				ppim->va = virtual_avail;
 				virtual_avail += size;
 				va = ppim->va;
 				break;
 			}
 		}
 		if (va == 0)
 			panic("%s: too many preinit mappings", __func__);
 	} else {
 		/*
 		 * If we have a preinit mapping, re-use it.
 		 */
 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 			ppim = pmap_preinit_mapping + i;
 			if (ppim->pa == pa && ppim->sz == size &&
 			    ppim->mode == mode)
 				return ((void *)(ppim->va + offset));
 		}
 		va = kva_alloc(size);
 		if (va == 0)
 			panic("%s: Couldn't allocate KVA", __func__);
 	}
 	for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
 		pmap_kenter_attr(va + tmpsize, pa + tmpsize, mode);
 	pmap_invalidate_range_int(kernel_pmap, va, va + tmpsize);
 	pmap_invalidate_cache_range(va, va + size);
 	return ((void *)(va + offset));
 }
 
 static void
 __CONCAT(PMTYPE, unmapdev)(vm_offset_t va, vm_size_t size)
 {
 	struct pmap_preinit_mapping *ppim;
 	vm_offset_t offset;
 	int i;
 
 	if (va >= PMAP_MAP_LOW && va <= KERNBASE && va + size <= KERNBASE)
 		return;
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 	va = trunc_page(va);
 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
 		ppim = pmap_preinit_mapping + i;
 		if (ppim->va == va && ppim->sz == size) {
 			if (pmap_initialized)
 				return;
 			ppim->pa = 0;
 			ppim->va = 0;
 			ppim->sz = 0;
 			ppim->mode = 0;
 			if (va + size == virtual_avail)
 				virtual_avail = va;
 			return;
 		}
 	}
 	if (pmap_initialized)
 		kva_free(va, size);
 }
 
 /*
  * Sets the memory attribute for the specified page.
  */
 static void
 __CONCAT(PMTYPE, page_set_memattr)(vm_page_t m, vm_memattr_t ma)
 {
 
 	m->md.pat_mode = ma;
 	if ((m->flags & PG_FICTITIOUS) != 0)
 		return;
 
 	/*
 	 * If "m" is a normal page, flush it from the cache.
 	 * See pmap_invalidate_cache_range().
 	 *
 	 * First, try to find an existing mapping of the page by sf
 	 * buffer. sf_buf_invalidate_cache() modifies mapping and
 	 * flushes the cache.
 	 */    
 	if (sf_buf_invalidate_cache(m))
 		return;
 
 	/*
 	 * If page is not mapped by sf buffer, but CPU does not
 	 * support self snoop, map the page transient and do
 	 * invalidation. In the worst case, whole cache is flushed by
 	 * pmap_invalidate_cache_range().
 	 */
 	if ((cpu_feature & CPUID_SS) == 0)
 		pmap_flush_page(m);
 }
 
 static void
 __CONCAT(PMTYPE, flush_page)(vm_page_t m)
 {
 	pt_entry_t *cmap_pte2;
 	struct pcpu *pc;
 	vm_offset_t sva, eva;
 	bool useclflushopt;
 
 	useclflushopt = (cpu_stdext_feature & CPUID_STDEXT_CLFLUSHOPT) != 0;
 	if (useclflushopt || (cpu_feature & CPUID_CLFSH) != 0) {
 		sched_pin();
 		pc = get_pcpu();
 		cmap_pte2 = pc->pc_cmap_pte2; 
 		mtx_lock(&pc->pc_cmap_lock);
 		if (*cmap_pte2)
 			panic("pmap_flush_page: CMAP2 busy");
 		*cmap_pte2 = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) |
 		    PG_A | PG_M | pmap_cache_bits(kernel_pmap, m->md.pat_mode,
 		    0);
 		invlcaddr(pc->pc_cmap_addr2);
 		sva = (vm_offset_t)pc->pc_cmap_addr2;
 		eva = sva + PAGE_SIZE;
 
 		/*
 		 * Use mfence or sfence despite the ordering implied by
 		 * mtx_{un,}lock() because clflush on non-Intel CPUs
 		 * and clflushopt are not guaranteed to be ordered by
 		 * any other instruction.
 		 */
 		if (useclflushopt)
 			sfence();
 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		for (; sva < eva; sva += cpu_clflush_line_size) {
 			if (useclflushopt)
 				clflushopt(sva);
 			else
 				clflush(sva);
 		}
 		if (useclflushopt)
 			sfence();
 		else if (cpu_vendor_id != CPU_VENDOR_INTEL)
 			mfence();
 		*cmap_pte2 = 0;
 		sched_unpin();
 		mtx_unlock(&pc->pc_cmap_lock);
 	} else
 		pmap_invalidate_cache();
 }
 
 /*
  * Changes the specified virtual address range's memory type to that given by
  * the parameter "mode".  The specified virtual address range must be
  * completely contained within either the kernel map.
  *
  * Returns zero if the change completed successfully, and either EINVAL or
  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
  * of the virtual address range was not mapped, and ENOMEM is returned if
  * there was insufficient memory available to complete the change.
  */
 static int
 __CONCAT(PMTYPE, change_attr)(vm_offset_t va, vm_size_t size, int mode)
 {
 	vm_offset_t base, offset, tmpva;
 	pd_entry_t *pde;
 	pt_entry_t *pte;
 	int cache_bits_pte, cache_bits_pde;
 	boolean_t changed;
 
 	base = trunc_page(va);
 	offset = va & PAGE_MASK;
 	size = round_page(offset + size);
 
 	/*
 	 * Only supported on kernel virtual addresses above the recursive map.
 	 */
 	if (base < VM_MIN_KERNEL_ADDRESS)
 		return (EINVAL);
 
 	cache_bits_pde = pmap_cache_bits(kernel_pmap, mode, 1);
 	cache_bits_pte = pmap_cache_bits(kernel_pmap, mode, 0);
 	changed = FALSE;
 
 	/*
 	 * Pages that aren't mapped aren't supported.  Also break down
 	 * 2/4MB pages into 4KB pages if required.
 	 */
 	PMAP_LOCK(kernel_pmap);
 	for (tmpva = base; tmpva < base + size; ) {
 		pde = pmap_pde(kernel_pmap, tmpva);
 		if (*pde == 0) {
 			PMAP_UNLOCK(kernel_pmap);
 			return (EINVAL);
 		}
 		if (*pde & PG_PS) {
 			/*
 			 * If the current 2/4MB page already has
 			 * the required memory type, then we need not
 			 * demote this page.  Just increment tmpva to
 			 * the next 2/4MB page frame.
 			 */
 			if ((*pde & PG_PDE_CACHE) == cache_bits_pde) {
 				tmpva = trunc_4mpage(tmpva) + NBPDR;
 				continue;
 			}
 
 			/*
 			 * If the current offset aligns with a 2/4MB
 			 * page frame and there is at least 2/4MB left
 			 * within the range, then we need not break
 			 * down this page into 4KB pages.
 			 */
 			if ((tmpva & PDRMASK) == 0 &&
 			    tmpva + PDRMASK < base + size) {
 				tmpva += NBPDR;
 				continue;
 			}
 			if (!pmap_demote_pde(kernel_pmap, pde, tmpva)) {
 				PMAP_UNLOCK(kernel_pmap);
 				return (ENOMEM);
 			}
 		}
 		pte = vtopte(tmpva);
 		if (*pte == 0) {
 			PMAP_UNLOCK(kernel_pmap);
 			return (EINVAL);
 		}
 		tmpva += PAGE_SIZE;
 	}
 	PMAP_UNLOCK(kernel_pmap);
 
 	/*
 	 * Ok, all the pages exist, so run through them updating their
 	 * cache mode if required.
 	 */
 	for (tmpva = base; tmpva < base + size; ) {
 		pde = pmap_pde(kernel_pmap, tmpva);
 		if (*pde & PG_PS) {
 			if ((*pde & PG_PDE_CACHE) != cache_bits_pde) {
 				pmap_pde_attr(pde, cache_bits_pde);
 				changed = TRUE;
 			}
 			tmpva = trunc_4mpage(tmpva) + NBPDR;
 		} else {
 			pte = vtopte(tmpva);
 			if ((*pte & PG_PTE_CACHE) != cache_bits_pte) {
 				pmap_pte_attr(pte, cache_bits_pte);
 				changed = TRUE;
 			}
 			tmpva += PAGE_SIZE;
 		}
 	}
 
 	/*
 	 * Flush CPU caches to make sure any data isn't cached that
 	 * shouldn't be, etc.
 	 */
 	if (changed) {
 		pmap_invalidate_range_int(kernel_pmap, base, tmpva);
 		pmap_invalidate_cache_range(base, tmpva);
 	}
 	return (0);
 }
 
 /*
  * perform the pmap work for mincore
  */
 static int
 __CONCAT(PMTYPE, mincore)(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pd_entry_t pde;
 	pt_entry_t pte;
 	vm_paddr_t pa;
 	int val;
 
 	PMAP_LOCK(pmap);
 retry:
 	pde = *pmap_pde(pmap, addr);
 	if (pde != 0) {
 		if ((pde & PG_PS) != 0) {
 			pte = pde;
 			/* Compute the physical address of the 4KB page. */
 			pa = ((pde & PG_PS_FRAME) | (addr & PDRMASK)) &
 			    PG_FRAME;
 			val = MINCORE_SUPER;
 		} else {
 			pte = pmap_pte_ufast(pmap, addr, pde);
 			pa = pte & PG_FRAME;
 			val = 0;
 		}
 	} else {
 		pte = 0;
 		pa = 0;
 		val = 0;
 	}
 	if ((pte & PG_V) != 0) {
 		val |= MINCORE_INCORE;
 		if ((pte & (PG_M | PG_RW)) == (PG_M | PG_RW))
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((pte & PG_A) != 0)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 	}
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) &&
 	    (pte & (PG_MANAGED | PG_V)) == (PG_MANAGED | PG_V)) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 static void
 __CONCAT(PMTYPE, activate)(struct thread *td)
 {
 	pmap_t	pmap, oldpmap;
 	u_int	cpuid;
 	u_int32_t  cr3;
 
 	critical_enter();
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	oldpmap = PCPU_GET(curpmap);
 	cpuid = PCPU_GET(cpuid);
 #if defined(SMP)
 	CPU_CLR_ATOMIC(cpuid, &oldpmap->pm_active);
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 #else
 	CPU_CLR(cpuid, &oldpmap->pm_active);
 	CPU_SET(cpuid, &pmap->pm_active);
 #endif
 #ifdef PMAP_PAE_COMP
 	cr3 = vtophys(pmap->pm_pdpt);
 #else
 	cr3 = vtophys(pmap->pm_pdir);
 #endif
 	/*
 	 * pmap_activate is for the current thread on the current cpu
 	 */
 	td->td_pcb->pcb_cr3 = cr3;
 	PCPU_SET(curpmap, pmap);
 	critical_exit();
 }
 
 static void
 __CONCAT(PMTYPE, activate_boot)(pmap_t pmap)
 {
 	u_int cpuid;
 
 	cpuid = PCPU_GET(cpuid);
 #if defined(SMP)
 	CPU_SET_ATOMIC(cpuid, &pmap->pm_active);
 #else
 	CPU_SET(cpuid, &pmap->pm_active);
 #endif
 	PCPU_SET(curpmap, pmap);
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 static void
 __CONCAT(PMTYPE, align_superpage)(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t superpage_offset;
 
 	if (size < NBPDR)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & PDRMASK;
 	if (size - ((NBPDR - superpage_offset) & PDRMASK) < NBPDR ||
 	    (*addr & PDRMASK) == superpage_offset)
 		return;
 	if ((*addr & PDRMASK) < superpage_offset)
 		*addr = (*addr & ~PDRMASK) + superpage_offset;
 	else
 		*addr = ((*addr + PDRMASK) & ~PDRMASK) + superpage_offset;
 }
 
 static vm_offset_t
 __CONCAT(PMTYPE, quick_enter_page)(vm_page_t m)
 {
 	vm_offset_t qaddr;
 	pt_entry_t *pte;
 
 	critical_enter();
 	qaddr = PCPU_GET(qmap_addr);
 	pte = vtopte(qaddr);
 
 	KASSERT(*pte == 0,
 	    ("pmap_quick_enter_page: PTE busy %#jx", (uintmax_t)*pte));
 	*pte = PG_V | PG_RW | VM_PAGE_TO_PHYS(m) | PG_A | PG_M |
 	    pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(m), 0);
 	invlpg(qaddr);
 
 	return (qaddr);
 }
 
 static void
 __CONCAT(PMTYPE, quick_remove_page)(vm_offset_t addr)
 {
 	vm_offset_t qaddr;
 	pt_entry_t *pte;
 
 	qaddr = PCPU_GET(qmap_addr);
 	pte = vtopte(qaddr);
 
 	KASSERT(*pte != 0, ("pmap_quick_remove_page: PTE not in use"));
 	KASSERT(addr == qaddr, ("pmap_quick_remove_page: invalid address"));
 
 	*pte = 0;
 	critical_exit();
 }
 
 static vmem_t *pmap_trm_arena;
 static vmem_addr_t pmap_trm_arena_last = PMAP_TRM_MIN_ADDRESS;
 static int trm_guard = PAGE_SIZE;
 
 static int
 pmap_trm_import(void *unused __unused, vmem_size_t size, int flags,
     vmem_addr_t *addrp)
 {
 	vm_page_t m;
 	vmem_addr_t af, addr, prev_addr;
 	pt_entry_t *trm_pte;
 
 	prev_addr = atomic_load_long(&pmap_trm_arena_last);
 	size = round_page(size) + trm_guard;
 	for (;;) {
 		if (prev_addr + size < prev_addr || prev_addr + size < size ||
 		    prev_addr + size > PMAP_TRM_MAX_ADDRESS)
 			return (ENOMEM);
 		addr = prev_addr + size;
 		if (atomic_fcmpset_int(&pmap_trm_arena_last, &prev_addr, addr))
 			break;
 	}
 	prev_addr += trm_guard;
 	trm_pte = PTmap + atop(prev_addr);
 	for (af = prev_addr; af < addr; af += PAGE_SIZE) {
 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY |
 		    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK);
 		pte_store(&trm_pte[atop(af - prev_addr)], VM_PAGE_TO_PHYS(m) |
 		    PG_M | PG_A | PG_RW | PG_V | pgeflag |
 		    pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, FALSE));
 	}
 	*addrp = prev_addr;
 	return (0);
 }
 
 void
 pmap_init_trm(void)
 {
 	vm_page_t pd_m;
 
 	TUNABLE_INT_FETCH("machdep.trm_guard", &trm_guard);
 	if ((trm_guard & PAGE_MASK) != 0)
 		trm_guard = 0;
 	pmap_trm_arena = vmem_create("i386trampoline", 0, 0, 1, 0, M_WAITOK);
 	vmem_set_import(pmap_trm_arena, pmap_trm_import, NULL, NULL, PAGE_SIZE);
 	pd_m = vm_page_alloc(NULL, 0, VM_ALLOC_NOOBJ | VM_ALLOC_NOBUSY |
 	    VM_ALLOC_NORMAL | VM_ALLOC_WIRED | VM_ALLOC_WAITOK | VM_ALLOC_ZERO);
 	if ((pd_m->flags & PG_ZERO) == 0)
 		pmap_zero_page(pd_m);
 	PTD[TRPTDI] = VM_PAGE_TO_PHYS(pd_m) | PG_M | PG_A | PG_RW | PG_V |
 	    pmap_cache_bits(kernel_pmap, VM_MEMATTR_DEFAULT, TRUE);
 }
 
 static void *
 __CONCAT(PMTYPE, trm_alloc)(size_t size, int flags)
 {
 	vmem_addr_t res;
 	int error;
 
 	MPASS((flags & ~(M_WAITOK | M_NOWAIT | M_ZERO)) == 0);
 	error = vmem_xalloc(pmap_trm_arena, roundup2(size, 4), sizeof(int),
 	    0, 0, VMEM_ADDR_MIN, VMEM_ADDR_MAX, flags | M_FIRSTFIT, &res);
 	if (error != 0)
 		return (NULL);
 	if ((flags & M_ZERO) != 0)
 		bzero((void *)res, size);
 	return ((void *)res);
 }
 
 static void
 __CONCAT(PMTYPE, trm_free)(void *addr, size_t size)
 {
 
 	vmem_free(pmap_trm_arena, (uintptr_t)addr, roundup2(size, 4));
 }
 
 static void
 __CONCAT(PMTYPE, ksetrw)(vm_offset_t va)
 {
 
 	*vtopte(va) |= PG_RW;
 }
 
 static void
 __CONCAT(PMTYPE, remap_lowptdi)(bool enable)
 {
 
 	PTD[KPTDI] = enable ? PTD[LOWPTDI] : 0;
 	invltlb_glob();
 }
 
 static vm_offset_t
 __CONCAT(PMTYPE, get_map_low)(void)
 {
 
 	return (PMAP_MAP_LOW);
 }
 
 static vm_offset_t
 __CONCAT(PMTYPE, get_vm_maxuser_address)(void)
 {
 
 	return (VM_MAXUSER_ADDRESS);
 }
 
 static vm_paddr_t
 __CONCAT(PMTYPE, pg_frame)(vm_paddr_t pa)
 {
 
 	return (pa & PG_FRAME);
 }
 
 static void
 __CONCAT(PMTYPE, sf_buf_map)(struct sf_buf *sf)
 {
 	pt_entry_t opte, *ptep;
 
 	/*
 	 * Update the sf_buf's virtual-to-physical mapping, flushing the
 	 * virtual address from the TLB.  Since the reference count for
 	 * the sf_buf's old mapping was zero, that mapping is not
 	 * currently in use.  Consequently, there is no need to exchange
 	 * the old and new PTEs atomically, even under PAE.
 	 */
 	ptep = vtopte(sf->kva);
 	opte = *ptep;
 	*ptep = VM_PAGE_TO_PHYS(sf->m) | PG_RW | PG_V |
 	    pmap_cache_bits(kernel_pmap, sf->m->md.pat_mode, 0);
 
 	/*
 	 * Avoid unnecessary TLB invalidations: If the sf_buf's old
 	 * virtual-to-physical mapping was not used, then any processor
 	 * that has invalidated the sf_buf's virtual address from its TLB
 	 * since the last used mapping need not invalidate again.
 	 */
 #ifdef SMP
 	if ((opte & (PG_V | PG_A)) ==  (PG_V | PG_A))
 		CPU_ZERO(&sf->cpumask);
 #else
 	if ((opte & (PG_V | PG_A)) ==  (PG_V | PG_A))
 		pmap_invalidate_page_int(kernel_pmap, sf->kva);
 #endif
 }
 
 static void
 __CONCAT(PMTYPE, cp_slow0_map)(vm_offset_t kaddr, int plen, vm_page_t *ma)
 {
 	pt_entry_t *pte;
 	int i;
 
 	for (i = 0, pte = vtopte(kaddr); i < plen; i++, pte++) {
 		*pte = PG_V | PG_RW | PG_A | PG_M | VM_PAGE_TO_PHYS(ma[i]) |
 		    pmap_cache_bits(kernel_pmap, pmap_page_get_memattr(ma[i]),
 		    FALSE);
 		invlpg(kaddr + ptoa(i));
 	}
 }
 
 static u_int
 __CONCAT(PMTYPE, get_kcr3)(void)
 {
 
 #ifdef PMAP_PAE_COMP
 	return ((u_int)IdlePDPT);
 #else
 	return ((u_int)IdlePTD);
 #endif
 }
 
 static u_int
 __CONCAT(PMTYPE, get_cr3)(pmap_t pmap)
 {
 
 #ifdef PMAP_PAE_COMP
 	return ((u_int)vtophys(pmap->pm_pdpt));
 #else
 	return ((u_int)vtophys(pmap->pm_pdir));
 #endif
 }
 
 static caddr_t
 __CONCAT(PMTYPE, cmap3)(vm_paddr_t pa, u_int pte_bits)
 {
 	pt_entry_t *pte;
 
 	pte = CMAP3;
 	*pte = pa | pte_bits;
 	invltlb();
 	return (CADDR3);
 }
 
 static void
 __CONCAT(PMTYPE, basemem_setup)(u_int basemem)
 {
 	pt_entry_t *pte;
 	int i;
 
 	/*
 	 * Map pages between basemem and ISA_HOLE_START, if any, r/w into
 	 * the vm86 page table so that vm86 can scribble on them using
 	 * the vm86 map too.  XXX: why 2 ways for this and only 1 way for
 	 * page 0, at least as initialized here?
 	 */
 	pte = (pt_entry_t *)vm86paddr;
 	for (i = basemem / 4; i < 160; i++)
 		pte[i] = (i << PAGE_SHIFT) | PG_V | PG_RW | PG_U;
 }
 
 struct bios16_pmap_handle {
 	pt_entry_t	*pte;
 	pd_entry_t	*ptd;
 	pt_entry_t	orig_ptd;
 };
 
 static void *
 __CONCAT(PMTYPE, bios16_enter)(void)
 {
 	struct bios16_pmap_handle *h;
 
 	/*
 	 * no page table, so create one and install it.
 	 */
 	h = malloc(sizeof(struct bios16_pmap_handle), M_TEMP, M_WAITOK);
 	h->pte = (pt_entry_t *)malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
 	h->ptd = IdlePTD;
 	*h->pte = vm86phystk | PG_RW | PG_V;
 	h->orig_ptd = *h->ptd;
 	*h->ptd = vtophys(h->pte) | PG_RW | PG_V;
 	pmap_invalidate_all_int(kernel_pmap);	/* XXX insurance for now */
 	return (h);
 }
 
 static void
 __CONCAT(PMTYPE, bios16_leave)(void *arg)
 {
 	struct bios16_pmap_handle *h;
 
 	h = arg;
 	*h->ptd = h->orig_ptd;		/* remove page table */
 	/*
 	 * XXX only needs to be invlpg(0) but that doesn't work on the 386
 	 */
 	pmap_invalidate_all_int(kernel_pmap);
 	free(h->pte, M_TEMP);		/* ... and free it */
 }
 
 #define	PMM(a)	\
 	.pm_##a = __CONCAT(PMTYPE, a),
 
 struct pmap_methods __CONCAT(PMTYPE, methods) = {
 	PMM(ksetrw)
 	PMM(remap_lower)
 	PMM(remap_lowptdi)
 	PMM(align_superpage)
 	PMM(quick_enter_page)
 	PMM(quick_remove_page)
 	PMM(trm_alloc)
 	PMM(trm_free)
 	PMM(get_map_low)
 	PMM(get_vm_maxuser_address)
 	PMM(kextract)
 	PMM(pg_frame)
 	PMM(sf_buf_map)
 	PMM(cp_slow0_map)
 	PMM(get_kcr3)
 	PMM(get_cr3)
 	PMM(cmap3)
 	PMM(basemem_setup)
 	PMM(set_nx)
 	PMM(bios16_enter)
 	PMM(bios16_leave)
 	PMM(bootstrap)
 	PMM(is_valid_memattr)
 	PMM(cache_bits)
 	PMM(ps_enabled)
 	PMM(pinit0)
 	PMM(pinit)
 	PMM(activate)
 	PMM(activate_boot)
 	PMM(advise)
 	PMM(clear_modify)
 	PMM(change_attr)
 	PMM(mincore)
 	PMM(copy)
 	PMM(copy_page)
 	PMM(copy_pages)
 	PMM(zero_page)
 	PMM(zero_page_area)
 	PMM(enter)
 	PMM(enter_object)
 	PMM(enter_quick)
 	PMM(kenter_temporary)
 	PMM(object_init_pt)
 	PMM(unwire)
 	PMM(page_exists_quick)
 	PMM(page_wired_mappings)
 	PMM(page_is_mapped)
 	PMM(remove_pages)
 	PMM(is_modified)
 	PMM(is_prefaultable)
 	PMM(is_referenced)
 	PMM(remove_write)
 	PMM(ts_referenced)
 	PMM(mapdev_attr)
 	PMM(unmapdev)
 	PMM(page_set_memattr)
 	PMM(extract)
 	PMM(extract_and_hold)
 	PMM(map)
 	PMM(qenter)
 	PMM(qremove)
 	PMM(release)
 	PMM(remove)
 	PMM(protect)
 	PMM(remove_all)
 	PMM(init)
 	PMM(init_pat)
 	PMM(growkernel)
 	PMM(invalidate_page)
 	PMM(invalidate_range)
 	PMM(invalidate_all)
 	PMM(invalidate_cache)
 	PMM(flush_page)
 	PMM(kenter)
 	PMM(kremove)
 };
Index: projects/nfsv42/sys/net/if_tap.h
===================================================================
--- projects/nfsv42/sys/net/if_tap.h	(revision 350367)
+++ projects/nfsv42/sys/net/if_tap.h	(revision 350368)
@@ -1,74 +1,74 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * BASED ON:
  * -------------------------------------------------------------------------
  *
  * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk>
  * Nottingham University 1987.
  */
 
 /*
  * $FreeBSD$
  * $Id: if_tap.h,v 0.7 2000/07/12 04:12:51 max Exp $
  */
 
 #ifndef _NET_IF_TAP_H_
 #define _NET_IF_TAP_H_
 
 #include <net/if_tun.h>
 
 /* maximum receive packet size (hard limit) */
 #define	TAPMRU		16384
 
 #define	tapinfo		tuninfo
 
 /*
  * ioctl's for get/set debug; these are aliases of TUN* ioctls, see net/if_tun.h
  * for details.
  */
 #define	TAPSDEBUG		TUNSDEBUG
 #define	TAPGDEBUG		TUNGDEBUG
 #define	TAPSIFINFO		TUNSIFINFO
 #define	TAPGIFINFO		TUNGIFINFO
-#define	TAPGIFNAME		_IOR('t', 93, struct ifreq)
+#define	TAPGIFNAME		TUNGIFNAME
 
 /* VMware ioctl's */
 #define VMIO_SIOCSIFFLAGS	_IOWINT('V', 0)
 #define VMIO_SIOCSKEEP		_IO('V', 1)
 #define VMIO_SIOCSIFBR		_IO('V', 2)
 #define VMIO_SIOCSLADRF		_IO('V', 3)
 
 /* XXX -- unimplemented */
 #define VMIO_SIOCSETMACADDR	_IO('V', 4)
 
 /* XXX -- not used? */
 #define VMIO_SIOCPORT		_IO('V', 5)
 #define VMIO_SIOCBRIDGE		_IO('V', 6)
 #define VMIO_SIOCNETIF		_IO('V', 7)
 
 #endif /* !_NET_IF_TAP_H_ */
Index: projects/nfsv42/sys/net/if_tun.h
===================================================================
--- projects/nfsv42/sys/net/if_tun.h	(revision 350367)
+++ projects/nfsv42/sys/net/if_tun.h	(revision 350368)
@@ -1,48 +1,49 @@
 /*	$NetBSD: if_tun.h,v 1.5 1994/06/29 06:36:27 cgd Exp $	*/
 
 /*-
  * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk>
  * Nottingham University 1987.
  *
  * This source may be freely distributed, however I would be interested
  * in any changes that are made.
  *
  * This driver takes packets off the IP i/f and hands them up to a
  * user process to have its wicked way with. This driver has it's
  * roots in a similar driver written by Phil Cockcroft (formerly) at
  * UCL. This driver is based much more on read/write/select mode of
  * operation though.
  *
  * $FreeBSD$
  */
 
 #ifndef _NET_IF_TUN_H_
 #define _NET_IF_TUN_H_
 
 /* Refer to if_tunvar.h for the softc stuff */
 
 /* Maximum transmit packet size (default) */
 #define	TUNMTU		1500
 
 /* Maximum receive packet size (hard limit) */
 #define	TUNMRU		65535
 
 struct tuninfo {
 	int	baudrate;		/* linespeed */
 	unsigned short	mtu;		/* maximum transmission unit */
 	u_char	type;			/* ethernet, tokenring, etc. */
 	u_char	dummy;			/* place holder */
 };
 
 /* ioctl's for get/set debug */
 #define	TUNSDEBUG	_IOW('t', 90, int)
 #define	TUNGDEBUG	_IOR('t', 89, int)
 #define	TUNSIFINFO	_IOW('t', 91, struct tuninfo)
 #define	TUNGIFINFO	_IOR('t', 92, struct tuninfo)
 #define	TUNSLMODE	_IOW('t', 93, int)
+#define	TUNGIFNAME	_IOR('t', 93, struct ifreq)
 #define	TUNSIFMODE	_IOW('t', 94, int)
 #define	TUNSIFPID	_IO('t', 95)
 #define	TUNSIFHEAD	_IOW('t', 96, int)
 #define	TUNGIFHEAD	_IOR('t', 97, int)
 
 #endif /* !_NET_IF_TUN_H_ */
Index: projects/nfsv42/sys/net/if_tuntap.c
===================================================================
--- projects/nfsv42/sys/net/if_tuntap.c	(revision 350367)
+++ projects/nfsv42/sys/net/if_tuntap.c	(revision 350368)
@@ -1,1719 +1,1718 @@
 /*	$NetBSD: if_tun.c,v 1.14 1994/06/29 06:36:25 cgd Exp $	*/
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (C) 1999-2000 by Maksim Yevmenkin <m_evmenkin@yahoo.com>
  * All rights reserved.
  * Copyright (c) 2019 Kyle Evans <kevans@FreeBSD.org>
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * BASED ON:
  * -------------------------------------------------------------------------
  *
  * Copyright (c) 1988, Julian Onions <jpo@cs.nott.ac.uk>
  * Nottingham University 1987.
  *
  * This source may be freely distributed, however I would be interested
  * in any changes that are made.
  *
  * This driver takes packets off the IP i/f and hands them up to a
  * user process to have its wicked way with. This driver has it's
  * roots in a similar driver written by Phil Cockcroft (formerly) at
  * UCL. This driver is based much more on read/write/poll mode of
  * operation though.
  *
  * $FreeBSD$
  */
 
 #include "opt_inet.h"
 #include "opt_inet6.h"
 
 #include <sys/param.h>
 #include <sys/lock.h>
 #include <sys/priv.h>
 #include <sys/proc.h>
 #include <sys/systm.h>
 #include <sys/jail.h>
 #include <sys/mbuf.h>
 #include <sys/module.h>
 #include <sys/socket.h>
 #include <sys/eventhandler.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/sockio.h>
 #include <sys/sx.h>
 #include <sys/ttycom.h>
 #include <sys/poll.h>
 #include <sys/selinfo.h>
 #include <sys/signalvar.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/sysctl.h>
 #include <sys/conf.h>
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/random.h>
 #include <sys/ctype.h>
 
 #include <net/ethernet.h>
 #include <net/if.h>
 #include <net/if_var.h>
 #include <net/if_clone.h>
 #include <net/if_dl.h>
 #include <net/if_media.h>
 #include <net/if_types.h>
 #include <net/netisr.h>
 #include <net/route.h>
 #include <net/vnet.h>
 #ifdef INET
 #include <netinet/in.h>
 #endif
 #include <net/bpf.h>
 #include <net/if_tap.h>
 #include <net/if_tun.h>
 
 #include <sys/queue.h>
 #include <sys/condvar.h>
 #include <security/mac/mac_framework.h>
 
 struct tuntap_driver;
 
 /*
  * tun_list is protected by global tunmtx.  Other mutable fields are
  * protected by tun->tun_mtx, or by their owning subsystem.  tun_dev is
  * static for the duration of a tunnel interface.
  */
 struct tuntap_softc {
 	TAILQ_ENTRY(tuntap_softc)	 tun_list;
 	struct cdev			*tun_dev;
 	u_short				 tun_flags;	/* misc flags */
 #define	TUN_OPEN	0x0001
 #define	TUN_INITED	0x0002
 #define	TUN_RCOLL	0x0004
 #define	TUN_IASET	0x0008
 #define	TUN_DSTADDR	0x0010
 #define	TUN_LMODE	0x0020
 #define	TUN_RWAIT	0x0040
 #define	TUN_ASYNC	0x0080
 #define	TUN_IFHEAD	0x0100
 #define	TUN_DYING	0x0200
 #define	TUN_L2		0x0400
 #define	TUN_VMNET	0x0800
 
 #define	TUN_DRIVER_IDENT_MASK	(TUN_L2 | TUN_VMNET)
 #define	TUN_READY		(TUN_OPEN | TUN_INITED)
 
 	pid_t			 tun_pid;	/* owning pid */
 	struct ifnet		*tun_ifp;	/* the interface */
 	struct sigio		*tun_sigio;	/* async I/O info */
 	struct tuntap_driver	*tun_drv;	/* appropriate driver */
 	struct selinfo		 tun_rsel;	/* read select */
 	struct mtx		 tun_mtx;	/* softc field mutex */
 	struct cv		 tun_cv;	/* for ref'd dev destroy */
 	struct ether_addr	 tun_ether;	/* remote address */
 };
 #define	TUN2IFP(sc)	((sc)->tun_ifp)
 
 #define	TUNDEBUG	if (tundebug) if_printf
 
 #define	TUN_LOCK(tp)	mtx_lock(&(tp)->tun_mtx)
 #define	TUN_UNLOCK(tp)	mtx_unlock(&(tp)->tun_mtx)
 
 #define	TUN_VMIO_FLAG_MASK	0x0fff
 
 /*
  * All mutable global variables in if_tun are locked using tunmtx, with
  * the exception of tundebug, which is used unlocked, and the drivers' *clones,
  * which are static after setup.
  */
 static struct mtx tunmtx;
 static eventhandler_tag tag;
 static const char tunname[] = "tun";
 static const char tapname[] = "tap";
 static const char vmnetname[] = "vmnet";
 static MALLOC_DEFINE(M_TUN, tunname, "Tunnel Interface");
 static int tundebug = 0;
 static int tundclone = 1;
 static int tap_allow_uopen = 0;	/* allow user open() */
 static int tapuponopen = 0;	/* IFF_UP on open() */
 static int tapdclone = 1;	/* enable devfs cloning */
 
 static TAILQ_HEAD(,tuntap_softc)	tunhead = TAILQ_HEAD_INITIALIZER(tunhead);
 SYSCTL_INT(_debug, OID_AUTO, if_tun_debug, CTLFLAG_RW, &tundebug, 0, "");
 
 static struct sx tun_ioctl_sx;
 SX_SYSINIT(tun_ioctl_sx, &tun_ioctl_sx, "tun_ioctl");
 
 SYSCTL_DECL(_net_link);
 /* tun */
 static SYSCTL_NODE(_net_link, OID_AUTO, tun, CTLFLAG_RW, 0,
     "IP tunnel software network interface.");
 SYSCTL_INT(_net_link_tun, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tundclone, 0,
     "Enable legacy devfs interface creation.");
 
 /* tap */
 static SYSCTL_NODE(_net_link, OID_AUTO, tap, CTLFLAG_RW, 0,
     "Ethernet tunnel software network interface");
 SYSCTL_INT(_net_link_tap, OID_AUTO, user_open, CTLFLAG_RW, &tap_allow_uopen, 0,
     "Allow user to open /dev/tap (based on node permissions)");
 SYSCTL_INT(_net_link_tap, OID_AUTO, up_on_open, CTLFLAG_RW, &tapuponopen, 0,
     "Bring interface up when /dev/tap is opened");
 SYSCTL_INT(_net_link_tap, OID_AUTO, devfs_cloning, CTLFLAG_RWTUN, &tapdclone, 0,
     "Enable legacy devfs interface creation");
 SYSCTL_INT(_net_link_tap, OID_AUTO, debug, CTLFLAG_RW, &tundebug, 0, "");
 
 static int	tuntap_name2info(const char *name, int *unit, int *flags);
 static void	tunclone(void *arg, struct ucred *cred, char *name,
 		    int namelen, struct cdev **dev);
 static void	tuncreate(struct cdev *dev, struct tuntap_driver *);
 static int	tunifioctl(struct ifnet *, u_long, caddr_t);
 static void	tuninit(struct ifnet *);
 static void	tunifinit(void *xtp);
 static int	tuntapmodevent(module_t, int, void *);
 static int	tunoutput(struct ifnet *, struct mbuf *,
 		    const struct sockaddr *, struct route *ro);
 static void	tunstart(struct ifnet *);
 static void	tunstart_l2(struct ifnet *);
 
 static int	tun_clone_match(struct if_clone *ifc, const char *name);
 static int	tap_clone_match(struct if_clone *ifc, const char *name);
 static int	vmnet_clone_match(struct if_clone *ifc, const char *name);
 static int	tun_clone_create(struct if_clone *, char *, size_t, caddr_t);
 static int	tun_clone_destroy(struct if_clone *, struct ifnet *);
 
 static d_open_t		tunopen;
 static d_close_t	tunclose;
 static d_read_t		tunread;
 static d_write_t	tunwrite;
 static d_ioctl_t	tunioctl;
 static d_poll_t		tunpoll;
 static d_kqfilter_t	tunkqfilter;
 
 static int		tunkqread(struct knote *, long);
 static int		tunkqwrite(struct knote *, long);
 static void		tunkqdetach(struct knote *);
 
 static struct filterops tun_read_filterops = {
 	.f_isfd =	1,
 	.f_attach =	NULL,
 	.f_detach =	tunkqdetach,
 	.f_event =	tunkqread,
 };
 
 static struct filterops tun_write_filterops = {
 	.f_isfd =	1,
 	.f_attach =	NULL,
 	.f_detach =	tunkqdetach,
 	.f_event =	tunkqwrite,
 };
 
 static struct tuntap_driver {
 	struct cdevsw		 cdevsw;
 	int			 ident_flags;
 	struct unrhdr		*unrhdr;
 	struct clonedevs	*clones;
 	ifc_match_t		*clone_match_fn;
 	ifc_create_t		*clone_create_fn;
 	ifc_destroy_t		*clone_destroy_fn;
 } tuntap_drivers[] = {
 	{
 		.ident_flags =	0,
 		.cdevsw =	{
 		    .d_version =	D_VERSION,
 		    .d_flags =		D_NEEDMINOR,
 		    .d_open =		tunopen,
 		    .d_close =		tunclose,
 		    .d_read =		tunread,
 		    .d_write =		tunwrite,
 		    .d_ioctl =		tunioctl,
 		    .d_poll =		tunpoll,
 		    .d_kqfilter =	tunkqfilter,
 		    .d_name =		tunname,
 		},
 		.clone_match_fn =	tun_clone_match,
 		.clone_create_fn =	tun_clone_create,
 		.clone_destroy_fn =	tun_clone_destroy,
 	},
 	{
 		.ident_flags =	TUN_L2,
 		.cdevsw =	{
 		    .d_version =	D_VERSION,
 		    .d_flags =		D_NEEDMINOR,
 		    .d_open =		tunopen,
 		    .d_close =		tunclose,
 		    .d_read =		tunread,
 		    .d_write =		tunwrite,
 		    .d_ioctl =		tunioctl,
 		    .d_poll =		tunpoll,
 		    .d_kqfilter =	tunkqfilter,
 		    .d_name =		tapname,
 		},
 		.clone_match_fn =	tap_clone_match,
 		.clone_create_fn =	tun_clone_create,
 		.clone_destroy_fn =	tun_clone_destroy,
 	},
 	{
 		.ident_flags =	TUN_L2 | TUN_VMNET,
 		.cdevsw =	{
 		    .d_version =	D_VERSION,
 		    .d_flags =		D_NEEDMINOR,
 		    .d_open =		tunopen,
 		    .d_close =		tunclose,
 		    .d_read =		tunread,
 		    .d_write =		tunwrite,
 		    .d_ioctl =		tunioctl,
 		    .d_poll =		tunpoll,
 		    .d_kqfilter =	tunkqfilter,
 		    .d_name =		vmnetname,
 		},
 		.clone_match_fn =	vmnet_clone_match,
 		.clone_create_fn =	tun_clone_create,
 		.clone_destroy_fn =	tun_clone_destroy,
 	},
 };
 
 struct tuntap_driver_cloner {
 	SLIST_ENTRY(tuntap_driver_cloner)	 link;
 	struct tuntap_driver			*drv;
 	struct if_clone				*cloner;
 };
 
 VNET_DEFINE_STATIC(SLIST_HEAD(, tuntap_driver_cloner), tuntap_driver_cloners) =
     SLIST_HEAD_INITIALIZER(tuntap_driver_cloners);
 
 #define	V_tuntap_driver_cloners	VNET(tuntap_driver_cloners)
 
 /*
  * Sets unit and/or flags given the device name.  Must be called with correct
  * vnet context.
  */
 static int
 tuntap_name2info(const char *name, int *outunit, int *outflags)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_driver_cloner *drvc;
 	char *dname;
 	int flags, unit;
 	bool found;
 
 	if (name == NULL)
 		return (EINVAL);
 
 	/*
 	 * Needed for dev_stdclone, but dev_stdclone will not modify, it just
 	 * wants to be able to pass back a char * through the second param. We
 	 * will always set that as NULL here, so we'll fake it.
 	 */
 	dname = __DECONST(char *, name);
 	found = false;
 
 	KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners),
 	    ("tuntap_driver_cloners failed to initialize"));
 	SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) {
 		KASSERT(drvc->drv != NULL,
 		    ("tuntap_driver_cloners entry not properly initialized"));
 		drv = drvc->drv;
 
 		if (strcmp(name, drv->cdevsw.d_name) == 0) {
 			found = true;
 			unit = -1;
 			flags = drv->ident_flags;
 			break;
 		}
 
 		if (dev_stdclone(dname, NULL, drv->cdevsw.d_name, &unit) == 1) {
 			found = true;
 			flags = drv->ident_flags;
 			break;
 		}
 	}
 
 	if (!found)
 		return (ENXIO);
 
 	if (outunit != NULL)
 		*outunit = unit;
 	if (outflags != NULL)
 		*outflags = flags;
 	return (0);
 }
 
 /*
  * Get driver information from a set of flags specified.  Masks the identifying
  * part of the flags and compares it against all of the available
  * tuntap_drivers. Must be called with correct vnet context.
  */
 static struct tuntap_driver *
 tuntap_driver_from_flags(int tun_flags)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_driver_cloner *drvc;
 
 	KASSERT(!SLIST_EMPTY(&V_tuntap_driver_cloners),
 	    ("tuntap_driver_cloners failed to initialize"));
 	SLIST_FOREACH(drvc, &V_tuntap_driver_cloners, link) {
 		KASSERT(drvc->drv != NULL,
 		    ("tuntap_driver_cloners entry not properly initialized"));
 		drv = drvc->drv;
 		if ((tun_flags & TUN_DRIVER_IDENT_MASK) == drv->ident_flags)
 			return (drv);
 	}
 
 	return (NULL);
 }
 
 
 
 static int
 tun_clone_match(struct if_clone *ifc, const char *name)
 {
 	int tunflags;
 
 	if (tuntap_name2info(name, NULL, &tunflags) == 0) {
 		if ((tunflags & TUN_L2) == 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 tap_clone_match(struct if_clone *ifc, const char *name)
 {
 	int tunflags;
 
 	if (tuntap_name2info(name, NULL, &tunflags) == 0) {
 		if ((tunflags & (TUN_L2 | TUN_VMNET)) == TUN_L2)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 vmnet_clone_match(struct if_clone *ifc, const char *name)
 {
 	int tunflags;
 
 	if (tuntap_name2info(name, NULL, &tunflags) == 0) {
 		if ((tunflags & TUN_VMNET) != 0)
 			return (1);
 	}
 
 	return (0);
 }
 
 static int
 tun_clone_create(struct if_clone *ifc, char *name, size_t len, caddr_t params)
 {
 	struct tuntap_driver *drv;
 	struct cdev *dev;
 	int err, i, tunflags, unit;
 
 	tunflags = 0;
 	/* The name here tells us exactly what we're creating */
 	err = tuntap_name2info(name, &unit, &tunflags);
 	if (err != 0)
 		return (err);
 
 	drv = tuntap_driver_from_flags(tunflags);
 	if (drv == NULL)
 		return (ENXIO);
 
 	if (unit != -1) {
 		/* If this unit number is still available that/s okay. */
 		if (alloc_unr_specific(drv->unrhdr, unit) == -1)
 			return (EEXIST);
 	} else {
 		unit = alloc_unr(drv->unrhdr);
 	}
 
 	snprintf(name, IFNAMSIZ, "%s%d", drv->cdevsw.d_name, unit);
 
 	/* find any existing device, or allocate new unit number */
 	i = clone_create(&drv->clones, &drv->cdevsw, &unit, &dev, 0);
 	if (i) {
 		/* No preexisting struct cdev *, create one */
 		dev = make_dev(&drv->cdevsw, unit, UID_UUCP, GID_DIALER, 0600,
 		    "%s%d", drv->cdevsw.d_name, unit);
 	}
 
 	tuncreate(dev, drv);
 
 	return (0);
 }
 
 static void
 tunclone(void *arg, struct ucred *cred, char *name, int namelen,
     struct cdev **dev)
 {
 	char devname[SPECNAMELEN + 1];
 	struct tuntap_driver *drv;
 	int append_unit, i, u, tunflags;
 	bool mayclone;
 
 	if (*dev != NULL)
 		return;
 
 	tunflags = 0;
 	CURVNET_SET(CRED_TO_VNET(cred));
 	if (tuntap_name2info(name, &u, &tunflags) != 0)
 		goto out;	/* Not recognized */
 
 	if (u != -1 && u > IF_MAXUNIT)
 		goto out;	/* Unit number too high */
 
 	mayclone = priv_check_cred(cred, PRIV_NET_IFCREATE) == 0;
 	if ((tunflags & TUN_L2) != 0) {
 		/* tap/vmnet allow user open with a sysctl */
 		mayclone = (mayclone || tap_allow_uopen) && tapdclone;
 	} else {
 		mayclone = mayclone && tundclone;
 	}
 
 	/*
 	 * If tun cloning is enabled, only the superuser can create an
 	 * interface.
 	 */
 	if (!mayclone)
 		goto out;
 
 	if (u == -1)
 		append_unit = 1;
 	else
 		append_unit = 0;
 
 	drv = tuntap_driver_from_flags(tunflags);
 	if (drv == NULL)
 		goto out;
 
 	/* find any existing device, or allocate new unit number */
 	i = clone_create(&drv->clones, &drv->cdevsw, &u, dev, 0);
 	if (i) {
 		if (append_unit) {
 			namelen = snprintf(devname, sizeof(devname), "%s%d",
 			    name, u);
 			name = devname;
 		}
 		/* No preexisting struct cdev *, create one */
 		*dev = make_dev_credf(MAKEDEV_REF, &drv->cdevsw, u, cred,
 		    UID_UUCP, GID_DIALER, 0600, "%s", name);
 	}
 
 	if_clone_create(name, namelen, NULL);
 out:
 	CURVNET_RESTORE();
 }
 
 static void
 tun_destroy(struct tuntap_softc *tp)
 {
 
 	TUN_LOCK(tp);
 	tp->tun_flags |= TUN_DYING;
 	if ((tp->tun_flags & TUN_OPEN) != 0)
 		cv_wait_unlock(&tp->tun_cv, &tp->tun_mtx);
 	else
 		TUN_UNLOCK(tp);
 
 	CURVNET_SET(TUN2IFP(tp)->if_vnet);
 
 	destroy_dev(tp->tun_dev);
 	seldrain(&tp->tun_rsel);
 	knlist_clear(&tp->tun_rsel.si_note, 0);
 	knlist_destroy(&tp->tun_rsel.si_note);
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		ether_ifdetach(TUN2IFP(tp));
 	} else {
 		bpfdetach(TUN2IFP(tp));
 		if_detach(TUN2IFP(tp));
 	}
 	sx_xlock(&tun_ioctl_sx);
 	TUN2IFP(tp)->if_softc = NULL;
 	sx_xunlock(&tun_ioctl_sx);
 	free_unr(tp->tun_drv->unrhdr, TUN2IFP(tp)->if_dunit);
 	if_free(TUN2IFP(tp));
 	mtx_destroy(&tp->tun_mtx);
 	cv_destroy(&tp->tun_cv);
 	free(tp, M_TUN);
 	CURVNET_RESTORE();
 }
 
 static int
 tun_clone_destroy(struct if_clone *ifc __unused, struct ifnet *ifp)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 
 	mtx_lock(&tunmtx);
 	TAILQ_REMOVE(&tunhead, tp, tun_list);
 	mtx_unlock(&tunmtx);
 	tun_destroy(tp);
 
 	return (0);
 }
 
 static void
 vnet_tun_init(const void *unused __unused)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_driver_cloner *drvc;
 	int i;
 
 	for (i = 0; i < nitems(tuntap_drivers); ++i) {
 		drv = &tuntap_drivers[i];
 		drvc = malloc(sizeof(*drvc), M_TUN, M_WAITOK | M_ZERO);
 
 		drvc->drv = drv;
 		drvc->cloner = if_clone_advanced(drv->cdevsw.d_name, 0,
 		    drv->clone_match_fn, drv->clone_create_fn,
 		    drv->clone_destroy_fn);
 		SLIST_INSERT_HEAD(&V_tuntap_driver_cloners, drvc, link);
 	};
 }
 VNET_SYSINIT(vnet_tun_init, SI_SUB_PROTO_IF, SI_ORDER_ANY,
 		vnet_tun_init, NULL);
 
 static void
 vnet_tun_uninit(const void *unused __unused)
 {
 	struct tuntap_driver_cloner *drvc;
 
 	while (!SLIST_EMPTY(&V_tuntap_driver_cloners)) {
 		drvc = SLIST_FIRST(&V_tuntap_driver_cloners);
 		SLIST_REMOVE_HEAD(&V_tuntap_driver_cloners, link);
 
 		if_clone_detach(drvc->cloner);
 		free(drvc, M_TUN);
 	}
 }
 VNET_SYSUNINIT(vnet_tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY,
     vnet_tun_uninit, NULL);
 
 static void
 tun_uninit(const void *unused __unused)
 {
 	struct tuntap_driver *drv;
 	struct tuntap_softc *tp;
 	int i;
 
 	EVENTHANDLER_DEREGISTER(dev_clone, tag);
 	drain_dev_clone_events();
 
 	mtx_lock(&tunmtx);
 	while ((tp = TAILQ_FIRST(&tunhead)) != NULL) {
 		TAILQ_REMOVE(&tunhead, tp, tun_list);
 		mtx_unlock(&tunmtx);
 		tun_destroy(tp);
 		mtx_lock(&tunmtx);
 	}
 	mtx_unlock(&tunmtx);
 	for (i = 0; i < nitems(tuntap_drivers); ++i) {
 		drv = &tuntap_drivers[i];
 		delete_unrhdr(drv->unrhdr);
 		clone_cleanup(&drv->clones);
 	}
 	mtx_destroy(&tunmtx);
 }
 SYSUNINIT(tun_uninit, SI_SUB_PROTO_IF, SI_ORDER_ANY, tun_uninit, NULL);
 
 static int
 tuntapmodevent(module_t mod, int type, void *data)
 {
 	struct tuntap_driver *drv;
 	int i;
 
 	switch (type) {
 	case MOD_LOAD:
 		mtx_init(&tunmtx, "tunmtx", NULL, MTX_DEF);
 		for (i = 0; i < nitems(tuntap_drivers); ++i) {
 			drv = &tuntap_drivers[i];
 			clone_setup(&drv->clones);
 			drv->unrhdr = new_unrhdr(0, IF_MAXUNIT, &tunmtx);
 		}
 		tag = EVENTHANDLER_REGISTER(dev_clone, tunclone, 0, 1000);
 		if (tag == NULL)
 			return (ENOMEM);
 		break;
 	case MOD_UNLOAD:
 		/* See tun_uninit, so it's done after the vnet_sysuninit() */
 		break;
 	default:
 		return EOPNOTSUPP;
 	}
 	return 0;
 }
 
 static moduledata_t tuntap_mod = {
 	"if_tuntap",
 	tuntapmodevent,
 	0
 };
 
 DECLARE_MODULE(if_tuntap, tuntap_mod, SI_SUB_PSEUDO, SI_ORDER_ANY);
 MODULE_VERSION(if_tuntap, 1);
 
 static void
 tunstart(struct ifnet *ifp)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 	struct mbuf *m;
 
 	TUNDEBUG(ifp, "starting\n");
 	if (ALTQ_IS_ENABLED(&ifp->if_snd)) {
 		IFQ_LOCK(&ifp->if_snd);
 		IFQ_POLL_NOLOCK(&ifp->if_snd, m);
 		if (m == NULL) {
 			IFQ_UNLOCK(&ifp->if_snd);
 			return;
 		}
 		IFQ_UNLOCK(&ifp->if_snd);
 	}
 
 	TUN_LOCK(tp);
 	if (tp->tun_flags & TUN_RWAIT) {
 		tp->tun_flags &= ~TUN_RWAIT;
 		wakeup(tp);
 	}
 	selwakeuppri(&tp->tun_rsel, PZERO + 1);
 	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 	if (tp->tun_flags & TUN_ASYNC && tp->tun_sigio) {
 		TUN_UNLOCK(tp);
 		pgsigio(&tp->tun_sigio, SIGIO, 0);
 	} else
 		TUN_UNLOCK(tp);
 }
 
 /*
  * tunstart_l2
  *
  * queue packets from higher level ready to put out
  */
 static void
 tunstart_l2(struct ifnet *ifp)
 {
 	struct tuntap_softc	*tp = ifp->if_softc;
 
 	TUNDEBUG(ifp, "starting\n");
 
 	/*
 	 * do not junk pending output if we are in VMnet mode.
 	 * XXX: can this do any harm because of queue overflow?
 	 */
 
 	TUN_LOCK(tp);
 	if (((tp->tun_flags & TUN_VMNET) == 0) &&
 	    ((tp->tun_flags & TUN_READY) != TUN_READY)) {
 		struct mbuf *m;
 
 		/* Unlocked read. */
 		TUNDEBUG(ifp, "not ready, tun_flags = 0x%x\n", tp->tun_flags);
 
 		for (;;) {
 			IF_DEQUEUE(&ifp->if_snd, m);
 			if (m != NULL) {
 				m_freem(m);
 				if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			} else
 				break;
 		}
 		TUN_UNLOCK(tp);
 
 		return;
 	}
 
 	ifp->if_drv_flags |= IFF_DRV_OACTIVE;
 
 	if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
 		if (tp->tun_flags & TUN_RWAIT) {
 			tp->tun_flags &= ~TUN_RWAIT;
 			wakeup(tp);
 		}
 
 		if ((tp->tun_flags & TUN_ASYNC) && (tp->tun_sigio != NULL)) {
 			TUN_UNLOCK(tp);
 			pgsigio(&tp->tun_sigio, SIGIO, 0);
 			TUN_LOCK(tp);
 		}
 
 		selwakeuppri(&tp->tun_rsel, PZERO+1);
 		KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 		if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); /* obytes are counted in ether_output */
 	}
 
 	ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 	TUN_UNLOCK(tp);
 } /* tunstart_l2 */
 
 
 /* XXX: should return an error code so it can fail. */
 static void
 tuncreate(struct cdev *dev, struct tuntap_driver *drv)
 {
 	struct tuntap_softc *sc;
 	struct ifnet *ifp;
 	struct ether_addr eaddr;
 	int iflags;
 	u_char type;
 
 	sc = malloc(sizeof(*sc), M_TUN, M_WAITOK | M_ZERO);
 	mtx_init(&sc->tun_mtx, "tun_mtx", NULL, MTX_DEF);
 	cv_init(&sc->tun_cv, "tun_condvar");
 	sc->tun_flags = drv->ident_flags;
 	sc->tun_dev = dev;
 	sc->tun_drv = drv;
 	mtx_lock(&tunmtx);
 	TAILQ_INSERT_TAIL(&tunhead, sc, tun_list);
 	mtx_unlock(&tunmtx);
 
 	iflags = IFF_MULTICAST;
 	if ((sc->tun_flags & TUN_L2) != 0) {
 		type = IFT_ETHER;
 		iflags |= IFF_BROADCAST | IFF_SIMPLEX;
 	} else {
 		type = IFT_PPP;
 		iflags |= IFF_POINTOPOINT;
 	}
 	ifp = sc->tun_ifp = if_alloc(type);
 	if (ifp == NULL)
 		panic("%s%d: failed to if_alloc() interface.\n",
 		    drv->cdevsw.d_name, dev2unit(dev));
 	ifp->if_softc = sc;
 	if_initname(ifp, drv->cdevsw.d_name, dev2unit(dev));
 	ifp->if_ioctl = tunifioctl;
 	ifp->if_flags = iflags;
 	IFQ_SET_MAXLEN(&ifp->if_snd, ifqmaxlen);
 	knlist_init_mtx(&sc->tun_rsel.si_note, &sc->tun_mtx);
 	ifp->if_capabilities |= IFCAP_LINKSTATE;
 	ifp->if_capenable |= IFCAP_LINKSTATE;
 
 	if ((sc->tun_flags & TUN_L2) != 0) {
 		ifp->if_mtu = ETHERMTU;
 		ifp->if_init = tunifinit;
 		ifp->if_start = tunstart_l2;
 
 		ether_gen_addr(ifp, &eaddr);
 		ether_ifattach(ifp, eaddr.octet);
 	} else {
 		ifp->if_mtu = TUNMTU;
 		ifp->if_start = tunstart;
 		ifp->if_output = tunoutput;
 
 		ifp->if_snd.ifq_drv_maxlen = 0;
 		IFQ_SET_READY(&ifp->if_snd);
 
 		if_attach(ifp);
 		bpfattach(ifp, DLT_NULL, sizeof(u_int32_t));
 	}
 	dev->si_drv1 = sc;
 
 	TUN_LOCK(sc);
 	sc->tun_flags |= TUN_INITED;
 	TUN_UNLOCK(sc);
 
 	TUNDEBUG(ifp, "interface %s is created, minor = %#x\n",
 	    ifp->if_xname, dev2unit(dev));
 }
 
 static int
 tunopen(struct cdev *dev, int flag, int mode, struct thread *td)
 {
 	struct ifnet	*ifp;
 	struct tuntap_driver *drv;
 	struct tuntap_softc *tp;
 	int error, tunflags;
 
 	tunflags = 0;
 	CURVNET_SET(TD_TO_VNET(td));
 	error = tuntap_name2info(dev->si_name, NULL, &tunflags);
 	if (error != 0) {
 		CURVNET_RESTORE();
 		return (error);	/* Shouldn't happen */
 	}
 
 	if ((tunflags & TUN_L2) != 0) {
 		/* Restrict? */
 		if (tap_allow_uopen == 0) {
 			error = priv_check(td, PRIV_NET_TAP);
 			if (error != 0) {
 				CURVNET_RESTORE();
 				return (error);
 			}
 		}
 	}
 
 	/*
 	 * XXXRW: Non-atomic test and set of dev->si_drv1 requires
 	 * synchronization.
 	 */
 	tp = dev->si_drv1;
 	if (!tp) {
 		drv = tuntap_driver_from_flags(tunflags);
 		if (drv == NULL) {
 			CURVNET_RESTORE();
 			return (ENXIO);
 		}
 		tuncreate(dev, drv);
 		tp = dev->si_drv1;
 	}
 
 	TUN_LOCK(tp);
 	if ((tp->tun_flags & (TUN_OPEN | TUN_DYING)) != 0) {
 		TUN_UNLOCK(tp);
 		CURVNET_RESTORE();
 		return (EBUSY);
 	}
 
 	ifp = TUN2IFP(tp);
 
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		bcopy(IF_LLADDR(ifp), tp->tun_ether.octet,
 		    sizeof(tp->tun_ether.octet));
 
 		ifp->if_drv_flags |= IFF_DRV_RUNNING;
 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 
 		if (tapuponopen)
 			ifp->if_flags |= IFF_UP;
 	}
 
 	tp->tun_pid = td->td_proc->p_pid;
 	tp->tun_flags |= TUN_OPEN;
 
 	if_link_state_change(ifp, LINK_STATE_UP);
 	TUNDEBUG(ifp, "open\n");
 	TUN_UNLOCK(tp);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * tunclose - close the device - mark i/f down & delete
  * routing info
  */
 static	int
 tunclose(struct cdev *dev, int foo, int bar, struct thread *td)
 {
 	struct tuntap_softc *tp;
 	struct ifnet *ifp;
 	bool l2tun;
 
 	tp = dev->si_drv1;
 	ifp = TUN2IFP(tp);
 
 	TUN_LOCK(tp);
 	/*
 	 * Simply close the device if this isn't the controlling process.  This
 	 * may happen if, for instance, the tunnel has been handed off to
 	 * another process.  The original controller should be able to close it
 	 * without putting us into an inconsistent state.
 	 */
 	if (td->td_proc->p_pid != tp->tun_pid) {
 		TUN_UNLOCK(tp);
 		return (0);
 	}
 
 	/*
 	 * junk all pending output
 	 */
 	CURVNET_SET(ifp->if_vnet);
 
 	l2tun = false;
 	if ((tp->tun_flags & TUN_L2) != 0) {
 		l2tun = true;
 		IF_DRAIN(&ifp->if_snd);
 	} else {
 		IFQ_PURGE(&ifp->if_snd);
 	}
 
 	/* For vmnet, we won't do most of the address/route bits */
 	if ((tp->tun_flags & TUN_VMNET) != 0 ||
 	    (l2tun && (ifp->if_flags & IFF_LINK0) != 0))
 		goto out;
 
 	if (ifp->if_flags & IFF_UP) {
 		TUN_UNLOCK(tp);
 		if_down(ifp);
 		TUN_LOCK(tp);
 	}
 
 	/* Delete all addresses and routes which reference this interface. */
 	if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
 		struct ifaddr *ifa;
 
 		ifp->if_drv_flags &= ~IFF_DRV_RUNNING;
 		TUN_UNLOCK(tp);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			/* deal w/IPv4 PtP destination; unlocked read */
 			if (!l2tun && ifa->ifa_addr->sa_family == AF_INET) {
 				rtinit(ifa, (int)RTM_DELETE,
 				    tp->tun_flags & TUN_DSTADDR ? RTF_HOST : 0);
 			} else {
 				rtinit(ifa, (int)RTM_DELETE, 0);
 			}
 		}
 		if_purgeaddrs(ifp);
 		TUN_LOCK(tp);
 	}
 
 out:
 	if_link_state_change(ifp, LINK_STATE_DOWN);
 	CURVNET_RESTORE();
 
 	funsetown(&tp->tun_sigio);
 	selwakeuppri(&tp->tun_rsel, PZERO + 1);
 	KNOTE_LOCKED(&tp->tun_rsel.si_note, 0);
 	TUNDEBUG (ifp, "closed\n");
 	tp->tun_flags &= ~TUN_OPEN;
 	tp->tun_pid = 0;
 
 	cv_broadcast(&tp->tun_cv);
 	TUN_UNLOCK(tp);
 	return (0);
 }
 
 static void
 tuninit(struct ifnet *ifp)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 #ifdef INET
 	struct ifaddr *ifa;
 #endif
 
 	TUNDEBUG(ifp, "tuninit\n");
 
 	TUN_LOCK(tp);
 	ifp->if_drv_flags |= IFF_DRV_RUNNING;
 	if ((tp->tun_flags & TUN_L2) == 0) {
 		ifp->if_flags |= IFF_UP;
 		getmicrotime(&ifp->if_lastchange);
 #ifdef INET
 		if_addr_rlock(ifp);
 		CK_STAILQ_FOREACH(ifa, &ifp->if_addrhead, ifa_link) {
 			if (ifa->ifa_addr->sa_family == AF_INET) {
 				struct sockaddr_in *si;
 
 				si = (struct sockaddr_in *)ifa->ifa_addr;
 				if (si->sin_addr.s_addr)
 					tp->tun_flags |= TUN_IASET;
 
 				si = (struct sockaddr_in *)ifa->ifa_dstaddr;
 				if (si && si->sin_addr.s_addr)
 					tp->tun_flags |= TUN_DSTADDR;
 			}
 		}
 		if_addr_runlock(ifp);
 #endif
 		TUN_UNLOCK(tp);
 	} else {
 		ifp->if_drv_flags &= ~IFF_DRV_OACTIVE;
 		TUN_UNLOCK(tp);
 		/* attempt to start output */
 		tunstart_l2(ifp);
 	}
 
 }
 
 /*
  * Used only for l2 tunnel.
  */
 static void
 tunifinit(void *xtp)
 {
 	struct tuntap_softc *tp;
 
 	tp = (struct tuntap_softc *)xtp;
 	tuninit(tp->tun_ifp);
 }
 
 /*
  * Process an ioctl request.
  */
 static int
 tunifioctl(struct ifnet *ifp, u_long cmd, caddr_t data)
 {
 	struct ifreq *ifr = (struct ifreq *)data;
 	struct tuntap_softc *tp;
 	struct ifstat *ifs;
 	struct ifmediareq	*ifmr;
 	int		dummy, error = 0;
 	bool		l2tun;
 
 	ifmr = NULL;
 	sx_xlock(&tun_ioctl_sx);
 	tp = ifp->if_softc;
 	if (tp == NULL) {
 		error = ENXIO;
 		goto bad;
 	}
 	l2tun = (tp->tun_flags & TUN_L2) != 0;
 	switch(cmd) {
 	case SIOCGIFSTATUS:
 		ifs = (struct ifstat *)data;
 		TUN_LOCK(tp);
 		if (tp->tun_pid)
 			snprintf(ifs->ascii, sizeof(ifs->ascii),
 			    "\tOpened by PID %d\n", tp->tun_pid);
 		else
 			ifs->ascii[0] = '\0';
 		TUN_UNLOCK(tp);
 		break;
 	case SIOCSIFADDR:
 		if (l2tun)
 			error = ether_ioctl(ifp, cmd, data);
 		else
 			tuninit(ifp);
 		if (error == 0)
 		    TUNDEBUG(ifp, "address set\n");
 		break;
 	case SIOCSIFMTU:
 		ifp->if_mtu = ifr->ifr_mtu;
 		TUNDEBUG(ifp, "mtu set\n");
 		break;
 	case SIOCSIFFLAGS:
 	case SIOCADDMULTI:
 	case SIOCDELMULTI:
 		break;
 	case SIOCGIFMEDIA:
 		if (!l2tun) {
 			error = EINVAL;
 			break;
 		}
 
 		ifmr = (struct ifmediareq *)data;
 		dummy = ifmr->ifm_count;
 		ifmr->ifm_count = 1;
 		ifmr->ifm_status = IFM_AVALID;
 		ifmr->ifm_active = IFM_ETHER;
 		if (tp->tun_flags & TUN_OPEN)
 			ifmr->ifm_status |= IFM_ACTIVE;
 		ifmr->ifm_current = ifmr->ifm_active;
 		if (dummy >= 1) {
 			int media = IFM_ETHER;
 			error = copyout(&media, ifmr->ifm_ulist, sizeof(int));
 		}
 		break;
 	default:
 		if (l2tun) {
 			error = ether_ioctl(ifp, cmd, data);
 		} else {
 			error = EINVAL;
 		}
 	}
 bad:
 	sx_xunlock(&tun_ioctl_sx);
 	return (error);
 }
 
 /*
  * tunoutput - queue packets from higher level ready to put out.
  */
 static int
 tunoutput(struct ifnet *ifp, struct mbuf *m0, const struct sockaddr *dst,
     struct route *ro)
 {
 	struct tuntap_softc *tp = ifp->if_softc;
 	u_short cached_tun_flags;
 	int error;
 	u_int32_t af;
 
 	TUNDEBUG (ifp, "tunoutput\n");
 
 #ifdef MAC
 	error = mac_ifnet_check_transmit(ifp, m0);
 	if (error) {
 		m_freem(m0);
 		return (error);
 	}
 #endif
 
 	/* Could be unlocked read? */
 	TUN_LOCK(tp);
 	cached_tun_flags = tp->tun_flags;
 	TUN_UNLOCK(tp);
 	if ((cached_tun_flags & TUN_READY) != TUN_READY) {
 		TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags);
 		m_freem (m0);
 		return (EHOSTDOWN);
 	}
 
 	if ((ifp->if_flags & IFF_UP) != IFF_UP) {
 		m_freem (m0);
 		return (EHOSTDOWN);
 	}
 
 	/* BPF writes need to be handled specially. */
 	if (dst->sa_family == AF_UNSPEC)
 		bcopy(dst->sa_data, &af, sizeof(af));
 	else
 		af = dst->sa_family;
 
 	if (bpf_peers_present(ifp->if_bpf))
 		bpf_mtap2(ifp->if_bpf, &af, sizeof(af), m0);
 
 	/* prepend sockaddr? this may abort if the mbuf allocation fails */
 	if (cached_tun_flags & TUN_LMODE) {
 		/* allocate space for sockaddr */
 		M_PREPEND(m0, dst->sa_len, M_NOWAIT);
 
 		/* if allocation failed drop packet */
 		if (m0 == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENOBUFS);
 		} else {
 			bcopy(dst, m0->m_data, dst->sa_len);
 		}
 	}
 
 	if (cached_tun_flags & TUN_IFHEAD) {
 		/* Prepend the address family */
 		M_PREPEND(m0, 4, M_NOWAIT);
 
 		/* if allocation failed drop packet */
 		if (m0 == NULL) {
 			if_inc_counter(ifp, IFCOUNTER_IQDROPS, 1);
 			if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
 			return (ENOBUFS);
 		} else
 			*(u_int32_t *)m0->m_data = htonl(af);
 	} else {
 #ifdef INET
 		if (af != AF_INET)
 #endif
 		{
 			m_freem(m0);
 			return (EAFNOSUPPORT);
 		}
 	}
 
 	error = (ifp->if_transmit)(ifp, m0);
 	if (error)
 		return (ENOBUFS);
 	if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
 	return (0);
 }
 
 /*
  * the cdevsw interface is now pretty minimal.
  */
 static	int
 tunioctl(struct cdev *dev, u_long cmd, caddr_t data, int flag,
     struct thread *td)
 {
 	struct ifreq ifr, *ifrp;
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct tuninfo *tunp;
 	int error, iflags;
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4)
 	int	ival;
 #endif
 	bool	l2tun;
 
 	l2tun = (tp->tun_flags & TUN_L2) != 0;
 	if (l2tun) {
 		/* tap specific ioctls */
 		switch(cmd) {
-		case TAPGIFNAME:
-			ifrp = (struct ifreq *)data;
-			strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname,
-			    IFNAMSIZ);
-
-			return (0);
 		/* VMware/VMnet port ioctl's */
 #if defined(COMPAT_FREEBSD6) || defined(COMPAT_FREEBSD5) || \
     defined(COMPAT_FREEBSD4)
 		case _IO('V', 0):
 			ival = IOCPARM_IVAL(data);
 			data = (caddr_t)&ival;
 			/* FALLTHROUGH */
 #endif
 		case VMIO_SIOCSIFFLAGS: /* VMware/VMnet SIOCSIFFLAGS */
 			iflags = *(int *)data;
 			iflags &= TUN_VMIO_FLAG_MASK;
 			iflags &= ~IFF_CANTCHANGE;
 			iflags |= IFF_UP;
 
 			TUN_LOCK(tp);
 			TUN2IFP(tp)->if_flags = iflags |
 			    (TUN2IFP(tp)->if_flags & IFF_CANTCHANGE);
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case SIOCGIFADDR:	/* get MAC address of the remote side */
 			TUN_LOCK(tp);
 			bcopy(&tp->tun_ether.octet, data,
 			    sizeof(tp->tun_ether.octet));
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case SIOCSIFADDR:	/* set MAC address of the remote side */
 			TUN_LOCK(tp);
 			bcopy(data, &tp->tun_ether.octet,
 			    sizeof(tp->tun_ether.octet));
 			TUN_UNLOCK(tp);
 
 			return (0);
 		}
 
 		/* Fall through to the common ioctls if unhandled */
 	} else {
 		switch (cmd) {
 		case TUNSLMODE:
 			TUN_LOCK(tp);
 			if (*(int *)data) {
 				tp->tun_flags |= TUN_LMODE;
 				tp->tun_flags &= ~TUN_IFHEAD;
 			} else
 				tp->tun_flags &= ~TUN_LMODE;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TUNSIFHEAD:
 			TUN_LOCK(tp);
 			if (*(int *)data) {
 				tp->tun_flags |= TUN_IFHEAD;
 				tp->tun_flags &= ~TUN_LMODE;
 			} else
 				tp->tun_flags &= ~TUN_IFHEAD;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TUNGIFHEAD:
 			TUN_LOCK(tp);
 			*(int *)data = (tp->tun_flags & TUN_IFHEAD) ? 1 : 0;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		case TUNSIFMODE:
 			/* deny this if UP */
 			if (TUN2IFP(tp)->if_flags & IFF_UP)
 				return (EBUSY);
 
 			switch (*(int *)data & ~IFF_MULTICAST) {
 			case IFF_POINTOPOINT:
 			case IFF_BROADCAST:
 				TUN_LOCK(tp);
 				TUN2IFP(tp)->if_flags &=
 				    ~(IFF_BROADCAST|IFF_POINTOPOINT|IFF_MULTICAST);
 				TUN2IFP(tp)->if_flags |= *(int *)data;
 				TUN_UNLOCK(tp);
 
 				break;
 			default:
 				return (EINVAL);
 			}
 
 			return (0);
 		case TUNSIFPID:
 			TUN_LOCK(tp);
 			tp->tun_pid = curthread->td_proc->p_pid;
 			TUN_UNLOCK(tp);
 
 			return (0);
 		}
 		/* Fall through to the common ioctls if unhandled */
 	}
 
 	switch (cmd) {
+	case TUNGIFNAME:
+		ifrp = (struct ifreq *)data;
+		strlcpy(ifrp->ifr_name, TUN2IFP(tp)->if_xname, IFNAMSIZ);
+
+		return (0);
 	case TUNSIFINFO:
 		tunp = (struct tuninfo *)data;
 		if (TUN2IFP(tp)->if_type != tunp->type)
 			return (EPROTOTYPE);
 		TUN_LOCK(tp);
 		if (TUN2IFP(tp)->if_mtu != tunp->mtu) {
 			strlcpy(ifr.ifr_name, if_name(TUN2IFP(tp)), IFNAMSIZ);
 			ifr.ifr_mtu = tunp->mtu;
 			CURVNET_SET(TUN2IFP(tp)->if_vnet);
 			error = ifhwioctl(SIOCSIFMTU, TUN2IFP(tp),
 			    (caddr_t)&ifr, td);
 			CURVNET_RESTORE();
 			if (error) {
 				TUN_UNLOCK(tp);
 				return (error);
 			}
 		}
 		TUN2IFP(tp)->if_baudrate = tunp->baudrate;
 		TUN_UNLOCK(tp);
 		break;
 	case TUNGIFINFO:
 		tunp = (struct tuninfo *)data;
 		TUN_LOCK(tp);
 		tunp->mtu = TUN2IFP(tp)->if_mtu;
 		tunp->type = TUN2IFP(tp)->if_type;
 		tunp->baudrate = TUN2IFP(tp)->if_baudrate;
 		TUN_UNLOCK(tp);
 		break;
 	case TUNSDEBUG:
 		tundebug = *(int *)data;
 		break;
 	case TUNGDEBUG:
 		*(int *)data = tundebug;
 		break;
 	case FIONBIO:
 		break;
 	case FIOASYNC:
 		TUN_LOCK(tp);
 		if (*(int *)data)
 			tp->tun_flags |= TUN_ASYNC;
 		else
 			tp->tun_flags &= ~TUN_ASYNC;
 		TUN_UNLOCK(tp);
 		break;
 	case FIONREAD:
 		if (!IFQ_IS_EMPTY(&TUN2IFP(tp)->if_snd)) {
 			struct mbuf *mb;
 			IFQ_LOCK(&TUN2IFP(tp)->if_snd);
 			IFQ_POLL_NOLOCK(&TUN2IFP(tp)->if_snd, mb);
 			for (*(int *)data = 0; mb != NULL; mb = mb->m_next)
 				*(int *)data += mb->m_len;
 			IFQ_UNLOCK(&TUN2IFP(tp)->if_snd);
 		} else
 			*(int *)data = 0;
 		break;
 	case FIOSETOWN:
 		return (fsetown(*(int *)data, &tp->tun_sigio));
 
 	case FIOGETOWN:
 		*(int *)data = fgetown(&tp->tun_sigio);
 		return (0);
 
 	/* This is deprecated, FIOSETOWN should be used instead. */
 	case TIOCSPGRP:
 		return (fsetown(-(*(int *)data), &tp->tun_sigio));
 
 	/* This is deprecated, FIOGETOWN should be used instead. */
 	case TIOCGPGRP:
 		*(int *)data = -fgetown(&tp->tun_sigio);
 		return (0);
 
 	default:
 		return (ENOTTY);
 	}
 	return (0);
 }
 
 /*
  * The cdevsw read interface - reads a packet at a time, or at
  * least as much of a packet as can be read.
  */
 static	int
 tunread(struct cdev *dev, struct uio *uio, int flag)
 {
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 	struct mbuf	*m;
 	int		error=0, len;
 
 	TUNDEBUG (ifp, "read\n");
 	TUN_LOCK(tp);
 	if ((tp->tun_flags & TUN_READY) != TUN_READY) {
 		TUN_UNLOCK(tp);
 		TUNDEBUG (ifp, "not ready 0%o\n", tp->tun_flags);
 		return (EHOSTDOWN);
 	}
 
 	tp->tun_flags &= ~TUN_RWAIT;
 
 	do {
 		IFQ_DEQUEUE(&ifp->if_snd, m);
 		if (m == NULL) {
 			if (flag & O_NONBLOCK) {
 				TUN_UNLOCK(tp);
 				return (EWOULDBLOCK);
 			}
 			tp->tun_flags |= TUN_RWAIT;
 			error = mtx_sleep(tp, &tp->tun_mtx, PCATCH | (PZERO + 1),
 			    "tunread", 0);
 			if (error != 0) {
 				TUN_UNLOCK(tp);
 				return (error);
 			}
 		}
 	} while (m == NULL);
 	TUN_UNLOCK(tp);
 
 	if ((tp->tun_flags & TUN_L2) != 0)
 		BPF_MTAP(ifp, m);
 
 	while (m && uio->uio_resid > 0 && error == 0) {
 		len = min(uio->uio_resid, m->m_len);
 		if (len != 0)
 			error = uiomove(mtod(m, void *), len, uio);
 		m = m_free(m);
 	}
 
 	if (m) {
 		TUNDEBUG(ifp, "Dropping mbuf\n");
 		m_freem(m);
 	}
 	return (error);
 }
 
 static int
 tunwrite_l2(struct tuntap_softc *tp, struct mbuf *m)
 {
 	struct ether_header *eh;
 	struct ifnet *ifp;
 
 	ifp = TUN2IFP(tp);
 
 	/*
 	 * Only pass a unicast frame to ether_input(), if it would
 	 * actually have been received by non-virtual hardware.
 	 */
 	if (m->m_len < sizeof(struct ether_header)) {
 		m_freem(m);
 		return (0);
 	}
 
 	eh = mtod(m, struct ether_header *);
 
 	if (eh && (ifp->if_flags & IFF_PROMISC) == 0 &&
 	    !ETHER_IS_MULTICAST(eh->ether_dhost) &&
 	    bcmp(eh->ether_dhost, IF_LLADDR(ifp), ETHER_ADDR_LEN) != 0) {
 		m_freem(m);
 		return (0);
 	}
 
 	/* Pass packet up to parent. */
 	CURVNET_SET(ifp->if_vnet);
 	(*ifp->if_input)(ifp, m);
 	CURVNET_RESTORE();
 	/* ibytes are counted in parent */
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	return (0);
 }
 
 static int
 tunwrite_l3(struct tuntap_softc *tp, struct mbuf *m)
 {
 	struct ifnet *ifp;
 	int family, isr;
 
 	ifp = TUN2IFP(tp);
 	/* Could be unlocked read? */
 	TUN_LOCK(tp);
 	if (tp->tun_flags & TUN_IFHEAD) {
 		TUN_UNLOCK(tp);
 		if (m->m_len < sizeof(family) &&
 		(m = m_pullup(m, sizeof(family))) == NULL)
 			return (ENOBUFS);
 		family = ntohl(*mtod(m, u_int32_t *));
 		m_adj(m, sizeof(family));
 	} else {
 		TUN_UNLOCK(tp);
 		family = AF_INET;
 	}
 
 	BPF_MTAP2(ifp, &family, sizeof(family), m);
 
 	switch (family) {
 #ifdef INET
 	case AF_INET:
 		isr = NETISR_IP;
 		break;
 #endif
 #ifdef INET6
 	case AF_INET6:
 		isr = NETISR_IPV6;
 		break;
 #endif
 	default:
 		m_freem(m);
 		return (EAFNOSUPPORT);
 	}
 	random_harvest_queue(m, sizeof(*m), RANDOM_NET_TUN);
 	if_inc_counter(ifp, IFCOUNTER_IBYTES, m->m_pkthdr.len);
 	if_inc_counter(ifp, IFCOUNTER_IPACKETS, 1);
 	CURVNET_SET(ifp->if_vnet);
 	M_SETFIB(m, ifp->if_fib);
 	netisr_dispatch(isr, m);
 	CURVNET_RESTORE();
 	return (0);
 }
 
 /*
  * the cdevsw write interface - an atomic write is a packet - or else!
  */
 static	int
 tunwrite(struct cdev *dev, struct uio *uio, int flag)
 {
 	struct tuntap_softc *tp;
 	struct ifnet	*ifp;
 	struct mbuf	*m;
 	uint32_t	mru;
 	int		align;
 	bool		l2tun;
 
 	tp = dev->si_drv1;
 	ifp = TUN2IFP(tp);
 	TUNDEBUG(ifp, "tunwrite\n");
 	if ((ifp->if_flags & IFF_UP) != IFF_UP)
 		/* ignore silently */
 		return (0);
 
 	if (uio->uio_resid == 0)
 		return (0);
 
 	l2tun = (tp->tun_flags & TUN_L2) != 0;
 	align = 0;
 	mru = l2tun ? TAPMRU : TUNMRU;
 	if (l2tun)
 		align = ETHER_ALIGN;
 	else if ((tp->tun_flags & TUN_IFHEAD) != 0)
 		mru += sizeof(uint32_t);	/* family */
 	if (uio->uio_resid < 0 || uio->uio_resid > mru) {
 		TUNDEBUG(ifp, "len=%zd!\n", uio->uio_resid);
 		return (EIO);
 	}
 
 	if ((m = m_uiotombuf(uio, M_NOWAIT, 0, align, M_PKTHDR)) == NULL) {
 		if_inc_counter(ifp, IFCOUNTER_IERRORS, 1);
 		return (ENOBUFS);
 	}
 
 	m->m_pkthdr.rcvif = ifp;
 #ifdef MAC
 	mac_ifnet_create_mbuf(ifp, m);
 #endif
 
 	if (l2tun)
 		return (tunwrite_l2(tp, m));
 
 	return (tunwrite_l3(tp, m));
 }
 
 /*
  * tunpoll - the poll interface, this is only useful on reads
  * really. The write detect always returns true, write never blocks
  * anyway, it either accepts the packet or drops it.
  */
 static	int
 tunpoll(struct cdev *dev, int events, struct thread *td)
 {
 	struct tuntap_softc *tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 	int		revents = 0;
 
 	TUNDEBUG(ifp, "tunpoll\n");
 
 	if (events & (POLLIN | POLLRDNORM)) {
 		IFQ_LOCK(&ifp->if_snd);
 		if (!IFQ_IS_EMPTY(&ifp->if_snd)) {
 			TUNDEBUG(ifp, "tunpoll q=%d\n", ifp->if_snd.ifq_len);
 			revents |= events & (POLLIN | POLLRDNORM);
 		} else {
 			TUNDEBUG(ifp, "tunpoll waiting\n");
 			selrecord(td, &tp->tun_rsel);
 		}
 		IFQ_UNLOCK(&ifp->if_snd);
 	}
 	if (events & (POLLOUT | POLLWRNORM))
 		revents |= events & (POLLOUT | POLLWRNORM);
 
 	return (revents);
 }
 
 /*
  * tunkqfilter - support for the kevent() system call.
  */
 static int
 tunkqfilter(struct cdev *dev, struct knote *kn)
 {
 	struct tuntap_softc	*tp = dev->si_drv1;
 	struct ifnet	*ifp = TUN2IFP(tp);
 
 	switch(kn->kn_filter) {
 	case EVFILT_READ:
 		TUNDEBUG(ifp, "%s kqfilter: EVFILT_READ, minor = %#x\n",
 		    ifp->if_xname, dev2unit(dev));
 		kn->kn_fop = &tun_read_filterops;
 		break;
 
 	case EVFILT_WRITE:
 		TUNDEBUG(ifp, "%s kqfilter: EVFILT_WRITE, minor = %#x\n",
 		    ifp->if_xname, dev2unit(dev));
 		kn->kn_fop = &tun_write_filterops;
 		break;
 
 	default:
 		TUNDEBUG(ifp, "%s kqfilter: invalid filter, minor = %#x\n",
 		    ifp->if_xname, dev2unit(dev));
 		return(EINVAL);
 	}
 
 	kn->kn_hook = tp;
 	knlist_add(&tp->tun_rsel.si_note, kn, 0);
 
 	return (0);
 }
 
 /*
  * Return true of there is data in the interface queue.
  */
 static int
 tunkqread(struct knote *kn, long hint)
 {
 	int			ret;
 	struct tuntap_softc	*tp = kn->kn_hook;
 	struct cdev		*dev = tp->tun_dev;
 	struct ifnet	*ifp = TUN2IFP(tp);
 
 	if ((kn->kn_data = ifp->if_snd.ifq_len) > 0) {
 		TUNDEBUG(ifp,
 		    "%s have data in the queue.  Len = %d, minor = %#x\n",
 		    ifp->if_xname, ifp->if_snd.ifq_len, dev2unit(dev));
 		ret = 1;
 	} else {
 		TUNDEBUG(ifp,
 		    "%s waiting for data, minor = %#x\n", ifp->if_xname,
 		    dev2unit(dev));
 		ret = 0;
 	}
 
 	return (ret);
 }
 
 /*
  * Always can write, always return MTU in kn->data.
  */
 static int
 tunkqwrite(struct knote *kn, long hint)
 {
 	struct tuntap_softc	*tp = kn->kn_hook;
 	struct ifnet	*ifp = TUN2IFP(tp);
 
 	kn->kn_data = ifp->if_mtu;
 
 	return (1);
 }
 
 static void
 tunkqdetach(struct knote *kn)
 {
 	struct tuntap_softc	*tp = kn->kn_hook;
 
 	knlist_remove(&tp->tun_rsel.si_note, kn, 0);
 }
Index: projects/nfsv42/sys/riscv/riscv/pmap.c
===================================================================
--- projects/nfsv42/sys/riscv/riscv/pmap.c	(revision 350367)
+++ projects/nfsv42/sys/riscv/riscv/pmap.c	(revision 350368)
@@ -1,4482 +1,4473 @@
 /*-
  * SPDX-License-Identifier: BSD-4-Clause
  *
  * Copyright (c) 1991 Regents of the University of California.
  * All rights reserved.
  * Copyright (c) 1994 John S. Dyson
  * All rights reserved.
  * Copyright (c) 1994 David Greenman
  * All rights reserved.
  * Copyright (c) 2003 Peter Wemm
  * All rights reserved.
  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
  * All rights reserved.
  * Copyright (c) 2014 Andrew Turner
  * All rights reserved.
  * Copyright (c) 2014 The FreeBSD Foundation
  * All rights reserved.
  * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com>
  * All rights reserved.
  *
  * This code is derived from software contributed to Berkeley by
  * the Systems Programming Group of the University of Utah Computer
  * Science Department and William Jolitz of UUNET Technologies Inc.
  *
  * Portions of this software were developed by Andrew Turner under
  * sponsorship from The FreeBSD Foundation.
  *
  * Portions of this software were developed by SRI International and the
  * University of Cambridge Computer Laboratory under DARPA/AFRL contract
  * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
  *
  * Portions of this software were developed by the University of Cambridge
  * Computer Laboratory as part of the CTSRD Project, with support from the
  * UK Higher Education Innovation Fund (HEIF).
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
  *	This product includes software developed by the University of
  *	California, Berkeley and its contributors.
  * 4. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
  */
 /*-
  * Copyright (c) 2003 Networks Associates Technology, Inc.
  * All rights reserved.
  *
  * This software was developed for the FreeBSD Project by Jake Burkholder,
  * Safeport Network Services, and Network Associates Laboratories, the
  * Security Research Division of Network Associates, Inc. under
  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
  * CHATS research program.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 /*
  *	Manages physical address maps.
  *
  *	Since the information managed by this module is
  *	also stored by the logical address mapping module,
  *	this module may throw away valid virtual-to-physical
  *	mappings at almost any time.  However, invalidations
  *	of virtual-to-physical mappings must be done as
  *	requested.
  *
  *	In order to cope with hardware architectures which
  *	make virtual-to-physical map invalidates expensive,
  *	this module may delay invalidate or reduced protection
  *	operations until such time as they are actually
  *	necessary.  This module is given full information as
  *	to which processors are currently using which maps,
  *	and to when physical maps must be made correct.
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/bitstring.h>
 #include <sys/bus.h>
 #include <sys/cpuset.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
 #include <sys/mman.h>
 #include <sys/msgbuf.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 #include <sys/rwlock.h>
 #include <sys/sx.h>
 #include <sys/vmem.h>
 #include <sys/vmmeter.h>
 #include <sys/sched.h>
 #include <sys/sysctl.h>
 #include <sys/smp.h>
 
 #include <vm/vm.h>
 #include <vm/vm_param.h>
 #include <vm/vm_kern.h>
 #include <vm/vm_page.h>
 #include <vm/vm_map.h>
 #include <vm/vm_object.h>
 #include <vm/vm_extern.h>
 #include <vm/vm_pageout.h>
 #include <vm/vm_pager.h>
 #include <vm/vm_phys.h>
 #include <vm/vm_radix.h>
 #include <vm/vm_reserv.h>
 #include <vm/uma.h>
 
 #include <machine/machdep.h>
 #include <machine/md_var.h>
 #include <machine/pcb.h>
 #include <machine/sbi.h>
 
 #define	NUL1E		(Ln_ENTRIES * Ln_ENTRIES)
 #define	NUL2E		(Ln_ENTRIES * NUL1E)
 
 #if !defined(DIAGNOSTIC)
 #ifdef __GNUC_GNU_INLINE__
 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
 #else
 #define PMAP_INLINE	extern inline
 #endif
 #else
 #define PMAP_INLINE
 #endif
 
 #ifdef PV_STATS
 #define PV_STAT(x)	do { x ; } while (0)
 #else
 #define PV_STAT(x)	do { } while (0)
 #endif
 
 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
 #define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
 
 #define	NPV_LIST_LOCKS	MAXCPU
 
 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
 			(&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS])
 
 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
 	struct rwlock **_lockp = (lockp);		\
 	struct rwlock *_new_lock;			\
 							\
 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
 	if (_new_lock != *_lockp) {			\
 		if (*_lockp != NULL)			\
 			rw_wunlock(*_lockp);		\
 		*_lockp = _new_lock;			\
 		rw_wlock(*_lockp);			\
 	}						\
 } while (0)
 
 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
 
 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
 	struct rwlock **_lockp = (lockp);		\
 							\
 	if (*_lockp != NULL) {				\
 		rw_wunlock(*_lockp);			\
 		*_lockp = NULL;				\
 	}						\
 } while (0)
 
 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
 
 /* The list of all the user pmaps */
 LIST_HEAD(pmaplist, pmap);
 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER();
 
 struct pmap kernel_pmap_store;
 
 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
 vm_offset_t kernel_vm_end = 0;
 
 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
 
 /* This code assumes all L1 DMAP entries will be used */
 CTASSERT((DMAP_MIN_ADDRESS  & ~L1_OFFSET) == DMAP_MIN_ADDRESS);
 CTASSERT((DMAP_MAX_ADDRESS  & ~L1_OFFSET) == DMAP_MAX_ADDRESS);
 
 static struct rwlock_padalign pvh_global_lock;
 static struct mtx_padalign allpmaps_lock;
 
 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD, 0,
     "VM/pmap parameters");
 
 static int superpages_enabled = 1;
 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
     CTLFLAG_RDTUN, &superpages_enabled, 0,
     "Enable support for transparent superpages");
 
 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD, 0,
     "2MB page mapping counters");
 
 static u_long pmap_l2_demotions;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
     &pmap_l2_demotions, 0,
     "2MB page demotions");
 
 static u_long pmap_l2_mappings;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
     &pmap_l2_mappings, 0,
     "2MB page mappings");
 
 static u_long pmap_l2_p_failures;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
     &pmap_l2_p_failures, 0,
     "2MB page promotion failures");
 
 static u_long pmap_l2_promotions;
 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
     &pmap_l2_promotions, 0,
     "2MB page promotions");
 
 /*
  * Data for the pv entry allocation mechanism
  */
 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
 static struct mtx pv_chunks_mutex;
 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
 static struct md_page *pv_table;
 static struct md_page pv_dummy;
 
 extern cpuset_t all_harts;
 
 /*
  * Internal flags for pmap_enter()'s helper functions.
  */
 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
 
 static void	free_pv_chunk(struct pv_chunk *pc);
 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
 		    vm_offset_t va);
 static bool	pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va);
 static bool	pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2,
 		    vm_offset_t va, struct rwlock **lockp);
 static int	pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
 		    u_int flags, vm_page_t m, struct rwlock **lockp);
 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
     vm_page_t m, struct rwlock **lockp);
 
 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
 		struct rwlock **lockp);
 
 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct spglist *free);
 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
 
 #define	pmap_clear(pte)			pmap_store(pte, 0)
 #define	pmap_clear_bits(pte, bits)	atomic_clear_64(pte, bits)
 #define	pmap_load_store(pte, entry)	atomic_swap_64(pte, entry)
 #define	pmap_load_clear(pte)		pmap_load_store(pte, 0)
 #define	pmap_load(pte)			atomic_load_64(pte)
 #define	pmap_store(pte, entry)		atomic_store_64(pte, entry)
 #define	pmap_store_bits(pte, bits)	atomic_set_64(pte, bits)
 
 /********************/
 /* Inline functions */
 /********************/
 
 static __inline void
 pagecopy(void *s, void *d)
 {
 
 	memcpy(d, s, PAGE_SIZE);
 }
 
 static __inline void
 pagezero(void *p)
 {
 
 	bzero(p, PAGE_SIZE);
 }
 
 #define	pmap_l1_index(va)	(((va) >> L1_SHIFT) & Ln_ADDR_MASK)
 #define	pmap_l2_index(va)	(((va) >> L2_SHIFT) & Ln_ADDR_MASK)
 #define	pmap_l3_index(va)	(((va) >> L3_SHIFT) & Ln_ADDR_MASK)
 
 #define	PTE_TO_PHYS(pte)	((pte >> PTE_PPN0_S) * PAGE_SIZE)
 
 static __inline pd_entry_t *
 pmap_l1(pmap_t pmap, vm_offset_t va)
 {
 
 	return (&pmap->pm_l1[pmap_l1_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
 {
 	vm_paddr_t phys;
 	pd_entry_t *l2;
 
 	phys = PTE_TO_PHYS(pmap_load(l1));
 	l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
 
 	return (&l2[pmap_l2_index(va)]);
 }
 
 static __inline pd_entry_t *
 pmap_l2(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *l1;
 
 	l1 = pmap_l1(pmap, va);
 	if ((pmap_load(l1) & PTE_V) == 0)
 		return (NULL);
 	if ((pmap_load(l1) & PTE_RX) != 0)
 		return (NULL);
 
 	return (pmap_l1_to_l2(l1, va));
 }
 
 static __inline pt_entry_t *
 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
 {
 	vm_paddr_t phys;
 	pt_entry_t *l3;
 
 	phys = PTE_TO_PHYS(pmap_load(l2));
 	l3 = (pd_entry_t *)PHYS_TO_DMAP(phys);
 
 	return (&l3[pmap_l3_index(va)]);
 }
 
 static __inline pt_entry_t *
 pmap_l3(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *l2;
 
 	l2 = pmap_l2(pmap, va);
 	if (l2 == NULL)
 		return (NULL);
 	if ((pmap_load(l2) & PTE_V) == 0)
 		return (NULL);
 	if ((pmap_load(l2) & PTE_RX) != 0)
 		return (NULL);
 
 	return (pmap_l2_to_l3(l2, va));
 }
 
 static __inline void
 pmap_resident_count_inc(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	pmap->pm_stats.resident_count += count;
 }
 
 static __inline void
 pmap_resident_count_dec(pmap_t pmap, int count)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(pmap->pm_stats.resident_count >= count,
 	    ("pmap %p resident count underflow %ld %d", pmap,
 	    pmap->pm_stats.resident_count, count));
 	pmap->pm_stats.resident_count -= count;
 }
 
 static void
 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index,
     pt_entry_t entry)
 {
 	struct pmap *user_pmap;
 	pd_entry_t *l1;
 
 	/* Distribute new kernel L1 entry to all the user pmaps */
 	if (pmap != kernel_pmap)
 		return;
 
 	mtx_lock(&allpmaps_lock);
 	LIST_FOREACH(user_pmap, &allpmaps, pm_list) {
 		l1 = &user_pmap->pm_l1[l1index];
 		pmap_store(l1, entry);
 	}
 	mtx_unlock(&allpmaps_lock);
 }
 
 static pt_entry_t *
 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
     u_int *l2_slot)
 {
 	pt_entry_t *l2;
 	pd_entry_t *l1;
 
 	l1 = (pd_entry_t *)l1pt;
 	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
 
 	/* Check locore has used a table L1 map */
 	KASSERT((l1[*l1_slot] & PTE_RX) == 0,
 		("Invalid bootstrap L1 table"));
 
 	/* Find the address of the L2 table */
 	l2 = (pt_entry_t *)init_pt_va;
 	*l2_slot = pmap_l2_index(va);
 
 	return (l2);
 }
 
 static vm_paddr_t
 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
 {
 	u_int l1_slot, l2_slot;
 	pt_entry_t *l2;
 	vm_paddr_t ret;
 
 	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
 
 	/* Check locore has used L2 superpages */
 	KASSERT((l2[l2_slot] & PTE_RX) != 0,
 		("Invalid bootstrap L2 table"));
 
 	/* L2 is superpages */
 	ret = (l2[l2_slot] >> PTE_PPN1_S) << L2_SHIFT;
 	ret += (va & L2_OFFSET);
 
 	return (ret);
 }
 
 static void
 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa)
 {
 	vm_offset_t va;
 	vm_paddr_t pa;
 	pd_entry_t *l1;
 	u_int l1_slot;
 	pt_entry_t entry;
 	pn_t pn;
 
 	pa = dmap_phys_base = min_pa & ~L1_OFFSET;
 	va = DMAP_MIN_ADDRESS;
 	l1 = (pd_entry_t *)kern_l1;
 	l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS);
 
 	for (; va < DMAP_MAX_ADDRESS && pa < max_pa;
 	    pa += L1_SIZE, va += L1_SIZE, l1_slot++) {
 		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
 
 		/* superpages */
 		pn = (pa / PAGE_SIZE);
 		entry = PTE_KERN;
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(&l1[l1_slot], entry);
 	}
 
 	/* Set the upper limit of the DMAP region */
 	dmap_phys_max = pa;
 	dmap_max_addr = va;
 
 	sfence_vma();
 }
 
 static vm_offset_t
 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
 {
 	vm_offset_t l3pt;
 	pt_entry_t entry;
 	pd_entry_t *l2;
 	vm_paddr_t pa;
 	u_int l2_slot;
 	pn_t pn;
 
 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
 
 	l2 = pmap_l2(kernel_pmap, va);
 	l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1));
 	l2_slot = pmap_l2_index(va);
 	l3pt = l3_start;
 
 	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
 		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
 
 		pa = pmap_early_vtophys(l1pt, l3pt);
 		pn = (pa / PAGE_SIZE);
 		entry = (PTE_V);
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(&l2[l2_slot], entry);
 		l3pt += PAGE_SIZE;
 	}
 
 
 	/* Clean the L2 page table */
 	memset((void *)l3_start, 0, l3pt - l3_start);
 
 	return (l3pt);
 }
 
 /*
  *	Bootstrap the system enough to run with virtual memory.
  */
 void
 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen)
 {
 	u_int l1_slot, l2_slot, avail_slot, map_slot;
 	vm_offset_t freemempos;
 	vm_offset_t dpcpu, msgbufpv;
 	vm_paddr_t end, max_pa, min_pa, pa, start;
 	int i;
 
 	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
 	printf("%lx\n", l1pt);
 	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
 
 	/* Set this early so we can use the pagetable walking functions */
 	kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt;
 	PMAP_LOCK_INIT(kernel_pmap);
 
 	rw_init(&pvh_global_lock, "pmap pv global");
 
 	CPU_FILL(&kernel_pmap->pm_active);
 
 	/* Assume the address we were loaded to is a valid physical address. */
 	min_pa = max_pa = kernstart;
 
 	/*
 	 * Find the minimum physical address. physmap is sorted,
 	 * but may contain empty ranges.
 	 */
 	for (i = 0; i < physmap_idx * 2; i += 2) {
 		if (physmap[i] == physmap[i + 1])
 			continue;
 		if (physmap[i] <= min_pa)
 			min_pa = physmap[i];
 		if (physmap[i + 1] > max_pa)
 			max_pa = physmap[i + 1];
 	}
 	printf("physmap_idx %lx\n", physmap_idx);
 	printf("min_pa %lx\n", min_pa);
 	printf("max_pa %lx\n", max_pa);
 
 	/* Create a direct map region early so we can use it for pa -> va */
 	pmap_bootstrap_dmap(l1pt, min_pa, max_pa);
 
 	/*
 	 * Read the page table to find out what is already mapped.
 	 * This assumes we have mapped a block of memory from KERNBASE
 	 * using a single L1 entry.
 	 */
 	(void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
 
 	/* Sanity check the index, KERNBASE should be the first VA */
 	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
 
 	freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE);
 
 	/* Create the l3 tables for the early devmap */
 	freemempos = pmap_bootstrap_l3(l1pt,
 	    VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos);
 
 	sfence_vma();
 
 #define alloc_pages(var, np)						\
 	(var) = freemempos;						\
 	freemempos += (np * PAGE_SIZE);					\
 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
 
 	/* Allocate dynamic per-cpu area. */
 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
 	dpcpu_init((void *)dpcpu, 0);
 
 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
 	msgbufp = (void *)msgbufpv;
 
 	virtual_avail = roundup2(freemempos, L2_SIZE);
 	virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE;
 	kernel_vm_end = virtual_avail;
 	
 	pa = pmap_early_vtophys(l1pt, freemempos);
 
 	/* Initialize phys_avail and dump_avail. */
 	for (avail_slot = map_slot = physmem = 0; map_slot < physmap_idx * 2;
 	    map_slot += 2) {
 		start = physmap[map_slot];
 		end = physmap[map_slot + 1];
 
 		if (start == end)
 			continue;
 		dump_avail[map_slot] = start;
 		dump_avail[map_slot + 1] = end;
 		realmem += atop((vm_offset_t)(end - start));
 
 		if (start >= kernstart && end <= pa)
 			continue;
 
 		if (start < kernstart && end > kernstart)
 			end = kernstart;
 		else if (start < pa && end > pa)
 			start = pa;
 		phys_avail[avail_slot] = start;
 		phys_avail[avail_slot + 1] = end;
 		physmem += (end - start) >> PAGE_SHIFT;
 		avail_slot += 2;
 
 		if (end != physmap[map_slot + 1] && end > pa) {
 			phys_avail[avail_slot] = pa;
 			phys_avail[avail_slot + 1] = physmap[map_slot + 1];
 			physmem += (physmap[map_slot + 1] - pa) >> PAGE_SHIFT;
 			avail_slot += 2;
 		}
 	}
 	phys_avail[avail_slot] = 0;
 	phys_avail[avail_slot + 1] = 0;
 
 	/*
 	 * Maxmem isn't the "maximum memory", it's one larger than the
 	 * highest page of the physical address space.  It should be
 	 * called something like "Maxphyspage".
 	 */
 	Maxmem = atop(phys_avail[avail_slot - 1]);
 }
 
 /*
  *	Initialize a vm_page's machine-dependent fields.
  */
 void
 pmap_page_init(vm_page_t m)
 {
 
 	TAILQ_INIT(&m->md.pv_list);
 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
 }
 
 /*
  *	Initialize the pmap module.
  *	Called by vm_init, to initialize any structures that the pmap
  *	system needs to map virtual memory.
  */
 void
 pmap_init(void)
 {
 	vm_size_t s;
 	int i, pv_npg;
 
 	/*
 	 * Initialize the pv chunk and pmap list mutexes.
 	 */
 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF);
 
 	/*
 	 * Initialize the pool of pv list locks.
 	 */
 	for (i = 0; i < NPV_LIST_LOCKS; i++)
 		rw_init(&pv_list_locks[i], "pmap pv list");
 
 	/*
 	 * Calculate the size of the pv head table for superpages.
 	 */
 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
 
 	/*
 	 * Allocate memory for the pv head table for superpages.
 	 */
 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
 	s = round_page(s);
 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
 	for (i = 0; i < pv_npg; i++)
 		TAILQ_INIT(&pv_table[i].pv_list);
 	TAILQ_INIT(&pv_dummy.pv_list);
 
 	if (superpages_enabled)
 		pagesizes[1] = L2_SIZE;
 }
 
 #ifdef SMP
 /*
  * For SMP, these functions have to use IPIs for coherence.
  *
  * In general, the calling thread uses a plain fence to order the
  * writes to the page tables before invoking an SBI callback to invoke
  * sfence_vma() on remote CPUs.
  */
 static void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 	cpuset_t mask;
 
 	sched_pin();
 	mask = pmap->pm_active;
 	CPU_CLR(PCPU_GET(hart), &mask);
 	fence();
 	if (!CPU_EMPTY(&mask) && smp_started)
 		sbi_remote_sfence_vma(mask.__bits, va, 1);
 	sfence_vma_page(va);
 	sched_unpin();
 }
 
 static void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	cpuset_t mask;
 
 	sched_pin();
 	mask = pmap->pm_active;
 	CPU_CLR(PCPU_GET(hart), &mask);
 	fence();
 	if (!CPU_EMPTY(&mask) && smp_started)
 		sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1);
 
 	/*
 	 * Might consider a loop of sfence_vma_page() for a small
 	 * number of pages in the future.
 	 */
 	sfence_vma();
 	sched_unpin();
 }
 
 static void
 pmap_invalidate_all(pmap_t pmap)
 {
 	cpuset_t mask;
 
 	sched_pin();
 	mask = pmap->pm_active;
 	CPU_CLR(PCPU_GET(hart), &mask);
 
 	/*
 	 * XXX: The SBI doc doesn't detail how to specify x0 as the
 	 * address to perform a global fence.  BBL currently treats
 	 * all sfence_vma requests as global however.
 	 */
 	fence();
 	if (!CPU_EMPTY(&mask) && smp_started)
 		sbi_remote_sfence_vma(mask.__bits, 0, 0);
 	sfence_vma();
 	sched_unpin();
 }
 #else
 /*
  * Normal, non-SMP, invalidation functions.
  * We inline these within pmap.c for speed.
  */
 static __inline void
 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
 {
 
 	sfence_vma_page(va);
 }
 
 static __inline void
 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 
 	/*
 	 * Might consider a loop of sfence_vma_page() for a small
 	 * number of pages in the future.
 	 */
 	sfence_vma();
 }
 
 static __inline void
 pmap_invalidate_all(pmap_t pmap)
 {
 
 	sfence_vma();
 }
 #endif
 
 /*
  *	Routine:	pmap_extract
  *	Function:
  *		Extract the physical page address associated
  *		with the given map/virtual_address pair.
  */
 vm_paddr_t 
 pmap_extract(pmap_t pmap, vm_offset_t va)
 {
 	pd_entry_t *l2p, l2;
 	pt_entry_t *l3p, l3;
 	vm_paddr_t pa;
 
 	pa = 0;
 	PMAP_LOCK(pmap);
 	/*
 	 * Start with the l2 tabel. We are unable to allocate
 	 * pages in the l1 table.
 	 */
 	l2p = pmap_l2(pmap, va);
 	if (l2p != NULL) {
 		l2 = pmap_load(l2p);
 		if ((l2 & PTE_RX) == 0) {
 			l3p = pmap_l2_to_l3(l2p, va);
 			if (l3p != NULL) {
 				l3 = pmap_load(l3p);
 				pa = PTE_TO_PHYS(l3);
 				pa |= (va & L3_OFFSET);
 			}
 		} else {
 			/* L2 is superpages */
 			pa = (l2 >> PTE_PPN1_S) << L2_SHIFT;
 			pa |= (va & L2_OFFSET);
 		}
 	}
 	PMAP_UNLOCK(pmap);
 	return (pa);
 }
 
 /*
  *	Routine:	pmap_extract_and_hold
  *	Function:
  *		Atomically extract and hold the physical page
  *		with the given pmap and virtual address pair
  *		if that mapping permits the given protection.
  */
 vm_page_t
 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
 {
 	pt_entry_t *l3p, l3;
 	vm_paddr_t phys;
 	vm_paddr_t pa;
 	vm_page_t m;
 
 	pa = 0;
 	m = NULL;
 	PMAP_LOCK(pmap);
 retry:
 	l3p = pmap_l3(pmap, va);
 	if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) {
 		if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) {
 			phys = PTE_TO_PHYS(l3);
 			if (vm_page_pa_tryrelock(pmap, phys, &pa))
 				goto retry;
 			m = PHYS_TO_VM_PAGE(phys);
 			vm_page_wire(m);
 		}
 	}
 	PA_UNLOCK_COND(pa);
 	PMAP_UNLOCK(pmap);
 	return (m);
 }
 
 vm_paddr_t
 pmap_kextract(vm_offset_t va)
 {
 	pd_entry_t *l2;
 	pt_entry_t *l3;
 	vm_paddr_t pa;
 
 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
 		pa = DMAP_TO_PHYS(va);
 	} else {
 		l2 = pmap_l2(kernel_pmap, va);
 		if (l2 == NULL)
 			panic("pmap_kextract: No l2");
 		if ((pmap_load(l2) & PTE_RX) != 0) {
 			/* superpages */
 			pa = (pmap_load(l2) >> PTE_PPN1_S) << L2_SHIFT;
 			pa |= (va & L2_OFFSET);
 			return (pa);
 		}
 
 		l3 = pmap_l2_to_l3(l2, va);
 		if (l3 == NULL)
 			panic("pmap_kextract: No l3...");
 		pa = PTE_TO_PHYS(pmap_load(l3));
 		pa |= (va & PAGE_MASK);
 	}
 	return (pa);
 }
 
 /***************************************************
  * Low level mapping routines.....
  ***************************************************/
 
 void
 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
 {
 	pt_entry_t entry;
 	pt_entry_t *l3;
 	vm_offset_t va;
 	pn_t pn;
 
 	KASSERT((pa & L3_OFFSET) == 0,
 	   ("pmap_kenter_device: Invalid physical address"));
 	KASSERT((sva & L3_OFFSET) == 0,
 	   ("pmap_kenter_device: Invalid virtual address"));
 	KASSERT((size & PAGE_MASK) == 0,
 	    ("pmap_kenter_device: Mapping is not page-sized"));
 
 	va = sva;
 	while (size != 0) {
 		l3 = pmap_l3(kernel_pmap, va);
 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
 
 		pn = (pa / PAGE_SIZE);
 		entry = PTE_KERN;
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(l3, entry);
 
 		va += PAGE_SIZE;
 		pa += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  * Remove a page from the kernel pagetables.
  * Note: not SMP coherent.
  */
 PMAP_INLINE void
 pmap_kremove(vm_offset_t va)
 {
 	pt_entry_t *l3;
 
 	l3 = pmap_l3(kernel_pmap, va);
 	KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
 
 	pmap_clear(l3);
 	sfence_vma();
 }
 
 void
 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
 {
 	pt_entry_t *l3;
 	vm_offset_t va;
 
 	KASSERT((sva & L3_OFFSET) == 0,
 	   ("pmap_kremove_device: Invalid virtual address"));
 	KASSERT((size & PAGE_MASK) == 0,
 	    ("pmap_kremove_device: Mapping is not page-sized"));
 
 	va = sva;
 	while (size != 0) {
 		l3 = pmap_l3(kernel_pmap, va);
 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
 		pmap_clear(l3);
 
 		va += PAGE_SIZE;
 		size -= PAGE_SIZE;
 	}
 
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  *	Used to map a range of physical addresses into kernel
  *	virtual address space.
  *
  *	The value passed in '*virt' is a suggested virtual address for
  *	the mapping. Architectures which can support a direct-mapped
  *	physical to virtual region can return the appropriate address
  *	within that region, leaving '*virt' unchanged. Other
  *	architectures should map the pages starting at '*virt' and
  *	update '*virt' with the first usable address after the mapped
  *	region.
  */
 vm_offset_t
 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
 {
 
 	return PHYS_TO_DMAP(start);
 }
 
 
 /*
  * Add a list of wired pages to the kva
  * this routine is only used for temporary
  * kernel mappings that do not need to have
  * page modification or references recorded.
  * Note that old mappings are simply written
  * over.  The page *must* be wired.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
 {
 	pt_entry_t *l3, pa;
 	vm_offset_t va;
 	vm_page_t m;
 	pt_entry_t entry;
 	pn_t pn;
 	int i;
 
 	va = sva;
 	for (i = 0; i < count; i++) {
 		m = ma[i];
 		pa = VM_PAGE_TO_PHYS(m);
 		pn = (pa / PAGE_SIZE);
 		l3 = pmap_l3(kernel_pmap, va);
 
 		entry = PTE_KERN;
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(l3, entry);
 
 		va += L3_SIZE;
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 /*
  * This routine tears out page mappings from the
  * kernel -- it is meant only for temporary mappings.
  * Note: SMP coherent.  Uses a ranged shootdown IPI.
  */
 void
 pmap_qremove(vm_offset_t sva, int count)
 {
 	pt_entry_t *l3;
 	vm_offset_t va;
 
 	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
 
 	for (va = sva; count-- > 0; va += PAGE_SIZE) {
 		l3 = pmap_l3(kernel_pmap, va);
 		KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
 		pmap_clear(l3);
 	}
 	pmap_invalidate_range(kernel_pmap, sva, va);
 }
 
 bool
 pmap_ps_enabled(pmap_t pmap __unused)
 {
 
 	return (superpages_enabled);
 }
 
 /***************************************************
  * Page table page management routines.....
  ***************************************************/
 /*
  * Schedule the specified unused page table page to be freed.  Specifically,
  * add the page to the specified list of pages that will be released to the
  * physical memory manager after the TLB has been updated.
  */
 static __inline void
 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
     boolean_t set_PG_ZERO)
 {
 
 	if (set_PG_ZERO)
 		m->flags |= PG_ZERO;
 	else
 		m->flags &= ~PG_ZERO;
 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
 }
 
 /*
  * Inserts the specified page table page into the specified pmap's collection
  * of idle page table pages.  Each of a pmap's page table pages is responsible
  * for mapping a distinct range of virtual addresses.  The pmap's collection is
  * ordered by this virtual address range.
  *
  * If "promoted" is false, then the page table page "ml3" must be zero filled.
  */
 static __inline int
 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0;
 	return (vm_radix_insert(&pmap->pm_root, ml3));
 }
 
 /*
  * Removes the page table page mapping the specified virtual address from the
  * specified pmap's collection of idle page table pages, and returns it.
  * Otherwise, returns NULL if there is no page table page corresponding to the
  * specified virtual address.
  */
 static __inline vm_page_t
 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
 {
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
 }
 	
 /*
  * Decrements a page table page's wire count, which is used to record the
  * number of valid page table entries within the page.  If the wire count
  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
  * page table page was unmapped and FALSE otherwise.
  */
 static inline boolean_t
 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 
 	--m->wire_count;
 	if (m->wire_count == 0) {
 		_pmap_unwire_ptp(pmap, va, m, free);
 		return (TRUE);
 	} else {
 		return (FALSE);
 	}
 }
 
 static void
 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
 {
 	vm_paddr_t phys;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	if (m->pindex >= NUL1E) {
 		pd_entry_t *l1;
 		l1 = pmap_l1(pmap, va);
 		pmap_clear(l1);
 		pmap_distribute_l1(pmap, pmap_l1_index(va), 0);
 	} else {
 		pd_entry_t *l2;
 		l2 = pmap_l2(pmap, va);
 		pmap_clear(l2);
 	}
 	pmap_resident_count_dec(pmap, 1);
 	if (m->pindex < NUL1E) {
 		pd_entry_t *l1;
 		vm_page_t pdpg;
 
 		l1 = pmap_l1(pmap, va);
 		phys = PTE_TO_PHYS(pmap_load(l1));
 		pdpg = PHYS_TO_VM_PAGE(phys);
 		pmap_unwire_ptp(pmap, va, pdpg, free);
 	}
 	pmap_invalidate_page(pmap, va);
 
 	vm_wire_sub(1);
 
 	/* 
 	 * Put page on a list so that it is released after
 	 * *ALL* TLB shootdown is done
 	 */
 	pmap_add_delayed_free_list(m, free, TRUE);
 }
 
 /*
  * After removing a page table entry, this routine is used to
  * conditionally free the page, and manage the hold/wire counts.
  */
 static int
 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
     struct spglist *free)
 {
 	vm_page_t mpte;
 
 	if (va >= VM_MAXUSER_ADDRESS)
 		return (0);
 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
 	mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde));
 	return (pmap_unwire_ptp(pmap, va, mpte, free));
 }
 
 void
 pmap_pinit0(pmap_t pmap)
 {
 
 	PMAP_LOCK_INIT(pmap);
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
 	pmap->pm_l1 = kernel_pmap->pm_l1;
 	pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT);
 	CPU_ZERO(&pmap->pm_active);
 	pmap_activate_boot(pmap);
 }
 
 int
 pmap_pinit(pmap_t pmap)
 {
 	vm_paddr_t l1phys;
 	vm_page_t l1pt;
 
 	/*
 	 * allocate the l1 page
 	 */
 	while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL |
 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
 		vm_wait(NULL);
 
 	l1phys = VM_PAGE_TO_PHYS(l1pt);
 	pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys);
 	pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT);
 
 	if ((l1pt->flags & PG_ZERO) == 0)
 		pagezero(pmap->pm_l1);
 
 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
 
 	CPU_ZERO(&pmap->pm_active);
 
 	/* Install kernel pagetables */
 	memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE);
 
 	/* Add to the list of all user pmaps */
 	mtx_lock(&allpmaps_lock);
 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
 	mtx_unlock(&allpmaps_lock);
 
 	vm_radix_init(&pmap->pm_root);
 
 	return (1);
 }
 
 /*
  * This routine is called if the desired page table page does not exist.
  *
  * If page table page allocation fails, this routine may sleep before
  * returning NULL.  It sleeps only if a lock pointer was given.
  *
  * Note: If a page allocation fails at page table level two or three,
  * one or two pages may be held during the wait, only to be released
  * afterwards.  This conservative approach is easily argued to avoid
  * race conditions.
  */
 static vm_page_t
 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
 {
 	vm_page_t m, /*pdppg, */pdpg;
 	pt_entry_t entry;
 	vm_paddr_t phys;
 	pn_t pn;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	/*
 	 * Allocate a page table page.
 	 */
 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
 		if (lockp != NULL) {
 			RELEASE_PV_LIST_LOCK(lockp);
 			PMAP_UNLOCK(pmap);
 			rw_runlock(&pvh_global_lock);
 			vm_wait(NULL);
 			rw_rlock(&pvh_global_lock);
 			PMAP_LOCK(pmap);
 		}
 
 		/*
 		 * Indicate the need to retry.  While waiting, the page table
 		 * page may have been allocated.
 		 */
 		return (NULL);
 	}
 
 	if ((m->flags & PG_ZERO) == 0)
 		pmap_zero_page(m);
 
 	/*
 	 * Map the pagetable page into the process address space, if
 	 * it isn't already there.
 	 */
 
 	if (ptepindex >= NUL1E) {
 		pd_entry_t *l1;
 		vm_pindex_t l1index;
 
 		l1index = ptepindex - NUL1E;
 		l1 = &pmap->pm_l1[l1index];
 
 		pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE);
 		entry = (PTE_V);
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(l1, entry);
 		pmap_distribute_l1(pmap, l1index, entry);
 	} else {
 		vm_pindex_t l1index;
 		pd_entry_t *l1, *l2;
 
 		l1index = ptepindex >> (L1_SHIFT - L2_SHIFT);
 		l1 = &pmap->pm_l1[l1index];
 		if (pmap_load(l1) == 0) {
 			/* recurse for allocating page dir */
 			if (_pmap_alloc_l3(pmap, NUL1E + l1index,
 			    lockp) == NULL) {
 				vm_page_unwire_noq(m);
 				vm_page_free_zero(m);
 				return (NULL);
 			}
 		} else {
 			phys = PTE_TO_PHYS(pmap_load(l1));
 			pdpg = PHYS_TO_VM_PAGE(phys);
 			pdpg->wire_count++;
 		}
 
 		phys = PTE_TO_PHYS(pmap_load(l1));
 		l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
 
 		pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE);
 		entry = (PTE_V);
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(l2, entry);
 	}
 
 	pmap_resident_count_inc(pmap, 1);
 
 	return (m);
 }
 
 static vm_page_t
 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	pd_entry_t *l1;
 	vm_page_t l2pg;
 	vm_pindex_t l2pindex;
 
 retry:
 	l1 = pmap_l1(pmap, va);
 	if (l1 != NULL && (pmap_load(l1) & PTE_RWX) == 0) {
 		/* Add a reference to the L2 page. */
 		l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1)));
 		l2pg->wire_count++;
 	} else {
 		/* Allocate a L2 page. */
 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
 		if (l2pg == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (l2pg);
 }
 
 static vm_page_t
 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
 {
 	vm_pindex_t ptepindex;
 	pd_entry_t *l2;
 	vm_paddr_t phys;
 	vm_page_t m;
 
 	/*
 	 * Calculate pagetable page index
 	 */
 	ptepindex = pmap_l2_pindex(va);
 retry:
 	/*
 	 * Get the page directory entry
 	 */
 	l2 = pmap_l2(pmap, va);
 
 	/*
 	 * If the page table page is mapped, we just increment the
 	 * hold count, and activate it.
 	 */
 	if (l2 != NULL && pmap_load(l2) != 0) {
 		phys = PTE_TO_PHYS(pmap_load(l2));
 		m = PHYS_TO_VM_PAGE(phys);
 		m->wire_count++;
 	} else {
 		/*
 		 * Here if the pte page isn't mapped, or if it has been
 		 * deallocated.
 		 */
 		m = _pmap_alloc_l3(pmap, ptepindex, lockp);
 		if (m == NULL && lockp != NULL)
 			goto retry;
 	}
 	return (m);
 }
 
 
 /***************************************************
  * Pmap allocation/deallocation routines.
  ***************************************************/
 
 /*
  * Release any resources held by the given physical map.
  * Called when a pmap initialized by pmap_pinit is being released.
  * Should only be called if the map contains no valid mappings.
  */
 void
 pmap_release(pmap_t pmap)
 {
 	vm_page_t m;
 
 	KASSERT(pmap->pm_stats.resident_count == 0,
 	    ("pmap_release: pmap resident count %ld != 0",
 	    pmap->pm_stats.resident_count));
 	KASSERT(CPU_EMPTY(&pmap->pm_active),
 	    ("releasing active pmap %p", pmap));
 
 	mtx_lock(&allpmaps_lock);
 	LIST_REMOVE(pmap, pm_list);
 	mtx_unlock(&allpmaps_lock);
 
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1));
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 }
 
 #if 0
 static int
 kvm_size(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
 
 	return sysctl_handle_long(oidp, &ksize, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_size, "LU", "Size of KVM");
 
 static int
 kvm_free(SYSCTL_HANDLER_ARGS)
 {
 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
 
 	return sysctl_handle_long(oidp, &kfree, 0, req);
 }
 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG|CTLFLAG_RD, 
     0, 0, kvm_free, "LU", "Amount of KVM free");
 #endif /* 0 */
 
 /*
  * grow the number of kernel page table entries, if needed
  */
 void
 pmap_growkernel(vm_offset_t addr)
 {
 	vm_paddr_t paddr;
 	vm_page_t nkpg;
 	pd_entry_t *l1, *l2;
 	pt_entry_t entry;
 	pn_t pn;
 
 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
 
 	addr = roundup2(addr, L2_SIZE);
 	if (addr - 1 >= vm_map_max(kernel_map))
 		addr = vm_map_max(kernel_map);
 	while (kernel_vm_end < addr) {
 		l1 = pmap_l1(kernel_pmap, kernel_vm_end);
 		if (pmap_load(l1) == 0) {
 			/* We need a new PDP entry */
 			nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT,
 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 			if (nkpg == NULL)
 				panic("pmap_growkernel: no memory to grow kernel");
 			if ((nkpg->flags & PG_ZERO) == 0)
 				pmap_zero_page(nkpg);
 			paddr = VM_PAGE_TO_PHYS(nkpg);
 
 			pn = (paddr / PAGE_SIZE);
 			entry = (PTE_V);
 			entry |= (pn << PTE_PPN0_S);
 			pmap_store(l1, entry);
 			pmap_distribute_l1(kernel_pmap,
 			    pmap_l1_index(kernel_vm_end), entry);
 			continue; /* try again */
 		}
 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
 		if ((pmap_load(l2) & PTE_V) != 0 &&
 		    (pmap_load(l2) & PTE_RWX) == 0) {
 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 				kernel_vm_end = vm_map_max(kernel_map);
 				break;
 			}
 			continue;
 		}
 
 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT,
 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 		    VM_ALLOC_ZERO);
 		if (nkpg == NULL)
 			panic("pmap_growkernel: no memory to grow kernel");
 		if ((nkpg->flags & PG_ZERO) == 0) {
 			pmap_zero_page(nkpg);
 		}
 		paddr = VM_PAGE_TO_PHYS(nkpg);
 
 		pn = (paddr / PAGE_SIZE);
 		entry = (PTE_V);
 		entry |= (pn << PTE_PPN0_S);
 		pmap_store(l2, entry);
 
 		pmap_invalidate_page(kernel_pmap, kernel_vm_end);
 
 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
 			kernel_vm_end = vm_map_max(kernel_map);
 			break;                       
 		}
 	}
 }
 
 
 /***************************************************
  * page management routines.
  ***************************************************/
 
 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
 CTASSERT(_NPCM == 3);
 CTASSERT(_NPCPV == 168);
 
 static __inline struct pv_chunk *
 pv_to_chunk(pv_entry_t pv)
 {
 
 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
 }
 
 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
 
 #define	PC_FREE0	0xfffffffffffffffful
 #define	PC_FREE1	0xfffffffffffffffful
 #define	PC_FREE2	0x000000fffffffffful
 
 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
 
 #if 0
 #ifdef PV_STATS
 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
 
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
 	"Current number of pv entry chunks");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
 	"Current number of pv entry chunks allocated");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
 	"Current number of pv entry chunks frees");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
 	"Number of times tried to get a chunk page but failed.");
 
 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
 static int pv_entry_spare;
 
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
 	"Current number of pv entry frees");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
 	"Current number of pv entry allocs");
 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
 	"Current number of pv entries");
 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
 	"Current number of spare pv entries");
 #endif
 #endif /* 0 */
 
 /*
  * We are in a serious low memory condition.  Resort to
  * drastic measures to free some pages so we can allocate
  * another pv entry chunk.
  *
  * Returns NULL if PV entries were reclaimed from the specified pmap.
  *
  * We do not, however, unmap 2mpages because subsequent accesses will
  * allocate per-page pv entries until repromotion occurs, thereby
  * exacerbating the shortage of free pv entries.
  */
 static vm_page_t
 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
 {
 
 	panic("RISCVTODO: reclaim_pv_chunk");
 }
 
 /*
  * free the pv_entry back to the free list
  */
 static void
 free_pv_entry(pmap_t pmap, pv_entry_t pv)
 {
 	struct pv_chunk *pc;
 	int idx, field, bit;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
 	pc = pv_to_chunk(pv);
 	idx = pv - &pc->pc_pventry[0];
 	field = idx / 64;
 	bit = idx % 64;
 	pc->pc_map[field] |= 1ul << bit;
 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
 	    pc->pc_map[2] != PC_FREE2) {
 		/* 98% of the time, pc is already at the head of the list. */
 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		}
 		return;
 	}
 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 	free_pv_chunk(pc);
 }
 
 static void
 free_pv_chunk(struct pv_chunk *pc)
 {
 	vm_page_t m;
 
 	mtx_lock(&pv_chunks_mutex);
  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
 	/* entire chunk is free, return it */
 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
 	dump_drop_page(m->phys_addr);
 	vm_page_unwire_noq(m);
 	vm_page_free(m);
 }
 
 /*
  * Returns a new PV entry, allocating a new PV chunk from the system when
  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
  * returned.
  *
  * The given PV list lock may be released.
  */
 static pv_entry_t
 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
 {
 	int bit, field;
 	pv_entry_t pv;
 	struct pv_chunk *pc;
 	vm_page_t m;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
 retry:
 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 	if (pc != NULL) {
 		for (field = 0; field < _NPCM; field++) {
 			if (pc->pc_map[field]) {
 				bit = ffsl(pc->pc_map[field]) - 1;
 				break;
 			}
 		}
 		if (field < _NPCM) {
 			pv = &pc->pc_pventry[field * 64 + bit];
 			pc->pc_map[field] &= ~(1ul << bit);
 			/* If this was the last item, move it to tail */
 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
 			    pc->pc_map[2] == 0) {
 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
 				    pc_list);
 			}
 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
 			return (pv);
 		}
 	}
 	/* No free items, allocate another chunk */
 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 	    VM_ALLOC_WIRED);
 	if (m == NULL) {
 		if (lockp == NULL) {
 			PV_STAT(pc_chunk_tryfail++);
 			return (NULL);
 		}
 		m = reclaim_pv_chunk(pmap, lockp);
 		if (m == NULL)
 			goto retry;
 	}
 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
 	dump_add_page(m->phys_addr);
 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 	pc->pc_pmap = pmap;
 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
 	pc->pc_map[1] = PC_FREE1;
 	pc->pc_map[2] = PC_FREE2;
 	mtx_lock(&pv_chunks_mutex);
 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
 	mtx_unlock(&pv_chunks_mutex);
 	pv = &pc->pc_pventry[0];
 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
 	return (pv);
 }
 
 /*
  * Ensure that the number of spare PV entries in the specified pmap meets or
  * exceeds the given count, "needed".
  *
  * The given PV list lock may be released.
  */
 static void
 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
 {
 	struct pch new_tail;
 	struct pv_chunk *pc;
 	vm_page_t m;
 	int avail, free;
 	bool reclaimed;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
 
 	/*
 	 * Newly allocated PV chunks must be stored in a private list until
 	 * the required number of PV chunks have been allocated.  Otherwise,
 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
 	 * contrast, these chunks must be added to the pmap upon allocation.
 	 */
 	TAILQ_INIT(&new_tail);
 retry:
 	avail = 0;
 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
 		bit_count((bitstr_t *)pc->pc_map, 0,
 		    sizeof(pc->pc_map) * NBBY, &free);
 		if (free == 0)
 			break;
 		avail += free;
 		if (avail >= needed)
 			break;
 	}
 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
 		    VM_ALLOC_WIRED);
 		if (m == NULL) {
 			m = reclaim_pv_chunk(pmap, lockp);
 			if (m == NULL)
 				goto retry;
 			reclaimed = true;
 		}
 		/* XXX PV STATS */
 #if 0
 		dump_add_page(m->phys_addr);
 #endif
 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
 		pc->pc_pmap = pmap;
 		pc->pc_map[0] = PC_FREE0;
 		pc->pc_map[1] = PC_FREE1;
 		pc->pc_map[2] = PC_FREE2;
 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
 
 		/*
 		 * The reclaim might have freed a chunk from the current pmap.
 		 * If that chunk contained available entries, we need to
 		 * re-count the number of available entries.
 		 */
 		if (reclaimed)
 			goto retry;
 	}
 	if (!TAILQ_EMPTY(&new_tail)) {
 		mtx_lock(&pv_chunks_mutex);
 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
 		mtx_unlock(&pv_chunks_mutex);
 	}
 }
 
 /*
  * First find and then remove the pv entry for the specified pmap and virtual
  * address from the specified pv list.  Returns the pv entry if found and NULL
  * otherwise.  This operation can be performed on pv lists for either 4KB or
  * 2MB page mappings.
  */
 static __inline pv_entry_t
 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 			break;
 		}
 	}
 	return (pv);
 }
 
 /*
  * First find and then destroy the pv entry for the specified pmap and virtual
  * address.  This operation can be performed on pv lists for either 4KB or 2MB
  * page mappings.
  */
 static void
 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
 {
 	pv_entry_t pv;
 
 	pv = pmap_pvh_remove(pvh, pmap, va);
 
 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va));
 	free_pv_entry(pmap, pv);
 }
 
 /*
  * Conditionally create the PV entry for a 4KB page mapping if the required
  * memory can be allocated without resorting to reclamation.
  */
 static boolean_t
 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
     struct rwlock **lockp)
 {
 	pv_entry_t pv;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
 		pv->pv_va = va;
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		return (TRUE);
 	} else
 		return (FALSE);
 }
 
 /*
  * After demotion from a 2MB page mapping to 512 4KB page mappings,
  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
  * entries for each of the 4KB page mappings.
  */
 static void __unused
 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	struct pv_chunk *pc;
 	pv_entry_t pv;
 	vm_page_t m;
 	vm_offset_t va_last;
 	int bit, field;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	/*
 	 * Transfer the 2mpage's pv entry for this mapping to the first
 	 * page's pv list.  Once this transfer begins, the pv list lock
 	 * must not be released until the last pv entry is reinstantiated.
 	 */
 	pvh = pa_to_pvh(pa);
 	va &= ~L2_OFFSET;
 	pv = pmap_pvh_remove(pvh, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
 	m = PHYS_TO_VM_PAGE(pa);
 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 	m->md.pv_gen++;
 	/* Instantiate the remaining 511 pv entries. */
 	va_last = va + L2_SIZE - PAGE_SIZE;
 	for (;;) {
 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
 		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
 		for (field = 0; field < _NPCM; field++) {
 			while (pc->pc_map[field] != 0) {
 				bit = ffsl(pc->pc_map[field]) - 1;
 				pc->pc_map[field] &= ~(1ul << bit);
 				pv = &pc->pc_pventry[field * 64 + bit];
 				va += PAGE_SIZE;
 				pv->pv_va = va;
 				m++;
 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 			    ("pmap_pv_demote_l2: page %p is not managed", m));
 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 				m->md.pv_gen++;
 				if (va == va_last)
 					goto out;
 			}
 		}
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 out:
 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
 	}
 	/* XXX PV stats */
 }
 
 #if VM_NRESERVLEVEL > 0
 static void
 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_page_t m;
 	vm_offset_t va_last;
 
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	KASSERT((va & L2_OFFSET) == 0,
 	    ("pmap_pv_promote_l2: misaligned va %#lx", va));
 
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 
 	m = PHYS_TO_VM_PAGE(pa);
 	pv = pmap_pvh_remove(&m->md, pmap, va);
 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va));
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 
 	va_last = va + L2_SIZE - PAGE_SIZE;
 	do {
 		m++;
 		va += PAGE_SIZE;
 		pmap_pvh_free(&m->md, pmap, va);
 	} while (va < va_last);
 }
 #endif /* VM_NRESERVLEVEL > 0 */
 
 /*
  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
  * false if the PV entry cannot be allocated without resorting to reclamation.
  */
 static bool
 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
     struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pv_entry_t pv;
 	vm_paddr_t pa;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	/* Pass NULL instead of the lock pointer to disable reclamation. */
 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
 	    NULL : lockp)) == NULL)
 		return (false);
 	pv->pv_va = va;
 	pa = PTE_TO_PHYS(l2e);
 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
 	pvh = pa_to_pvh(pa);
 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 	pvh->pv_gen++;
 	return (true);
 }
 
 static void
 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
 {
 	pt_entry_t newl2, oldl2;
 	vm_page_t ml3;
 	vm_paddr_t ml3pa;
 
 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	ml3 = pmap_remove_pt_page(pmap, va);
 	if (ml3 == NULL)
 		panic("pmap_remove_kernel_l2: Missing pt page");
 
 	ml3pa = VM_PAGE_TO_PHYS(ml3);
 	newl2 = ml3pa | PTE_V;
 
 	/*
 	 * If this page table page was unmapped by a promotion, then it
 	 * contains valid mappings.  Zero it to invalidate those mappings.
 	 */
 	if (ml3->valid != 0)
 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
 
 	/*
 	 * Demote the mapping.
 	 */
 	oldl2 = pmap_load_store(l2, newl2);
 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
 	    __func__, l2, oldl2));
 }
 
 /*
  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
  */
 static int
 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
 {
 	struct md_page *pvh;
 	pt_entry_t oldl2;
 	vm_offset_t eva, va;
 	vm_page_t m, ml3;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
 	oldl2 = pmap_load_clear(l2);
 	KASSERT((oldl2 & PTE_RWX) != 0,
 	    ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2));
 
 	/*
 	 * The sfence.vma documentation states that it is sufficient to specify
 	 * a single address within a superpage mapping.  However, since we do
 	 * not perform any invalidation upon promotion, TLBs may still be
 	 * caching 4KB mappings within the superpage, so we must invalidate the
 	 * entire range.
 	 */
 	pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
 	if ((oldl2 & PTE_SW_WIRED) != 0)
 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
 	if ((oldl2 & PTE_SW_MANAGED) != 0) {
 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2));
 		pvh = pa_to_pvh(PTE_TO_PHYS(oldl2));
 		pmap_pvh_free(pvh, pmap, sva);
 		eva = sva + L2_SIZE;
 		for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2));
 		    va < eva; va += PAGE_SIZE, m++) {
 			if ((oldl2 & PTE_D) != 0)
 				vm_page_dirty(m);
 			if ((oldl2 & PTE_A) != 0)
 				vm_page_aflag_set(m, PGA_REFERENCED);
 			if (TAILQ_EMPTY(&m->md.pv_list) &&
 			    TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 	if (pmap == kernel_pmap) {
 		pmap_remove_kernel_l2(pmap, l2, sva);
 	} else {
 		ml3 = pmap_remove_pt_page(pmap, sva);
 		if (ml3 != NULL) {
 			KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_remove_l2: l3 page not promoted"));
 			pmap_resident_count_dec(pmap, 1);
 			KASSERT(ml3->wire_count == Ln_ENTRIES,
 			    ("pmap_remove_l2: l3 page wire count error"));
 			ml3->wire_count = 1;
 			vm_page_unwire_noq(ml3);
 			pmap_add_delayed_free_list(ml3, free, FALSE);
 		}
 	}
 	return (pmap_unuse_pt(pmap, sva, l1e, free));
 }
 
 /*
  * pmap_remove_l3: do the things to unmap a page in a process
  */
 static int
 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va, 
     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
 {
 	pt_entry_t old_l3;
 	vm_paddr_t phys;
 	vm_page_t m;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 	old_l3 = pmap_load_clear(l3);
 	pmap_invalidate_page(pmap, va);
 	if (old_l3 & PTE_SW_WIRED)
 		pmap->pm_stats.wired_count -= 1;
 	pmap_resident_count_dec(pmap, 1);
 	if (old_l3 & PTE_SW_MANAGED) {
 		phys = PTE_TO_PHYS(old_l3);
 		m = PHYS_TO_VM_PAGE(phys);
 		if ((old_l3 & PTE_D) != 0)
 			vm_page_dirty(m);
 		if (old_l3 & PTE_A)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
 		pmap_pvh_free(&m->md, pmap, va);
 	}
 
 	return (pmap_unuse_pt(pmap, va, l2e, free));
 }
 
 /*
  *	Remove the given range of addresses from the specified map.
  *
  *	It is assumed that the start and end are properly
  *	rounded to the page size.
  */
 void
 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	struct spglist free;
 	struct rwlock *lock;
 	vm_offset_t va, va_next;
 	pd_entry_t *l1, *l2, l2e;
 	pt_entry_t *l3;
 
 	/*
 	 * Perform an unsynchronized read.  This is, however, safe.
 	 */
 	if (pmap->pm_stats.resident_count == 0)
 		return;
 
 	SLIST_INIT(&free);
 
 	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 
 	lock = NULL;
 	for (; sva < eva; sva = va_next) {
 		if (pmap->pm_stats.resident_count == 0)
 			break;
 
 		l1 = pmap_l1(pmap, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		/*
 		 * Calculate index for next page table.
 		 */
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (l2 == NULL)
 			continue;
 		if ((l2e = pmap_load(l2)) == 0)
 			continue;
 		if ((l2e & PTE_RWX) != 0) {
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				(void)pmap_remove_l2(pmap, l2, sva,
 				    pmap_load(l1), &free, &lock);
 				continue;
 			} else if (!pmap_demote_l2_locked(pmap, l2, sva,
 			    &lock)) {
 				/*
 				 * The large page mapping was destroyed.
 				 */
 				continue;
 			}
 			l2e = pmap_load(l2);
 		}
 
 		/*
 		 * Limit our scan to either the end of the va represented
 		 * by the current page table page, or to the end of the
 		 * range being removed.
 		 */
 		if (va_next > eva)
 			va_next = eva;
 
 		va = va_next;
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			if (pmap_load(l3) == 0) {
 				if (va != va_next) {
 					pmap_invalidate_range(pmap, va, sva);
 					va = va_next;
 				}
 				continue;
 			}
 			if (va == va_next)
 				va = sva;
 			if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) {
 				sva += L3_SIZE;
 				break;
 			}
 		}
 		if (va != va_next)
 			pmap_invalidate_range(pmap, va, sva);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, false);
 }
 
 /*
  *	Routine:	pmap_remove_all
  *	Function:
  *		Removes this physical page from
  *		all physical maps in which it resides.
  *		Reflects back modify bits to the pager.
  *
  *	Notes:
  *		Original versions of this routine were very
  *		inefficient because they iteratively called
  *		pmap_remove (slow...)
  */
 
 void
 pmap_remove_all(vm_page_t m)
 {
 	struct spglist free;
 	struct md_page *pvh;
 	pmap_t pmap;
 	pt_entry_t *l3, l3e;
 	pd_entry_t *l2, l2e;
 	pv_entry_t pv;
 	vm_offset_t va;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_all: page %p is not managed", m));
 	SLIST_INIT(&free);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 
 	rw_wlock(&pvh_global_lock);
 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		va = pv->pv_va;
 		l2 = pmap_l2(pmap, va);
 		(void)pmap_demote_l2(pmap, l2, va);
 		PMAP_UNLOCK(pmap);
 	}
 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
 		pmap = PV_PMAP(pv);
 		PMAP_LOCK(pmap);
 		pmap_resident_count_dec(pmap, 1);
 		l2 = pmap_l2(pmap, pv->pv_va);
 		KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found"));
 		l2e = pmap_load(l2);
 
 		KASSERT((l2e & PTE_RX) == 0,
 		    ("pmap_remove_all: found a superpage in %p's pv list", m));
 
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
 		l3e = pmap_load_clear(l3);
 		pmap_invalidate_page(pmap, pv->pv_va);
 		if (l3e & PTE_SW_WIRED)
 			pmap->pm_stats.wired_count--;
 		if ((l3e & PTE_A) != 0)
 			vm_page_aflag_set(m, PGA_REFERENCED);
 
 		/*
 		 * Update the vm_page_t clean and reference bits.
 		 */
 		if ((l3e & PTE_D) != 0)
 			vm_page_dirty(m);
 		pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		free_pv_entry(pmap, pv);
 		PMAP_UNLOCK(pmap);
 	}
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_wunlock(&pvh_global_lock);
 	vm_page_free_pages_toq(&free, false);
 }
 
 /*
  *	Set the physical protection on the
  *	specified range of this map as requested.
  */
 void
 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
 {
 	pd_entry_t *l1, *l2, l2e;
 	pt_entry_t *l3, l3e, mask;
 	vm_page_t m, mt;
 	vm_paddr_t pa;
 	vm_offset_t va_next;
 	bool anychanged, pv_lists_locked;
 
 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
 		pmap_remove(pmap, sva, eva);
 		return;
 	}
 
 	if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
 	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
 		return;
 
 	anychanged = false;
 	pv_lists_locked = false;
 	mask = 0;
 	if ((prot & VM_PROT_WRITE) == 0)
 		mask |= PTE_W | PTE_D;
 	if ((prot & VM_PROT_EXECUTE) == 0)
 		mask |= PTE_X;
 resume:
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l1 = pmap_l1(pmap, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if (l2 == NULL || (l2e = pmap_load(l2)) == 0)
 			continue;
 		if ((l2e & PTE_RWX) != 0) {
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 retryl2:
 				if ((prot & VM_PROT_WRITE) == 0 &&
 				    (l2e & (PTE_SW_MANAGED | PTE_D)) ==
 				    (PTE_SW_MANAGED | PTE_D)) {
 					pa = PTE_TO_PHYS(l2e);
 					m = PHYS_TO_VM_PAGE(pa);
 					for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
 						vm_page_dirty(mt);
 				}
 				if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask))
 					goto retryl2;
 				anychanged = true;
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = true;
 					if (!rw_try_rlock(&pvh_global_lock)) {
 						if (anychanged)
 							pmap_invalidate_all(
 							    pmap);
 						PMAP_UNLOCK(pmap);
 						rw_rlock(&pvh_global_lock);
 						goto resume;
 					}
 				}
 				if (!pmap_demote_l2(pmap, l2, sva)) {
 					/*
 					 * The large page mapping was destroyed.
 					 */
 					continue;
 				}
 			}
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			l3e = pmap_load(l3);
 retryl3:
 			if ((l3e & PTE_V) == 0)
 				continue;
 			if ((prot & VM_PROT_WRITE) == 0 &&
 			    (l3e & (PTE_SW_MANAGED | PTE_D)) ==
 			    (PTE_SW_MANAGED | PTE_D)) {
 				m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e));
 				vm_page_dirty(m);
 			}
 			if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask))
 				goto retryl3;
 			anychanged = true;
 		}
 	}
 	if (anychanged)
 		pmap_invalidate_all(pmap);
 	if (pv_lists_locked)
 		rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 int
 pmap_fault_fixup(pmap_t pmap, vm_offset_t va, vm_prot_t ftype)
 {
 	pd_entry_t *l2, l2e;
 	pt_entry_t bits, *pte, oldpte;
 	int rv;
 
 	rv = 0;
 	PMAP_LOCK(pmap);
 	l2 = pmap_l2(pmap, va);
 	if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0)
 		goto done;
 	if ((l2e & PTE_RWX) == 0) {
 		pte = pmap_l2_to_l3(l2, va);
 		if (pte == NULL || ((oldpte = pmap_load(pte) & PTE_V)) == 0)
 			goto done;
 	} else {
 		pte = l2;
 		oldpte = l2e;
 	}
 
 	if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) ||
 	    (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) ||
 	    (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) ||
 	    (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0))
 		goto done;
 
 	bits = PTE_A;
 	if (ftype == VM_PROT_WRITE)
 		bits |= PTE_D;
 
 	/*
 	 * Spurious faults can occur if the implementation caches invalid
 	 * entries in the TLB, or if simultaneous accesses on multiple CPUs
 	 * race with each other.
 	 */
 	if ((oldpte & bits) != bits)
 		pmap_store_bits(pte, bits);
 	sfence_vma();
 	rv = 1;
 done:
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 static bool
 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va)
 {
 	struct rwlock *lock;
 	bool rv;
 
 	lock = NULL;
 	rv = pmap_demote_l2_locked(pmap, l2, va, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	return (rv);
 }
 
 /*
  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
  * mapping is invalidated.
  */
 static bool
 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
     struct rwlock **lockp)
 {
 	struct spglist free;
 	vm_page_t mpte;
 	pd_entry_t newl2, oldl2;
 	pt_entry_t *firstl3, newl3;
 	vm_paddr_t mptepa;
 	int i;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	oldl2 = pmap_load(l2);
 	KASSERT((oldl2 & PTE_RWX) != 0,
 	    ("pmap_demote_l2_locked: oldl2 is not a leaf entry"));
 	if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
 	    NULL) {
 		if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL,
 		    pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT :
 		    VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) ==
 		    NULL) {
 			SLIST_INIT(&free);
 			(void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET,
 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
 			vm_page_free_pages_toq(&free, true);
 			CTR2(KTR_PMAP, "pmap_demote_l2_locked: "
 			    "failure for va %#lx in pmap %p", va, pmap);
 			return (false);
 		}
 		if (va < VM_MAXUSER_ADDRESS) {
 			mpte->wire_count = Ln_ENTRIES;
 			pmap_resident_count_inc(pmap, 1);
 		}
 	}
 	mptepa = VM_PAGE_TO_PHYS(mpte);
 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
 	newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
 	KASSERT((oldl2 & PTE_A) != 0,
 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_A"));
 	KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W,
 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_D"));
 	newl3 = oldl2;
 
 	/*
 	 * If the page table page is not leftover from an earlier promotion,
 	 * initialize it.
 	 */
 	if (mpte->valid == 0) {
 		for (i = 0; i < Ln_ENTRIES; i++)
 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
 	}
 	KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3),
 	    ("pmap_demote_l2_locked: firstl3 and newl3 map different physical "
 	    "addresses"));
 
 	/*
 	 * If the mapping has changed attributes, update the page table
 	 * entries.
 	 */
 	if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE))
 		for (i = 0; i < Ln_ENTRIES; i++)
 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
 
 	/*
 	 * The spare PV entries must be reserved prior to demoting the
 	 * mapping, that is, prior to changing the L2 entry.  Otherwise, the
 	 * state of the L2 entry and the PV lists will be inconsistent, which
 	 * can result in reclaim_pv_chunk() attempting to remove a PV entry from
 	 * the wrong PV list and pmap_pv_demote_l2() failing to find the
 	 * expected PV entry for the 2MB page mapping that is being demoted.
 	 */
 	if ((oldl2 & PTE_SW_MANAGED) != 0)
 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
 
 	/*
 	 * Demote the mapping.
 	 */
 	pmap_store(l2, newl2);
 
 	/*
 	 * Demote the PV entry.
 	 */
 	if ((oldl2 & PTE_SW_MANAGED) != 0)
 		pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
 
 	atomic_add_long(&pmap_l2_demotions, 1);
 	CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p",
 	    va, pmap);
 	return (true);
 }
 
 #if VM_NRESERVLEVEL > 0
 static void
 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
     struct rwlock **lockp)
 {
 	pt_entry_t *firstl3, *l3;
 	vm_paddr_t pa;
 	vm_page_t ml3;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	va &= ~L2_OFFSET;
 	KASSERT((pmap_load(l2) & PTE_RWX) == 0,
 	    ("pmap_promote_l2: invalid l2 entry %p", l2));
 
 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
 	pa = PTE_TO_PHYS(pmap_load(firstl3));
 	if ((pa & L2_OFFSET) != 0) {
 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
 		    va, pmap);
 		atomic_add_long(&pmap_l2_p_failures, 1);
 		return;
 	}
 
 	pa += PAGE_SIZE;
 	for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) {
 		if (PTE_TO_PHYS(pmap_load(l3)) != pa) {
 			CTR2(KTR_PMAP,
 			    "pmap_promote_l2: failure for va %#lx pmap %p",
 			    va, pmap);
 			atomic_add_long(&pmap_l2_p_failures, 1);
 			return;
 		}
 		if ((pmap_load(l3) & PTE_PROMOTE) !=
 		    (pmap_load(firstl3) & PTE_PROMOTE)) {
 			CTR2(KTR_PMAP,
 			    "pmap_promote_l2: failure for va %#lx pmap %p",
 			    va, pmap);
 			atomic_add_long(&pmap_l2_p_failures, 1);
 			return;
 		}
 		pa += PAGE_SIZE;
 	}
 
 	ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
 	KASSERT(ml3->pindex == pmap_l2_pindex(va),
 	    ("pmap_promote_l2: page table page's pindex is wrong"));
 	if (pmap_insert_pt_page(pmap, ml3, true)) {
 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
 		    va, pmap);
 		atomic_add_long(&pmap_l2_p_failures, 1);
 		return;
 	}
 
 	if ((pmap_load(firstl3) & PTE_SW_MANAGED) != 0)
 		pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(pmap_load(firstl3)),
 		    lockp);
 
 	pmap_store(l2, pmap_load(firstl3));
 
 	atomic_add_long(&pmap_l2_promotions, 1);
 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
 	    pmap);
 }
 #endif
 
 /*
  *	Insert the given physical page (p) at
  *	the specified virtual address (v) in the
  *	target physical map with the protection requested.
  *
  *	If specified, the page will be wired down, meaning
  *	that the related pte can not be reclaimed.
  *
  *	NB:  This is the only routine which MAY NOT lazy-evaluate
  *	or lose information.  That is, this routine must actually
  *	insert this page into the given map NOW.
  */
 int
 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     u_int flags, int8_t psind)
 {
 	struct rwlock *lock;
 	pd_entry_t *l1, *l2, l2e;
 	pt_entry_t new_l3, orig_l3;
 	pt_entry_t *l3;
 	pv_entry_t pv;
 	vm_paddr_t opa, pa, l2_pa, l3_pa;
 	vm_page_t mpte, om, l2_m, l3_m;
 	pt_entry_t entry;
 	pn_t l2_pn, l3_pn, pn;
 	int rv;
 	bool nosleep;
 
 	va = trunc_page(va);
 	if ((m->oflags & VPO_UNMANAGED) == 0 && !vm_page_xbusied(m))
 		VM_OBJECT_ASSERT_LOCKED(m->object);
 	pa = VM_PAGE_TO_PHYS(m);
 	pn = (pa / PAGE_SIZE);
 
 	new_l3 = PTE_V | PTE_R | PTE_A;
 	if (prot & VM_PROT_EXECUTE)
 		new_l3 |= PTE_X;
 	if (flags & VM_PROT_WRITE)
 		new_l3 |= PTE_D;
 	if (prot & VM_PROT_WRITE)
 		new_l3 |= PTE_W;
 	if (va < VM_MAX_USER_ADDRESS)
 		new_l3 |= PTE_U;
 
 	new_l3 |= (pn << PTE_PPN0_S);
 	if ((flags & PMAP_ENTER_WIRED) != 0)
 		new_l3 |= PTE_SW_WIRED;
 
 	/*
 	 * Set modified bit gratuitously for writeable mappings if
 	 * the page is unmanaged. We do not want to take a fault
 	 * to do the dirty bit accounting for these mappings.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) != 0) {
 		if (prot & VM_PROT_WRITE)
 			new_l3 |= PTE_D;
 	} else
 		new_l3 |= PTE_SW_MANAGED;
 
 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
 
 	lock = NULL;
 	mpte = NULL;
 	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	if (psind == 1) {
 		/* Assert the required virtual and physical alignment. */
 		KASSERT((va & L2_OFFSET) == 0,
 		    ("pmap_enter: va %#lx unaligned", va));
 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
 		rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock);
 		goto out;
 	}
 
 	l2 = pmap_l2(pmap, va);
 	if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 &&
 	    ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2,
 	    va, &lock))) {
 		l3 = pmap_l2_to_l3(l2, va);
 		if (va < VM_MAXUSER_ADDRESS) {
 			mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
 			mpte->wire_count++;
 		}
 	} else if (va < VM_MAXUSER_ADDRESS) {
 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
 		mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
 		if (mpte == NULL && nosleep) {
 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
 			if (lock != NULL)
 				rw_wunlock(lock);
 			rw_runlock(&pvh_global_lock);
 			PMAP_UNLOCK(pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		l3 = pmap_l3(pmap, va);
 	} else {
 		l3 = pmap_l3(pmap, va);
 		/* TODO: This is not optimal, but should mostly work */
 		if (l3 == NULL) {
 			if (l2 == NULL) {
 				l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 				    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
 				    VM_ALLOC_ZERO);
 				if (l2_m == NULL)
 					panic("pmap_enter: l2 pte_m == NULL");
 				if ((l2_m->flags & PG_ZERO) == 0)
 					pmap_zero_page(l2_m);
 
 				l2_pa = VM_PAGE_TO_PHYS(l2_m);
 				l2_pn = (l2_pa / PAGE_SIZE);
 
 				l1 = pmap_l1(pmap, va);
 				entry = (PTE_V);
 				entry |= (l2_pn << PTE_PPN0_S);
 				pmap_store(l1, entry);
 				pmap_distribute_l1(pmap, pmap_l1_index(va), entry);
 				l2 = pmap_l1_to_l2(l1, va);
 			}
 
 			l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
 			    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
 			if (l3_m == NULL)
 				panic("pmap_enter: l3 pte_m == NULL");
 			if ((l3_m->flags & PG_ZERO) == 0)
 				pmap_zero_page(l3_m);
 
 			l3_pa = VM_PAGE_TO_PHYS(l3_m);
 			l3_pn = (l3_pa / PAGE_SIZE);
 			entry = (PTE_V);
 			entry |= (l3_pn << PTE_PPN0_S);
 			pmap_store(l2, entry);
 			l3 = pmap_l2_to_l3(l2, va);
 		}
 		pmap_invalidate_page(pmap, va);
 	}
 
 	orig_l3 = pmap_load(l3);
 	opa = PTE_TO_PHYS(orig_l3);
 	pv = NULL;
 
 	/*
 	 * Is the specified virtual address already mapped?
 	 */
 	if ((orig_l3 & PTE_V) != 0) {
 		/*
 		 * Wiring change, just update stats. We don't worry about
 		 * wiring PT pages as they remain resident as long as there
 		 * are valid mappings in them. Hence, if a user page is wired,
 		 * the PT page will be also.
 		 */
 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
 		    (orig_l3 & PTE_SW_WIRED) == 0)
 			pmap->pm_stats.wired_count++;
 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
 		    (orig_l3 & PTE_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count--;
 
 		/*
 		 * Remove the extra PT page reference.
 		 */
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			KASSERT(mpte->wire_count > 0,
 			    ("pmap_enter: missing reference to page table page,"
 			     " va: 0x%lx", va));
 		}
 
 		/*
 		 * Has the physical page changed?
 		 */
 		if (opa == pa) {
 			/*
 			 * No, might be a protection or wiring change.
 			 */
 			if ((orig_l3 & PTE_SW_MANAGED) != 0 &&
 			    (new_l3 & PTE_W) != 0)
 				vm_page_aflag_set(m, PGA_WRITEABLE);
 			goto validate;
 		}
 
 		/*
 		 * The physical page has changed.  Temporarily invalidate
 		 * the mapping.  This ensures that all threads sharing the
 		 * pmap keep a consistent view of the mapping, which is
 		 * necessary for the correct handling of COW faults.  It
 		 * also permits reuse of the old mapping's PV entry,
 		 * avoiding an allocation.
 		 *
 		 * For consistency, handle unmanaged mappings the same way.
 		 */
 		orig_l3 = pmap_load_clear(l3);
 		KASSERT(PTE_TO_PHYS(orig_l3) == opa,
 		    ("pmap_enter: unexpected pa update for %#lx", va));
 		if ((orig_l3 & PTE_SW_MANAGED) != 0) {
 			om = PHYS_TO_VM_PAGE(opa);
 
 			/*
 			 * The pmap lock is sufficient to synchronize with
 			 * concurrent calls to pmap_page_test_mappings() and
 			 * pmap_ts_referenced().
 			 */
 			if ((orig_l3 & PTE_D) != 0)
 				vm_page_dirty(om);
 			if ((orig_l3 & PTE_A) != 0)
 				vm_page_aflag_set(om, PGA_REFERENCED);
 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
 			pv = pmap_pvh_remove(&om->md, pmap, va);
 			KASSERT(pv != NULL,
 			    ("pmap_enter: no PV entry for %#lx", va));
 			if ((new_l3 & PTE_SW_MANAGED) == 0)
 				free_pv_entry(pmap, pv);
 			if ((om->aflags & PGA_WRITEABLE) != 0 &&
 			    TAILQ_EMPTY(&om->md.pv_list))
 				vm_page_aflag_clear(om, PGA_WRITEABLE);
 		}
 		pmap_invalidate_page(pmap, va);
 		orig_l3 = 0;
 	} else {
 		/*
 		 * Increment the counters.
 		 */
 		if ((new_l3 & PTE_SW_WIRED) != 0)
 			pmap->pm_stats.wired_count++;
 		pmap_resident_count_inc(pmap, 1);
 	}
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((new_l3 & PTE_SW_MANAGED) != 0) {
 		if (pv == NULL) {
 			pv = get_pv_entry(pmap, &lock);
 			pv->pv_va = va;
 		}
 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		if ((new_l3 & PTE_W) != 0)
 			vm_page_aflag_set(m, PGA_WRITEABLE);
 	}
 
 validate:
 	/*
 	 * Sync the i-cache on all harts before updating the PTE
 	 * if the new PTE is executable.
 	 */
 	if (prot & VM_PROT_EXECUTE)
 		pmap_sync_icache(pmap, va, PAGE_SIZE);
 
 	/*
 	 * Update the L3 entry.
 	 */
 	if (orig_l3 != 0) {
 		orig_l3 = pmap_load_store(l3, new_l3);
 		pmap_invalidate_page(pmap, va);
 		KASSERT(PTE_TO_PHYS(orig_l3) == pa,
 		    ("pmap_enter: invalid update"));
 		if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) ==
 		    (PTE_D | PTE_SW_MANAGED))
 			vm_page_dirty(m);
 	} else {
 		pmap_store(l3, new_l3);
 	}
 
 #if VM_NRESERVLEVEL > 0
 	if (mpte != NULL && mpte->wire_count == Ln_ENTRIES &&
 	    pmap_ps_enabled(pmap) &&
 	    (m->flags & PG_FICTITIOUS) == 0 &&
 	    vm_reserv_level_iffullpop(m) == 0)
 		pmap_promote_l2(pmap, l2, va, &lock);
 #endif
 
 	rv = KERN_SUCCESS;
 out:
 	if (lock != NULL)
 		rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
  * if successful.  Returns false if (1) a page table page cannot be allocated
  * without sleeping, (2) a mapping already exists at the specified virtual
  * address, or (3) a PV entry cannot be allocated without reclaiming another
  * PV entry.
  */
 static bool
 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
     struct rwlock **lockp)
 {
 	pd_entry_t new_l2;
 	pn_t pn;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE;
 	new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V);
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		new_l2 |= PTE_SW_MANAGED;
 	if ((prot & VM_PROT_EXECUTE) != 0)
 		new_l2 |= PTE_X;
 	if (va < VM_MAXUSER_ADDRESS)
 		new_l2 |= PTE_U;
 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
 	    KERN_SUCCESS);
 }
 
 /*
  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
  * a mapping already exists at the specified virtual address.  Returns
  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
  *
  * The parameter "m" is only used when creating a managed, writeable mapping.
  */
 static int
 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
     vm_page_t m, struct rwlock **lockp)
 {
 	struct spglist free;
 	pd_entry_t *l2, *l3, oldl2;
 	vm_offset_t sva;
 	vm_page_t l2pg, mt;
 
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
 	    NULL : lockp)) == NULL) {
 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
 		    va, pmap);
 		return (KERN_RESOURCE_SHORTAGE);
 	}
 
 	l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
 	l2 = &l2[pmap_l2_index(va)];
 	if ((oldl2 = pmap_load(l2)) != 0) {
 		KASSERT(l2pg->wire_count > 1,
 		    ("pmap_enter_l2: l2pg's wire count is too low"));
 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
 			l2pg->wire_count--;
 			CTR2(KTR_PMAP,
 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
 			    va, pmap);
 			return (KERN_FAILURE);
 		}
 		SLIST_INIT(&free);
 		if ((oldl2 & PTE_RWX) != 0)
 			(void)pmap_remove_l2(pmap, l2, va,
 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
 		else
 			for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) {
 				l3 = pmap_l2_to_l3(l2, sva);
 				if ((pmap_load(l3) & PTE_V) != 0 &&
 				    pmap_remove_l3(pmap, l3, sva, oldl2, &free,
 				    lockp) != 0)
 					break;
 			}
 		vm_page_free_pages_toq(&free, true);
 		if (va >= VM_MAXUSER_ADDRESS) {
 			/*
 			 * Both pmap_remove_l2() and pmap_remove_l3() will
 			 * leave the kernel page table page zero filled.
 			 */
 			mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
 			if (pmap_insert_pt_page(pmap, mt, false))
 				panic("pmap_enter_l2: trie insert failed");
 		} else
 			KASSERT(pmap_load(l2) == 0,
 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
 	}
 
 	if ((new_l2 & PTE_SW_MANAGED) != 0) {
 		/*
 		 * Abort this mapping if its PV entry could not be created.
 		 */
 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, l2pg, &free)) {
 				/*
 				 * Although "va" is not mapped, paging-structure
 				 * caches could nonetheless have entries that
 				 * refer to the freed page table pages.
 				 * Invalidate those entries.
 				 */
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, true);
 			}
 			CTR2(KTR_PMAP,
 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
 			    va, pmap);
 			return (KERN_RESOURCE_SHORTAGE);
 		}
 		if ((new_l2 & PTE_W) != 0)
 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
 				vm_page_aflag_set(mt, PGA_WRITEABLE);
 	}
 
 	/*
 	 * Increment counters.
 	 */
 	if ((new_l2 & PTE_SW_WIRED) != 0)
 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
 
 	/*
 	 * Map the superpage.
 	 */
 	pmap_store(l2, new_l2);
 
 	atomic_add_long(&pmap_l2_mappings, 1);
 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
 	    va, pmap);
 
 	return (KERN_SUCCESS);
 }
 
 /*
  * Maps a sequence of resident pages belonging to the same object.
  * The sequence begins with the given page m_start.  This page is
  * mapped at the given virtual address start.  Each subsequent page is
  * mapped at a virtual address that is offset from start by the same
  * amount as the page is offset from m_start within the object.  The
  * last page in the sequence is the page with the largest offset from
  * m_start that can be mapped at a virtual address less than the given
  * virtual address end.  Not every virtual page between start and end
  * is mapped; only those for which a resident page exists with the
  * corresponding offset from m_start are mapped.
  */
 void
 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
     vm_page_t m_start, vm_prot_t prot)
 {
 	struct rwlock *lock;
 	vm_offset_t va;
 	vm_page_t m, mpte;
 	vm_pindex_t diff, psize;
 
 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
 
 	psize = atop(end - start);
 	mpte = NULL;
 	m = m_start;
 	lock = NULL;
 	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
 		va = start + ptoa(diff);
 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
 			m = &m[L2_SIZE / PAGE_SIZE - 1];
 		else
 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
 			    &lock);
 		m = TAILQ_NEXT(m, listq);
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  * this code makes some *MAJOR* assumptions:
  * 1. Current pmap & pmap exists.
  * 2. Not wired.
  * 3. Read access.
  * 4. No page table pages.
  * but is *MUCH* faster than pmap_enter...
  */
 
 void
 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
 {
 	struct rwlock *lock;
 
 	lock = NULL;
 	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
 	if (lock != NULL)
 		rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 static vm_page_t
 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
 {
 	struct spglist free;
 	vm_paddr_t phys;
 	pd_entry_t *l2;
 	pt_entry_t *l3, newl3;
 
 	KASSERT(va < kmi.clean_sva || va >= kmi.clean_eva ||
 	    (m->oflags & VPO_UNMANAGED) != 0,
 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
 	rw_assert(&pvh_global_lock, RA_LOCKED);
 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
 
 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
 	/*
 	 * In the case that a page table page is not
 	 * resident, we are creating it here.
 	 */
 	if (va < VM_MAXUSER_ADDRESS) {
 		vm_pindex_t l2pindex;
 
 		/*
 		 * Calculate pagetable page index
 		 */
 		l2pindex = pmap_l2_pindex(va);
 		if (mpte && (mpte->pindex == l2pindex)) {
 			mpte->wire_count++;
 		} else {
 			/*
 			 * Get the l2 entry
 			 */
 			l2 = pmap_l2(pmap, va);
 
 			/*
 			 * If the page table page is mapped, we just increment
 			 * the hold count, and activate it.  Otherwise, we
 			 * attempt to allocate a page table page.  If this
 			 * attempt fails, we don't retry.  Instead, we give up.
 			 */
 			if (l2 != NULL && pmap_load(l2) != 0) {
 				phys = PTE_TO_PHYS(pmap_load(l2));
 				mpte = PHYS_TO_VM_PAGE(phys);
 				mpte->wire_count++;
 			} else {
 				/*
 				 * Pass NULL instead of the PV list lock
 				 * pointer, because we don't intend to sleep.
 				 */
 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
 				if (mpte == NULL)
 					return (mpte);
 			}
 		}
 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
 		l3 = &l3[pmap_l3_index(va)];
 	} else {
 		mpte = NULL;
 		l3 = pmap_l3(kernel_pmap, va);
 	}
 	if (l3 == NULL)
 		panic("pmap_enter_quick_locked: No l3");
 	if (pmap_load(l3) != 0) {
 		if (mpte != NULL) {
 			mpte->wire_count--;
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Enter on the PV list if part of our managed memory.
 	 */
 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
 		if (mpte != NULL) {
 			SLIST_INIT(&free);
 			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
 				pmap_invalidate_page(pmap, va);
 				vm_page_free_pages_toq(&free, false);
 			}
 			mpte = NULL;
 		}
 		return (mpte);
 	}
 
 	/*
 	 * Increment counters
 	 */
 	pmap_resident_count_inc(pmap, 1);
 
 	newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) |
 	    PTE_V | PTE_R;
 	if ((prot & VM_PROT_EXECUTE) != 0)
 		newl3 |= PTE_X;
 	if ((m->oflags & VPO_UNMANAGED) == 0)
 		newl3 |= PTE_SW_MANAGED;
 	if (va < VM_MAX_USER_ADDRESS)
 		newl3 |= PTE_U;
 
 	/*
 	 * Sync the i-cache on all harts before updating the PTE
 	 * if the new PTE is executable.
 	 */
 	if (prot & VM_PROT_EXECUTE)
 		pmap_sync_icache(pmap, va, PAGE_SIZE);
 
 	pmap_store(l3, newl3);
 
 	pmap_invalidate_page(pmap, va);
 	return (mpte);
 }
 
 /*
  * This code maps large physical mmap regions into the
  * processor address space.  Note that some shortcuts
  * are taken, but the code works.
  */
 void
 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
     vm_pindex_t pindex, vm_size_t size)
 {
 
 	VM_OBJECT_ASSERT_WLOCKED(object);
 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
 	    ("pmap_object_init_pt: non-device object"));
 }
 
 /*
  *	Clear the wired attribute from the mappings for the specified range of
  *	addresses in the given pmap.  Every valid mapping within that range
  *	must have the wired attribute set.  In contrast, invalid mappings
  *	cannot have the wired attribute set, so they are ignored.
  *
  *	The wired attribute of the page table entry is not a hardware feature,
  *	so there is no need to invalidate any TLB entries.
  */
 void
 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
 {
 	vm_offset_t va_next;
 	pd_entry_t *l1, *l2, l2e;
 	pt_entry_t *l3, l3e;
 	bool pv_lists_locked;
 
 	pv_lists_locked = false;
 retry:
 	PMAP_LOCK(pmap);
 	for (; sva < eva; sva = va_next) {
 		l1 = pmap_l1(pmap, sva);
 		if (pmap_load(l1) == 0) {
 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
 			if (va_next < sva)
 				va_next = eva;
 			continue;
 		}
 
 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
 		if (va_next < sva)
 			va_next = eva;
 
 		l2 = pmap_l1_to_l2(l1, sva);
 		if ((l2e = pmap_load(l2)) == 0)
 			continue;
 		if ((l2e & PTE_RWX) != 0) {
 			if (sva + L2_SIZE == va_next && eva >= va_next) {
 				if ((l2e & PTE_SW_WIRED) == 0)
 					panic("pmap_unwire: l2 %#jx is missing "
 					    "PTE_SW_WIRED", (uintmax_t)l2e);
 				pmap_clear_bits(l2, PTE_SW_WIRED);
 				continue;
 			} else {
 				if (!pv_lists_locked) {
 					pv_lists_locked = true;
 					if (!rw_try_rlock(&pvh_global_lock)) {
 						PMAP_UNLOCK(pmap);
 						rw_rlock(&pvh_global_lock);
 						/* Repeat sva. */
 						goto retry;
 					}
 				}
 				if (!pmap_demote_l2(pmap, l2, sva))
 					panic("pmap_unwire: demotion failed");
 			}
 		}
 
 		if (va_next > eva)
 			va_next = eva;
 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
 		    sva += L3_SIZE) {
 			if ((l3e = pmap_load(l3)) == 0)
 				continue;
 			if ((l3e & PTE_SW_WIRED) == 0)
 				panic("pmap_unwire: l3 %#jx is missing "
 				    "PTE_SW_WIRED", (uintmax_t)l3e);
 
 			/*
 			 * PG_W must be cleared atomically.  Although the pmap
 			 * lock synchronizes access to PG_W, another processor
 			 * could be setting PG_M and/or PG_A concurrently.
 			 */
 			pmap_clear_bits(l3, PTE_SW_WIRED);
 			pmap->pm_stats.wired_count--;
 		}
 	}
 	if (pv_lists_locked)
 		rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 }
 
 /*
  *	Copy the range specified by src_addr/len
  *	from the source map to the range dst_addr/len
  *	in the destination map.
  *
  *	This routine is only advisory and need not do anything.
  */
 
 void
 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
     vm_offset_t src_addr)
 {
 
 }
 
 /*
  *	pmap_zero_page zeros the specified hardware page by mapping
  *	the page into KVM and using bzero to clear its contents.
  */
 void
 pmap_zero_page(vm_page_t m)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	pagezero((void *)va);
 }
 
 /*
  *	pmap_zero_page_area zeros the specified hardware page by mapping 
  *	the page into KVM and using bzero to clear its contents.
  *
  *	off and size may not cover an area beyond a single hardware page.
  */
 void
 pmap_zero_page_area(vm_page_t m, int off, int size)
 {
 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
 
 	if (off == 0 && size == PAGE_SIZE)
 		pagezero((void *)va);
 	else
 		bzero((char *)va + off, size);
 }
 
 /*
  *	pmap_copy_page copies the specified (machine independent)
  *	page by mapping the page into virtual memory and using
  *	bcopy to copy the page, one machine dependent page at a
  *	time.
  */
 void
 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
 {
 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
 
 	pagecopy((void *)src, (void *)dst);
 }
 
 int unmapped_buf_allowed = 1;
 
 void
 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
     vm_offset_t b_offset, int xfersize)
 {
 	void *a_cp, *b_cp;
 	vm_page_t m_a, m_b;
 	vm_paddr_t p_a, p_b;
 	vm_offset_t a_pg_offset, b_pg_offset;
 	int cnt;
 
 	while (xfersize > 0) {
 		a_pg_offset = a_offset & PAGE_MASK;
 		m_a = ma[a_offset >> PAGE_SHIFT];
 		p_a = m_a->phys_addr;
 		b_pg_offset = b_offset & PAGE_MASK;
 		m_b = mb[b_offset >> PAGE_SHIFT];
 		p_b = m_b->phys_addr;
 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
 			panic("!DMAP a %lx", p_a);
 		} else {
 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
 		}
 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
 			panic("!DMAP b %lx", p_b);
 		} else {
 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
 		}
 		bcopy(a_cp, b_cp, cnt);
 		a_offset += cnt;
 		b_offset += cnt;
 		xfersize -= cnt;
 	}
 }
 
 vm_offset_t
 pmap_quick_enter_page(vm_page_t m)
 {
 
 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
 }
 
 void
 pmap_quick_remove_page(vm_offset_t addr)
 {
 }
 
 /*
  * Returns true if the pmap's pv is one of the first
  * 16 pvs linked to from this page.  This count may
  * be changed upwards or downwards in the future; it
  * is only necessary that true be returned for a small
  * subset of pmaps for proper page aging.
  */
 boolean_t
 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pv_entry_t pv;
 	int loops = 0;
 	boolean_t rv;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_page_exists_quick: page %p is not managed", m));
 	rv = FALSE;
 	rw_rlock(&pvh_global_lock);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		if (PV_PMAP(pv) == pmap) {
 			rv = TRUE;
 			break;
 		}
 		loops++;
 		if (loops >= 16)
 			break;
 	}
 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			if (PV_PMAP(pv) == pmap) {
 				rv = TRUE;
 				break;
 			}
 			loops++;
 			if (loops >= 16)
 				break;
 		}
 	}
 	rw_runlock(lock);
 	rw_runlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  *	pmap_page_wired_mappings:
  *
  *	Return the number of managed mappings to the given physical page
  *	that are wired.
  */
 int
 pmap_page_wired_mappings(vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pmap_t pmap;
 	pd_entry_t *l2;
 	pt_entry_t *l3;
 	pv_entry_t pv;
 	int count, md_gen, pvh_gen;
 
 	if ((m->oflags & VPO_UNMANAGED) != 0)
 		return (0);
 	rw_rlock(&pvh_global_lock);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	count = 0;
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		l3 = pmap_l3(pmap, pv->pv_va);
 		if ((pmap_load(l3) & PTE_SW_WIRED) != 0)
 			count++;
 		PMAP_UNLOCK(pmap);
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			l2 = pmap_l2(pmap, pv->pv_va);
 			if ((pmap_load(l2) & PTE_SW_WIRED) != 0)
 				count++;
 			PMAP_UNLOCK(pmap);
 		}
 	}
 	rw_runlock(lock);
 	rw_runlock(&pvh_global_lock);
 	return (count);
 }
 
 static void
 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv,
     struct spglist *free, bool superpage)
 {
 	struct md_page *pvh;
 	vm_page_t mpte, mt;
 
 	if (superpage) {
 		pmap_resident_count_dec(pmap, Ln_ENTRIES);
 		pvh = pa_to_pvh(m->phys_addr);
 		TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 		pvh->pv_gen++;
 		if (TAILQ_EMPTY(&pvh->pv_list)) {
 			for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
 				if (TAILQ_EMPTY(&mt->md.pv_list) &&
 				    (mt->aflags & PGA_WRITEABLE) != 0)
 					vm_page_aflag_clear(mt, PGA_WRITEABLE);
 		}
 		mpte = pmap_remove_pt_page(pmap, pv->pv_va);
 		if (mpte != NULL) {
 			KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
 			    ("pmap_remove_pages: pte page not promoted"));
 			pmap_resident_count_dec(pmap, 1);
 			KASSERT(mpte->wire_count == Ln_ENTRIES,
 			    ("pmap_remove_pages: pte page wire count error"));
 			mpte->wire_count = 0;
 			pmap_add_delayed_free_list(mpte, free, FALSE);
 		}
 	} else {
 		pmap_resident_count_dec(pmap, 1);
 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 		m->md.pv_gen++;
 		if (TAILQ_EMPTY(&m->md.pv_list) &&
 		    (m->aflags & PGA_WRITEABLE) != 0) {
 			pvh = pa_to_pvh(m->phys_addr);
 			if (TAILQ_EMPTY(&pvh->pv_list))
 				vm_page_aflag_clear(m, PGA_WRITEABLE);
 		}
 	}
 }
 
 /*
  * Destroy all managed, non-wired mappings in the given user-space
  * pmap.  This pmap cannot be active on any processor besides the
  * caller.
  *
  * This function cannot be applied to the kernel pmap.  Moreover, it
  * is not intended for general use.  It is only to be used during
  * process termination.  Consequently, it can be implemented in ways
  * that make it faster than pmap_remove().  First, it can more quickly
  * destroy mappings by iterating over the pmap's collection of PV
  * entries, rather than searching the page table.  Second, it doesn't
  * have to test and clear the page table entries atomically, because
  * no processor is currently accessing the user address space.  In
  * particular, a page table entry's dirty bit won't change state once
  * this function starts.
  */
 void
 pmap_remove_pages(pmap_t pmap)
 {
 	struct spglist free;
 	pd_entry_t ptepde;
 	pt_entry_t *pte, tpte;
 	vm_page_t m, mt;
 	pv_entry_t pv;
 	struct pv_chunk *pc, *npc;
 	struct rwlock *lock;
 	int64_t bit;
 	uint64_t inuse, bitmask;
 	int allfree, field, freed, idx;
 	bool superpage;
 
 	lock = NULL;
 
 	SLIST_INIT(&free);
 	rw_rlock(&pvh_global_lock);
 	PMAP_LOCK(pmap);
 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
 		allfree = 1;
 		freed = 0;
 		for (field = 0; field < _NPCM; field++) {
 			inuse = ~pc->pc_map[field] & pc_freemask[field];
 			while (inuse != 0) {
 				bit = ffsl(inuse) - 1;
 				bitmask = 1UL << bit;
 				idx = field * 64 + bit;
 				pv = &pc->pc_pventry[idx];
 				inuse &= ~bitmask;
 
 				pte = pmap_l1(pmap, pv->pv_va);
 				ptepde = pmap_load(pte);
 				pte = pmap_l1_to_l2(pte, pv->pv_va);
 				tpte = pmap_load(pte);
 				if ((tpte & PTE_RWX) != 0) {
 					superpage = true;
 				} else {
 					ptepde = tpte;
 					pte = pmap_l2_to_l3(pte, pv->pv_va);
 					tpte = pmap_load(pte);
 					superpage = false;
 				}
 
 				/*
 				 * We cannot remove wired pages from a
 				 * process' mapping at this time.
 				 */
 				if (tpte & PTE_SW_WIRED) {
 					allfree = 0;
 					continue;
 				}
 
 				m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte));
 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
 				    m < &vm_page_array[vm_page_array_size],
 				    ("pmap_remove_pages: bad pte %#jx",
 				    (uintmax_t)tpte));
 
 				pmap_clear(pte);
 
 				/*
 				 * Update the vm_page_t clean/reference bits.
 				 */
 				if ((tpte & (PTE_D | PTE_W)) ==
 				    (PTE_D | PTE_W)) {
 					if (superpage)
 						for (mt = m;
 						    mt < &m[Ln_ENTRIES]; mt++)
 							vm_page_dirty(mt);
 					else
 						vm_page_dirty(m);
 				}
 
 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
 
 				/* Mark free */
 				pc->pc_map[field] |= bitmask;
 
 				pmap_remove_pages_pv(pmap, m, pv, &free,
 				    superpage);
 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
 				freed++;
 			}
 		}
 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
 		if (allfree) {
 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
 			free_pv_chunk(pc);
 		}
 	}
 	if (lock != NULL)
 		rw_wunlock(lock);
 	pmap_invalidate_all(pmap);
 	rw_runlock(&pvh_global_lock);
 	PMAP_UNLOCK(pmap);
 	vm_page_free_pages_toq(&free, false);
 }
 
 static bool
 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pd_entry_t *l2;
 	pt_entry_t *l3, mask;
 	pv_entry_t pv;
 	pmap_t pmap;
 	int md_gen, pvh_gen;
 	bool rv;
 
 	mask = 0;
 	if (modified)
 		mask |= PTE_D;
 	if (accessed)
 		mask |= PTE_A;
 
 	rv = FALSE;
 	rw_rlock(&pvh_global_lock);
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(lock);
 restart:
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			rw_runlock(lock);
 			PMAP_LOCK(pmap);
 			rw_rlock(lock);
 			if (md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		l3 = pmap_l3(pmap, pv->pv_va);
 		rv = (pmap_load(l3) & mask) == mask;
 		PMAP_UNLOCK(pmap);
 		if (rv)
 			goto out;
 	}
 	if ((m->flags & PG_FICTITIOUS) == 0) {
 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
 			pmap = PV_PMAP(pv);
 			if (!PMAP_TRYLOCK(pmap)) {
 				md_gen = m->md.pv_gen;
 				pvh_gen = pvh->pv_gen;
 				rw_runlock(lock);
 				PMAP_LOCK(pmap);
 				rw_rlock(lock);
 				if (md_gen != m->md.pv_gen ||
 				    pvh_gen != pvh->pv_gen) {
 					PMAP_UNLOCK(pmap);
 					goto restart;
 				}
 			}
 			l2 = pmap_l2(pmap, pv->pv_va);
 			rv = (pmap_load(l2) & mask) == mask;
 			PMAP_UNLOCK(pmap);
 			if (rv)
 				goto out;
 		}
 	}
 out:
 	rw_runlock(lock);
 	rw_runlock(&pvh_global_lock);
 	return (rv);
 }
 
 /*
  *	pmap_is_modified:
  *
  *	Return whether or not the specified physical page was modified
  *	in any physical maps.
  */
 boolean_t
 pmap_is_modified(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_modified: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * concurrently set while the object is locked.  Thus, if PGA_WRITEABLE
 	 * is clear, no PTEs can have PG_M set.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return (FALSE);
 	return (pmap_page_test_mappings(m, FALSE, TRUE));
 }
 
 /*
  *	pmap_is_prefaultable:
  *
  *	Return whether or not the specified virtual address is eligible
  *	for prefault.
  */
 boolean_t
 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
 {
 	pt_entry_t *l3;
 	boolean_t rv;
 
 	rv = FALSE;
 	PMAP_LOCK(pmap);
 	l3 = pmap_l3(pmap, addr);
 	if (l3 != NULL && pmap_load(l3) != 0) {
 		rv = TRUE;
 	}
 	PMAP_UNLOCK(pmap);
 	return (rv);
 }
 
 /*
  *	pmap_is_referenced:
  *
  *	Return whether or not the specified physical page was referenced
  *	in any physical maps.
  */
 boolean_t
 pmap_is_referenced(vm_page_t m)
 {
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_is_referenced: page %p is not managed", m));
 	return (pmap_page_test_mappings(m, TRUE, FALSE));
 }
 
 /*
  * Clear the write and modified bits in each of the given page's mappings.
  */
 void
 pmap_remove_write(vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pmap_t pmap;
 	pd_entry_t *l2;
 	pt_entry_t *l3, oldl3, newl3;
 	pv_entry_t next_pv, pv;
 	vm_offset_t va;
 	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_remove_write: page %p is not managed", m));
 
 	/*
 	 * If the page is not exclusive busied, then PGA_WRITEABLE cannot be
 	 * set by another thread while the object is locked.  Thus,
 	 * if PGA_WRITEABLE is clear, no page table entries need updating.
 	 */
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	if (!vm_page_xbusied(m) && (m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	rw_rlock(&pvh_global_lock);
 retry_pv_loop:
 	rw_wlock(lock);
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		va = pv->pv_va;
 		l2 = pmap_l2(pmap, va);
 		if ((pmap_load(l2) & PTE_W) != 0)
 			(void)pmap_demote_l2_locked(pmap, l2, va, &lock);
 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
 		    ("inconsistent pv lock %p %p for page %p",
 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				rw_wunlock(lock);
 				goto retry_pv_loop;
 			}
 		}
 		l3 = pmap_l3(pmap, pv->pv_va);
 		oldl3 = pmap_load(l3);
 retry:
 		if ((oldl3 & PTE_W) != 0) {
 			newl3 = oldl3 & ~(PTE_D | PTE_W);
 			if (!atomic_fcmpset_long(l3, &oldl3, newl3))
 				goto retry;
 			if ((oldl3 & PTE_D) != 0)
 				vm_page_dirty(m);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 	vm_page_aflag_clear(m, PGA_WRITEABLE);
 	rw_runlock(&pvh_global_lock);
 }
 
 /*
  *	pmap_ts_referenced:
  *
  *	Return a count of reference bits for a page, clearing those bits.
  *	It is not necessary for every reference bit to be cleared, but it
  *	is necessary that 0 only be returned when there are truly no
  *	reference bits set.
  *
  *	As an optimization, update the page's dirty field if a modified bit is
  *	found while counting reference bits.  This opportunistic update can be
  *	performed at low cost and can eliminate the need for some future calls
  *	to pmap_is_modified().  However, since this function stops after
  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
  *	dirty pages.  Those dirty pages will only be detected by a future call
  *	to pmap_is_modified().
  */
 int
 pmap_ts_referenced(vm_page_t m)
 {
 	struct spglist free;
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pv_entry_t pv, pvf;
 	pmap_t pmap;
 	pd_entry_t *l2, l2e;
 	pt_entry_t *l3, l3e;
 	vm_paddr_t pa;
 	vm_offset_t va;
 	int cleared, md_gen, not_cleared, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_ts_referenced: page %p is not managed", m));
 	SLIST_INIT(&free);
 	cleared = 0;
 	pa = VM_PAGE_TO_PHYS(m);
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
 
 	lock = PHYS_TO_PV_LIST_LOCK(pa);
 	rw_rlock(&pvh_global_lock);
 	rw_wlock(lock);
 retry:
 	not_cleared = 0;
 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
 		goto small_mappings;
 	pv = pvf;
 	do {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		va = pv->pv_va;
 		l2 = pmap_l2(pmap, va);
 		l2e = pmap_load(l2);
 		if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) {
 			/*
 			 * Although l2e is mapping a 2MB page, because
 			 * this function is called at a 4KB page granularity,
 			 * we only update the 4KB page under test.
 			 */
 			vm_page_dirty(m);
 		}
 		if ((l2e & PTE_A) != 0) {
 			/*
 			 * Since this reference bit is shared by 512 4KB
 			 * pages, it should not be cleared every time it is
 			 * tested.  Apply a simple "hash" function on the
 			 * physical page number, the virtual superpage number,
 			 * and the pmap address to select one 4KB page out of
 			 * the 512 on which testing the reference bit will
 			 * result in clearing that reference bit.  This
 			 * function is designed to avoid the selection of the
 			 * same 4KB page for every 2MB page mapping.
 			 *
 			 * On demotion, a mapping that hasn't been referenced
 			 * is simply destroyed.  To avoid the possibility of a
 			 * subsequent page fault on a demoted wired mapping,
 			 * always leave its reference bit set.  Moreover,
 			 * since the superpage is wired, the current state of
 			 * its reference bit won't affect page replacement.
 			 */
 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
 			    (l2e & PTE_SW_WIRED) == 0) {
 				pmap_clear_bits(l2, PTE_A);
 				pmap_invalidate_page(pmap, va);
 				cleared++;
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
 			pvh->pv_gen++;
 		}
 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
 			goto out;
 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
 small_mappings:
 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
 		goto out;
 	pv = pvf;
 	do {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			md_gen = m->md.pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto retry;
 			}
 		}
 		l2 = pmap_l2(pmap, pv->pv_va);
 
 		KASSERT((pmap_load(l2) & PTE_RX) == 0,
 		    ("pmap_ts_referenced: found an invalid l2 table"));
 
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
 		l3e = pmap_load(l3);
 		if ((l3e & PTE_D) != 0)
 			vm_page_dirty(m);
 		if ((l3e & PTE_A) != 0) {
 			if ((l3e & PTE_SW_WIRED) == 0) {
 				/*
 				 * Wired pages cannot be paged out so
 				 * doing accessed bit emulation for
 				 * them is wasted effort. We do the
 				 * hard work for unwired pages only.
 				 */
 				pmap_clear_bits(l3, PTE_A);
 				pmap_invalidate_page(pmap, pv->pv_va);
 				cleared++;
 			} else
 				not_cleared++;
 		}
 		PMAP_UNLOCK(pmap);
 		/* Rotate the PV list if it has more than one entry. */
 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
 			m->md.pv_gen++;
 		}
 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
 	    not_cleared < PMAP_TS_REFERENCED_MAX);
 out:
 	rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 	vm_page_free_pages_toq(&free, false);
 	return (cleared + not_cleared);
 }
 
 /*
  *	Apply the given advice to the specified range of addresses within the
  *	given pmap.  Depending on the advice, clear the referenced and/or
  *	modified flags in each mapping and set the mapped page's dirty field.
  */
 void
 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
 {
 }
 
 /*
  *	Clear the modify bits on the specified physical page.
  */
 void
 pmap_clear_modify(vm_page_t m)
 {
 	struct md_page *pvh;
 	struct rwlock *lock;
 	pmap_t pmap;
 	pv_entry_t next_pv, pv;
 	pd_entry_t *l2, oldl2;
-	pt_entry_t *l3, oldl3;
+	pt_entry_t *l3;
 	vm_offset_t va;
 	int md_gen, pvh_gen;
 
 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
 	    ("pmap_clear_modify: page %p is not managed", m));
 	VM_OBJECT_ASSERT_WLOCKED(m->object);
 	KASSERT(!vm_page_xbusied(m),
 	    ("pmap_clear_modify: page %p is exclusive busied", m));
 
 	/*
 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
 	 * If the object containing the page is locked and the page is not
 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
 	 */
 	if ((m->aflags & PGA_WRITEABLE) == 0)
 		return;
 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
 	rw_rlock(&pvh_global_lock);
 	rw_wlock(lock);
 restart:
 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		va = pv->pv_va;
 		l2 = pmap_l2(pmap, va);
 		oldl2 = pmap_load(l2);
-		if ((oldl2 & PTE_W) != 0) {
-			if (pmap_demote_l2_locked(pmap, l2, va, &lock)) {
-				if ((oldl2 & PTE_SW_WIRED) == 0) {
-					/*
-					 * Write protect the mapping to a
-					 * single page so that a subsequent
-					 * write access may repromote.
-					 */
-					va += VM_PAGE_TO_PHYS(m) -
-					    PTE_TO_PHYS(oldl2);
-					l3 = pmap_l2_to_l3(l2, va);
-					oldl3 = pmap_load(l3);
-					if ((oldl3 & PTE_V) != 0) {
-						while (!atomic_fcmpset_long(l3,
-						    &oldl3, oldl3 & ~(PTE_D |
-						    PTE_W)))
-							cpu_spinwait();
-						vm_page_dirty(m);
-						pmap_invalidate_page(pmap, va);
-					}
-				}
-			}
+		/* If oldl2 has PTE_W set, then it also has PTE_D set. */
+		if ((oldl2 & PTE_W) != 0 &&
+		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
+		    (oldl2 & PTE_SW_WIRED) == 0) {
+			/*
+			 * Write protect the mapping to a single page so that
+			 * a subsequent write access may repromote.
+			 */
+			va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
+			l3 = pmap_l2_to_l3(l2, va);
+			pmap_clear_bits(l3, PTE_D | PTE_W);
+			vm_page_dirty(m);
+			pmap_invalidate_page(pmap, va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
 		pmap = PV_PMAP(pv);
 		if (!PMAP_TRYLOCK(pmap)) {
 			md_gen = m->md.pv_gen;
 			pvh_gen = pvh->pv_gen;
 			rw_wunlock(lock);
 			PMAP_LOCK(pmap);
 			rw_wlock(lock);
 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
 				PMAP_UNLOCK(pmap);
 				goto restart;
 			}
 		}
 		l2 = pmap_l2(pmap, pv->pv_va);
 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
 		    ("pmap_clear_modify: found a 2mpage in page %p's pv list",
 		    m));
 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
 		if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) {
 			pmap_clear_bits(l3, PTE_D | PTE_W);
 			pmap_invalidate_page(pmap, pv->pv_va);
 		}
 		PMAP_UNLOCK(pmap);
 	}
 	rw_wunlock(lock);
 	rw_runlock(&pvh_global_lock);
 }
 
 void *
 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
 {
 
         return ((void *)PHYS_TO_DMAP(pa));
 }
 
 void
 pmap_unmapbios(vm_paddr_t pa, vm_size_t size)
 {
 }
 
 /*
  * Sets the memory attribute for the specified page.
  */
 void
 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
 {
 
 	m->md.pv_memattr = ma;
 }
 
 /*
  * perform the pmap work for mincore
  */
 int
 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *locked_pa)
 {
 	pt_entry_t *l2, *l3, tpte;
 	vm_paddr_t pa;
 	int val;
 	bool managed;
 
 	PMAP_LOCK(pmap);
 retry:
 	managed = false;
 	val = 0;
 
 	l2 = pmap_l2(pmap, addr);
 	if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) {
 		if ((tpte & PTE_RWX) != 0) {
 			pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET);
 			val = MINCORE_INCORE | MINCORE_SUPER;
 		} else {
 			l3 = pmap_l2_to_l3(l2, addr);
 			tpte = pmap_load(l3);
 			if ((tpte & PTE_V) == 0)
 				goto done;
 			pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET);
 			val = MINCORE_INCORE;
 		}
 
 		if ((tpte & PTE_D) != 0)
 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
 		if ((tpte & PTE_A) != 0)
 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
 		managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED;
 	}
 
 done:
 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
 		/* Ensure that "PHYS_TO_VM_PAGE(pa)->object" doesn't change. */
 		if (vm_page_pa_tryrelock(pmap, pa, locked_pa))
 			goto retry;
 	} else
 		PA_UNLOCK_COND(*locked_pa);
 	PMAP_UNLOCK(pmap);
 	return (val);
 }
 
 void
 pmap_activate_sw(struct thread *td)
 {
 	pmap_t oldpmap, pmap;
 	u_int hart;
 
 	oldpmap = PCPU_GET(curpmap);
 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
 	if (pmap == oldpmap)
 		return;
 	load_satp(pmap->pm_satp);
 
 	hart = PCPU_GET(hart);
 #ifdef SMP
 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
 	CPU_CLR_ATOMIC(hart, &oldpmap->pm_active);
 #else
 	CPU_SET(hart, &pmap->pm_active);
 	CPU_CLR(hart, &oldpmap->pm_active);
 #endif
 	PCPU_SET(curpmap, pmap);
 
 	sfence_vma();
 }
 
 void
 pmap_activate(struct thread *td)
 {
 
 	critical_enter();
 	pmap_activate_sw(td);
 	critical_exit();
 }
 
 void
 pmap_activate_boot(pmap_t pmap)
 {
 	u_int hart;
 
 	hart = PCPU_GET(hart);
 #ifdef SMP
 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
 #else
 	CPU_SET(hart, &pmap->pm_active);
 #endif
 	PCPU_SET(curpmap, pmap);
 }
 
 void
 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
 {
 	cpuset_t mask;
 
 	/*
 	 * From the RISC-V User-Level ISA V2.2:
 	 *
 	 * "To make a store to instruction memory visible to all
 	 * RISC-V harts, the writing hart has to execute a data FENCE
 	 * before requesting that all remote RISC-V harts execute a
 	 * FENCE.I."
 	 */
 	sched_pin();
 	mask = all_harts;
 	CPU_CLR(PCPU_GET(hart), &mask);
 	fence();
 	if (!CPU_EMPTY(&mask) && smp_started)
 		sbi_remote_fence_i(mask.__bits);
 	sched_unpin();
 }
 
 /*
  *	Increase the starting virtual address of the given mapping if a
  *	different alignment might result in more superpage mappings.
  */
 void
 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
     vm_offset_t *addr, vm_size_t size)
 {
 	vm_offset_t superpage_offset;
 
 	if (size < L2_SIZE)
 		return;
 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
 		offset += ptoa(object->pg_color);
 	superpage_offset = offset & L2_OFFSET;
 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
 	    (*addr & L2_OFFSET) == superpage_offset)
 		return;
 	if ((*addr & L2_OFFSET) < superpage_offset)
 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
 	else
 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
 }
 
 /**
  * Get the kernel virtual address of a set of physical pages. If there are
  * physical addresses not covered by the DMAP perform a transient mapping
  * that will be removed when calling pmap_unmap_io_transient.
  *
  * \param page        The pages the caller wishes to obtain the virtual
  *                    address on the kernel memory map.
  * \param vaddr       On return contains the kernel virtual memory address
  *                    of the pages passed in the page parameter.
  * \param count       Number of pages passed in.
  * \param can_fault   TRUE if the thread using the mapped pages can take
  *                    page faults, FALSE otherwise.
  *
  * \returns TRUE if the caller must call pmap_unmap_io_transient when
  *          finished or FALSE otherwise.
  *
  */
 boolean_t
 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	boolean_t needs_mapping;
 	int error, i;
 
 	/*
 	 * Allocate any KVA space that we need, this is done in a separate
 	 * loop to prevent calling vmem_alloc while pinned.
 	 */
 	needs_mapping = FALSE;
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) {
 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
 			needs_mapping = TRUE;
 		} else {
 			vaddr[i] = PHYS_TO_DMAP(paddr);
 		}
 	}
 
 	/* Exit early if everything is covered by the DMAP */
 	if (!needs_mapping)
 		return (FALSE);
 
 	if (!can_fault)
 		sched_pin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (paddr >= DMAP_MAX_PHYSADDR) {
 			panic(
 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
 		}
 	}
 
 	return (needs_mapping);
 }
 
 void
 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
     boolean_t can_fault)
 {
 	vm_paddr_t paddr;
 	int i;
 
 	if (!can_fault)
 		sched_unpin();
 	for (i = 0; i < count; i++) {
 		paddr = VM_PAGE_TO_PHYS(page[i]);
 		if (paddr >= DMAP_MAX_PHYSADDR) {
 			panic("RISCVTODO: pmap_unmap_io_transient: Unmap data");
 		}
 	}
 }
 
 boolean_t
 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
 {
 
 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK);
 }
 
 bool
 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2,
     pt_entry_t **l3)
 {
 	pd_entry_t *l1p, *l2p;
 
 	/* Get l1 directory entry. */
 	l1p = pmap_l1(pmap, va);
 	*l1 = l1p;
 
 	if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0)
 		return (false);
 
 	if ((pmap_load(l1p) & PTE_RX) != 0) {
 		*l2 = NULL;
 		*l3 = NULL;
 		return (true);
 	}
 
 	/* Get l2 directory entry. */
 	l2p = pmap_l1_to_l2(l1p, va);
 	*l2 = l2p;
 
 	if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0)
 		return (false);
 
 	if ((pmap_load(l2p) & PTE_RX) != 0) {
 		*l3 = NULL;
 		return (true);
 	}
 
 	/* Get l3 page table entry. */
 	*l3 = pmap_l2_to_l3(l2p, va);
 
 	return (true);
 }
Index: projects/nfsv42/sys/sys/ata.h
===================================================================
--- projects/nfsv42/sys/sys/ata.h	(revision 350367)
+++ projects/nfsv42/sys/sys/ata.h	(revision 350368)
@@ -1,1053 +1,1056 @@
 /*-
  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
  *
  * Copyright (c) 2000 - 2008 Søren Schmidt <sos@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer,
  *    without modification, immediately at the beginning of the file.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
  * $FreeBSD$
  */
 
 #ifndef _SYS_ATA_H_
 #define _SYS_ATA_H_
 
 #include <sys/ioccom.h>
 
 /* ATA/ATAPI device parameters */
 struct ata_params {
 /*000*/ u_int16_t       config;         /* configuration info */
 #define ATA_PROTO_MASK                  0x8003
 #define ATA_PROTO_ATAPI                 0x8000
 #define ATA_PROTO_ATAPI_12              0x8000
 #define ATA_PROTO_ATAPI_16              0x8001
 #define ATA_PROTO_CFA                   0x848a
 #define ATA_ATAPI_TYPE_MASK             0x1f00
 #define ATA_ATAPI_TYPE_DIRECT           0x0000  /* disk/floppy */
 #define ATA_ATAPI_TYPE_TAPE             0x0100  /* streaming tape */
 #define ATA_ATAPI_TYPE_CDROM            0x0500  /* CD-ROM device */
 #define ATA_ATAPI_TYPE_OPTICAL          0x0700  /* optical disk */
 #define ATA_DRQ_MASK                    0x0060
 #define ATA_DRQ_SLOW                    0x0000  /* cpu 3 ms delay */
 #define ATA_DRQ_INTR                    0x0020  /* interrupt 10 ms delay */
 #define ATA_DRQ_FAST                    0x0040  /* accel 50 us delay */
 #define ATA_RESP_INCOMPLETE             0x0004
 
 /*001*/ u_int16_t       cylinders;              /* # of cylinders */
 /*002*/ u_int16_t       specconf;		/* specific configuration */
 /*003*/ u_int16_t       heads;                  /* # heads */
 	u_int16_t       obsolete4;
 	u_int16_t       obsolete5;
 /*006*/ u_int16_t       sectors;                /* # sectors/track */
 /*007*/ u_int16_t       vendor7[3];
 /*010*/ u_int8_t        serial[20];             /* serial number */
 /*020*/ u_int16_t       retired20;
 	u_int16_t       retired21;
 	u_int16_t       obsolete22;
 /*023*/ u_int8_t        revision[8];            /* firmware revision */
 /*027*/ u_int8_t        model[40];              /* model name */
 /*047*/ u_int16_t       sectors_intr;           /* sectors per interrupt */
 /*048*/ u_int16_t       tcg;                    /* Trusted Computing Group */
 #define ATA_SUPPORT_TCG                 0x0001
 /*049*/ u_int16_t       capabilities1;
 #define ATA_SUPPORT_DMA                 0x0100
 #define ATA_SUPPORT_LBA                 0x0200
 #define ATA_SUPPORT_IORDYDIS            0x0400
 #define ATA_SUPPORT_IORDY               0x0800
 #define ATA_SUPPORT_OVERLAP             0x4000
 
 /*050*/ u_int16_t       capabilities2;
 /*051*/ u_int16_t       retired_piomode;        /* PIO modes 0-2 */
 #define ATA_RETIRED_PIO_MASK            0x0300
 
 /*052*/ u_int16_t       retired_dmamode;        /* DMA modes */
 #define ATA_RETIRED_DMA_MASK            0x0003
 
 /*053*/ u_int16_t       atavalid;               /* fields valid */
 #define ATA_FLAG_54_58                  0x0001  /* words 54-58 valid */
 #define ATA_FLAG_64_70                  0x0002  /* words 64-70 valid */
 #define ATA_FLAG_88                     0x0004  /* word 88 valid */
 
 /*054*/ u_int16_t       current_cylinders;
 /*055*/ u_int16_t       current_heads;
 /*056*/ u_int16_t       current_sectors;
 /*057*/ u_int16_t       current_size_1;
 /*058*/ u_int16_t       current_size_2;
 /*059*/ u_int16_t       multi;
 #define ATA_SUPPORT_BLOCK_ERASE_EXT     0x8000
 #define ATA_SUPPORT_OVERWRITE_EXT       0x4000
 #define ATA_SUPPORT_CRYPTO_SCRAMBLE_EXT 0x2000
 #define ATA_SUPPORT_SANITIZE            0x1000
+#define	ATA_SUPPORT_SANITIZE_ALLOWED	0x0800
+#define	ATA_SUPPORT_ANTIFREEZE_LOCK_EXT	0x0400
 #define ATA_MULTI_VALID                 0x0100
 
 /*060*/ u_int16_t       lba_size_1;
 	u_int16_t       lba_size_2;
 	u_int16_t       obsolete62;
 /*063*/ u_int16_t       mwdmamodes;             /* multiword DMA modes */
 /*064*/ u_int16_t       apiomodes;              /* advanced PIO modes */
 
 /*065*/ u_int16_t       mwdmamin;               /* min. M/W DMA time/word ns */
 /*066*/ u_int16_t       mwdmarec;               /* rec. M/W DMA time ns */
 /*067*/ u_int16_t       pioblind;               /* min. PIO cycle w/o flow */
 /*068*/ u_int16_t       pioiordy;               /* min. PIO cycle IORDY flow */
 /*069*/ u_int16_t       support3;
 #define ATA_SUPPORT_RZAT                0x0020
 #define ATA_SUPPORT_DRAT                0x4000
 #define ATA_ENCRYPTS_ALL_USER_DATA      0x0010  /* Self-encrypting drive */
 #define	ATA_SUPPORT_ZONE_MASK		0x0003
 #define	ATA_SUPPORT_ZONE_NR		0x0000
 #define	ATA_SUPPORT_ZONE_HOST_AWARE	0x0001
 #define	ATA_SUPPORT_ZONE_DEV_MANAGED	0x0002
 	u_int16_t       reserved70;
 /*071*/ u_int16_t       rlsovlap;               /* rel time (us) for overlap */
 /*072*/ u_int16_t       rlsservice;             /* rel time (us) for service */
 	u_int16_t       reserved73;
 	u_int16_t       reserved74;
 /*075*/ u_int16_t       queue;
 #define ATA_QUEUE_LEN(x)                ((x) & 0x001f)
 
 /*76*/  u_int16_t       satacapabilities;
 #define ATA_SATA_GEN1                   0x0002
 #define ATA_SATA_GEN2                   0x0004
 #define ATA_SATA_GEN3                   0x0008
 #define ATA_SUPPORT_NCQ                 0x0100
 #define ATA_SUPPORT_IFPWRMNGTRCV        0x0200
 #define ATA_SUPPORT_PHYEVENTCNT         0x0400
 #define ATA_SUPPORT_NCQ_UNLOAD          0x0800
 #define ATA_SUPPORT_NCQ_PRIO            0x1000
 #define ATA_SUPPORT_HAPST               0x2000
 #define ATA_SUPPORT_DAPST               0x4000
 #define ATA_SUPPORT_READLOGDMAEXT       0x8000
 
 /*77*/  u_int16_t       satacapabilities2;
 #define ATA_SATA_CURR_GEN_MASK          0x0006
 #define ATA_SUPPORT_NCQ_STREAM          0x0010
 #define ATA_SUPPORT_NCQ_QMANAGEMENT     0x0020
 #define ATA_SUPPORT_RCVSND_FPDMA_QUEUED 0x0040
 /*78*/  u_int16_t       satasupport;
 #define ATA_SUPPORT_NONZERO             0x0002
 #define ATA_SUPPORT_AUTOACTIVATE        0x0004
 #define ATA_SUPPORT_IFPWRMNGT           0x0008
 #define ATA_SUPPORT_INORDERDATA         0x0010
 #define ATA_SUPPORT_ASYNCNOTIF          0x0020
 #define ATA_SUPPORT_SOFTSETPRESERVE     0x0040
 /*79*/  u_int16_t       sataenabled;
 #define ATA_ENABLED_DAPST               0x0080
 
 /*080*/ u_int16_t       version_major;
 /*081*/ u_int16_t       version_minor;
 
 	struct {
 /*082/085*/ u_int16_t   command1;
 #define ATA_SUPPORT_SMART               0x0001
 #define ATA_SUPPORT_SECURITY            0x0002
 #define ATA_SUPPORT_REMOVABLE           0x0004
 #define ATA_SUPPORT_POWERMGT            0x0008
 #define ATA_SUPPORT_PACKET              0x0010
 #define ATA_SUPPORT_WRITECACHE          0x0020
 #define ATA_SUPPORT_LOOKAHEAD           0x0040
 #define ATA_SUPPORT_RELEASEIRQ          0x0080
 #define ATA_SUPPORT_SERVICEIRQ          0x0100
 #define ATA_SUPPORT_RESET               0x0200
 #define ATA_SUPPORT_PROTECTED           0x0400
 #define ATA_SUPPORT_WRITEBUFFER         0x1000
 #define ATA_SUPPORT_READBUFFER          0x2000
 #define ATA_SUPPORT_NOP                 0x4000
 
 /*083/086*/ u_int16_t   command2;
 #define ATA_SUPPORT_MICROCODE           0x0001
 #define ATA_SUPPORT_QUEUED              0x0002
 #define ATA_SUPPORT_CFA                 0x0004
 #define ATA_SUPPORT_APM                 0x0008
 #define ATA_SUPPORT_NOTIFY              0x0010
 #define ATA_SUPPORT_STANDBY             0x0020
 #define ATA_SUPPORT_SPINUP              0x0040
 #define ATA_SUPPORT_MAXSECURITY         0x0100
 #define ATA_SUPPORT_AUTOACOUSTIC        0x0200
 #define ATA_SUPPORT_ADDRESS48           0x0400
 #define ATA_SUPPORT_OVERLAY             0x0800
 #define ATA_SUPPORT_FLUSHCACHE          0x1000
 #define ATA_SUPPORT_FLUSHCACHE48        0x2000
 
 /*084/087*/ u_int16_t   extension;
 #define ATA_SUPPORT_SMARTLOG		0x0001
 #define ATA_SUPPORT_SMARTTEST		0x0002
 #define ATA_SUPPORT_MEDIASN		0x0004
 #define ATA_SUPPORT_MEDIAPASS		0x0008
 #define ATA_SUPPORT_STREAMING		0x0010
 #define ATA_SUPPORT_GENLOG		0x0020
 #define ATA_SUPPORT_WRITEDMAFUAEXT	0x0040
 #define ATA_SUPPORT_WRITEDMAQFUAEXT	0x0080
 #define ATA_SUPPORT_64BITWWN		0x0100
 #define ATA_SUPPORT_UNLOAD		0x2000
 	} __packed support, enabled;
 
 /*088*/ u_int16_t       udmamodes;              /* UltraDMA modes */
 /*089*/ u_int16_t       erase_time;             /* time req'd in 2min units */
 /*090*/ u_int16_t       enhanced_erase_time;    /* time req'd in 2min units */
 /*091*/ u_int16_t       apm_value;
 /*092*/ u_int16_t       master_passwd_revision; /* password revision code */
 /*093*/ u_int16_t       hwres;
 #define ATA_CABLE_ID                    0x2000
 
 /*094*/ u_int16_t       acoustic;
 #define ATA_ACOUSTIC_CURRENT(x)         ((x) & 0x00ff)
 #define ATA_ACOUSTIC_VENDOR(x)          (((x) & 0xff00) >> 8)
 
 /*095*/ u_int16_t       stream_min_req_size;
 /*096*/ u_int16_t       stream_transfer_time;
 /*097*/ u_int16_t       stream_access_latency;
 /*098*/ u_int32_t       stream_granularity;
 /*100*/ u_int16_t       lba_size48_1;
 	u_int16_t       lba_size48_2;
 	u_int16_t       lba_size48_3;
 	u_int16_t       lba_size48_4;
 	u_int16_t       reserved104;
 /*105*/	u_int16_t       max_dsm_blocks;
 /*106*/	u_int16_t       pss;
 #define ATA_PSS_LSPPS			0x000F
 #define ATA_PSS_LSSABOVE512		0x1000
 #define ATA_PSS_MULTLS			0x2000
 #define ATA_PSS_VALID_MASK		0xC000
 #define ATA_PSS_VALID_VALUE		0x4000
 /*107*/ u_int16_t       isd;
 /*108*/ u_int16_t       wwn[4];
 	u_int16_t       reserved112[5];
 /*117*/ u_int16_t       lss_1;
 /*118*/ u_int16_t       lss_2;
 /*119*/ u_int16_t       support2;
 #define ATA_SUPPORT_WRITEREADVERIFY	0x0002
 #define ATA_SUPPORT_WRITEUNCORREXT	0x0004
 #define ATA_SUPPORT_RWLOGDMAEXT		0x0008
 #define ATA_SUPPORT_MICROCODE3		0x0010
 #define ATA_SUPPORT_FREEFALL		0x0020
 #define ATA_SUPPORT_SENSE_REPORT	0x0040
 #define ATA_SUPPORT_EPC			0x0080
 #define ATA_SUPPORT_AMAX_ADDR		0x0100
 #define ATA_SUPPORT_DSN			0x0200
 /*120*/ u_int16_t       enabled2;
 #define ATA_ENABLED_WRITEREADVERIFY	0x0002
 #define ATA_ENABLED_WRITEUNCORREXT	0x0004
 #define ATA_ENABLED_FREEFALL		0x0020
 #define ATA_ENABLED_SENSE_REPORT	0x0040
 #define ATA_ENABLED_EPC			0x0080
 #define ATA_ENABLED_DSN			0x0200
 	u_int16_t       reserved121[6];
 /*127*/ u_int16_t       removable_status;
 /*128*/ u_int16_t       security_status;
 #define ATA_SECURITY_LEVEL		0x0100	/* 0: high, 1: maximum */
 #define ATA_SECURITY_ENH_SUPP		0x0020	/* enhanced erase supported */
 #define ATA_SECURITY_COUNT_EXP		0x0010	/* count expired */
 #define ATA_SECURITY_FROZEN		0x0008	/* security config is frozen */
 #define ATA_SECURITY_LOCKED		0x0004	/* drive is locked */
 #define ATA_SECURITY_ENABLED		0x0002	/* ATA Security is enabled */
 #define ATA_SECURITY_SUPPORTED		0x0001	/* ATA Security is supported */
 
 	u_int16_t       reserved129[31];
 /*160*/ u_int16_t       cfa_powermode1;
 	u_int16_t       reserved161;
 /*162*/ u_int16_t       cfa_kms_support;
 /*163*/ u_int16_t       cfa_trueide_modes;
 /*164*/ u_int16_t       cfa_memory_modes;
 	u_int16_t       reserved165[3];
 /*168*/ u_int16_t       form_factor;
 #define ATA_FORM_FACTOR_MASK		0x000f
 #define ATA_FORM_FACTOR_NOT_REPORTED	0x0000
 #define ATA_FORM_FACTOR_5_25		0x0001
 #define ATA_FORM_FACTOR_3_5		0x0002
 #define ATA_FORM_FACTOR_2_5		0x0003
 #define ATA_FORM_FACTOR_1_8		0x0004
 #define ATA_FORM_FACTOR_SUB_1_8		0x0005
 #define ATA_FORM_FACTOR_MSATA		0x0006
 #define ATA_FORM_FACTOR_M_2		0x0007
 #define ATA_FORM_FACTOR_MICRO_SSD	0x0008
 #define ATA_FORM_FACTOR_C_FAST		0x0009
 /*169*/	u_int16_t       support_dsm;
 #define ATA_SUPPORT_DSM_TRIM		0x0001
 	u_int16_t       reserved170[6];
 /*176*/ u_int8_t        media_serial[60];
 /*206*/ u_int16_t       sct;
 	u_int16_t       reserved207[2];
 /*209*/ u_int16_t       lsalign;
 /*210*/ u_int16_t       wrv_sectors_m3_1;
 	u_int16_t       wrv_sectors_m3_2;
 /*212*/ u_int16_t       wrv_sectors_m2_1;
 	u_int16_t       wrv_sectors_m2_2;
 /*214*/ u_int16_t       nv_cache_caps;
 /*215*/ u_int16_t       nv_cache_size_1;
 	u_int16_t       nv_cache_size_2;
 /*217*/ u_int16_t       media_rotation_rate;
 #define ATA_RATE_NOT_REPORTED		0x0000
 #define ATA_RATE_NON_ROTATING		0x0001
 	u_int16_t       reserved218;
 /*219*/ u_int16_t       nv_cache_opt;
 /*220*/ u_int16_t       wrv_mode;
 	u_int16_t       reserved221;
 /*222*/ u_int16_t       transport_major;
 /*223*/ u_int16_t       transport_minor;
 	u_int16_t       reserved224[31];
 /*255*/ u_int16_t       integrity;
 } __packed;
 
 /* ATA Dataset Management */
 #define ATA_DSM_BLK_SIZE	512
 #define ATA_DSM_BLK_RANGES	64
 #define ATA_DSM_RANGE_SIZE	8
 #define ATA_DSM_RANGE_MAX	65535
 
 /*
  * ATA Device Register
  *
  * bit 7 Obsolete (was 1 in early ATA specs)
  * bit 6 Sets LBA/CHS mode. 1=LBA, 0=CHS 
  * bit 5 Obsolete (was 1 in early ATA specs)
  * bit 4 1 = Slave Drive, 0 = Master Drive
  * bit 3-0 In LBA mode, 27-24 of address. In CHS mode, head number
 */
 
 #define ATA_DEV_MASTER		0x00
 #define ATA_DEV_SLAVE		0x10
 #define ATA_DEV_LBA		0x40
 
 /* ATA limits */
 #define ATA_MAX_28BIT_LBA	268435455UL
 
 /* ATA Status Register */
 #define ATA_STATUS_ERROR		0x01
 #define ATA_STATUS_SENSE_AVAIL		0x02
 #define ATA_STATUS_ALIGN_ERR		0x04
 #define ATA_STATUS_DATA_REQ		0x08
 #define ATA_STATUS_DEF_WRITE_ERR	0x10
 #define ATA_STATUS_DEVICE_FAULT		0x20
 #define ATA_STATUS_DEVICE_READY		0x40
 #define ATA_STATUS_BUSY			0x80
 
 /* ATA Error Register */
 #define ATA_ERROR_ABORT		0x04
 #define ATA_ERROR_ID_NOT_FOUND	0x10
 
 /* ATA HPA Features */
 #define ATA_HPA_FEAT_MAX_ADDR	0x00
 #define ATA_HPA_FEAT_SET_PWD	0x01
 #define ATA_HPA_FEAT_LOCK	0x02
 #define ATA_HPA_FEAT_UNLOCK	0x03
 #define ATA_HPA_FEAT_FREEZE	0x04
 
 /* ATA transfer modes */
 #define ATA_MODE_MASK           0x0f
 #define ATA_DMA_MASK            0xf0
 #define ATA_PIO                 0x00
 #define ATA_PIO0                0x08
 #define ATA_PIO1                0x09
 #define ATA_PIO2                0x0a
 #define ATA_PIO3                0x0b
 #define ATA_PIO4                0x0c
 #define ATA_PIO_MAX             0x0f
 #define ATA_DMA                 0x10
 #define ATA_WDMA0               0x20
 #define ATA_WDMA1               0x21
 #define ATA_WDMA2               0x22
 #define ATA_UDMA0               0x40
 #define ATA_UDMA1               0x41
 #define ATA_UDMA2               0x42
 #define ATA_UDMA3               0x43
 #define ATA_UDMA4               0x44
 #define ATA_UDMA5               0x45
 #define ATA_UDMA6               0x46
 #define ATA_SA150               0x47
 #define ATA_SA300               0x48
 #define ATA_SA600               0x49
 #define ATA_DMA_MAX             0x4f
 
 
 /* ATA commands */
 #define ATA_NOP                         0x00    /* NOP */
 #define         ATA_NF_FLUSHQUEUE       0x00    /* flush queued cmd's */
 #define         ATA_NF_AUTOPOLL         0x01    /* start autopoll function */
 #define ATA_DATA_SET_MANAGEMENT		0x06
 #define 	ATA_DSM_TRIM		0x01
 #define ATA_DEVICE_RESET                0x08    /* reset device */
 #define ATA_READ                        0x20    /* read */
 #define ATA_READ48                      0x24    /* read 48bit LBA */
 #define ATA_READ_DMA48                  0x25    /* read DMA 48bit LBA */
 #define ATA_READ_DMA_QUEUED48           0x26    /* read DMA QUEUED 48bit LBA */
 #define ATA_READ_NATIVE_MAX_ADDRESS48   0x27    /* read native max addr 48bit */
 #define ATA_READ_MUL48                  0x29    /* read multi 48bit LBA */
 #define ATA_READ_STREAM_DMA48           0x2a    /* read DMA stream 48bit LBA */
 #define ATA_READ_LOG_EXT                0x2f    /* read log ext - PIO Data-In */
 #define ATA_READ_STREAM48               0x2b    /* read stream 48bit LBA */
 #define ATA_WRITE                       0x30    /* write */
 #define ATA_WRITE48                     0x34    /* write 48bit LBA */
 #define ATA_WRITE_DMA48                 0x35    /* write DMA 48bit LBA */
 #define ATA_WRITE_DMA_QUEUED48          0x36    /* write DMA QUEUED 48bit LBA*/
 #define ATA_SET_MAX_ADDRESS48           0x37    /* set max address 48bit */
 #define ATA_WRITE_MUL48                 0x39    /* write multi 48bit LBA */
 #define ATA_WRITE_STREAM_DMA48          0x3a
 #define ATA_WRITE_STREAM48              0x3b
 #define ATA_WRITE_DMA_FUA48             0x3d
 #define ATA_WRITE_DMA_QUEUED_FUA48      0x3e
 #define ATA_WRITE_LOG_EXT               0x3f
 #define ATA_READ_VERIFY                 0x40
 #define ATA_READ_VERIFY48               0x42
 #define ATA_WRITE_UNCORRECTABLE48       0x45    /* write uncorrectable 48bit LBA */
 #define         ATA_WU_PSEUDO           0x55    /* pseudo-uncorrectable error */
 #define         ATA_WU_FLAGGED          0xaa    /* flagged-uncorrectable error */
 #define ATA_READ_LOG_DMA_EXT            0x47    /* read log DMA ext - PIO Data-In */
 #define	ATA_ZAC_MANAGEMENT_IN		0x4a	/* ZAC management in */
 #define		ATA_ZM_REPORT_ZONES	0x00	/* report zones */
 #define	ATA_WRITE_LOG_DMA_EXT		0x57	/* WRITE LOG DMA EXT */
 #define	ATA_TRUSTED_NON_DATA		0x5b	/* TRUSTED NON-DATA */
 #define	ATA_TRUSTED_RECEIVE		0x5c	/* TRUSTED RECEIVE */
 #define	ATA_TRUSTED_RECEIVE_DMA		0x5d	/* TRUSTED RECEIVE DMA */
 #define	ATA_TRUSTED_SEND		0x5e	/* TRUSTED SEND */
 #define	ATA_TRUSTED_SEND_DMA		0x5f	/* TRUSTED SEND DMA */
 #define ATA_READ_FPDMA_QUEUED           0x60    /* read DMA NCQ */
 #define ATA_WRITE_FPDMA_QUEUED          0x61    /* write DMA NCQ */
 #define ATA_NCQ_NON_DATA		0x63	/* NCQ non-data command */
 #define		ATA_ABORT_NCQ_QUEUE	0x00	/* abort NCQ queue */
 #define		ATA_DEADLINE_HANDLING	0x01	/* deadline handling */
 #define		ATA_SET_FEATURES	0x05	/* set features */
 #define		ATA_ZERO_EXT		0x06	/* zero ext */
 #define		ATA_NCQ_ZAC_MGMT_OUT	0x07	/* NCQ ZAC mgmt out no data */
 #define ATA_SEND_FPDMA_QUEUED           0x64    /* send DMA NCQ */
 #define		ATA_SFPDMA_DSM		0x00	/* Data set management */
 #define			ATA_SFPDMA_DSM_TRIM	0x01	/* Set trim bit in auxiliary */
 #define		ATA_SFPDMA_HYBRID_EVICT	0x01	/* Hybrid Evict */
 #define		ATA_SFPDMA_WLDMA	0x02	/* Write Log DMA EXT */
 #define		ATA_SFPDMA_ZAC_MGMT_OUT	0x03	/* NCQ ZAC mgmt out w/data */
 #define ATA_RECV_FPDMA_QUEUED           0x65    /* receive DMA NCQ */
 #define		ATA_RFPDMA_RL_DMA_EXT	0x00	/* Read Log DMA EXT */
 #define		ATA_RFPDMA_ZAC_MGMT_IN	0x02	/* NCQ ZAC mgmt in w/data */
 #define ATA_SEP_ATTN                    0x67    /* SEP request */
 #define ATA_SEEK                        0x70    /* seek */
 #define	ATA_AMAX_ADDR			0x78	/* Accessible Max Address */
 #define		ATA_AMAX_ADDR_GET	0x00	/* GET NATIVE MAX ADDRESS EXT */
 #define		ATA_AMAX_ADDR_SET	0x01	/* SET ACCESSIBLE MAX ADDRESS EXT */
 #define		ATA_AMAX_ADDR_FREEZE	0x02	/* FREEZE ACCESSIBLE MAX ADDRESS EXT */
 #define	ATA_ZAC_MANAGEMENT_OUT		0x9f	/* ZAC management out */
 #define		ATA_ZM_CLOSE_ZONE	0x01	/* close zone */
 #define		ATA_ZM_FINISH_ZONE	0x02	/* finish zone */
 #define		ATA_ZM_OPEN_ZONE	0x03	/* open zone */
 #define		ATA_ZM_RWP		0x04	/* reset write pointer */
 #define	ATA_DOWNLOAD_MICROCODE		0x92	/* DOWNLOAD MICROCODE */
 #define	ATA_DOWNLOAD_MICROCODE_DMA	0x93	/* DOWNLOAD MICROCODE DMA */
 #define ATA_PACKET_CMD                  0xa0    /* packet command */
 #define ATA_ATAPI_IDENTIFY              0xa1    /* get ATAPI params*/
 #define ATA_SERVICE                     0xa2    /* service command */
 #define ATA_SMART_CMD                   0xb0    /* SMART command */
+#define	ATA_SANITIZE			0xb4	/* sanitize device */
 #define ATA_CFA_ERASE                   0xc0    /* CFA erase */
 #define ATA_READ_MUL                    0xc4    /* read multi */
 #define ATA_WRITE_MUL                   0xc5    /* write multi */
 #define ATA_SET_MULTI                   0xc6    /* set multi size */
 #define ATA_READ_DMA_QUEUED             0xc7    /* read DMA QUEUED */
 #define ATA_READ_DMA                    0xc8    /* read DMA */
 #define ATA_WRITE_DMA                   0xca    /* write DMA */
 #define ATA_WRITE_DMA_QUEUED            0xcc    /* write DMA QUEUED */
 #define ATA_WRITE_MUL_FUA48             0xce
 #define ATA_STANDBY_IMMEDIATE           0xe0    /* standby immediate */
 #define ATA_IDLE_IMMEDIATE              0xe1    /* idle immediate */
 #define ATA_STANDBY_CMD                 0xe2    /* standby */
 #define ATA_IDLE_CMD                    0xe3    /* idle */
 #define ATA_READ_BUFFER                 0xe4    /* read buffer */
 #define ATA_READ_PM                     0xe4    /* read portmultiplier */
 #define ATA_CHECK_POWER_MODE            0xe5    /* device power mode */
 #define ATA_SLEEP                       0xe6    /* sleep */
 #define ATA_FLUSHCACHE                  0xe7    /* flush cache to disk */
 #define	ATA_WRITE_BUFFER		0xe8    /* write buffer */
 #define ATA_WRITE_PM                    0xe8    /* write portmultiplier */
 #define	ATA_READ_BUFFER_DMA		0xe9    /* read buffer DMA */
 #define ATA_FLUSHCACHE48                0xea    /* flush cache to disk */
 #define	ATA_WRITE_BUFFER_DMA		0xeb    /* write buffer DMA */
 #define ATA_ATA_IDENTIFY                0xec    /* get ATA params */
 #define ATA_SETFEATURES                 0xef    /* features command */
 #define         ATA_SF_ENAB_WCACHE      0x02    /* enable write cache */
 #define         ATA_SF_DIS_WCACHE       0x82    /* disable write cache */
 #define         ATA_SF_SETXFER          0x03    /* set transfer mode */
 #define		ATA_SF_APM		0x05	/* Enable APM feature set */
 #define         ATA_SF_ENAB_PUIS        0x06    /* enable PUIS */
 #define         ATA_SF_DIS_PUIS         0x86    /* disable PUIS */
 #define         ATA_SF_PUIS_SPINUP      0x07    /* PUIS spin-up */
 #define		ATA_SF_WRV		0x0b	/* Enable Write-Read-Verify */
 #define 	ATA_SF_DLC		0x0c	/* Enable device life control */
 #define 	ATA_SF_SATA		0x10	/* Enable use of SATA feature */
 #define 	ATA_SF_FFC		0x41	/* Free-fall Control */
 #define 	ATA_SF_MHIST		0x43	/* Set Max Host Sect. Times */
 #define 	ATA_SF_RATE		0x45	/* Set Rate Basis */
 #define 	ATA_SF_EPC		0x4A	/* Extended Power Conditions */
 #define         ATA_SF_ENAB_RCACHE      0xaa    /* enable readahead cache */
 #define         ATA_SF_DIS_RCACHE       0x55    /* disable readahead cache */
 #define         ATA_SF_ENAB_RELIRQ      0x5d    /* enable release interrupt */
 #define         ATA_SF_DIS_RELIRQ       0xdd    /* disable release interrupt */
 #define         ATA_SF_ENAB_SRVIRQ      0x5e    /* enable service interrupt */
 #define         ATA_SF_DIS_SRVIRQ       0xde    /* disable service interrupt */
 #define 	ATA_SF_LPSAERC		0x62	/* Long Phys Sect Align ErrRep*/
 #define 	ATA_SF_DSN		0x63	/* Device Stats Notification */
 #define ATA_CHECK_POWER_MODE		0xe5	/* Check Power Mode */
 #define ATA_SECURITY_SET_PASSWORD       0xf1    /* set drive password */
 #define ATA_SECURITY_UNLOCK             0xf2    /* unlock drive using passwd */
 #define ATA_SECURITY_ERASE_PREPARE      0xf3    /* prepare to erase drive */
 #define ATA_SECURITY_ERASE_UNIT         0xf4    /* erase all blocks on drive */
 #define ATA_SECURITY_FREEZE_LOCK        0xf5    /* freeze security config */
 #define ATA_SECURITY_DISABLE_PASSWORD   0xf6    /* disable drive password */
 #define ATA_READ_NATIVE_MAX_ADDRESS     0xf8    /* read native max address */
 #define ATA_SET_MAX_ADDRESS             0xf9    /* set max address */
 
 
 /* ATAPI commands */
 #define ATAPI_TEST_UNIT_READY           0x00    /* check if device is ready */
 #define ATAPI_REZERO                    0x01    /* rewind */
 #define ATAPI_REQUEST_SENSE             0x03    /* get sense data */
 #define ATAPI_FORMAT                    0x04    /* format unit */
 #define ATAPI_READ                      0x08    /* read data */
 #define ATAPI_WRITE                     0x0a    /* write data */
 #define ATAPI_WEOF                      0x10    /* write filemark */
 #define         ATAPI_WF_WRITE          0x01
 #define ATAPI_SPACE                     0x11    /* space command */
 #define         ATAPI_SP_FM             0x01
 #define         ATAPI_SP_EOD            0x03
 #define ATAPI_INQUIRY			0x12	/* get inquiry data */
 #define ATAPI_MODE_SELECT               0x15    /* mode select */
 #define ATAPI_ERASE                     0x19    /* erase */
 #define ATAPI_MODE_SENSE                0x1a    /* mode sense */
 #define ATAPI_START_STOP                0x1b    /* start/stop unit */
 #define         ATAPI_SS_LOAD           0x01
 #define         ATAPI_SS_RETENSION      0x02
 #define         ATAPI_SS_EJECT          0x04
 #define ATAPI_PREVENT_ALLOW             0x1e    /* media removal */
 #define ATAPI_READ_FORMAT_CAPACITIES    0x23    /* get format capacities */
 #define ATAPI_READ_CAPACITY             0x25    /* get volume capacity */
 #define ATAPI_READ_BIG                  0x28    /* read data */
 #define ATAPI_WRITE_BIG                 0x2a    /* write data */
 #define ATAPI_LOCATE                    0x2b    /* locate to position */
 #define ATAPI_READ_POSITION             0x34    /* read position */
 #define ATAPI_SYNCHRONIZE_CACHE         0x35    /* flush buf, close channel */
 #define ATAPI_WRITE_BUFFER              0x3b    /* write device buffer */
 #define ATAPI_READ_BUFFER               0x3c    /* read device buffer */
 #define ATAPI_READ_SUBCHANNEL           0x42    /* get subchannel info */
 #define ATAPI_READ_TOC                  0x43    /* get table of contents */
 #define ATAPI_PLAY_10                   0x45    /* play by lba */
 #define ATAPI_PLAY_MSF                  0x47    /* play by MSF address */
 #define ATAPI_PLAY_TRACK                0x48    /* play by track number */
 #define ATAPI_PAUSE                     0x4b    /* pause audio operation */
 #define ATAPI_READ_DISK_INFO            0x51    /* get disk info structure */
 #define ATAPI_READ_TRACK_INFO           0x52    /* get track info structure */
 #define ATAPI_RESERVE_TRACK             0x53    /* reserve track */
 #define ATAPI_SEND_OPC_INFO             0x54    /* send OPC structurek */
 #define ATAPI_MODE_SELECT_BIG           0x55    /* set device parameters */
 #define ATAPI_REPAIR_TRACK              0x58    /* repair track */
 #define ATAPI_READ_MASTER_CUE           0x59    /* read master CUE info */
 #define ATAPI_MODE_SENSE_BIG            0x5a    /* get device parameters */
 #define ATAPI_CLOSE_TRACK               0x5b    /* close track/session */
 #define ATAPI_READ_BUFFER_CAPACITY      0x5c    /* get buffer capicity */
 #define ATAPI_SEND_CUE_SHEET            0x5d    /* send CUE sheet */
 #define ATAPI_SERVICE_ACTION_IN         0x96	/* get service data */
 #define ATAPI_BLANK                     0xa1    /* blank the media */
 #define ATAPI_SEND_KEY                  0xa3    /* send DVD key structure */
 #define ATAPI_REPORT_KEY                0xa4    /* get DVD key structure */
 #define ATAPI_PLAY_12                   0xa5    /* play by lba */
 #define ATAPI_LOAD_UNLOAD               0xa6    /* changer control command */
 #define ATAPI_READ_STRUCTURE            0xad    /* get DVD structure */
 #define ATAPI_PLAY_CD                   0xb4    /* universal play command */
 #define ATAPI_SET_SPEED                 0xbb    /* set drive speed */
 #define ATAPI_MECH_STATUS               0xbd    /* get changer status */
 #define ATAPI_READ_CD                   0xbe    /* read data */
 #define ATAPI_POLL_DSC                  0xff    /* poll DSC status bit */
 
 
 struct ata_ioc_devices {
     int                 channel;
     char                name[2][32];
     struct ata_params   params[2];
 };
 
 /* pr channel ATA ioctl calls */
 #define IOCATAGMAXCHANNEL       _IOR('a',  1, int)
 #define IOCATAREINIT            _IOW('a',  2, int)
 #define IOCATAATTACH            _IOW('a',  3, int)
 #define IOCATADETACH            _IOW('a',  4, int)
 #define IOCATADEVICES           _IOWR('a',  5, struct ata_ioc_devices)
 
 /* ATAPI request sense structure */
 struct atapi_sense {
     u_int8_t	error;				/* current or deferred errors */
 #define	ATA_SENSE_VALID			0x80
 
     u_int8_t	segment;			/* segment number */
     u_int8_t	key;				/* sense key */
 #define ATA_SENSE_KEY_MASK		0x0f    /* sense key mask */
 #define ATA_SENSE_NO_SENSE		0x00    /* no specific sense key info */
 #define ATA_SENSE_RECOVERED_ERROR 	0x01    /* command OK, data recovered */
 #define ATA_SENSE_NOT_READY		0x02    /* no access to drive */
 #define ATA_SENSE_MEDIUM_ERROR		0x03    /* non-recovered data error */
 #define ATA_SENSE_HARDWARE_ERROR	0x04    /* non-recoverable HW failure */
 #define ATA_SENSE_ILLEGAL_REQUEST	0x05    /* invalid command param(s) */
 #define ATA_SENSE_UNIT_ATTENTION	0x06    /* media changed */
 #define ATA_SENSE_DATA_PROTECT		0x07    /* write protect */
 #define ATA_SENSE_BLANK_CHECK		0x08    /* blank check */
 #define ATA_SENSE_VENDOR_SPECIFIC	0x09    /* vendor specific skey */
 #define ATA_SENSE_COPY_ABORTED		0x0a    /* copy aborted */
 #define ATA_SENSE_ABORTED_COMMAND	0x0b    /* command aborted, try again */
 #define ATA_SENSE_EQUAL			0x0c    /* equal */
 #define ATA_SENSE_VOLUME_OVERFLOW	0x0d    /* volume overflow */
 #define ATA_SENSE_MISCOMPARE		0x0e    /* data dont match the medium */
 #define ATA_SENSE_RESERVED		0x0f
 #define	ATA_SENSE_ILI			0x20;
 #define	ATA_SENSE_EOM			0x40;
 #define	ATA_SENSE_FILEMARK		0x80;
 
     u_int32_t   cmd_info;		/* cmd information */
     u_int8_t	sense_length;		/* additional sense len (n-7) */
     u_int32_t   cmd_specific_info;	/* additional cmd spec info */
     u_int8_t    asc;			/* additional sense code */
     u_int8_t    ascq;			/* additional sense code qual */
     u_int8_t    replaceable_unit_code;	/* replaceable unit code */
     u_int8_t	specific;		/* sense key specific */
 #define	ATA_SENSE_SPEC_VALID	0x80
 #define	ATA_SENSE_SPEC_MASK	0x7f
 	
     u_int8_t	specific1;		/* sense key specific */
     u_int8_t	specific2;		/* sense key specific */
 } __packed;
 
 /*
  * SET FEATURES subcommands
  */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * These values go in the LBA 3:0.
  */
 #define ATA_SF_EPC_RESTORE	0x00	/* Restore Power Condition Settings */
 #define ATA_SF_EPC_GOTO		0x01	/* Go To Power Condition */
 #define ATA_SF_EPC_SET_TIMER	0x02	/* Set Power Condition Timer */
 #define ATA_SF_EPC_SET_STATE	0x03	/* Set Power Condition State */
 #define ATA_SF_EPC_ENABLE	0x04	/* Enable the EPC feature set */
 #define ATA_SF_EPC_DISABLE	0x05	/* Disable the EPC feature set */
 #define ATA_SF_EPC_SET_SOURCE	0x06	/* Set EPC Power Source */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Power Condition ID field
  * These values go in the count register.
  */
 #define ATA_EPC_STANDBY_Z	0x00	/* Substate of PM2:Standby */
 #define ATA_EPC_STANDBY_Y	0x01	/* Substate of PM2:Standby */
 #define ATA_EPC_IDLE_A		0x81	/* Substate of PM1:Idle */
 #define ATA_EPC_IDLE_B		0x82	/* Substate of PM1:Idle */
 #define ATA_EPC_IDLE_C		0x83	/* Substate of PM1:Idle */
 #define ATA_EPC_ALL		0xff	/* All supported power conditions */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Restore Power Conditions Settings subcommand
  * These values go in the LBA register.
  */
 #define ATA_SF_EPC_RST_DFLT	0x40	/* 1=Rst from Default, 0= from Saved */
 #define ATA_SF_EPC_RST_SAVE	0x10	/* 1=Save on completion */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Got To Power Condition subcommand
  * These values go in the LBA register.
  */
 #define ATA_SF_EPC_GOTO_DELAY	0x02000000	/* Delayed entry bit */
 #define ATA_SF_EPC_GOTO_HOLD	0x01000000	/* Hold Power Cond bit */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Set Power Condition Timer subcommand
  * These values go in the LBA register.
  */
 #define ATA_SF_EPC_TIMER_MASK	0x00ffff00	/* Timer field */
 #define ATA_SF_EPC_TIMER_SHIFT	8
 #define ATA_SF_EPC_TIMER_SEC	0x00000080	/* Timer units, 1=sec, 0=.1s */
 #define ATA_SF_EPC_TIMER_EN	0x00000020	/* Enable/disable cond. */
 #define ATA_SF_EPC_TIMER_SAVE	0x00000010	/* Save settings on comp.  */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Set Power Condition State subcommand
  * These values go in the LBA register.
  */
 #define ATA_SF_EPC_SETCON_EN	0x00000020	/* Enable power cond. */
 #define ATA_SF_EPC_SETCON_SAVE	0x00000010	/* Save settings on comp */
 
 /*
  * SET FEATURES command
  * Extended Power Conditions subcommand -- ATA_SF_EPC (0x4A)
  * Set EPC Power Source subcommand
  * These values go in the count register.
  */
 #define ATA_SF_EPC_SRC_UNKNOWN	0x0000	/* Unknown source */
 #define ATA_SF_EPC_SRC_BAT	0x0001	/* battery source */
 #define ATA_SF_EPC_SRC_NOT_BAT	0x0002	/* not battery source */
 
 #define	ATA_LOG_DIRECTORY	0x00	/* Directory of all logs */
 #define	ATA_POWER_COND_LOG	0x08	/* Power Conditions Log */
 #define	ATA_PCL_IDLE		0x00	/* Idle Power Conditions Page */
 #define	ATA_PCL_STANDBY		0x01	/* Standby Power Conditions Page */
 #define	ATA_IDENTIFY_DATA_LOG	0x30	/* Identify Device Data Log */
 #define	ATA_IDL_PAGE_LIST	0x00	/* List of supported pages */
 #define	ATA_IDL_IDENTIFY_DATA	0x01	/* Copy of Identify Device data */
 #define	ATA_IDL_CAPACITY	0x02	/* Capacity */
 #define	ATA_IDL_SUP_CAP		0x03	/* Supported Capabilities */
 #define	ATA_IDL_CUR_SETTINGS	0x04	/* Current Settings */
 #define	ATA_IDL_ATA_STRINGS	0x05	/* ATA Strings */
 #define	ATA_IDL_SECURITY	0x06	/* Security */
 #define	ATA_IDL_PARALLEL_ATA	0x07	/* Parallel ATA */
 #define	ATA_IDL_SERIAL_ATA	0x08	/* Serial ATA */
 #define	ATA_IDL_ZDI		0x09	/* Zoned Device Information */
 
 struct ata_gp_log_dir {
 	uint8_t header[2];
 #define	ATA_GP_LOG_DIR_VERSION		0x0001
 	uint8_t num_pages[255*2];	/* Number of log pages at address */
 };
 
 /*
  * ATA Power Conditions log descriptor
  */
 struct ata_power_cond_log_desc {
 	uint8_t reserved1;
 	uint8_t flags;
 #define ATA_PCL_COND_SUPPORTED		0x80
 #define ATA_PCL_COND_SAVEABLE		0x40
 #define ATA_PCL_COND_CHANGEABLE		0x20
 #define ATA_PCL_DEFAULT_TIMER_EN	0x10
 #define ATA_PCL_SAVED_TIMER_EN		0x08
 #define ATA_PCL_CURRENT_TIMER_EN	0x04
 #define ATA_PCL_HOLD_PC_NOT_SUP		0x02
 	uint8_t reserved2[2];
 	uint8_t default_timer[4];
 	uint8_t saved_timer[4];
 	uint8_t current_timer[4];
 	uint8_t nom_time_to_active[4];
 	uint8_t min_timer[4];
 	uint8_t max_timer[4];
 	uint8_t num_transitions_to_pc[4];
 	uint8_t hours_in_pc[4];
 	uint8_t reserved3[28];
 };
 
 /*
  * ATA Power Conditions Log (0x08), Idle power conditions page (0x00)
  */
 struct ata_power_cond_log_idle {
 	struct ata_power_cond_log_desc idle_a_desc;
 	struct ata_power_cond_log_desc idle_b_desc;
 	struct ata_power_cond_log_desc idle_c_desc;
 	uint8_t reserved[320];
 };
 
 /*
  * ATA Power Conditions Log (0x08), Standby power conditions page (0x01)
  */
 struct ata_power_cond_log_standby {
 	uint8_t reserved[384];
 	struct ata_power_cond_log_desc standby_y_desc;
 	struct ata_power_cond_log_desc standby_z_desc;
 };
 
 /*
  * ATA IDENTIFY DEVICE data log (0x30) page 0x00
  * List of Supported IDENTIFY DEVICE data pages.
  */
 struct ata_identify_log_pages {
 	uint8_t header[8];
 #define	ATA_IDLOG_REVISION	0x0000000000000001
 	uint8_t entry_count;
 	uint8_t entries[503];
 };
 
 /*
  * ATA IDENTIFY DEVICE data log (0x30)
  * Capacity (Page 0x02).
  */
 struct ata_identify_log_capacity {
 	uint8_t header[8];
 #define	ATA_CAP_HEADER_VALID	0x8000000000000000
 #define	ATA_CAP_PAGE_NUM_MASK	0x0000000000ff0000
 #define	ATA_CAP_PAGE_NUM_SHIFT	16
 #define ATA_CAP_REV_MASK	0x00000000000000ff
 	uint8_t capacity[8];
 #define	ATA_CAP_CAPACITY_VALID	0x8000000000000000
 #define	ATA_CAP_ACCESSIBLE_CAP	0x0000ffffffffffff
 	uint8_t phys_logical_sect_size[8];
 #define	ATA_CAP_PL_VALID	0x8000000000000000
 #define	ATA_CAP_LTOP_REL_SUP	0x4000000000000000
 #define	ATA_CAP_LOG_SECT_SUP	0x2000000000000000
 #define	ATA_CAP_ALIGN_ERR_MASK	0x0000000000300000
 #define	ATA_CAP_LTOP_MASK	0x00000000000f0000
 #define	ATA_CAP_LOG_SECT_OFF	0x000000000000ffff
 	uint8_t logical_sect_size[8];
 #define	ATA_CAP_LOG_SECT_VALID	0x8000000000000000
 #define	ATA_CAP_LOG_SECT_SIZE	0x00000000ffffffff
 	uint8_t nominal_buffer_size[8];
 #define	ATA_CAP_NOM_BUF_VALID	0x8000000000000000
 #define	ATA_CAP_NOM_BUF_SIZE	0x7fffffffffffffff
 	uint8_t reserved[472];
 };
 
 /*
  * ATA IDENTIFY DEVICE data log (0x30)
  * Supported Capabilities (Page 0x03).
  */
 
 struct ata_identify_log_sup_cap {
 	uint8_t header[8];
 #define	ATA_SUP_CAP_HEADER_VALID	0x8000000000000000
 #define	ATA_SUP_CAP_PAGE_NUM_MASK	0x0000000000ff0000
 #define	ATA_SUP_CAP_PAGE_NUM_SHIFT	16
 #define ATA_SUP_CAP_REV_MASK		0x00000000000000ff
 	uint8_t sup_cap[8];
 #define	ATA_SUP_CAP_VALID		0x8000000000000000
 #define	ATA_SC_SET_SECT_CONFIG_SUP	0x0002000000000000 /* Set Sect Conf*/
 #define	ATA_SC_ZERO_EXT_SUP		0x0001000000000000 /* Zero EXT */
 #define	ATA_SC_SUCC_NCQ_SENSE_SUP	0x0000800000000000 /* Succ. NCQ Sns */
 #define	ATA_SC_DLC_SUP			0x0000400000000000 /* DLC */
 #define	ATA_SC_RQSN_DEV_FAULT_SUP	0x0000200000000000 /* Req Sns Dev Flt*/
 #define	ATA_SC_DSN_SUP			0x0000100000000000 /* DSN */
 #define	ATA_SC_LP_STANDBY_SUP		0x0000080000000000 /* LP Standby */
 #define	ATA_SC_SET_EPC_PS_SUP		0x0000040000000000 /* Set EPC PS */
 #define	ATA_SC_AMAX_ADDR_SUP		0x0000020000000000 /* AMAX Addr */
 #define	ATA_SC_DRAT_SUP			0x0000008000000000 /* DRAT */
 #define	ATA_SC_LPS_MISALGN_SUP		0x0000004000000000 /* LPS Misalign */
 #define	ATA_SC_RB_DMA_SUP		0x0000001000000000 /* Read Buf DMA */
 #define	ATA_SC_WB_DMA_SUP		0x0000000800000000 /* Write Buf DMA */
 #define	ATA_SC_DNLD_MC_DMA_SUP		0x0000000200000000 /* DL MCode DMA */
 #define	ATA_SC_28BIT_SUP		0x0000000100000000 /* 28-bit */
 #define	ATA_SC_RZAT_SUP			0x0000000080000000 /* RZAT */
 #define	ATA_SC_NOP_SUP			0x0000000020000000 /* NOP */
 #define	ATA_SC_READ_BUFFER_SUP		0x0000000010000000 /* Read Buffer */
 #define	ATA_SC_WRITE_BUFFER_SUP		0x0000000008000000 /* Write Buffer */
 #define	ATA_SC_READ_LOOK_AHEAD_SUP	0x0000000002000000 /* Read Look-Ahead*/
 #define	ATA_SC_VOLATILE_WC_SUP		0x0000000001000000 /* Volatile WC */
 #define	ATA_SC_SMART_SUP		0x0000000000800000 /* SMART */
 #define	ATA_SC_FLUSH_CACHE_EXT_SUP	0x0000000000400000 /* Flush Cache Ext */
 #define	ATA_SC_48BIT_SUP		0x0000000000100000 /* 48-Bit */
 #define	ATA_SC_SPINUP_SUP		0x0000000000040000 /* Spin-Up */
 #define	ATA_SC_PUIS_SUP			0x0000000000020000 /* PUIS */
 #define	ATA_SC_APM_SUP			0x0000000000010000 /* APM */
 #define	ATA_SC_DL_MICROCODE_SUP		0x0000000000004000 /* DL Microcode */
 #define	ATA_SC_UNLOAD_SUP		0x0000000000002000 /* Unload */
 #define	ATA_SC_WRITE_FUA_EXT_SUP	0x0000000000001000 /* Write FUA EXT */
 #define	ATA_SC_GPL_SUP			0x0000000000000800 /* GPL */
 #define	ATA_SC_STREAMING_SUP		0x0000000000000400 /* Streaming */
 #define	ATA_SC_SMART_SELFTEST_SUP	0x0000000000000100 /* SMART self-test */
 #define	ATA_SC_SMART_ERR_LOG_SUP	0x0000000000000080 /* SMART Err Log */
 #define	ATA_SC_EPC_SUP			0x0000000000000040 /* EPC */
 #define	ATA_SC_SENSE_SUP		0x0000000000000020 /* Sense data */
 #define	ATA_SC_FREEFALL_SUP		0x0000000000000010 /* Free-Fall */
 #define	ATA_SC_DM_MODE3_SUP		0x0000000000000008 /* DM Mode 3 */
 #define	ATA_SC_GPL_DMA_SUP		0x0000000000000004 /* GPL DMA */
 #define ATA_SC_WRITE_UNCOR_SUP		0x0000000000000002 /* Write uncorr.  */
 #define ATA_SC_WRV_SUP			0x0000000000000001 /* WRV */
 	uint8_t download_code_cap[8];
 #define ATA_DL_CODE_VALID		0x8000000000000000
 #define	ATA_DLC_DM_OFFSETS_DEFER_SUP	0x0000000400000000
 #define	ATA_DLC_DM_IMMED_SUP		0x0000000200000000
 #define	ATA_DLC_DM_OFF_IMMED_SUP	0x0000000100000000
 #define	ATA_DLC_DM_MAX_XFER_SIZE_MASK	0x00000000ffff0000
 #define	ATA_DLC_DM_MAX_XFER_SIZE_SHIFT	16
 #define	ATA_DLC_DM_MIN_XFER_SIZE_MASK	0x000000000000ffff
 	uint8_t nom_media_rotation_rate[8];
 #define	ATA_NOM_MEDIA_ROTATION_VALID	0x8000000000000000
 #define	ATA_ROTATION_MASK		0x000000000000ffff
 	uint8_t form_factor[8];
 #define	ATA_FORM_FACTOR_VALID		0x8000000000000000
 #define	ATA_FF_MASK			0x000000000000000f
 #define	ATA_FF_NOT_REPORTED		0x0000000000000000 /* Not reported */
 #define	ATA_FF_525_IN			0x0000000000000001 /* 5.25 inch */
 #define	ATA_FF_35_IN			0x0000000000000002 /* 3.5 inch */
 #define	ATA_FF_25_IN			0x0000000000000003 /* 2.5 inch */
 #define	ATA_FF_18_IN			0x0000000000000004 /* 1.8 inch */
 #define	ATA_FF_LT_18_IN			0x0000000000000005 /* < 1.8 inch */
 #define	ATA_FF_MSATA			0x0000000000000006 /* mSATA */
 #define	ATA_FF_M2			0x0000000000000007 /* M.2 */
 #define	ATA_FF_MICROSSD			0x0000000000000008 /* MicroSSD */
 #define	ATA_FF_CFAST			0x0000000000000009 /* CFast */
 	uint8_t wrv_sec_cnt_mode3[8];
 #define ATA_WRV_MODE3_VALID		0x8000000000000000
 #define ATA_WRV_MODE3_COUNT		0x00000000ffffffff
 	uint8_t wrv_sec_cnt_mode2[8];
 #define	ATA_WRV_MODE2_VALID		0x8000000000000000
 #define ATA_WRV_MODE2_COUNT		0x00000000ffffffff
 	uint8_t wwn[16];
 	/* XXX KDM need to figure out how to handle 128-bit fields */
 	uint8_t dsm[8];
 #define	ATA_DSM_VALID			0x8000000000000000
 #define	ATA_LB_MARKUP_SUP		0x000000000000ff00
 #define	ATA_TRIM_SUP			0x0000000000000001
 	uint8_t util_per_unit_time[16];
 	/* XXX KDM need to figure out how to handle 128-bit fields */
 	uint8_t util_usage_rate_sup[8];
 #define	ATA_UTIL_USAGE_RATE_VALID	0x8000000000000000
 #define	ATA_SETTING_RATE_SUP		0x0000000000800000
 #define	ATA_SINCE_POWERON_SUP		0x0000000000000100
 #define	ATA_POH_RATE_SUP		0x0000000000000010
 #define	ATA_DATE_TIME_RATE_SUP		0x0000000000000001
 	uint8_t zoned_cap[8];
 #define	ATA_ZONED_VALID			0x8000000000000000
 #define	ATA_ZONED_MASK			0x0000000000000003
 	uint8_t sup_zac_cap[8];
 #define	ATA_SUP_ZAC_CAP_VALID		0x8000000000000000
 #define	ATA_ND_RWP_SUP			0x0000000000000010 /* Reset Write Ptr*/
 #define	ATA_ND_FINISH_ZONE_SUP		0x0000000000000008 /* Finish Zone */
 #define	ATA_ND_CLOSE_ZONE_SUP		0x0000000000000004 /* Close Zone */
 #define	ATA_ND_OPEN_ZONE_SUP		0x0000000000000002 /* Open Zone */
 #define	ATA_REPORT_ZONES_SUP		0x0000000000000001 /* Report Zones */
 	uint8_t reserved[392];
 };
 
 /*
  * ATA Identify Device Data Log Zoned Device Information Page (0x09).
  * Current as of ZAC r04a, August 25, 2015.
  */
 struct ata_zoned_info_log {
 	uint8_t header[8];
 #define	ATA_ZDI_HEADER_VALID	0x8000000000000000
 #define	ATA_ZDI_PAGE_NUM_MASK	0x0000000000ff0000
 #define	ATA_ZDI_PAGE_NUM_SHIFT	16
 #define ATA_ZDI_REV_MASK	0x00000000000000ff
 	uint8_t zoned_cap[8];
 #define	ATA_ZDI_CAP_VALID	0x8000000000000000
 #define	ATA_ZDI_CAP_URSWRZ	0x0000000000000001
 	uint8_t zoned_settings[8];
 #define	ATA_ZDI_SETTINGS_VALID	0x8000000000000000
 	uint8_t optimal_seq_zones[8];
 #define	ATA_ZDI_OPT_SEQ_VALID	0x8000000000000000
 #define	ATA_ZDI_OPT_SEQ_MASK	0x00000000ffffffff
 	uint8_t optimal_nonseq_zones[8];
 #define	ATA_ZDI_OPT_NS_VALID	0x8000000000000000
 #define	ATA_ZDI_OPT_NS_MASK	0x00000000ffffffff
 	uint8_t max_seq_req_zones[8];
 #define	ATA_ZDI_MAX_SEQ_VALID	0x8000000000000000
 #define	ATA_ZDI_MAX_SEQ_MASK	0x00000000ffffffff
 	uint8_t version_info[8];
 #define	ATA_ZDI_VER_VALID	0x8000000000000000
 #define	ATA_ZDI_VER_ZAC_SUP	0x0100000000000000
 #define	ATA_ZDI_VER_ZAC_MASK	0x00000000000000ff
 	uint8_t reserved[456];
 };
 
 struct ata_ioc_request {
     union {
 	struct {
 	    u_int8_t            command;
 	    u_int8_t            feature;
 	    u_int64_t           lba;
 	    u_int16_t           count;
 	} ata;
 	struct {
 	    char                ccb[16];
 	    struct atapi_sense	sense;
 	} atapi;
     } u;
     caddr_t             data;
     int                 count;
     int                 flags;
 #define ATA_CMD_CONTROL                 0x01
 #define ATA_CMD_READ                    0x02
 #define ATA_CMD_WRITE                   0x04
 #define ATA_CMD_ATAPI                   0x08
 
     int                 timeout;
     int                 error;
 };
 
 struct ata_security_password {
 	u_int16_t		ctrl;
 #define ATA_SECURITY_PASSWORD_USER	0x0000
 #define ATA_SECURITY_PASSWORD_MASTER	0x0001
 #define ATA_SECURITY_ERASE_NORMAL	0x0000
 #define ATA_SECURITY_ERASE_ENHANCED	0x0002
 #define ATA_SECURITY_LEVEL_HIGH		0x0000
 #define ATA_SECURITY_LEVEL_MAXIMUM	0x0100
 
 	u_int8_t		password[32];
 	u_int16_t		revision;
 	u_int16_t		reserved[238];
 };
 
 /* pr device ATA ioctl calls */
 #define IOCATAREQUEST           _IOWR('a', 100, struct ata_ioc_request)
 #define IOCATAGPARM             _IOR('a', 101, struct ata_params)
 #define IOCATAGMODE             _IOR('a', 102, int)
 #define IOCATASMODE             _IOW('a', 103, int)
 
 #define IOCATAGSPINDOWN		_IOR('a', 104, int)
 #define IOCATASSPINDOWN		_IOW('a', 105, int)
 
 
 struct ata_ioc_raid_config {
 	    int                 lun;
 	    int                 type;
 #define AR_JBOD                         0x0001
 #define AR_SPAN                         0x0002
 #define AR_RAID0                        0x0004
 #define AR_RAID1                        0x0008
 #define AR_RAID01                       0x0010
 #define AR_RAID3                        0x0020
 #define AR_RAID4                        0x0040
 #define AR_RAID5                        0x0080
 
 	    int                 interleave;
 	    int                 status;
 #define AR_READY                        1
 #define AR_DEGRADED                     2
 #define AR_REBUILDING                   4
 
 	    int                 progress;
 	    int                 total_disks;
 	    int                 disks[16];
 };
 
 struct ata_ioc_raid_status {
 	    int                 lun;
 	    int                 type;
 	    int                 interleave;
 	    int                 status;
 	    int                 progress;
 	    int                 total_disks;
 	    struct {
 		    int		state;
 #define AR_DISK_ONLINE			0x01
 #define AR_DISK_PRESENT			0x02
 #define AR_DISK_SPARE			0x04
 		    int		lun;
 	    } disks[16];
 };
 
 /* ATA RAID ioctl calls */
 #define IOCATARAIDCREATE        _IOWR('a', 200, struct ata_ioc_raid_config)
 #define IOCATARAIDDELETE        _IOW('a', 201, int)
 #define IOCATARAIDSTATUS        _IOWR('a', 202, struct ata_ioc_raid_status)
 #define IOCATARAIDADDSPARE      _IOW('a', 203, struct ata_ioc_raid_config)
 #define IOCATARAIDREBUILD       _IOW('a', 204, int)
 
 #endif /* _SYS_ATA_H_ */
Index: projects/nfsv42/sys/ufs/ufs/ufs_vnops.c
===================================================================
--- projects/nfsv42/sys/ufs/ufs/ufs_vnops.c	(revision 350367)
+++ projects/nfsv42/sys/ufs/ufs/ufs_vnops.c	(revision 350368)
@@ -1,2795 +1,2802 @@
 /*-
  * SPDX-License-Identifier: BSD-3-Clause
  *
  * Copyright (c) 1982, 1986, 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
  * All or some portions of this file are derived from material licensed
  * to the University of California by American Telephone and Telegraph
  * Co. or Unix System Laboratories, Inc. and are reproduced herein with
  * the permission of UNIX System Laboratories, Inc.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  * 3. Neither the name of the University nor the names of its contributors
  *    may be used to endorse or promote products derived from this software
  *    without specific prior written permission.
  *
  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  *	@(#)ufs_vnops.c	8.27 (Berkeley) 5/27/95
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include "opt_quota.h"
 #include "opt_suiddir.h"
 #include "opt_ufs.h"
 #include "opt_ffs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/namei.h>
 #include <sys/kernel.h>
 #include <sys/fcntl.h>
 #include <sys/filio.h>
 #include <sys/stat.h>
 #include <sys/bio.h>
 #include <sys/buf.h>
 #include <sys/mount.h>
 #include <sys/priv.h>
 #include <sys/refcount.h>
 #include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/dirent.h>
 #include <sys/lockf.h>
 #include <sys/conf.h>
 #include <sys/acl.h>
 
 #include <security/mac/mac_framework.h>
 
 #include <sys/file.h>		/* XXX */
 
 #include <vm/vm.h>
 #include <vm/vm_extern.h>
 
 #include <ufs/ufs/acl.h>
 #include <ufs/ufs/extattr.h>
 #include <ufs/ufs/quota.h>
 #include <ufs/ufs/inode.h>
 #include <ufs/ufs/dir.h>
 #include <ufs/ufs/ufsmount.h>
 #include <ufs/ufs/ufs_extern.h>
 #ifdef UFS_DIRHASH
 #include <ufs/ufs/dirhash.h>
 #endif
 #ifdef UFS_GJOURNAL
 #include <ufs/ufs/gjournal.h>
 FEATURE(ufs_gjournal, "Journaling support through GEOM for UFS");
 #endif
 
 #ifdef QUOTA
 FEATURE(ufs_quota, "UFS disk quotas support");
 FEATURE(ufs_quota64, "64bit UFS disk quotas support");
 #endif
 
 #ifdef SUIDDIR
 FEATURE(suiddir,
     "Give all new files in directory the same ownership as the directory");
 #endif
 
 
 #include <ufs/ffs/ffs_extern.h>
 
 static vop_accessx_t	ufs_accessx;
 static int ufs_chmod(struct vnode *, int, struct ucred *, struct thread *);
 static int ufs_chown(struct vnode *, uid_t, gid_t, struct ucred *, struct thread *);
 static vop_close_t	ufs_close;
 static vop_create_t	ufs_create;
 static vop_getattr_t	ufs_getattr;
 static vop_ioctl_t	ufs_ioctl;
 static vop_link_t	ufs_link;
 static int ufs_makeinode(int mode, struct vnode *, struct vnode **, struct componentname *, const char *);
 static vop_markatime_t	ufs_markatime;
 static vop_mkdir_t	ufs_mkdir;
 static vop_mknod_t	ufs_mknod;
 static vop_open_t	ufs_open;
 static vop_pathconf_t	ufs_pathconf;
 static vop_print_t	ufs_print;
 static vop_readlink_t	ufs_readlink;
 static vop_remove_t	ufs_remove;
 static vop_rename_t	ufs_rename;
 static vop_rmdir_t	ufs_rmdir;
 static vop_setattr_t	ufs_setattr;
 static vop_strategy_t	ufs_strategy;
 static vop_symlink_t	ufs_symlink;
 static vop_whiteout_t	ufs_whiteout;
 static vop_close_t	ufsfifo_close;
 static vop_kqfilter_t	ufsfifo_kqfilter;
 
 SYSCTL_NODE(_vfs, OID_AUTO, ufs, CTLFLAG_RD, 0, "UFS filesystem");
 
 /*
  * A virgin directory (no blushing please).
  */
 static struct dirtemplate mastertemplate = {
 	0, 12, DT_DIR, 1, ".",
 	0, DIRBLKSIZ - 12, DT_DIR, 2, ".."
 };
 static struct odirtemplate omastertemplate = {
 	0, 12, 1, ".",
 	0, DIRBLKSIZ - 12, 2, ".."
 };
 
 static void
 ufs_itimes_locked(struct vnode *vp)
 {
 	struct inode *ip;
 	struct timespec ts;
 
 	ASSERT_VI_LOCKED(vp, __func__);
 
 	ip = VTOI(vp);
 	if (UFS_RDONLY(ip))
 		goto out;
 	if ((ip->i_flag & (IN_ACCESS | IN_CHANGE | IN_UPDATE)) == 0)
 		return;
 
 	if ((vp->v_type == VBLK || vp->v_type == VCHR) && !DOINGSOFTDEP(vp))
 		ip->i_flag |= IN_LAZYMOD;
 	else if (((vp->v_mount->mnt_kern_flag &
 		    (MNTK_SUSPENDED | MNTK_SUSPEND)) == 0) ||
 		    (ip->i_flag & (IN_CHANGE | IN_UPDATE)))
 		ip->i_flag |= IN_MODIFIED;
 	else if (ip->i_flag & IN_ACCESS)
 		ip->i_flag |= IN_LAZYACCESS;
 	vfs_timestamp(&ts);
 	if (ip->i_flag & IN_ACCESS) {
 		DIP_SET(ip, i_atime, ts.tv_sec);
 		DIP_SET(ip, i_atimensec, ts.tv_nsec);
 	}
 	if (ip->i_flag & IN_UPDATE) {
 		DIP_SET(ip, i_mtime, ts.tv_sec);
 		DIP_SET(ip, i_mtimensec, ts.tv_nsec);
 	}
 	if (ip->i_flag & IN_CHANGE) {
 		DIP_SET(ip, i_ctime, ts.tv_sec);
 		DIP_SET(ip, i_ctimensec, ts.tv_nsec);
 		DIP_SET(ip, i_modrev, DIP(ip, i_modrev) + 1);
 	}
 
  out:
 	ip->i_flag &= ~(IN_ACCESS | IN_CHANGE | IN_UPDATE);
 }
 
 void
 ufs_itimes(struct vnode *vp)
 {
 
 	VI_LOCK(vp);
 	ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 }
 
 /*
  * Create a regular file
  */
 static int
 ufs_create(ap)
 	struct vop_create_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	int error;
 
 	error =
 	    ufs_makeinode(MAKEIMODE(ap->a_vap->va_type, ap->a_vap->va_mode),
 	    ap->a_dvp, ap->a_vpp, ap->a_cnp, "ufs_create");
 	if (error != 0)
 		return (error);
 	if ((ap->a_cnp->cn_flags & MAKEENTRY) != 0)
 		cache_enter(ap->a_dvp, *ap->a_vpp, ap->a_cnp);
 	return (0);
 }
 
 /*
  * Mknod vnode call
  */
 /* ARGSUSED */
 static int
 ufs_mknod(ap)
 	struct vop_mknod_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode **vpp = ap->a_vpp;
 	struct inode *ip;
 	ino_t ino;
 	int error;
 
 	error = ufs_makeinode(MAKEIMODE(vap->va_type, vap->va_mode),
 	    ap->a_dvp, vpp, ap->a_cnp, "ufs_mknod");
 	if (error)
 		return (error);
 	ip = VTOI(*vpp);
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	if (vap->va_rdev != VNOVAL) {
 		/*
 		 * Want to be able to use this to make badblock
 		 * inodes, so don't truncate the dev number.
 		 */
 		DIP_SET(ip, i_rdev, vap->va_rdev);
 	}
 	/*
 	 * Remove inode, then reload it through VFS_VGET so it is
 	 * checked to see if it is an alias of an existing entry in
 	 * the inode cache.  XXX I don't believe this is necessary now.
 	 */
 	(*vpp)->v_type = VNON;
 	ino = ip->i_number;	/* Save this before vgone() invalidates ip. */
 	vgone(*vpp);
 	vput(*vpp);
 	error = VFS_VGET(ap->a_dvp->v_mount, ino, LK_EXCLUSIVE, vpp);
 	if (error) {
 		*vpp = NULL;
 		return (error);
 	}
 	return (0);
 }
 
 /*
  * Open called.
  */
 /* ARGSUSED */
 static int
 ufs_open(struct vop_open_args *ap)
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip;
 
 	if (vp->v_type == VCHR || vp->v_type == VBLK)
 		return (EOPNOTSUPP);
 
 	ip = VTOI(vp);
 	/*
 	 * Files marked append-only must be opened for appending.
 	 */
 	if ((ip->i_flags & APPEND) &&
 	    (ap->a_mode & (FWRITE | O_APPEND)) == FWRITE)
 		return (EPERM);
 	vnode_create_vobject(vp, DIP(ip, i_size), ap->a_td);
 	return (0);
 }
 
 /*
  * Close called.
  *
  * Update the times on the inode.
  */
 /* ARGSUSED */
 static int
 ufs_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int usecount;
 
 	VI_LOCK(vp);
 	usecount = vp->v_usecount;
 	if (usecount > 1)
 		ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (0);
 }
 
 static int
 ufs_accessx(ap)
 	struct vop_accessx_args /* {
 		struct vnode *a_vp;
 		accmode_t a_accmode;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	accmode_t accmode = ap->a_accmode;
 	int error;
 #ifdef UFS_ACL
 	struct acl *acl;
 	acl_type_t type;
 #endif
 
 	/*
 	 * Disallow write attempts on read-only filesystems;
 	 * unless the file is a socket, fifo, or a block or
 	 * character device resident on the filesystem.
 	 */
 	if (accmode & VMODIFY_PERMS) {
 		switch (vp->v_type) {
 		case VDIR:
 		case VLNK:
 		case VREG:
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 #ifdef QUOTA
 			/*
 			 * Inode is accounted in the quotas only if struct
 			 * dquot is attached to it. VOP_ACCESS() is called
 			 * from vn_open_cred() and provides a convenient
 			 * point to call getinoquota().  The lock mode is
 			 * exclusive when the file is opening for write.
 			 */
 			if (VOP_ISLOCKED(vp) == LK_EXCLUSIVE) {
 				error = getinoquota(ip);
 				if (error != 0)
 					return (error);
 			}
 #endif
 			break;
 		default:
 			break;
 		}
 	}
 
 	/*
 	 * If immutable bit set, nobody gets to write it.  "& ~VADMIN_PERMS"
 	 * permits the owner of the file to remove the IMMUTABLE flag.
 	 */
 	if ((accmode & (VMODIFY_PERMS & ~VADMIN_PERMS)) &&
 	    (ip->i_flags & (IMMUTABLE | SF_SNAPSHOT)))
 		return (EPERM);
 
 #ifdef UFS_ACL
 	if ((vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS)) != 0) {
 		if (vp->v_mount->mnt_flag & MNT_NFS4ACLS)
 			type = ACL_TYPE_NFS4;
 		else
 			type = ACL_TYPE_ACCESS;
 
 		acl = acl_alloc(M_WAITOK);
 		if (type == ACL_TYPE_NFS4)
 			error = ufs_getacl_nfs4_internal(vp, acl, ap->a_td);
 		else
 			error = VOP_GETACL(vp, type, acl, ap->a_cred, ap->a_td);
 		switch (error) {
 		case 0:
 			if (type == ACL_TYPE_NFS4) {
 				error = vaccess_acl_nfs4(vp->v_type, ip->i_uid,
 				    ip->i_gid, acl, accmode, ap->a_cred, NULL);
 			} else {
 				error = vfs_unixify_accmode(&accmode);
 				if (error == 0)
 					error = vaccess_acl_posix1e(vp->v_type, ip->i_uid,
 					    ip->i_gid, acl, accmode, ap->a_cred, NULL);
 			}
 			break;
 		default:
 			if (error != EOPNOTSUPP)
 				printf(
 "ufs_accessx(): Error retrieving ACL on object (%d).\n",
 				    error);
 			/*
 			 * XXX: Fall back until debugged.  Should
 			 * eventually possibly log an error, and return
 			 * EPERM for safety.
 			 */
 			error = vfs_unixify_accmode(&accmode);
 			if (error == 0)
 				error = vaccess(vp->v_type, ip->i_mode, ip->i_uid,
 				    ip->i_gid, accmode, ap->a_cred, NULL);
 		}
 		acl_free(acl);
 
 		return (error);
 	}
 #endif /* !UFS_ACL */
 	error = vfs_unixify_accmode(&accmode);
 	if (error == 0)
 		error = vaccess(vp->v_type, ip->i_mode, ip->i_uid, ip->i_gid,
 		    accmode, ap->a_cred, NULL);
 	return (error);
 }
 
 /* ARGSUSED */
 static int
 ufs_getattr(ap)
 	struct vop_getattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct vattr *vap = ap->a_vap;
 
 	VI_LOCK(vp);
 	ufs_itimes_locked(vp);
 	if (I_IS_UFS1(ip)) {
 		vap->va_atime.tv_sec = ip->i_din1->di_atime;
 		vap->va_atime.tv_nsec = ip->i_din1->di_atimensec;
 	} else {
 		vap->va_atime.tv_sec = ip->i_din2->di_atime;
 		vap->va_atime.tv_nsec = ip->i_din2->di_atimensec;
 	}
 	VI_UNLOCK(vp);
 	/*
 	 * Copy from inode table
 	 */
 	vap->va_fsid = dev2udev(ITOUMP(ip)->um_dev);
 	vap->va_fileid = ip->i_number;
 	vap->va_mode = ip->i_mode & ~IFMT;
 	vap->va_nlink = ip->i_effnlink;
 	vap->va_uid = ip->i_uid;
 	vap->va_gid = ip->i_gid;
 	if (I_IS_UFS1(ip)) {
 		vap->va_rdev = ip->i_din1->di_rdev;
 		vap->va_size = ip->i_din1->di_size;
 		vap->va_mtime.tv_sec = ip->i_din1->di_mtime;
 		vap->va_mtime.tv_nsec = ip->i_din1->di_mtimensec;
 		vap->va_ctime.tv_sec = ip->i_din1->di_ctime;
 		vap->va_ctime.tv_nsec = ip->i_din1->di_ctimensec;
 		vap->va_bytes = dbtob((u_quad_t)ip->i_din1->di_blocks);
 		vap->va_filerev = ip->i_din1->di_modrev;
 	} else {
 		vap->va_rdev = ip->i_din2->di_rdev;
 		vap->va_size = ip->i_din2->di_size;
 		vap->va_mtime.tv_sec = ip->i_din2->di_mtime;
 		vap->va_mtime.tv_nsec = ip->i_din2->di_mtimensec;
 		vap->va_ctime.tv_sec = ip->i_din2->di_ctime;
 		vap->va_ctime.tv_nsec = ip->i_din2->di_ctimensec;
 		vap->va_birthtime.tv_sec = ip->i_din2->di_birthtime;
 		vap->va_birthtime.tv_nsec = ip->i_din2->di_birthnsec;
 		vap->va_bytes = dbtob((u_quad_t)ip->i_din2->di_blocks);
 		vap->va_filerev = ip->i_din2->di_modrev;
 	}
 	vap->va_flags = ip->i_flags;
 	vap->va_gen = ip->i_gen;
 	vap->va_blocksize = vp->v_mount->mnt_stat.f_iosize;
 	vap->va_type = IFTOVT(ip->i_mode);
 	return (0);
 }
 
 /*
  * Set attribute vnode op. called from several syscalls
  */
 static int
 ufs_setattr(ap)
 	struct vop_setattr_args /* {
 		struct vnode *a_vp;
 		struct vattr *a_vap;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vattr *vap = ap->a_vap;
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	struct ucred *cred = ap->a_cred;
 	struct thread *td = curthread;
 	int error;
 
 	/*
 	 * Check for unsettable attributes.
 	 */
 	if ((vap->va_type != VNON) || (vap->va_nlink != VNOVAL) ||
 	    (vap->va_fsid != VNOVAL) || (vap->va_fileid != VNOVAL) ||
 	    (vap->va_blocksize != VNOVAL) || (vap->va_rdev != VNOVAL) ||
 	    ((int)vap->va_bytes != VNOVAL) || (vap->va_gen != VNOVAL)) {
 		return (EINVAL);
 	}
 	if (vap->va_flags != VNOVAL) {
 		if ((vap->va_flags & ~(SF_APPEND | SF_ARCHIVED | SF_IMMUTABLE |
 		    SF_NOUNLINK | SF_SNAPSHOT | UF_APPEND | UF_ARCHIVE |
 		    UF_HIDDEN | UF_IMMUTABLE | UF_NODUMP | UF_NOUNLINK |
 		    UF_OFFLINE | UF_OPAQUE | UF_READONLY | UF_REPARSE |
 		    UF_SPARSE | UF_SYSTEM)) != 0)
 			return (EOPNOTSUPP);
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		/*
 		 * Callers may only modify the file flags on objects they
 		 * have VADMIN rights for.
 		 */
 		if ((error = VOP_ACCESS(vp, VADMIN, cred, td)))
 			return (error);
 		/*
 		 * Unprivileged processes are not permitted to unset system
 		 * flags, or modify flags if any system flags are set.
 		 * Privileged non-jail processes may not modify system flags
 		 * if securelevel > 0 and any existing system flags are set.
 		 * Privileged jail processes behave like privileged non-jail
 		 * processes if the PR_ALLOW_CHFLAGS permission bit is set;
 		 * otherwise, they behave like unprivileged processes.
 		 */
 		if (!priv_check_cred(cred, PRIV_VFS_SYSFLAGS)) {
 			if (ip->i_flags &
 			    (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND)) {
 				error = securelevel_gt(cred, 0);
 				if (error)
 					return (error);
 			}
 			/* The snapshot flag cannot be toggled. */
 			if ((vap->va_flags ^ ip->i_flags) & SF_SNAPSHOT)
 				return (EPERM);
 		} else {
 			if (ip->i_flags &
 			    (SF_NOUNLINK | SF_IMMUTABLE | SF_APPEND) ||
 			    ((vap->va_flags ^ ip->i_flags) & SF_SETTABLE))
 				return (EPERM);
 		}
 		ip->i_flags = vap->va_flags;
 		DIP_SET(ip, i_flags, vap->va_flags);
 		ip->i_flag |= IN_CHANGE;
 		error = UFS_UPDATE(vp, 0);
 		if (ip->i_flags & (IMMUTABLE | APPEND))
 			return (error);
 	}
 	/*
 	 * If immutable or append, no one can change any of its attributes
 	 * except the ones already handled (in some cases, file flags
 	 * including the immutability flags themselves for the superuser).
 	 */
 	if (ip->i_flags & (IMMUTABLE | APPEND))
 		return (EPERM);
 	/*
 	 * Go through the fields and update iff not VNOVAL.
 	 */
 	if (vap->va_uid != (uid_t)VNOVAL || vap->va_gid != (gid_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((error = ufs_chown(vp, vap->va_uid, vap->va_gid, cred,
 		    td)) != 0)
 			return (error);
 	}
 	if (vap->va_size != VNOVAL) {
 		/*
 		 * XXX most of the following special cases should be in
 		 * callers instead of in N filesystems.  The VDIR check
 		 * mostly already is.
 		 */
 		switch (vp->v_type) {
 		case VDIR:
 			return (EISDIR);
 		case VLNK:
 		case VREG:
 			/*
 			 * Truncation should have an effect in these cases.
 			 * Disallow it if the filesystem is read-only or
 			 * the file is being snapshotted.
 			 */
 			if (vp->v_mount->mnt_flag & MNT_RDONLY)
 				return (EROFS);
 			if ((ip->i_flags & SF_SNAPSHOT) != 0)
 				return (EPERM);
 			break;
 		default:
 			/*
 			 * According to POSIX, the result is unspecified
 			 * for file types other than regular files,
 			 * directories and shared memory objects.  We
 			 * don't support shared memory objects in the file
 			 * system, and have dubious support for truncating
 			 * symlinks.  Just ignore the request in other cases.
 			 */
 			return (0);
 		}
 		if ((error = UFS_TRUNCATE(vp, vap->va_size, IO_NORMAL |
 		    ((vap->va_vaflags & VA_SYNC) != 0 ? IO_SYNC : 0),
 		    cred)) != 0)
 			return (error);
 	}
 	if (vap->va_atime.tv_sec != VNOVAL ||
 	    vap->va_mtime.tv_sec != VNOVAL ||
 	    vap->va_birthtime.tv_sec != VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((ip->i_flags & SF_SNAPSHOT) != 0)
 			return (EPERM);
 		error = vn_utimes_perm(vp, vap, cred, td);
 		if (error != 0)
 			return (error);
 		ip->i_flag |= IN_CHANGE | IN_MODIFIED;
 		if (vap->va_atime.tv_sec != VNOVAL) {
 			ip->i_flag &= ~IN_ACCESS;
 			DIP_SET(ip, i_atime, vap->va_atime.tv_sec);
 			DIP_SET(ip, i_atimensec, vap->va_atime.tv_nsec);
 		}
 		if (vap->va_mtime.tv_sec != VNOVAL) {
 			ip->i_flag &= ~IN_UPDATE;
 			DIP_SET(ip, i_mtime, vap->va_mtime.tv_sec);
 			DIP_SET(ip, i_mtimensec, vap->va_mtime.tv_nsec);
 		}
 		if (vap->va_birthtime.tv_sec != VNOVAL && I_IS_UFS2(ip)) {
 			ip->i_din2->di_birthtime = vap->va_birthtime.tv_sec;
 			ip->i_din2->di_birthnsec = vap->va_birthtime.tv_nsec;
 		}
 		error = UFS_UPDATE(vp, 0);
 		if (error)
 			return (error);
 	}
 	error = 0;
 	if (vap->va_mode != (mode_t)VNOVAL) {
 		if (vp->v_mount->mnt_flag & MNT_RDONLY)
 			return (EROFS);
 		if ((ip->i_flags & SF_SNAPSHOT) != 0 && (vap->va_mode &
 		   (S_IXUSR | S_IWUSR | S_IXGRP | S_IWGRP | S_IXOTH | S_IWOTH)))
 			return (EPERM);
 		error = ufs_chmod(vp, (int)vap->va_mode, cred, td);
 	}
 	return (error);
 }
 
 #ifdef UFS_ACL
 static int
 ufs_update_nfs4_acl_after_mode_change(struct vnode *vp, int mode,
     int file_owner_id, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct acl *aclp;
 
 	aclp = acl_alloc(M_WAITOK);
 	error = ufs_getacl_nfs4_internal(vp, aclp, td);
 	/*
 	 * We don't have to handle EOPNOTSUPP here, as the filesystem claims
 	 * it supports ACLs.
 	 */
 	if (error)
 		goto out;
 
 	acl_nfs4_sync_acl_from_mode(aclp, mode, file_owner_id);
 	error = ufs_setacl_nfs4_internal(vp, aclp, td);
 
 out:
 	acl_free(aclp);
 	return (error);
 }
 #endif /* UFS_ACL */
 
 /*
  * Mark this file's access time for update for vfs_mark_atime().  This
  * is called from execve() and mmap().
  */
 static int
 ufs_markatime(ap)
 	struct vop_markatime_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 
 	VI_LOCK(vp);
 	ip->i_flag |= IN_ACCESS;
 	VI_UNLOCK(vp);
 	/*
 	 * XXXKIB No UFS_UPDATE(ap->a_vp, 0) there.
 	 */
 	return (0);
 }
 
 /*
  * Change the mode on a file.
  * Inode must be locked before calling.
  */
 static int
 ufs_chmod(vp, mode, cred, td)
 	struct vnode *vp;
 	int mode;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct inode *ip = VTOI(vp);
 	int error;
 
 	/*
 	 * To modify the permissions on a file, must possess VADMIN
 	 * for that file.
 	 */
 	if ((error = VOP_ACCESSX(vp, VWRITE_ACL, cred, td)))
 		return (error);
 	/*
 	 * Privileged processes may set the sticky bit on non-directories,
 	 * as well as set the setgid bit on a file with a group that the
 	 * process is not a member of.  Both of these are allowed in
 	 * jail(8).
 	 */
 	if (vp->v_type != VDIR && (mode & S_ISTXT)) {
 		if (priv_check_cred(cred, PRIV_VFS_STICKYFILE))
 			return (EFTYPE);
 	}
 	if (!groupmember(ip->i_gid, cred) && (mode & ISGID)) {
 		error = priv_check_cred(cred, PRIV_VFS_SETGID);
 		if (error)
 			return (error);
 	}
 
 	/*
 	 * Deny setting setuid if we are not the file owner.
 	 */
 	if ((mode & ISUID) && ip->i_uid != cred->cr_uid) {
 		error = priv_check_cred(cred, PRIV_VFS_ADMIN);
 		if (error)
 			return (error);
 	}
 
 	ip->i_mode &= ~ALLPERMS;
 	ip->i_mode |= (mode & ALLPERMS);
 	DIP_SET(ip, i_mode, ip->i_mode);
 	ip->i_flag |= IN_CHANGE;
 #ifdef UFS_ACL
 	if ((vp->v_mount->mnt_flag & MNT_NFS4ACLS) != 0)
 		error = ufs_update_nfs4_acl_after_mode_change(vp, mode, ip->i_uid, cred, td);
 #endif
 	if (error == 0 && (ip->i_flag & IN_CHANGE) != 0)
 		error = UFS_UPDATE(vp, 0);
 
 	return (error);
 }
 
 /*
  * Perform chown operation on inode ip;
  * inode must be locked prior to call.
  */
 static int
 ufs_chown(vp, uid, gid, cred, td)
 	struct vnode *vp;
 	uid_t uid;
 	gid_t gid;
 	struct ucred *cred;
 	struct thread *td;
 {
 	struct inode *ip = VTOI(vp);
 	uid_t ouid;
 	gid_t ogid;
 	int error = 0;
 #ifdef QUOTA
 	int i;
 	ufs2_daddr_t change;
 #endif
 
 	if (uid == (uid_t)VNOVAL)
 		uid = ip->i_uid;
 	if (gid == (gid_t)VNOVAL)
 		gid = ip->i_gid;
 	/*
 	 * To modify the ownership of a file, must possess VADMIN for that
 	 * file.
 	 */
 	if ((error = VOP_ACCESSX(vp, VWRITE_OWNER, cred, td)))
 		return (error);
 	/*
 	 * To change the owner of a file, or change the group of a file to a
 	 * group of which we are not a member, the caller must have
 	 * privilege.
 	 */
 	if (((uid != ip->i_uid && uid != cred->cr_uid) || 
 	    (gid != ip->i_gid && !groupmember(gid, cred))) &&
 	    (error = priv_check_cred(cred, PRIV_VFS_CHOWN)))
 		return (error);
 	ogid = ip->i_gid;
 	ouid = ip->i_uid;
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) != 0)
 		return (error);
 	if (ouid == uid) {
 		dqrele(vp, ip->i_dquot[USRQUOTA]);
 		ip->i_dquot[USRQUOTA] = NODQUOT;
 	}
 	if (ogid == gid) {
 		dqrele(vp, ip->i_dquot[GRPQUOTA]);
 		ip->i_dquot[GRPQUOTA] = NODQUOT;
 	}
 	change = DIP(ip, i_blocks);
 	(void) chkdq(ip, -change, cred, CHOWN);
 	(void) chkiq(ip, -1, cred, CHOWN);
 	for (i = 0; i < MAXQUOTAS; i++) {
 		dqrele(vp, ip->i_dquot[i]);
 		ip->i_dquot[i] = NODQUOT;
 	}
 #endif
 	ip->i_gid = gid;
 	DIP_SET(ip, i_gid, gid);
 	ip->i_uid = uid;
 	DIP_SET(ip, i_uid, uid);
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) == 0) {
 		if (ouid == uid) {
 			dqrele(vp, ip->i_dquot[USRQUOTA]);
 			ip->i_dquot[USRQUOTA] = NODQUOT;
 		}
 		if (ogid == gid) {
 			dqrele(vp, ip->i_dquot[GRPQUOTA]);
 			ip->i_dquot[GRPQUOTA] = NODQUOT;
 		}
 		if ((error = chkdq(ip, change, cred, CHOWN)) == 0) {
 			if ((error = chkiq(ip, 1, cred, CHOWN)) == 0)
 				goto good;
 			else
 				(void) chkdq(ip, -change, cred, CHOWN|FORCE);
 		}
 		for (i = 0; i < MAXQUOTAS; i++) {
 			dqrele(vp, ip->i_dquot[i]);
 			ip->i_dquot[i] = NODQUOT;
 		}
 	}
 	ip->i_gid = ogid;
 	DIP_SET(ip, i_gid, ogid);
 	ip->i_uid = ouid;
 	DIP_SET(ip, i_uid, ouid);
 	if (getinoquota(ip) == 0) {
 		if (ouid == uid) {
 			dqrele(vp, ip->i_dquot[USRQUOTA]);
 			ip->i_dquot[USRQUOTA] = NODQUOT;
 		}
 		if (ogid == gid) {
 			dqrele(vp, ip->i_dquot[GRPQUOTA]);
 			ip->i_dquot[GRPQUOTA] = NODQUOT;
 		}
 		(void) chkdq(ip, change, cred, FORCE|CHOWN);
 		(void) chkiq(ip, 1, cred, FORCE|CHOWN);
 		(void) getinoquota(ip);
 	}
 	return (error);
 good:
 	if (getinoquota(ip))
 		panic("ufs_chown: lost quota");
 #endif /* QUOTA */
 	ip->i_flag |= IN_CHANGE;
 	if ((ip->i_mode & (ISUID | ISGID)) && (ouid != uid || ogid != gid)) {
 		if (priv_check_cred(cred, PRIV_VFS_RETAINSUGID)) {
 			ip->i_mode &= ~(ISUID | ISGID);
 			DIP_SET(ip, i_mode, ip->i_mode);
 		}
 	}
 	error = UFS_UPDATE(vp, 0);
 	return (error);
 }
 
 static int
 ufs_remove(ap)
 	struct vop_remove_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct inode *ip;
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	int error;
 	struct thread *td;
 
 	td = curthread;
 	ip = VTOI(vp);
 	if ((ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(dvp)->i_flags & APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 #ifdef UFS_GJOURNAL
 	ufs_gjournal_orphan(vp);
 #endif
 	error = ufs_dirremove(dvp, ip, ap->a_cnp->cn_flags, 0);
 	if (ip->i_nlink <= 0)
 		vp->v_vflag |= VV_NOSYNC;
 	if ((ip->i_flags & SF_SNAPSHOT) != 0) {
 		/*
 		 * Avoid deadlock where another thread is trying to
 		 * update the inodeblock for dvp and is waiting on
 		 * snaplk.  Temporary unlock the vnode lock for the
 		 * unlinked file and sync the directory.  This should
 		 * allow vput() of the directory to not block later on
 		 * while holding the snapshot vnode locked, assuming
 		 * that the directory hasn't been unlinked too.
 		 */
 		VOP_UNLOCK(vp, 0);
 		(void) VOP_FSYNC(dvp, MNT_WAIT, td);
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY);
 	}
 out:
 	return (error);
 }
 
 static void
 print_bad_link_count(const char *funcname, struct vnode *dvp)
 {
 	struct inode *dip;
 
 	dip = VTOI(dvp);
 	uprintf("%s: Bad link count %d on parent inode %jd in file system %s\n",
 	    funcname, dip->i_effnlink, (intmax_t)dip->i_number,
 	    dvp->v_mount->mnt_stat.f_mntonname);
 }
 
 /*
  * link vnode call
  */
 static int
 ufs_link(ap)
 	struct vop_link_args /* {
 		struct vnode *a_tdvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip;
 	struct direct newdir;
 	int error;
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_link: no name");
 #endif
 	if (VTOI(tdvp)->i_effnlink < 2) {
 		print_bad_link_count("ufs_link", tdvp);
 		error = EINVAL;
 		goto out;
 	}
 	ip = VTOI(vp);
 	if (ip->i_nlink >= UFS_LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	/*
 	 * The file may have been removed after namei droped the original
 	 * lock.
 	 */
 	if (ip->i_effnlink == 0) {
 		error = ENOENT;
 		goto out;
 	}
 	if (ip->i_flags & (IMMUTABLE | APPEND)) {
 		error = EPERM;
 		goto out;
 	}
 	ip->i_effnlink++;
 	ip->i_nlink++;
 	DIP_SET(ip, i_nlink, ip->i_nlink);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(vp))
 		softdep_setup_link(VTOI(tdvp), ip);
 	error = UFS_UPDATE(vp, !DOINGSOFTDEP(vp) && !DOINGASYNC(vp));
 	if (!error) {
 		ufs_makedirentry(ip, cnp, &newdir);
 		error = ufs_direnter(tdvp, vp, &newdir, cnp, NULL, 0);
 	}
 
 	if (error) {
 		ip->i_effnlink--;
 		ip->i_nlink--;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(vp))
 			softdep_revert_link(VTOI(tdvp), ip);
 	}
 out:
 	return (error);
 }
 
 /*
  * whiteout vnode call
  */
 static int
 ufs_whiteout(ap)
 	struct vop_whiteout_args /* {
 		struct vnode *a_dvp;
 		struct componentname *a_cnp;
 		int a_flags;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct direct newdir;
 	int error = 0;
 
 	switch (ap->a_flags) {
 	case LOOKUP:
 		/* 4.4 format directories support whiteout operations */
 		if (dvp->v_mount->mnt_maxsymlinklen > 0)
 			return (0);
 		return (EOPNOTSUPP);
 
 	case CREATE:
 		/* create a new directory whiteout */
 #ifdef INVARIANTS
 		if ((cnp->cn_flags & SAVENAME) == 0)
 			panic("ufs_whiteout: missing name");
 		if (dvp->v_mount->mnt_maxsymlinklen <= 0)
 			panic("ufs_whiteout: old format filesystem");
 #endif
 
 		newdir.d_ino = UFS_WINO;
 		newdir.d_namlen = cnp->cn_namelen;
 		bcopy(cnp->cn_nameptr, newdir.d_name, (unsigned)cnp->cn_namelen + 1);
 		newdir.d_type = DT_WHT;
 		error = ufs_direnter(dvp, NULL, &newdir, cnp, NULL, 0);
 		break;
 
 	case DELETE:
 		/* remove an existing directory whiteout */
 #ifdef INVARIANTS
 		if (dvp->v_mount->mnt_maxsymlinklen <= 0)
 			panic("ufs_whiteout: old format filesystem");
 #endif
 
 		cnp->cn_flags &= ~DOWHITEOUT;
 		error = ufs_dirremove(dvp, NULL, cnp->cn_flags, 0);
 		break;
 	default:
 		panic("ufs_whiteout: unknown op");
 	}
 	return (error);
 }
 
 static volatile int rename_restarts;
 SYSCTL_INT(_vfs_ufs, OID_AUTO, rename_restarts, CTLFLAG_RD,
     __DEVOLATILE(int *, &rename_restarts), 0,
     "Times rename had to restart due to lock contention");
 
 /*
  * Rename system call.
  * 	rename("foo", "bar");
  * is essentially
  *	unlink("bar");
  *	link("foo", "bar");
  *	unlink("foo");
  * but ``atomically''.  Can't do full commit without saving state in the
  * inode on disk which isn't feasible at this time.  Best we can do is
  * always guarantee the target exists.
  *
  * Basic algorithm is:
  *
  * 1) Bump link count on source while we're linking it to the
  *    target.  This also ensure the inode won't be deleted out
  *    from underneath us while we work (it may be truncated by
  *    a concurrent `trunc' or `open' for creation).
  * 2) Link source to destination.  If destination already exists,
  *    delete it first.
  * 3) Unlink source reference to inode if still around. If a
  *    directory was moved and the parent of the destination
  *    is different from the source, patch the ".." entry in the
  *    directory.
  */
 static int
 ufs_rename(ap)
 	struct vop_rename_args  /* {
 		struct vnode *a_fdvp;
 		struct vnode *a_fvp;
 		struct componentname *a_fcnp;
 		struct vnode *a_tdvp;
 		struct vnode *a_tvp;
 		struct componentname *a_tcnp;
 	} */ *ap;
 {
 	struct vnode *tvp = ap->a_tvp;
 	struct vnode *tdvp = ap->a_tdvp;
 	struct vnode *fvp = ap->a_fvp;
 	struct vnode *fdvp = ap->a_fdvp;
 	struct vnode *nvp;
 	struct componentname *tcnp = ap->a_tcnp;
 	struct componentname *fcnp = ap->a_fcnp;
 	struct thread *td = fcnp->cn_thread;
 	struct inode *fip, *tip, *tdp, *fdp;
 	struct direct newdir;
 	off_t endoff;
 	int doingdirectory, newparent;
 	int error = 0;
 	struct mount *mp;
 	ino_t ino;
 
 #ifdef INVARIANTS
 	if ((tcnp->cn_flags & HASBUF) == 0 ||
 	    (fcnp->cn_flags & HASBUF) == 0)
 		panic("ufs_rename: no name");
 #endif
 	endoff = 0;
 	mp = tdvp->v_mount;
 	VOP_UNLOCK(tdvp, 0);
 	if (tvp && tvp != tdvp)
 		VOP_UNLOCK(tvp, 0);
 	/*
 	 * Check for cross-device rename.
 	 */
 	if ((fvp->v_mount != tdvp->v_mount) ||
 	    (tvp && (fvp->v_mount != tvp->v_mount))) {
 		error = EXDEV;
 		mp = NULL;
 		goto releout;
 	}
 relock:
 	/* 
 	 * We need to acquire 2 to 4 locks depending on whether tvp is NULL
 	 * and fdvp and tdvp are the same directory.  Subsequently we need
 	 * to double-check all paths and in the directory rename case we
 	 * need to verify that we are not creating a directory loop.  To
 	 * handle this we acquire all but fdvp using non-blocking
 	 * acquisitions.  If we fail to acquire any lock in the path we will
 	 * drop all held locks, acquire the new lock in a blocking fashion,
 	 * and then release it and restart the rename.  This acquire/release
 	 * step ensures that we do not spin on a lock waiting for release.
 	 */
 	error = vn_lock(fdvp, LK_EXCLUSIVE);
 	if (error)
 		goto releout;
 	if (vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT) != 0) {
 		VOP_UNLOCK(fdvp, 0);
 		error = vn_lock(tdvp, LK_EXCLUSIVE);
 		if (error)
 			goto releout;
 		VOP_UNLOCK(tdvp, 0);
 		atomic_add_int(&rename_restarts, 1);
 		goto relock;
 	}
 	/*
 	 * Re-resolve fvp to be certain it still exists and fetch the
 	 * correct vnode.
 	 */
 	error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);
 	if (error) {
 		VOP_UNLOCK(fdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		goto releout;
 	}
 	error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
 	if (error) {
 		VOP_UNLOCK(fdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		if (error != EBUSY)
 			goto releout;
 		error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);
 		if (error != 0)
 			goto releout;
 		VOP_UNLOCK(nvp, 0);
 		vrele(fvp);
 		fvp = nvp;
 		atomic_add_int(&rename_restarts, 1);
 		goto relock;
 	}
 	vrele(fvp);
 	fvp = nvp;
 	/*
 	 * Re-resolve tvp and acquire the vnode lock if present.
 	 */
 	error = ufs_lookup_ino(tdvp, NULL, tcnp, &ino);
 	if (error != 0 && error != EJUSTRETURN) {
 		VOP_UNLOCK(fdvp, 0);
 		VOP_UNLOCK(tdvp, 0);
 		VOP_UNLOCK(fvp, 0);
 		goto releout;
 	}
 	/*
 	 * If tvp disappeared we just carry on.
 	 */
 	if (error == EJUSTRETURN && tvp != NULL) {
 		vrele(tvp);
 		tvp = NULL;
 	}
 	/*
 	 * Get the tvp ino if the lookup succeeded.  We may have to restart
 	 * if the non-blocking acquire fails.
 	 */
 	if (error == 0) {
 		nvp = NULL;
 		error = VFS_VGET(mp, ino, LK_EXCLUSIVE | LK_NOWAIT, &nvp);
 		if (tvp)
 			vrele(tvp);
 		tvp = nvp;
 		if (error) {
 			VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(tdvp, 0);
 			VOP_UNLOCK(fvp, 0);
 			if (error != EBUSY)
 				goto releout;
 			error = VFS_VGET(mp, ino, LK_EXCLUSIVE, &nvp);
 			if (error != 0)
 				goto releout;
 			vput(nvp);
 			atomic_add_int(&rename_restarts, 1);
 			goto relock;
 		}
 	}
 	fdp = VTOI(fdvp);
 	fip = VTOI(fvp);
 	tdp = VTOI(tdvp);
 	tip = NULL;
 	if (tvp)
 		tip = VTOI(tvp);
 	if (tvp && ((VTOI(tvp)->i_flags & (NOUNLINK | IMMUTABLE | APPEND)) ||
 	    (VTOI(tdvp)->i_flags & APPEND))) {
 		error = EPERM;
 		goto unlockout;
 	}
 	/*
 	 * Renaming a file to itself has no effect.  The upper layers should
 	 * not call us in that case.  However, things could change after
 	 * we drop the locks above.
 	 */
 	if (fvp == tvp) {
 		error = 0;
 		goto unlockout;
 	}
 	doingdirectory = 0;
 	newparent = 0;
 	ino = fip->i_number;
 	if (fip->i_nlink >= UFS_LINK_MAX) {
 		error = EMLINK;
 		goto unlockout;
 	}
 	if ((fip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))
 	    || (fdp->i_flags & APPEND)) {
 		error = EPERM;
 		goto unlockout;
 	}
 	if ((fip->i_mode & IFMT) == IFDIR) {
 		/*
 		 * Avoid ".", "..", and aliases of "." for obvious reasons.
 		 */
 		if ((fcnp->cn_namelen == 1 && fcnp->cn_nameptr[0] == '.') ||
 		    fdp == fip ||
 		    (fcnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) {
 			error = EINVAL;
 			goto unlockout;
 		}
 		if (fdp->i_number != tdp->i_number)
 			newparent = tdp->i_number;
 		doingdirectory = 1;
 	}
 	if ((fvp->v_type == VDIR && fvp->v_mountedhere != NULL) ||
 	    (tvp != NULL && tvp->v_type == VDIR &&
 	    tvp->v_mountedhere != NULL)) {
 		error = EXDEV;
 		goto unlockout;
 	}
 
 	/*
 	 * If ".." must be changed (ie the directory gets a new
 	 * parent) then the source directory must not be in the
 	 * directory hierarchy above the target, as this would
 	 * orphan everything below the source directory. Also
 	 * the user must have write permission in the source so
 	 * as to be able to change "..".
 	 */
 	if (doingdirectory && newparent) {
 		error = VOP_ACCESS(fvp, VWRITE, tcnp->cn_cred, tcnp->cn_thread);
 		if (error)
 			goto unlockout;
 		error = ufs_checkpath(ino, fdp->i_number, tdp, tcnp->cn_cred,
 		    &ino);
 		/*
 		 * We encountered a lock that we have to wait for.  Unlock
 		 * everything else and VGET before restarting.
 		 */
 		if (ino) {
 			VOP_UNLOCK(fdvp, 0);
 			VOP_UNLOCK(fvp, 0);
 			VOP_UNLOCK(tdvp, 0);
 			if (tvp)
 				VOP_UNLOCK(tvp, 0);
 			error = VFS_VGET(mp, ino, LK_SHARED, &nvp);
 			if (error == 0)
 				vput(nvp);
 			atomic_add_int(&rename_restarts, 1);
 			goto relock;
 		}
 		if (error)
 			goto unlockout;
 		if ((tcnp->cn_flags & SAVESTART) == 0)
 			panic("ufs_rename: lost to startdir");
 	}
 	if (fip->i_effnlink == 0 || fdp->i_effnlink == 0 ||
 	    tdp->i_effnlink == 0)
 		panic("Bad effnlink fip %p, fdp %p, tdp %p", fip, fdp, tdp);
 
 	/*
 	 * 1) Bump link count while we're moving stuff
 	 *    around.  If we crash somewhere before
 	 *    completing our work, the link count
 	 *    may be wrong, but correctable.
 	 */
 	fip->i_effnlink++;
 	fip->i_nlink++;
 	DIP_SET(fip, i_nlink, fip->i_nlink);
 	fip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(fvp))
 		softdep_setup_link(tdp, fip);
 	error = UFS_UPDATE(fvp, !DOINGSOFTDEP(fvp) && !DOINGASYNC(fvp));
 	if (error)
 		goto bad;
 
 	/*
 	 * 2) If target doesn't exist, link the target
 	 *    to the source and unlink the source.
 	 *    Otherwise, rewrite the target directory
 	 *    entry to reference the source inode and
 	 *    expunge the original entry's existence.
 	 */
 	if (tip == NULL) {
 		if (ITODEV(tdp) != ITODEV(fip))
 			panic("ufs_rename: EXDEV");
 		if (doingdirectory && newparent) {
 			/*
 			 * Account for ".." in new directory.
 			 * When source and destination have the same
 			 * parent we don't adjust the link count.  The
 			 * actual link modification is completed when
 			 * .. is rewritten below.
 			 */
 			if (tdp->i_nlink >= UFS_LINK_MAX) {
 				error = EMLINK;
 				goto bad;
 			}
 		}
 		ufs_makedirentry(fip, tcnp, &newdir);
 		error = ufs_direnter(tdvp, NULL, &newdir, tcnp, NULL, 1);
 		if (error)
 			goto bad;
 		/* Setup tdvp for directory compaction if needed. */
 		if (tdp->i_count && tdp->i_endoff &&
 		    tdp->i_endoff < tdp->i_size)
 			endoff = tdp->i_endoff;
 	} else {
 		if (ITODEV(tip) != ITODEV(tdp) || ITODEV(tip) != ITODEV(fip))
 			panic("ufs_rename: EXDEV");
 		/*
 		 * Short circuit rename(foo, foo).
 		 */
 		if (tip->i_number == fip->i_number)
 			panic("ufs_rename: same file");
 		/*
 		 * If the parent directory is "sticky", then the caller
 		 * must possess VADMIN for the parent directory, or the
 		 * destination of the rename.  This implements append-only
 		 * directories.
 		 */
 		if ((tdp->i_mode & S_ISTXT) &&
 		    VOP_ACCESS(tdvp, VADMIN, tcnp->cn_cred, td) &&
 		    VOP_ACCESS(tvp, VADMIN, tcnp->cn_cred, td)) {
 			error = EPERM;
 			goto bad;
 		}
 		/*
 		 * Target must be empty if a directory and have no links
 		 * to it. Also, ensure source and target are compatible
 		 * (both directories, or both not directories).
 		 */
 		if ((tip->i_mode & IFMT) == IFDIR) {
 			if ((tip->i_effnlink > 2) ||
 			    !ufs_dirempty(tip, tdp->i_number, tcnp->cn_cred)) {
 				error = ENOTEMPTY;
 				goto bad;
 			}
 			if (!doingdirectory) {
 				error = ENOTDIR;
 				goto bad;
 			}
 			cache_purge(tdvp);
 		} else if (doingdirectory) {
 			error = EISDIR;
 			goto bad;
 		}
 		if (doingdirectory) {
 			if (!newparent) {
 				tdp->i_effnlink--;
 				if (DOINGSOFTDEP(tdvp))
 					softdep_change_linkcnt(tdp);
 			}
 			tip->i_effnlink--;
 			if (DOINGSOFTDEP(tvp))
 				softdep_change_linkcnt(tip);
 		}
 		error = ufs_dirrewrite(tdp, tip, fip->i_number,
 		    IFTODT(fip->i_mode),
 		    (doingdirectory && newparent) ? newparent : doingdirectory);
 		if (error) {
 			if (doingdirectory) {
 				if (!newparent) {
 					tdp->i_effnlink++;
 					if (DOINGSOFTDEP(tdvp))
 						softdep_change_linkcnt(tdp);
 				}
 				tip->i_effnlink++;
 				if (DOINGSOFTDEP(tvp))
 					softdep_change_linkcnt(tip);
 			}
 		}
 		if (doingdirectory && !DOINGSOFTDEP(tvp)) {
 			/*
 			 * The only stuff left in the directory is "."
 			 * and "..". The "." reference is inconsequential
 			 * since we are quashing it. We have removed the "."
 			 * reference and the reference in the parent directory,
 			 * but there may be other hard links. The soft
 			 * dependency code will arrange to do these operations
 			 * after the parent directory entry has been deleted on
 			 * disk, so when running with that code we avoid doing
 			 * them now.
 			 */
 			if (!newparent) {
 				tdp->i_nlink--;
 				DIP_SET(tdp, i_nlink, tdp->i_nlink);
 				tdp->i_flag |= IN_CHANGE;
 			}
 			tip->i_nlink--;
 			DIP_SET(tip, i_nlink, tip->i_nlink);
 			tip->i_flag |= IN_CHANGE;
 		}
 	}
 
 	/*
 	 * 3) Unlink the source.  We have to resolve the path again to
 	 * fixup the directory offset and count for ufs_dirremove.
 	 */
 	if (fdvp == tdvp) {
 		error = ufs_lookup_ino(fdvp, NULL, fcnp, &ino);
 		if (error)
 			panic("ufs_rename: from entry went away!");
 		if (ino != fip->i_number)
 			panic("ufs_rename: ino mismatch %ju != %ju\n",
 			    (uintmax_t)ino, (uintmax_t)fip->i_number);
 	}
 	/*
 	 * If the source is a directory with a
 	 * new parent, the link count of the old
 	 * parent directory must be decremented
 	 * and ".." set to point to the new parent.
 	 */
 	if (doingdirectory && newparent) {
 		/*
 		 * If tip exists we simply use its link, otherwise we must
 		 * add a new one.
 		 */
 		if (tip == NULL) {
 			tdp->i_effnlink++;
 			tdp->i_nlink++;
 			DIP_SET(tdp, i_nlink, tdp->i_nlink);
 			tdp->i_flag |= IN_CHANGE;
 			if (DOINGSOFTDEP(tdvp))
 				softdep_setup_dotdot_link(tdp, fip);
 			error = UFS_UPDATE(tdvp, !DOINGSOFTDEP(tdvp) &&
 			    !DOINGASYNC(tdvp));
 			/* Don't go to bad here as the new link exists. */
 			if (error)
 				goto unlockout;
 		} else if (DOINGSUJ(tdvp))
 			/* Journal must account for each new link. */
 			softdep_setup_dotdot_link(tdp, fip);
 		fip->i_offset = mastertemplate.dot_reclen;
 		ufs_dirrewrite(fip, fdp, newparent, DT_DIR, 0);
 		cache_purge(fdvp);
 	}
 	error = ufs_dirremove(fdvp, fip, fcnp->cn_flags, 0);
 	/*
 	 * The kern_renameat() looks up the fvp using the DELETE flag, which
 	 * causes the removal of the name cache entry for fvp.
 	 * As the relookup of the fvp is done in two steps:
 	 * ufs_lookup_ino() and then VFS_VGET(), another thread might do a
 	 * normal lookup of the from name just before the VFS_VGET() call,
 	 * causing the cache entry to be re-instantiated.
 	 *
 	 * The same issue also applies to tvp if it exists as
 	 * otherwise we may have a stale name cache entry for the new
 	 * name that references the old i-node if it has other links
 	 * or open file descriptors.
 	 */
 	cache_purge(fvp);
 	if (tvp)
 		cache_purge(tvp);
 	cache_purge_negative(tdvp);
 
 unlockout:
 	vput(fdvp);
 	vput(fvp);
 	if (tvp)
 		vput(tvp);
 	/*
 	 * If compaction or fsync was requested do it now that other locks
 	 * are no longer needed.
 	 */
 	if (error == 0 && endoff != 0) {
 		error = UFS_TRUNCATE(tdvp, endoff, IO_NORMAL |
 		    (DOINGASYNC(tdvp) ? 0 : IO_SYNC), tcnp->cn_cred);
 		if (error != 0)
 			vn_printf(tdvp,
 			    "ufs_rename: failed to truncate, error %d\n",
 			    error);
 #ifdef UFS_DIRHASH
 		else if (tdp->i_dirhash != NULL)
 			ufsdirhash_dirtrunc(tdp, endoff);
 #endif
 		/*
 		 * Even if the directory compaction failed, rename was
 		 * succesful.  Do not propagate a UFS_TRUNCATE() error
 		 * to the caller.
 		 */
 		error = 0;
 	}
 	if (error == 0 && tdp->i_flag & IN_NEEDSYNC)
 		error = VOP_FSYNC(tdvp, MNT_WAIT, td);
 	vput(tdvp);
 	return (error);
 
 bad:
 	fip->i_effnlink--;
 	fip->i_nlink--;
 	DIP_SET(fip, i_nlink, fip->i_nlink);
 	fip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(fvp))
 		softdep_revert_link(tdp, fip);
 	goto unlockout;
 
 releout:
 	vrele(fdvp);
 	vrele(fvp);
 	vrele(tdvp);
 	if (tvp)
 		vrele(tvp);
 
 	return (error);
 }
 
 #ifdef UFS_ACL
 static int
 ufs_do_posix1e_acl_inheritance_dir(struct vnode *dvp, struct vnode *tvp,
     mode_t dmode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct inode *ip = VTOI(tvp);
 	struct acl *dacl, *acl;
 
 	acl = acl_alloc(M_WAITOK);
 	dacl = acl_alloc(M_WAITOK);
 
 	/*
 	 * Retrieve default ACL from parent, if any.
 	 */
 	error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td);
 	switch (error) {
 	case 0:
 		/*
 		 * Retrieved a default ACL, so merge mode and ACL if
 		 * necessary.  If the ACL is empty, fall through to
 		 * the "not defined or available" case.
 		 */
 		if (acl->acl_cnt != 0) {
 			dmode = acl_posix1e_newfilemode(dmode, acl);
 			ip->i_mode = dmode;
 			DIP_SET(ip, i_mode, dmode);
 			*dacl = *acl;
 			ufs_sync_acl_from_inode(ip, acl);
 			break;
 		}
 		/* FALLTHROUGH */
 
 	case EOPNOTSUPP:
 		/*
 		 * Just use the mode as-is.
 		 */
 		ip->i_mode = dmode;
 		DIP_SET(ip, i_mode, dmode);
 		error = 0;
 		goto out;
 	
 	default:
 		goto out;
 	}
 
 	/*
 	 * XXX: If we abort now, will Soft Updates notify the extattr
 	 * code that the EAs for the file need to be released?
 	 */
 	error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td);
 	if (error == 0)
 		error = VOP_SETACL(tvp, ACL_TYPE_DEFAULT, dacl, cred, td);
 	switch (error) {
 	case 0:
 		break;
 
 	case EOPNOTSUPP:
 		/*
 		 * XXX: This should not happen, as EOPNOTSUPP above
 		 * was supposed to free acl.
 		 */
 		printf("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()\n");
 		/*
 		panic("ufs_mkdir: VOP_GETACL() but no VOP_SETACL()");
 		 */
 		break;
 
 	default:
 		goto out;
 	}
 
 out:
 	acl_free(acl);
 	acl_free(dacl);
 
 	return (error);
 }
 
 static int
 ufs_do_posix1e_acl_inheritance_file(struct vnode *dvp, struct vnode *tvp,
     mode_t mode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct inode *ip = VTOI(tvp);
 	struct acl *acl;
 
 	acl = acl_alloc(M_WAITOK);
 
 	/*
 	 * Retrieve default ACL for parent, if any.
 	 */
 	error = VOP_GETACL(dvp, ACL_TYPE_DEFAULT, acl, cred, td);
 	switch (error) {
 	case 0:
 		/*
 		 * Retrieved a default ACL, so merge mode and ACL if
 		 * necessary.
 		 */
 		if (acl->acl_cnt != 0) {
 			/*
 			 * Two possible ways for default ACL to not
 			 * be present.  First, the EA can be
 			 * undefined, or second, the default ACL can
 			 * be blank.  If it's blank, fall through to
 			 * the it's not defined case.
 			 */
 			mode = acl_posix1e_newfilemode(mode, acl);
 			ip->i_mode = mode;
 			DIP_SET(ip, i_mode, mode);
 			ufs_sync_acl_from_inode(ip, acl);
 			break;
 		}
 		/* FALLTHROUGH */
 
 	case EOPNOTSUPP:
 		/*
 		 * Just use the mode as-is.
 		 */
 		ip->i_mode = mode;
 		DIP_SET(ip, i_mode, mode);
 		error = 0;
 		goto out;
 
 	default:
 		goto out;
 	}
 
 	/*
 	 * XXX: If we abort now, will Soft Updates notify the extattr
 	 * code that the EAs for the file need to be released?
 	 */
 	error = VOP_SETACL(tvp, ACL_TYPE_ACCESS, acl, cred, td);
 	switch (error) {
 	case 0:
 		break;
 
 	case EOPNOTSUPP:
 		/*
 		 * XXX: This should not happen, as EOPNOTSUPP above was
 		 * supposed to free acl.
 		 */
 		printf("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() "
 		    "but no VOP_SETACL()\n");
 		/* panic("ufs_do_posix1e_acl_inheritance_file: VOP_GETACL() "
 		    "but no VOP_SETACL()"); */
 		break;
 
 	default:
 		goto out;
 	}
 
 out:
 	acl_free(acl);
 
 	return (error);
 }
 
 static int
 ufs_do_nfs4_acl_inheritance(struct vnode *dvp, struct vnode *tvp,
     mode_t child_mode, struct ucred *cred, struct thread *td)
 {
 	int error;
 	struct acl *parent_aclp, *child_aclp;
 
 	parent_aclp = acl_alloc(M_WAITOK);
 	child_aclp = acl_alloc(M_WAITOK | M_ZERO);
 
 	error = ufs_getacl_nfs4_internal(dvp, parent_aclp, td);
 	if (error)
 		goto out;
 	acl_nfs4_compute_inherited_acl(parent_aclp, child_aclp,
 	    child_mode, VTOI(tvp)->i_uid, tvp->v_type == VDIR);
 	error = ufs_setacl_nfs4_internal(tvp, child_aclp, td);
 	if (error)
 		goto out;
 out:
 	acl_free(parent_aclp);
 	acl_free(child_aclp);
 
 	return (error);
 }
 #endif
 
 /*
  * Mkdir system call
  */
 static int
 ufs_mkdir(ap)
 	struct vop_mkdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 	} */ *ap;
 {
 	struct vnode *dvp = ap->a_dvp;
 	struct vattr *vap = ap->a_vap;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	struct vnode *tvp;
 	struct buf *bp;
 	struct dirtemplate dirtemplate, *dtp;
 	struct direct newdir;
 	int error, dmode;
 	long blkoff;
 
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("ufs_mkdir: no name");
 #endif
 	dp = VTOI(dvp);
 	if (dp->i_nlink >= UFS_LINK_MAX) {
 		error = EMLINK;
 		goto out;
 	}
 	dmode = vap->va_mode & 0777;
 	dmode |= IFDIR;
 	/*
 	 * Must simulate part of ufs_makeinode here to acquire the inode,
 	 * but not have it entered in the parent directory. The entry is
 	 * made later after writing "." and ".." entries.
 	 */
 	if (dp->i_effnlink < 2) {
 		print_bad_link_count("ufs_mkdir", dvp);
 		error = EINVAL;
 		goto out;
 	}
 	error = UFS_VALLOC(dvp, dmode, cnp->cn_cred, &tvp);
 	if (error)
 		goto out;
 	ip = VTOI(tvp);
 	ip->i_gid = dp->i_gid;
 	DIP_SET(ip, i_gid, dp->i_gid);
 #ifdef SUIDDIR
 	{
 #ifdef QUOTA
 		struct ucred ucred, *ucp;
 		gid_t ucred_group;
 		ucp = cnp->cn_cred;
 #endif
 		/*
 		 * If we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * The new directory also inherits the SUID bit.
 		 * If user's UID and dir UID are the same,
 		 * 'give it away' so that the SUID is still forced on.
 		 */
 		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		    (dp->i_mode & ISUID) && dp->i_uid) {
 			dmode |= ISUID;
 			ip->i_uid = dp->i_uid;
 			DIP_SET(ip, i_uid, dp->i_uid);
 #ifdef QUOTA
 			if (dp->i_uid != cnp->cn_cred->cr_uid) {
 				/*
 				 * Make sure the correct user gets charged
 				 * for the space.
 				 * Make a dummy credential for the victim.
 				 * XXX This seems to never be accessed out of
 				 * our context so a stack variable is ok.
 				 */
 				refcount_init(&ucred.cr_ref, 1);
 				ucred.cr_uid = ip->i_uid;
 				ucred.cr_ngroups = 1;
 				ucred.cr_groups = &ucred_group;
 				ucred.cr_groups[0] = dp->i_gid;
 				ucp = &ucred;
 			}
 #endif
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 			DIP_SET(ip, i_uid, ip->i_uid);
 		}
 #ifdef QUOTA
 		if ((error = getinoquota(ip)) ||
 	    	    (error = chkiq(ip, 1, ucp, 0))) {
 			if (DOINGSOFTDEP(tvp))
 				softdep_revert_link(dp, ip);
 			UFS_VFREE(tvp, ip->i_number, dmode);
 			vput(tvp);
 			return (error);
 		}
 #endif
 	}
 #else	/* !SUIDDIR */
 	ip->i_uid = cnp->cn_cred->cr_uid;
 	DIP_SET(ip, i_uid, ip->i_uid);
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) ||
 	    (error = chkiq(ip, 1, cnp->cn_cred, 0))) {
 		if (DOINGSOFTDEP(tvp))
 			softdep_revert_link(dp, ip);
 		UFS_VFREE(tvp, ip->i_number, dmode);
 		vput(tvp);
 		return (error);
 	}
 #endif
 #endif	/* !SUIDDIR */
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = dmode;
 	DIP_SET(ip, i_mode, dmode);
 	tvp->v_type = VDIR;	/* Rest init'd in getnewvnode(). */
 	ip->i_effnlink = 2;
 	ip->i_nlink = 2;
 	DIP_SET(ip, i_nlink, 2);
 
 	if (cnp->cn_flags & ISWHITEOUT) {
 		ip->i_flags |= UF_OPAQUE;
 		DIP_SET(ip, i_flags, ip->i_flags);
 	}
 
 	/*
 	 * Bump link count in parent directory to reflect work done below.
 	 * Should be done before reference is created so cleanup is
 	 * possible if we crash.
 	 */
 	dp->i_effnlink++;
 	dp->i_nlink++;
 	DIP_SET(dp, i_nlink, dp->i_nlink);
 	dp->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(dvp))
 		softdep_setup_mkdir(dp, ip);
 	error = UFS_UPDATE(dvp, !DOINGSOFTDEP(dvp) && !DOINGASYNC(dvp));
 	if (error)
 		goto bad;
 #ifdef MAC
 	if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) {
 		error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount,
 		    dvp, tvp, cnp);
 		if (error)
 			goto bad;
 	}
 #endif
 #ifdef UFS_ACL
 	if (dvp->v_mount->mnt_flag & MNT_ACLS) {
 		error = ufs_do_posix1e_acl_inheritance_dir(dvp, tvp, dmode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	} else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) {
 		error = ufs_do_nfs4_acl_inheritance(dvp, tvp, dmode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	}
 #endif /* !UFS_ACL */
 
 	/*
 	 * Initialize directory with "." and ".." from static template.
 	 */
 	if (dvp->v_mount->mnt_maxsymlinklen > 0)
 		dtp = &mastertemplate;
 	else
 		dtp = (struct dirtemplate *)&omastertemplate;
 	dirtemplate = *dtp;
 	dirtemplate.dot_ino = ip->i_number;
 	dirtemplate.dotdot_ino = dp->i_number;
 	vnode_pager_setsize(tvp, DIRBLKSIZ);
 	if ((error = UFS_BALLOC(tvp, (off_t)0, DIRBLKSIZ, cnp->cn_cred,
 	    BA_CLRBUF, &bp)) != 0)
 		goto bad;
 	ip->i_size = DIRBLKSIZ;
 	DIP_SET(ip, i_size, DIRBLKSIZ);
 	ip->i_flag |= IN_CHANGE | IN_UPDATE;
 	bcopy((caddr_t)&dirtemplate, (caddr_t)bp->b_data, sizeof dirtemplate);
 	if (DOINGSOFTDEP(tvp)) {
 		/*
 		 * Ensure that the entire newly allocated block is a
 		 * valid directory so that future growth within the
 		 * block does not have to ensure that the block is
 		 * written before the inode.
 		 */
 		blkoff = DIRBLKSIZ;
 		while (blkoff < bp->b_bcount) {
 			((struct direct *)
 			   (bp->b_data + blkoff))->d_reclen = DIRBLKSIZ;
 			blkoff += DIRBLKSIZ;
 		}
 	}
 	if ((error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) &&
 	    !DOINGASYNC(tvp))) != 0) {
 		(void)bwrite(bp);
 		goto bad;
 	}
 	/*
 	 * Directory set up, now install its entry in the parent directory.
 	 *
 	 * If we are not doing soft dependencies, then we must write out the
 	 * buffer containing the new directory body before entering the new 
 	 * name in the parent. If we are doing soft dependencies, then the
 	 * buffer containing the new directory body will be passed to and
 	 * released in the soft dependency code after the code has attached
 	 * an appropriate ordering dependency to the buffer which ensures that
 	 * the buffer is written before the new name is written in the parent.
 	 */
 	if (DOINGASYNC(dvp))
 		bdwrite(bp);
 	else if (!DOINGSOFTDEP(dvp) && ((error = bwrite(bp))))
 		goto bad;
 	ufs_makedirentry(ip, cnp, &newdir);
 	error = ufs_direnter(dvp, tvp, &newdir, cnp, bp, 0);
 	
 bad:
 	if (error == 0) {
 		*ap->a_vpp = tvp;
 	} else {
 		dp->i_effnlink--;
 		dp->i_nlink--;
 		DIP_SET(dp, i_nlink, dp->i_nlink);
 		dp->i_flag |= IN_CHANGE;
 		/*
 		 * No need to do an explicit VOP_TRUNCATE here, vrele will
 		 * do this for us because we set the link count to 0.
 		 */
 		ip->i_effnlink = 0;
 		ip->i_nlink = 0;
 		DIP_SET(ip, i_nlink, 0);
 		ip->i_flag |= IN_CHANGE;
 		if (DOINGSOFTDEP(tvp))
 			softdep_revert_mkdir(dp, ip);
 
 		vput(tvp);
 	}
 out:
 	return (error);
 }
 
 /*
  * Rmdir system call.
  */
 static int
 ufs_rmdir(ap)
 	struct vop_rmdir_args /* {
 		struct vnode *a_dvp;
 		struct vnode *a_vp;
 		struct componentname *a_cnp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct vnode *dvp = ap->a_dvp;
 	struct componentname *cnp = ap->a_cnp;
 	struct inode *ip, *dp;
 	int error;
 
 	ip = VTOI(vp);
 	dp = VTOI(dvp);
 
 	/*
 	 * Do not remove a directory that is in the process of being renamed.
 	 * Verify the directory is empty (and valid). Rmdir ".." will not be
 	 * valid since ".." will contain a reference to the current directory
 	 * and thus be non-empty. Do not allow the removal of mounted on
 	 * directories (this can happen when an NFS exported filesystem
 	 * tries to remove a locally mounted on directory).
 	 */
 	error = 0;
 	if (dp->i_effnlink <= 2) {
 		if (dp->i_effnlink == 2)
 			print_bad_link_count("ufs_rmdir", dvp);
 		error = EINVAL;
 		goto out;
 	}
 	if (!ufs_dirempty(ip, dp->i_number, cnp->cn_cred)) {
 		error = ENOTEMPTY;
 		goto out;
 	}
 	if ((dp->i_flags & APPEND)
 	    || (ip->i_flags & (NOUNLINK | IMMUTABLE | APPEND))) {
 		error = EPERM;
 		goto out;
 	}
 	if (vp->v_mountedhere != 0) {
 		error = EINVAL;
 		goto out;
 	}
 #ifdef UFS_GJOURNAL
 	ufs_gjournal_orphan(vp);
 #endif
 	/*
 	 * Delete reference to directory before purging
 	 * inode.  If we crash in between, the directory
 	 * will be reattached to lost+found,
 	 */
 	dp->i_effnlink--;
 	ip->i_effnlink--;
 	if (DOINGSOFTDEP(vp))
 		softdep_setup_rmdir(dp, ip);
 	error = ufs_dirremove(dvp, ip, cnp->cn_flags, 1);
 	if (error) {
 		dp->i_effnlink++;
 		ip->i_effnlink++;
 		if (DOINGSOFTDEP(vp))
 			softdep_revert_rmdir(dp, ip);
 		goto out;
 	}
 	cache_purge(dvp);
 	/*
 	 * The only stuff left in the directory is "." and "..". The "."
 	 * reference is inconsequential since we are quashing it. The soft
 	 * dependency code will arrange to do these operations after
 	 * the parent directory entry has been deleted on disk, so
 	 * when running with that code we avoid doing them now.
 	 */
 	if (!DOINGSOFTDEP(vp)) {
 		dp->i_nlink--;
 		DIP_SET(dp, i_nlink, dp->i_nlink);
 		dp->i_flag |= IN_CHANGE;
 		error = UFS_UPDATE(dvp, 0);
 		ip->i_nlink--;
 		DIP_SET(ip, i_nlink, ip->i_nlink);
 		ip->i_flag |= IN_CHANGE;
 	}
 	cache_purge(vp);
 #ifdef UFS_DIRHASH
 	/* Kill any active hash; i_effnlink == 0, so it will not come back. */
 	if (ip->i_dirhash != NULL)
 		ufsdirhash_free(ip);
 #endif
 out:
 	return (error);
 }
 
 /*
  * symlink -- make a symbolic link
  */
 static int
 ufs_symlink(ap)
 	struct vop_symlink_args /* {
 		struct vnode *a_dvp;
 		struct vnode **a_vpp;
 		struct componentname *a_cnp;
 		struct vattr *a_vap;
 		const char *a_target;
 	} */ *ap;
 {
 	struct vnode *vp, **vpp = ap->a_vpp;
 	struct inode *ip;
 	int len, error;
 
 	error = ufs_makeinode(IFLNK | ap->a_vap->va_mode, ap->a_dvp,
 	    vpp, ap->a_cnp, "ufs_symlink");
 	if (error)
 		return (error);
 	vp = *vpp;
 	len = strlen(ap->a_target);
 	if (len < vp->v_mount->mnt_maxsymlinklen) {
 		ip = VTOI(vp);
 		bcopy(ap->a_target, SHORTLINK(ip), len);
 		ip->i_size = len;
 		DIP_SET(ip, i_size, len);
 		ip->i_flag |= IN_CHANGE | IN_UPDATE;
 		error = UFS_UPDATE(vp, 0);
 	} else
 		error = vn_rdwr(UIO_WRITE, vp, __DECONST(void *, ap->a_target),
 		    len, (off_t)0, UIO_SYSSPACE, IO_NODELOCKED | IO_NOMACCHECK,
 		    ap->a_cnp->cn_cred, NOCRED, NULL, NULL);
 	if (error)
 		vput(vp);
 	return (error);
 }
 
 /*
  * Vnode op for reading directories.
  */
 int
 ufs_readdir(ap)
 	struct vop_readdir_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 		int *a_eofflag;
 		int *a_ncookies;
 		u_long **a_cookies;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct uio *uio = ap->a_uio;
 	struct buf *bp;
 	struct inode *ip;
 	struct direct *dp, *edp;
 	u_long *cookies;
 	struct dirent dstdp;
 	off_t offset, startoffset;
 	size_t readcnt, skipcnt;
 	ssize_t startresid;
 	u_int ncookies;
 	int error;
 
 	if (uio->uio_offset < 0)
 		return (EINVAL);
 	ip = VTOI(vp);
 	if (ip->i_effnlink == 0)
 		return (0);
 	if (ap->a_ncookies != NULL) {
 		if (uio->uio_resid < 0)
 			ncookies = 0;
 		else
 			ncookies = uio->uio_resid;
 		if (uio->uio_offset >= ip->i_size)
 			ncookies = 0;
 		else if (ip->i_size - uio->uio_offset < ncookies)
 			ncookies = ip->i_size - uio->uio_offset;
 		ncookies = ncookies / (offsetof(struct direct, d_name) + 4) + 1;
 		cookies = malloc(ncookies * sizeof(*cookies), M_TEMP, M_WAITOK);
 		*ap->a_ncookies = ncookies;
 		*ap->a_cookies = cookies;
 	} else {
 		ncookies = 0;
 		cookies = NULL;
 	}
 	offset = startoffset = uio->uio_offset;
 	startresid = uio->uio_resid;
 	error = 0;
 	while (error == 0 && uio->uio_resid > 0 &&
 	    uio->uio_offset < ip->i_size) {
 		error = ffs_blkatoff(vp, uio->uio_offset, NULL, &bp);
 		if (error)
 			break;
 		if (bp->b_offset + bp->b_bcount > ip->i_size)
 			readcnt = ip->i_size - bp->b_offset;
 		else
 			readcnt = bp->b_bcount;
 		skipcnt = (size_t)(uio->uio_offset - bp->b_offset) &
 		    ~(size_t)(DIRBLKSIZ - 1);
 		offset = bp->b_offset + skipcnt;
 		dp = (struct direct *)&bp->b_data[skipcnt];
 		edp = (struct direct *)&bp->b_data[readcnt];
 		while (error == 0 && uio->uio_resid > 0 && dp < edp) {
 			if (dp->d_reclen <= offsetof(struct direct, d_name) ||
 			    (caddr_t)dp + dp->d_reclen > (caddr_t)edp) {
 				error = EIO;
 				break;
 			}
 #if BYTE_ORDER == LITTLE_ENDIAN
 			/* Old filesystem format. */
 			if (vp->v_mount->mnt_maxsymlinklen <= 0) {
 				dstdp.d_namlen = dp->d_type;
 				dstdp.d_type = dp->d_namlen;
 			} else
 #endif
 			{
 				dstdp.d_namlen = dp->d_namlen;
 				dstdp.d_type = dp->d_type;
 			}
 			if (offsetof(struct direct, d_name) + dstdp.d_namlen >
 			    dp->d_reclen) {
 				error = EIO;
 				break;
 			}
 			if (offset < startoffset || dp->d_ino == 0)
 				goto nextentry;
 			dstdp.d_fileno = dp->d_ino;
 			dstdp.d_reclen = GENERIC_DIRSIZ(&dstdp);
 			bcopy(dp->d_name, dstdp.d_name, dstdp.d_namlen);
 			/* NOTE: d_off is the offset of the *next* entry. */
 			dstdp.d_off = offset + dp->d_reclen;
 			dirent_terminate(&dstdp);
 			if (dstdp.d_reclen > uio->uio_resid) {
 				if (uio->uio_resid == startresid)
 					error = EINVAL;
 				else
 					error = EJUSTRETURN;
 				break;
 			}
 			/* Advance dp. */
 			error = uiomove((caddr_t)&dstdp, dstdp.d_reclen, uio);
 			if (error)
 				break;
 			if (cookies != NULL) {
 				KASSERT(ncookies > 0,
 				    ("ufs_readdir: cookies buffer too small"));
 				*cookies = offset + dp->d_reclen;
 				cookies++;
 				ncookies--;
 			}
 nextentry:
 			offset += dp->d_reclen;
 			dp = (struct direct *)((caddr_t)dp + dp->d_reclen);
 		}
 		bqrelse(bp);
 		uio->uio_offset = offset;
 	}
 	/* We need to correct uio_offset. */
 	uio->uio_offset = offset;
 	if (error == EJUSTRETURN)
 		error = 0;
 	if (ap->a_ncookies != NULL) {
 		if (error == 0) {
 			ap->a_ncookies -= ncookies;
 		} else {
 			free(*ap->a_cookies, M_TEMP);
 			*ap->a_ncookies = 0;
 			*ap->a_cookies = NULL;
 		}
 	}
 	if (error == 0 && ap->a_eofflag)
 		*ap->a_eofflag = ip->i_size <= uio->uio_offset;
 	return (error);
 }
 
 /*
  * Return target name of a symbolic link
  */
 static int
 ufs_readlink(ap)
 	struct vop_readlink_args /* {
 		struct vnode *a_vp;
 		struct uio *a_uio;
 		struct ucred *a_cred;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 	doff_t isize;
 
 	isize = ip->i_size;
 	if ((isize < vp->v_mount->mnt_maxsymlinklen) ||
 	    DIP(ip, i_blocks) == 0) { /* XXX - for old fastlink support */
 		return (uiomove(SHORTLINK(ip), isize, ap->a_uio));
 	}
 	return (VOP_READ(vp, ap->a_uio, 0, ap->a_cred));
 }
 
 /*
  * Calculate the logical to physical mapping if not done already,
  * then call the device strategy routine.
  *
  * In order to be able to swap to a file, the ufs_bmaparray() operation may not
  * deadlock on memory.  See ufs_bmap() for details.
  */
 static int
 ufs_strategy(ap)
 	struct vop_strategy_args /* {
 		struct vnode *a_vp;
 		struct buf *a_bp;
 	} */ *ap;
 {
 	struct buf *bp = ap->a_bp;
 	struct vnode *vp = ap->a_vp;
 	ufs2_daddr_t blkno;
 	int error;
 
 	if (bp->b_blkno == bp->b_lblkno) {
 		error = ufs_bmaparray(vp, bp->b_lblkno, &blkno, bp, NULL, NULL);
 		bp->b_blkno = blkno;
 		if (error) {
 			bp->b_error = error;
 			bp->b_ioflags |= BIO_ERROR;
 			bufdone(bp);
 			return (0);
 		}
 		if ((long)bp->b_blkno == -1)
 			vfs_bio_clrbuf(bp);
 	}
 	if ((long)bp->b_blkno == -1) {
 		bufdone(bp);
 		return (0);
 	}
 	bp->b_iooffset = dbtob(bp->b_blkno);
 	BO_STRATEGY(VFSTOUFS(vp->v_mount)->um_bo, bp);
 	return (0);
 }
 
 /*
  * Print out the contents of an inode.
  */
 static int
 ufs_print(ap)
 	struct vop_print_args /* {
 		struct vnode *a_vp;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	struct inode *ip = VTOI(vp);
 
 	printf("\tnlink=%d, effnlink=%d, size=%jd", ip->i_nlink,
 	    ip->i_effnlink, (intmax_t)ip->i_size);
 	if (I_IS_UFS2(ip))
 		printf(", extsize %d", ip->i_din2->di_extsize);
 	printf("\n\tgeneration=%jx, uid=%d, gid=%d, flags=0x%b\n",
 	    (uintmax_t)ip->i_gen, ip->i_uid, ip->i_gid,
 	    (u_int)ip->i_flags, PRINT_INODE_FLAGS);
 	printf("\tino %lu, on dev %s", (u_long)ip->i_number,
 	    devtoname(ITODEV(ip)));
 	if (vp->v_type == VFIFO)
 		fifo_printinfo(vp);
 	printf("\n");
 	return (0);
 }
 
 /*
  * Close wrapper for fifos.
  *
  * Update the times on the inode then do device close.
  */
 static int
 ufsfifo_close(ap)
 	struct vop_close_args /* {
 		struct vnode *a_vp;
 		int  a_fflag;
 		struct ucred *a_cred;
 		struct thread *a_td;
 	} */ *ap;
 {
 	struct vnode *vp = ap->a_vp;
 	int usecount;
 
 	VI_LOCK(vp);
 	usecount = vp->v_usecount;
 	if (usecount > 1)
 		ufs_itimes_locked(vp);
 	VI_UNLOCK(vp);
 	return (fifo_specops.vop_close(ap));
 }
 
 /*
  * Kqfilter wrapper for fifos.
  *
  * Fall through to ufs kqfilter routines if needed 
  */
 static int
 ufsfifo_kqfilter(ap)
 	struct vop_kqfilter_args *ap;
 {
 	int error;
 
 	error = fifo_specops.vop_kqfilter(ap);
 	if (error)
 		error = vfs_kqfilter(ap);
 	return (error);
 }
 
 /*
  * Return POSIX pathconf information applicable to ufs filesystems.
  */
 static int
 ufs_pathconf(ap)
 	struct vop_pathconf_args /* {
 		struct vnode *a_vp;
 		int a_name;
 		int *a_retval;
 	} */ *ap;
 {
 	int error;
 
 	error = 0;
 	switch (ap->a_name) {
 	case _PC_LINK_MAX:
 		*ap->a_retval = UFS_LINK_MAX;
 		break;
 	case _PC_NAME_MAX:
 		*ap->a_retval = UFS_MAXNAMLEN;
 		break;
 	case _PC_PIPE_BUF:
 		if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO)
 			*ap->a_retval = PIPE_BUF;
 		else
 			error = EINVAL;
 		break;
 	case _PC_CHOWN_RESTRICTED:
 		*ap->a_retval = 1;
 		break;
 	case _PC_NO_TRUNC:
 		*ap->a_retval = 1;
 		break;
 #ifdef UFS_ACL
 	case _PC_ACL_EXTENDED:
 		if (ap->a_vp->v_mount->mnt_flag & MNT_ACLS)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		break;
 	case _PC_ACL_NFS4:
 		if (ap->a_vp->v_mount->mnt_flag & MNT_NFS4ACLS)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		break;
 #endif
 	case _PC_ACL_PATH_MAX:
 #ifdef UFS_ACL
 		if (ap->a_vp->v_mount->mnt_flag & (MNT_ACLS | MNT_NFS4ACLS))
 			*ap->a_retval = ACL_MAX_ENTRIES;
 		else
 			*ap->a_retval = 3;
 #else
 		*ap->a_retval = 3;
 #endif
 		break;
 #ifdef MAC
 	case _PC_MAC_PRESENT:
 		if (ap->a_vp->v_mount->mnt_flag & MNT_MULTILABEL)
 			*ap->a_retval = 1;
 		else
 			*ap->a_retval = 0;
 		break;
 #endif
 	case _PC_MIN_HOLE_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_PRIO_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_SYNC_IO:
 		*ap->a_retval = 0;
 		break;
 	case _PC_ALLOC_SIZE_MIN:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_bsize;
 		break;
 	case _PC_FILESIZEBITS:
 		*ap->a_retval = 64;
 		break;
 	case _PC_REC_INCR_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_MAX_XFER_SIZE:
 		*ap->a_retval = -1; /* means ``unlimited'' */
 		break;
 	case _PC_REC_MIN_XFER_SIZE:
 		*ap->a_retval = ap->a_vp->v_mount->mnt_stat.f_iosize;
 		break;
 	case _PC_REC_XFER_ALIGN:
 		*ap->a_retval = PAGE_SIZE;
 		break;
 	case _PC_SYMLINK_MAX:
 		*ap->a_retval = MAXPATHLEN;
 		break;
 
 	default:
 		error = vop_stdpathconf(ap);
 		break;
 	}
 	return (error);
 }
 
 /*
  * Initialize the vnode associated with a new inode, handle aliased
  * vnodes.
  */
 int
 ufs_vinit(mntp, fifoops, vpp)
 	struct mount *mntp;
 	struct vop_vector *fifoops;
 	struct vnode **vpp;
 {
 	struct inode *ip;
 	struct vnode *vp;
 
 	vp = *vpp;
 	ASSERT_VOP_LOCKED(vp, "ufs_vinit");
 	ip = VTOI(vp);
 	vp->v_type = IFTOVT(ip->i_mode);
 	/*
 	 * Only unallocated inodes should be of type VNON.
 	 */
 	if (ip->i_mode != 0 && vp->v_type == VNON)
 		return (EINVAL);
 	if (vp->v_type == VFIFO)
 		vp->v_op = fifoops;
 	if (ip->i_number == UFS_ROOTINO)
 		vp->v_vflag |= VV_ROOT;
 	*vpp = vp;
 	return (0);
 }
 
 /*
  * Allocate a new inode.
  * Vnode dvp must be locked.
  */
 static int
 ufs_makeinode(mode, dvp, vpp, cnp, callfunc)
 	int mode;
 	struct vnode *dvp;
 	struct vnode **vpp;
 	struct componentname *cnp;
 	const char *callfunc;
 {
 	struct inode *ip, *pdir;
 	struct direct newdir;
 	struct vnode *tvp;
 	int error;
 
 	pdir = VTOI(dvp);
 #ifdef INVARIANTS
 	if ((cnp->cn_flags & HASBUF) == 0)
 		panic("%s: no name", callfunc);
 #endif
 	*vpp = NULL;
 	if ((mode & IFMT) == 0)
 		mode |= IFREG;
 
 	if (pdir->i_effnlink < 2) {
 		print_bad_link_count(callfunc, dvp);
 		return (EINVAL);
 	}
 	error = UFS_VALLOC(dvp, mode, cnp->cn_cred, &tvp);
 	if (error)
 		return (error);
 	ip = VTOI(tvp);
 	ip->i_gid = pdir->i_gid;
 	DIP_SET(ip, i_gid, pdir->i_gid);
 #ifdef SUIDDIR
 	{
 #ifdef QUOTA
 		struct ucred ucred, *ucp;
 		gid_t ucred_group;
 		ucp = cnp->cn_cred;
 #endif
 		/*
 		 * If we are not the owner of the directory,
 		 * and we are hacking owners here, (only do this where told to)
 		 * and we are not giving it TO root, (would subvert quotas)
 		 * then go ahead and give it to the other user.
 		 * Note that this drops off the execute bits for security.
 		 */
 		if ((dvp->v_mount->mnt_flag & MNT_SUIDDIR) &&
 		    (pdir->i_mode & ISUID) &&
 		    (pdir->i_uid != cnp->cn_cred->cr_uid) && pdir->i_uid) {
 			ip->i_uid = pdir->i_uid;
 			DIP_SET(ip, i_uid, ip->i_uid);
 			mode &= ~07111;
 #ifdef QUOTA
 			/*
 			 * Make sure the correct user gets charged
 			 * for the space.
 			 * Quickly knock up a dummy credential for the victim.
 			 * XXX This seems to never be accessed out of our
 			 * context so a stack variable is ok.
 			 */
 			refcount_init(&ucred.cr_ref, 1);
 			ucred.cr_uid = ip->i_uid;
 			ucred.cr_ngroups = 1;
 			ucred.cr_groups = &ucred_group;
 			ucred.cr_groups[0] = pdir->i_gid;
 			ucp = &ucred;
 #endif
 		} else {
 			ip->i_uid = cnp->cn_cred->cr_uid;
 			DIP_SET(ip, i_uid, ip->i_uid);
 		}
 
 #ifdef QUOTA
 		if ((error = getinoquota(ip)) ||
 	    	    (error = chkiq(ip, 1, ucp, 0))) {
 			if (DOINGSOFTDEP(tvp))
 				softdep_revert_link(pdir, ip);
 			UFS_VFREE(tvp, ip->i_number, mode);
 			vput(tvp);
 			return (error);
 		}
 #endif
 	}
 #else	/* !SUIDDIR */
 	ip->i_uid = cnp->cn_cred->cr_uid;
 	DIP_SET(ip, i_uid, ip->i_uid);
 #ifdef QUOTA
 	if ((error = getinoquota(ip)) ||
 	    (error = chkiq(ip, 1, cnp->cn_cred, 0))) {
 		if (DOINGSOFTDEP(tvp))
 			softdep_revert_link(pdir, ip);
 		UFS_VFREE(tvp, ip->i_number, mode);
 		vput(tvp);
 		return (error);
 	}
 #endif
 #endif	/* !SUIDDIR */
 	ip->i_flag |= IN_ACCESS | IN_CHANGE | IN_UPDATE;
 	ip->i_mode = mode;
 	DIP_SET(ip, i_mode, mode);
 	tvp->v_type = IFTOVT(mode);	/* Rest init'd in getnewvnode(). */
 	ip->i_effnlink = 1;
 	ip->i_nlink = 1;
 	DIP_SET(ip, i_nlink, 1);
 	if (DOINGSOFTDEP(tvp))
 		softdep_setup_create(VTOI(dvp), ip);
 	if ((ip->i_mode & ISGID) && !groupmember(ip->i_gid, cnp->cn_cred) &&
 	    priv_check_cred(cnp->cn_cred, PRIV_VFS_SETGID)) {
 		ip->i_mode &= ~ISGID;
 		DIP_SET(ip, i_mode, ip->i_mode);
 	}
 
 	if (cnp->cn_flags & ISWHITEOUT) {
 		ip->i_flags |= UF_OPAQUE;
 		DIP_SET(ip, i_flags, ip->i_flags);
 	}
 
 	/*
 	 * Make sure inode goes to disk before directory entry.
 	 */
 	error = UFS_UPDATE(tvp, !DOINGSOFTDEP(tvp) && !DOINGASYNC(tvp));
 	if (error)
 		goto bad;
 #ifdef MAC
 	if (dvp->v_mount->mnt_flag & MNT_MULTILABEL) {
 		error = mac_vnode_create_extattr(cnp->cn_cred, dvp->v_mount,
 		    dvp, tvp, cnp);
 		if (error)
 			goto bad;
 	}
 #endif
 #ifdef UFS_ACL
 	if (dvp->v_mount->mnt_flag & MNT_ACLS) {
 		error = ufs_do_posix1e_acl_inheritance_file(dvp, tvp, mode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	} else if (dvp->v_mount->mnt_flag & MNT_NFS4ACLS) {
 		error = ufs_do_nfs4_acl_inheritance(dvp, tvp, mode,
 		    cnp->cn_cred, cnp->cn_thread);
 		if (error)
 			goto bad;
 	}
 #endif /* !UFS_ACL */
 	ufs_makedirentry(ip, cnp, &newdir);
 	error = ufs_direnter(dvp, tvp, &newdir, cnp, NULL, 0);
 	if (error)
 		goto bad;
 	*vpp = tvp;
 	return (0);
 
 bad:
 	/*
 	 * Write error occurred trying to update the inode
 	 * or the directory so must deallocate the inode.
 	 */
 	ip->i_effnlink = 0;
 	ip->i_nlink = 0;
 	DIP_SET(ip, i_nlink, 0);
 	ip->i_flag |= IN_CHANGE;
 	if (DOINGSOFTDEP(tvp))
 		softdep_revert_create(VTOI(dvp), ip);
 	vput(tvp);
 	return (error);
 }
 
 static int
 ufs_ioctl(struct vop_ioctl_args *ap)
 {
 	struct vnode *vp;
+	int error;
 
 	vp = ap->a_vp;
 	switch (ap->a_command) {
 	case FIOSEEKDATA:
-		return (ufs_bmap_seekdata(vp, (off_t *)ap->a_data));
+		error = vn_lock(vp, LK_SHARED);
+		if (error == 0) {
+			error = ufs_bmap_seekdata(vp, (off_t *)ap->a_data);
+			VOP_UNLOCK(vp, 0);
+		} else
+			error = EBADF;
+		return (error);
 	case FIOSEEKHOLE:
 		return (vn_bmap_seekhole(vp, ap->a_command, (off_t *)ap->a_data,
 		    ap->a_cred));
 	default:
 		return (ENOTTY);
 	}
 }
 
 /* Global vfs data structures for ufs. */
 struct vop_vector ufs_vnodeops = {
 	.vop_default =		&default_vnodeops,
 	.vop_fsync =		VOP_PANIC,
 	.vop_read =		VOP_PANIC,
 	.vop_reallocblks =	VOP_PANIC,
 	.vop_write =		VOP_PANIC,
 	.vop_accessx =		ufs_accessx,
 	.vop_bmap =		ufs_bmap,
 	.vop_cachedlookup =	ufs_lookup,
 	.vop_close =		ufs_close,
 	.vop_create =		ufs_create,
 	.vop_getattr =		ufs_getattr,
 	.vop_inactive =		ufs_inactive,
 	.vop_ioctl =		ufs_ioctl,
 	.vop_link =		ufs_link,
 	.vop_lookup =		vfs_cache_lookup,
 	.vop_markatime =	ufs_markatime,
 	.vop_mkdir =		ufs_mkdir,
 	.vop_mknod =		ufs_mknod,
 	.vop_open =		ufs_open,
 	.vop_pathconf =		ufs_pathconf,
 	.vop_poll =		vop_stdpoll,
 	.vop_print =		ufs_print,
 	.vop_readdir =		ufs_readdir,
 	.vop_readlink =		ufs_readlink,
 	.vop_reclaim =		ufs_reclaim,
 	.vop_remove =		ufs_remove,
 	.vop_rename =		ufs_rename,
 	.vop_rmdir =		ufs_rmdir,
 	.vop_setattr =		ufs_setattr,
 #ifdef MAC
 	.vop_setlabel =		vop_stdsetlabel_ea,
 #endif
 	.vop_strategy =		ufs_strategy,
 	.vop_symlink =		ufs_symlink,
 	.vop_whiteout =		ufs_whiteout,
 #ifdef UFS_EXTATTR
 	.vop_getextattr =	ufs_getextattr,
 	.vop_deleteextattr =	ufs_deleteextattr,
 	.vop_setextattr =	ufs_setextattr,
 #endif
 #ifdef UFS_ACL
 	.vop_getacl =		ufs_getacl,
 	.vop_setacl =		ufs_setacl,
 	.vop_aclcheck =		ufs_aclcheck,
 #endif
 };
 
 struct vop_vector ufs_fifoops = {
 	.vop_default =		&fifo_specops,
 	.vop_fsync =		VOP_PANIC,
 	.vop_accessx =		ufs_accessx,
 	.vop_close =		ufsfifo_close,
 	.vop_getattr =		ufs_getattr,
 	.vop_inactive =		ufs_inactive,
 	.vop_kqfilter =		ufsfifo_kqfilter,
 	.vop_markatime =	ufs_markatime,
 	.vop_pathconf = 	ufs_pathconf,
 	.vop_print =		ufs_print,
 	.vop_read =		VOP_PANIC,
 	.vop_reclaim =		ufs_reclaim,
 	.vop_setattr =		ufs_setattr,
 #ifdef MAC
 	.vop_setlabel =		vop_stdsetlabel_ea,
 #endif
 	.vop_write =		VOP_PANIC,
 #ifdef UFS_EXTATTR
 	.vop_getextattr =	ufs_getextattr,
 	.vop_deleteextattr =	ufs_deleteextattr,
 	.vop_setextattr =	ufs_setextattr,
 #endif
 #ifdef UFS_ACL
 	.vop_getacl =		ufs_getacl,
 	.vop_setacl =		ufs_setacl,
 	.vop_aclcheck =		ufs_aclcheck,
 #endif
 };
Index: projects/nfsv42/sys
===================================================================
--- projects/nfsv42/sys	(revision 350367)
+++ projects/nfsv42/sys	(revision 350368)

Property changes on: projects/nfsv42/sys
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head/sys:r350326-350367