Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c
===================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c	(revision 271000)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/fasttrap.c	(revision 271001)
@@ -1,2724 +1,2724 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  *
  * Portions Copyright 2010 The FreeBSD Foundation
  *
  * $FreeBSD$
  */
 
 /*
  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/atomic.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/modctl.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
 #if defined(sun)
 #include <sys/ddi.h>
 #endif
 #include <sys/sunddi.h>
 #include <sys/cpuvar.h>
 #include <sys/kmem.h>
 #if defined(sun)
 #include <sys/strsubr.h>
 #endif
 #include <sys/fasttrap.h>
 #include <sys/fasttrap_impl.h>
 #include <sys/fasttrap_isa.h>
 #include <sys/dtrace.h>
 #include <sys/dtrace_impl.h>
 #include <sys/sysmacros.h>
 #include <sys/proc.h>
 #include <sys/policy.h>
 #if defined(sun)
 #include <util/qsort.h>
 #endif
 #include <sys/mutex.h>
 #include <sys/kernel.h>
 #if !defined(sun)
 #include <sys/dtrace_bsd.h>
 #include <sys/eventhandler.h>
 #include <sys/user.h>
 #include <vm/vm.h>
 #include <vm/pmap.h>
 #include <vm/vm_map.h>
 #include <vm/vm_param.h>
 #include <sys/u8_textprep.h>
 #include <cddl/dev/dtrace/dtrace_cddl.h>
 #endif
 
 /*
  * User-Land Trap-Based Tracing
  * ----------------------------
  *
  * The fasttrap provider allows DTrace consumers to instrument any user-level
  * instruction to gather data; this includes probes with semantic
  * signifigance like entry and return as well as simple offsets into the
  * function. While the specific techniques used are very ISA specific, the
  * methodology is generalizable to any architecture.
  *
  *
  * The General Methodology
  * -----------------------
  *
  * With the primary goal of tracing every user-land instruction and the
  * limitation that we can't trust user space so don't want to rely on much
  * information there, we begin by replacing the instructions we want to trace
  * with trap instructions. Each instruction we overwrite is saved into a hash
  * table keyed by process ID and pc address. When we enter the kernel due to
  * this trap instruction, we need the effects of the replaced instruction to
  * appear to have occurred before we proceed with the user thread's
  * execution.
  *
  * Each user level thread is represented by a ulwp_t structure which is
  * always easily accessible through a register. The most basic way to produce
  * the effects of the instruction we replaced is to copy that instruction out
  * to a bit of scratch space reserved in the user thread's ulwp_t structure
  * (a sort of kernel-private thread local storage), set the PC to that
  * scratch space and single step. When we reenter the kernel after single
  * stepping the instruction we must then adjust the PC to point to what would
  * normally be the next instruction. Of course, special care must be taken
  * for branches and jumps, but these represent such a small fraction of any
  * instruction set that writing the code to emulate these in the kernel is
  * not too difficult.
  *
  * Return probes may require several tracepoints to trace every return site,
  * and, conversely, each tracepoint may activate several probes (the entry
  * and offset 0 probes, for example). To solve this muliplexing problem,
  * tracepoints contain lists of probes to activate and probes contain lists
  * of tracepoints to enable. If a probe is activated, it adds its ID to
  * existing tracepoints or creates new ones as necessary.
  *
  * Most probes are activated _before_ the instruction is executed, but return
  * probes are activated _after_ the effects of the last instruction of the
  * function are visible. Return probes must be fired _after_ we have
  * single-stepped the instruction whereas all other probes are fired
  * beforehand.
  *
  *
  * Lock Ordering
  * -------------
  *
  * The lock ordering below -- both internally and with respect to the DTrace
  * framework -- is a little tricky and bears some explanation. Each provider
  * has a lock (ftp_mtx) that protects its members including reference counts
  * for enabled probes (ftp_rcount), consumers actively creating probes
  * (ftp_ccount) and USDT consumers (ftp_mcount); all three prevent a provider
  * from being freed. A provider is looked up by taking the bucket lock for the
  * provider hash table, and is returned with its lock held. The provider lock
  * may be taken in functions invoked by the DTrace framework, but may not be
  * held while calling functions in the DTrace framework.
  *
  * To ensure consistency over multiple calls to the DTrace framework, the
  * creation lock (ftp_cmtx) should be held. Naturally, the creation lock may
  * not be taken when holding the provider lock as that would create a cyclic
  * lock ordering. In situations where one would naturally take the provider
  * lock and then the creation lock, we instead up a reference count to prevent
  * the provider from disappearing, drop the provider lock, and acquire the
  * creation lock.
  *
  * Briefly:
  * 	bucket lock before provider lock
  *	DTrace before provider lock
  *	creation lock before DTrace
  *	never hold the provider lock and creation lock simultaneously
  */
 
 static d_open_t fasttrap_open;
 static d_ioctl_t fasttrap_ioctl;
 
 static struct cdevsw fasttrap_cdevsw = {
 	.d_version	= D_VERSION,
 	.d_open		= fasttrap_open,
 	.d_ioctl	= fasttrap_ioctl,
 	.d_name		= "fasttrap",
 };
 static struct cdev *fasttrap_cdev;
 static dtrace_meta_provider_id_t fasttrap_meta_id;
 
 static struct proc *fasttrap_cleanup_proc;
 static struct mtx fasttrap_cleanup_mtx;
 static uint_t fasttrap_cleanup_work, fasttrap_cleanup_drain, fasttrap_cleanup_cv;
 
 /*
  * Generation count on modifications to the global tracepoint lookup table.
  */
 static volatile uint64_t fasttrap_mod_gen;
 
 /*
  * When the fasttrap provider is loaded, fasttrap_max is set to either
  * FASTTRAP_MAX_DEFAULT or the value for fasttrap-max-probes in the
  * fasttrap.conf file. Each time a probe is created, fasttrap_total is
  * incremented by the number of tracepoints that may be associated with that
  * probe; fasttrap_total is capped at fasttrap_max.
  */
 #define	FASTTRAP_MAX_DEFAULT		250000
 static uint32_t fasttrap_max;
 static uint32_t fasttrap_total;
 
 /*
  * Copyright (c) 2011, Joyent, Inc. All rights reserved.
  */
 
 #define	FASTTRAP_TPOINTS_DEFAULT_SIZE	0x4000
 #define	FASTTRAP_PROVIDERS_DEFAULT_SIZE	0x100
 #define	FASTTRAP_PROCS_DEFAULT_SIZE	0x100
 
 #define	FASTTRAP_PID_NAME		"pid"
 
 fasttrap_hash_t			fasttrap_tpoints;
 static fasttrap_hash_t		fasttrap_provs;
 static fasttrap_hash_t		fasttrap_procs;
 
 static uint64_t			fasttrap_pid_count;	/* pid ref count */
 static kmutex_t			fasttrap_count_mtx;	/* lock on ref count */
 
 #define	FASTTRAP_ENABLE_FAIL	1
 #define	FASTTRAP_ENABLE_PARTIAL	2
 
 static int fasttrap_tracepoint_enable(proc_t *, fasttrap_probe_t *, uint_t);
 static void fasttrap_tracepoint_disable(proc_t *, fasttrap_probe_t *, uint_t);
 
 static fasttrap_provider_t *fasttrap_provider_lookup(pid_t, const char *,
     const dtrace_pattr_t *);
 static void fasttrap_provider_retire(pid_t, const char *, int);
 static void fasttrap_provider_free(fasttrap_provider_t *);
 
 static fasttrap_proc_t *fasttrap_proc_lookup(pid_t);
 static void fasttrap_proc_release(fasttrap_proc_t *);
 
 #if !defined(sun)
 static void fasttrap_thread_dtor(void *, struct thread *);
 #endif
 
 #define	FASTTRAP_PROVS_INDEX(pid, name) \
 	((fasttrap_hash_str(name) + (pid)) & fasttrap_provs.fth_mask)
 
 #define	FASTTRAP_PROCS_INDEX(pid) ((pid) & fasttrap_procs.fth_mask)
 
 #if !defined(sun)
 static kmutex_t fasttrap_cpuc_pid_lock[MAXCPU];
 static eventhandler_tag fasttrap_thread_dtor_tag;
 #endif
 
 static int
 fasttrap_highbit(ulong_t i)
 {
 	int h = 1;
 
 	if (i == 0)
 		return (0);
 #ifdef _LP64
 	if (i & 0xffffffff00000000ul) {
 		h += 32; i >>= 32;
 	}
 #endif
 	if (i & 0xffff0000) {
 		h += 16; i >>= 16;
 	}
 	if (i & 0xff00) {
 		h += 8; i >>= 8;
 	}
 	if (i & 0xf0) {
 		h += 4; i >>= 4;
 	}
 	if (i & 0xc) {
 		h += 2; i >>= 2;
 	}
 	if (i & 0x2) {
 		h += 1;
 	}
 	return (h);
 }
 
 static uint_t
 fasttrap_hash_str(const char *p)
 {
 	unsigned int g;
 	uint_t hval = 0;
 
 	while (*p) {
 		hval = (hval << 4) + *p++;
 		if ((g = (hval & 0xf0000000)) != 0)
 			hval ^= g >> 24;
 		hval &= ~g;
 	}
 	return (hval);
 }
 
 void
 fasttrap_sigtrap(proc_t *p, kthread_t *t, uintptr_t pc)
 {
 #if defined(sun)
 	sigqueue_t *sqp = kmem_zalloc(sizeof (sigqueue_t), KM_SLEEP);
 
 	sqp->sq_info.si_signo = SIGTRAP;
 	sqp->sq_info.si_code = TRAP_DTRACE;
 	sqp->sq_info.si_addr = (caddr_t)pc;
 
 	mutex_enter(&p->p_lock);
 	sigaddqa(p, t, sqp);
 	mutex_exit(&p->p_lock);
 
 	if (t != NULL)
 		aston(t);
 #else
 	ksiginfo_t *ksi = kmem_zalloc(sizeof (ksiginfo_t), KM_SLEEP);
 
 	ksiginfo_init(ksi);
 	ksi->ksi_signo = SIGTRAP;
 	ksi->ksi_code = TRAP_DTRACE;
 	ksi->ksi_addr = (caddr_t)pc;
 	PROC_LOCK(p);
 	(void) tdksignal(t, SIGTRAP, ksi);
 	PROC_UNLOCK(p);
 #endif
 }
 
 #if !defined(sun)
 /*
  * Obtain a chunk of scratch space in the address space of the target process.
  */
 fasttrap_scrspace_t *
 fasttrap_scraddr(struct thread *td, fasttrap_proc_t *fprc)
 {
 	fasttrap_scrblock_t *scrblk;
 	fasttrap_scrspace_t *scrspc;
 	struct proc *p;
 	vm_offset_t addr;
 	int error, i;
 
 	scrspc = NULL;
 	if (td->t_dtrace_sscr != NULL) {
 		/* If the thread already has scratch space, we're done. */
 		scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr;
 		return (scrspc);
 	}
 
 	p = td->td_proc;
 
 	mutex_enter(&fprc->ftpc_mtx);
 	if (LIST_EMPTY(&fprc->ftpc_fscr)) {
 		/*
 		 * No scratch space is available, so we'll map a new scratch
 		 * space block into the traced process' address space.
 		 */
 		addr = 0;
 		error = vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr,
 		    FASTTRAP_SCRBLOCK_SIZE, 0, VMFS_ANY_SPACE, VM_PROT_ALL,
 		    VM_PROT_ALL, 0);
 		if (error != KERN_SUCCESS)
 			goto done;
 
 		scrblk = malloc(sizeof(*scrblk), M_SOLARIS, M_WAITOK);
 		scrblk->ftsb_addr = addr;
 		LIST_INSERT_HEAD(&fprc->ftpc_scrblks, scrblk, ftsb_next);
 
 		/*
 		 * Carve the block up into chunks and put them on the free list.
 		 */
 		for (i = 0;
 		    i < FASTTRAP_SCRBLOCK_SIZE / FASTTRAP_SCRSPACE_SIZE; i++) {
 			scrspc = malloc(sizeof(*scrspc), M_SOLARIS, M_WAITOK);
 			scrspc->ftss_addr = addr +
 			    i * FASTTRAP_SCRSPACE_SIZE;
 			LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc,
 			    ftss_next);
 		}
 	}
 
 	/*
 	 * Take the first scratch chunk off the free list, put it on the
 	 * allocated list, and return its address.
 	 */
 	scrspc = LIST_FIRST(&fprc->ftpc_fscr);
 	LIST_REMOVE(scrspc, ftss_next);
 	LIST_INSERT_HEAD(&fprc->ftpc_ascr, scrspc, ftss_next);
 
 	/*
 	 * This scratch space is reserved for use by td until the thread exits.
 	 */
 	td->t_dtrace_sscr = scrspc;
 
 done:
 	mutex_exit(&fprc->ftpc_mtx);
 
 	return (scrspc);
 }
 
 /*
  * Return any allocated per-thread scratch space chunks back to the process'
  * free list.
  */
 static void
 fasttrap_thread_dtor(void *arg __unused, struct thread *td)
 {
 	fasttrap_bucket_t *bucket;
 	fasttrap_proc_t *fprc;
 	fasttrap_scrspace_t *scrspc;
 	pid_t pid;
 
 	if (td->t_dtrace_sscr == NULL)
 		return;
 
 	pid = td->td_proc->p_pid;
 	bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
 	fprc = NULL;
 
 	/* Look up the fasttrap process handle for this process. */
 	mutex_enter(&bucket->ftb_mtx);
 	for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) {
 		if (fprc->ftpc_pid == pid) {
 			mutex_enter(&fprc->ftpc_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			break;
 		}
 	}
 	if (fprc == NULL) {
 		mutex_exit(&bucket->ftb_mtx);
 		return;
 	}
 
 	scrspc = (fasttrap_scrspace_t *)td->t_dtrace_sscr;
 	LIST_REMOVE(scrspc, ftss_next);
 	LIST_INSERT_HEAD(&fprc->ftpc_fscr, scrspc, ftss_next);
 
 	mutex_exit(&fprc->ftpc_mtx);
 }
 #endif
 
 /*
  * This function ensures that no threads are actively using the memory
  * associated with probes that were formerly live.
  */
 static void
 fasttrap_mod_barrier(uint64_t gen)
 {
 	int i;
 
 	if (gen < fasttrap_mod_gen)
 		return;
 
 	fasttrap_mod_gen++;
 
 	CPU_FOREACH(i) {
 		mutex_enter(&fasttrap_cpuc_pid_lock[i]);
 		mutex_exit(&fasttrap_cpuc_pid_lock[i]);
 	}
 }
 
 /*
  * This function performs asynchronous cleanup of fasttrap providers. The
  * Solaris implementation of this mechanism use a timeout that's activated in
  * fasttrap_pid_cleanup(), but this doesn't work in FreeBSD: one may sleep while
  * holding the DTrace mutexes, but it is unsafe to sleep in a callout handler.
  * Thus we use a dedicated process to perform the cleanup when requested.
  */
 /*ARGSUSED*/
 static void
 fasttrap_pid_cleanup_cb(void *data)
 {
 	fasttrap_provider_t **fpp, *fp;
 	fasttrap_bucket_t *bucket;
 	dtrace_provider_id_t provid;
 	int i, later = 0, rval;
 
 	mtx_lock(&fasttrap_cleanup_mtx);
 	while (!fasttrap_cleanup_drain || later > 0) {
 		fasttrap_cleanup_work = 0;
 		mtx_unlock(&fasttrap_cleanup_mtx);
 
 		later = 0;
 
 		/*
 		 * Iterate over all the providers trying to remove the marked
 		 * ones. If a provider is marked but not retired, we just
 		 * have to take a crack at removing it -- it's no big deal if
 		 * we can't.
 		 */
 		for (i = 0; i < fasttrap_provs.fth_nent; i++) {
 			bucket = &fasttrap_provs.fth_table[i];
 			mutex_enter(&bucket->ftb_mtx);
 			fpp = (fasttrap_provider_t **)&bucket->ftb_data;
 
 			while ((fp = *fpp) != NULL) {
 				if (!fp->ftp_marked) {
 					fpp = &fp->ftp_next;
 					continue;
 				}
 
 				mutex_enter(&fp->ftp_mtx);
 
 				/*
 				 * If this provider has consumers actively
 				 * creating probes (ftp_ccount) or is a USDT
 				 * provider (ftp_mcount), we can't unregister
 				 * or even condense.
 				 */
 				if (fp->ftp_ccount != 0 ||
 				    fp->ftp_mcount != 0) {
 					mutex_exit(&fp->ftp_mtx);
 					fp->ftp_marked = 0;
 					continue;
 				}
 
 				if (!fp->ftp_retired || fp->ftp_rcount != 0)
 					fp->ftp_marked = 0;
 
 				mutex_exit(&fp->ftp_mtx);
 
 				/*
 				 * If we successfully unregister this
 				 * provider we can remove it from the hash
 				 * chain and free the memory. If our attempt
 				 * to unregister fails and this is a retired
 				 * provider, increment our flag to try again
 				 * pretty soon. If we've consumed more than
 				 * half of our total permitted number of
 				 * probes call dtrace_condense() to try to
 				 * clean out the unenabled probes.
 				 */
 				provid = fp->ftp_provid;
 				if ((rval = dtrace_unregister(provid)) != 0) {
 					if (fasttrap_total > fasttrap_max / 2)
 						(void) dtrace_condense(provid);
 
 					if (rval == EAGAIN)
 						fp->ftp_marked = 1;
 
 					later += fp->ftp_marked;
 					fpp = &fp->ftp_next;
 				} else {
 					*fpp = fp->ftp_next;
 					fasttrap_provider_free(fp);
 				}
 			}
 			mutex_exit(&bucket->ftb_mtx);
 		}
 		mtx_lock(&fasttrap_cleanup_mtx);
 
 		/*
 		 * If we were unable to retire a provider, try again after a
 		 * second. This situation can occur in certain circumstances
 		 * where providers cannot be unregistered even though they have
 		 * no probes enabled because of an execution of dtrace -l or
 		 * something similar.
 		 */
 		if (later > 0 || fasttrap_cleanup_work ||
 		    fasttrap_cleanup_drain) {
 			mtx_unlock(&fasttrap_cleanup_mtx);
 			pause("ftclean", hz);
 			mtx_lock(&fasttrap_cleanup_mtx);
 		} else
 			mtx_sleep(&fasttrap_cleanup_cv, &fasttrap_cleanup_mtx,
 			    0, "ftcl", 0);
 	}
 
 	/*
 	 * Wake up the thread in fasttrap_unload() now that we're done.
 	 */
 	wakeup(&fasttrap_cleanup_drain);
 	mtx_unlock(&fasttrap_cleanup_mtx);
 
 	kthread_exit();
 }
 
 /*
  * Activates the asynchronous cleanup mechanism.
  */
 static void
 fasttrap_pid_cleanup(void)
 {
 
 	mtx_lock(&fasttrap_cleanup_mtx);
 	if (!fasttrap_cleanup_work) {
 		fasttrap_cleanup_work = 1;
 		wakeup(&fasttrap_cleanup_cv);
 	}
 	mtx_unlock(&fasttrap_cleanup_mtx);
 }
 
 /*
  * This is called from cfork() via dtrace_fasttrap_fork(). The child
  * process's address space is (roughly) a copy of the parent process's so
  * we have to remove all the instrumentation we had previously enabled in the
  * parent.
  */
 static void
 fasttrap_fork(proc_t *p, proc_t *cp)
 {
 #if !defined(sun)
 	fasttrap_scrblock_t *scrblk;
 	fasttrap_proc_t *fprc = NULL;
 #endif
 	pid_t ppid = p->p_pid;
 	int i;
 
 #if defined(sun)
 	ASSERT(curproc == p);
 	ASSERT(p->p_proc_flag & P_PR_LOCK);
 #else
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 #endif
 #if defined(sun)
 	ASSERT(p->p_dtrace_count > 0);
 #else
 	if (p->p_dtrace_helpers) {
 		/*
 		 * dtrace_helpers_duplicate() allocates memory.
 		 */
 		_PHOLD(cp);
 		PROC_UNLOCK(p);
 		PROC_UNLOCK(cp);
 		dtrace_helpers_duplicate(p, cp);
 		PROC_LOCK(cp);
 		PROC_LOCK(p);
 		_PRELE(cp);
 	}
 	/*
 	 * This check is purposely here instead of in kern_fork.c because,
 	 * for legal resons, we cannot include the dtrace_cddl.h header
 	 * inside kern_fork.c and insert if-clause there.
 	 */
 	if (p->p_dtrace_count == 0)
 		return;
 #endif
 	ASSERT(cp->p_dtrace_count == 0);
 
 	/*
 	 * This would be simpler and faster if we maintained per-process
 	 * hash tables of enabled tracepoints. It could, however, potentially
 	 * slow down execution of a tracepoint since we'd need to go
 	 * through two levels of indirection. In the future, we should
 	 * consider either maintaining per-process ancillary lists of
 	 * enabled tracepoints or hanging a pointer to a per-process hash
 	 * table of enabled tracepoints off the proc structure.
 	 */
 
 	/*
 	 * We don't have to worry about the child process disappearing
 	 * because we're in fork().
 	 */
 #if defined(sun)
 	mtx_lock_spin(&cp->p_slock);
 	sprlock_proc(cp);
 	mtx_unlock_spin(&cp->p_slock);
 #else
 	/*
 	 * fasttrap_tracepoint_remove() expects the child process to be
 	 * unlocked and the VM then expects curproc to be unlocked.
 	 */
 	_PHOLD(cp);
 	PROC_UNLOCK(cp);
 	PROC_UNLOCK(p);
 #endif
 
 	/*
 	 * Iterate over every tracepoint looking for ones that belong to the
 	 * parent process, and remove each from the child process.
 	 */
 	for (i = 0; i < fasttrap_tpoints.fth_nent; i++) {
 		fasttrap_tracepoint_t *tp;
 		fasttrap_bucket_t *bucket = &fasttrap_tpoints.fth_table[i];
 
 		mutex_enter(&bucket->ftb_mtx);
 		for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
 			if (tp->ftt_pid == ppid &&
 			    tp->ftt_proc->ftpc_acount != 0) {
 				int ret = fasttrap_tracepoint_remove(cp, tp);
 				ASSERT(ret == 0);
 
 				/*
 				 * The count of active providers can only be
 				 * decremented (i.e. to zero) during exec,
 				 * exit, and removal of a meta provider so it
 				 * should be impossible to drop the count
 				 * mid-fork.
 				 */
 				ASSERT(tp->ftt_proc->ftpc_acount != 0);
 #if !defined(sun)
 				fprc = tp->ftt_proc;
 #endif
 			}
 		}
 		mutex_exit(&bucket->ftb_mtx);
 
 #if !defined(sun)
 		/*
 		 * Unmap any scratch space inherited from the parent's address
 		 * space.
 		 */
 		if (fprc != NULL) {
 			mutex_enter(&fprc->ftpc_mtx);
 			LIST_FOREACH(scrblk, &fprc->ftpc_scrblks, ftsb_next) {
 				vm_map_remove(&cp->p_vmspace->vm_map,
 				    scrblk->ftsb_addr,
 				    scrblk->ftsb_addr + FASTTRAP_SCRBLOCK_SIZE);
 			}
 			mutex_exit(&fprc->ftpc_mtx);
 		}
 #endif
 	}
 
 #if defined(sun)
 	mutex_enter(&cp->p_lock);
 	sprunlock(cp);
 #else
 	PROC_LOCK(p);
 	PROC_LOCK(cp);
 	_PRELE(cp);
 #endif
 }
 
 /*
  * This is called from proc_exit() or from exec_common() if p_dtrace_probes
  * is set on the proc structure to indicate that there is a pid provider
  * associated with this process.
  */
 static void
 fasttrap_exec_exit(proc_t *p)
 {
 #if !defined(sun)
 	struct thread *td;
 #endif
 
 #if defined(sun)
 	ASSERT(p == curproc);
 #else
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	_PHOLD(p);
 	/*
 	 * Since struct threads may be recycled, we cannot rely on t_dtrace_sscr
 	 * fields to be zeroed by kdtrace_thread_ctor. Thus we must zero it
 	 * ourselves when a process exits.
 	 */
 	FOREACH_THREAD_IN_PROC(p, td)
 		td->t_dtrace_sscr = NULL;
 	PROC_UNLOCK(p);
 #endif
 
 	/*
 	 * We clean up the pid provider for this process here; user-land
 	 * static probes are handled by the meta-provider remove entry point.
 	 */
 	fasttrap_provider_retire(p->p_pid, FASTTRAP_PID_NAME, 0);
 #if !defined(sun)
 	if (p->p_dtrace_helpers)
 		dtrace_helpers_destroy(p);
 	PROC_LOCK(p);
 	_PRELE(p);
 #endif
 }
 
 
 /*ARGSUSED*/
 static void
 fasttrap_pid_provide(void *arg, dtrace_probedesc_t *desc)
 {
 	/*
 	 * There are no "default" pid probes.
 	 */
 }
 
 static int
 fasttrap_tracepoint_enable(proc_t *p, fasttrap_probe_t *probe, uint_t index)
 {
 	fasttrap_tracepoint_t *tp, *new_tp = NULL;
 	fasttrap_bucket_t *bucket;
 	fasttrap_id_t *id;
 	pid_t pid;
 	uintptr_t pc;
 
 	ASSERT(index < probe->ftp_ntps);
 
 	pid = probe->ftp_pid;
 	pc = probe->ftp_tps[index].fit_tp->ftt_pc;
 	id = &probe->ftp_tps[index].fit_id;
 
 	ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid);
 
 #if defined(sun)
 	ASSERT(!(p->p_flag & SVFORK));
 #endif
 
 	/*
 	 * Before we make any modifications, make sure we've imposed a barrier
 	 * on the generation in which this probe was last modified.
 	 */
 	fasttrap_mod_barrier(probe->ftp_gen);
 
 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
 
 	/*
 	 * If the tracepoint has already been enabled, just add our id to the
 	 * list of interested probes. This may be our second time through
 	 * this path in which case we'll have constructed the tracepoint we'd
 	 * like to install. If we can't find a match, and have an allocated
 	 * tracepoint ready to go, enable that one now.
 	 *
 	 * A tracepoint whose process is defunct is also considered defunct.
 	 */
 again:
 	mutex_enter(&bucket->ftb_mtx);
 	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
 		/*
 		 * Note that it's safe to access the active count on the
 		 * associated proc structure because we know that at least one
 		 * provider (this one) will still be around throughout this
 		 * operation.
 		 */
 		if (tp->ftt_pid != pid || tp->ftt_pc != pc ||
 		    tp->ftt_proc->ftpc_acount == 0)
 			continue;
 
 		/*
 		 * Now that we've found a matching tracepoint, it would be
 		 * a decent idea to confirm that the tracepoint is still
 		 * enabled and the trap instruction hasn't been overwritten.
 		 * Since this is a little hairy, we'll punt for now.
 		 */
 
 		/*
 		 * This can't be the first interested probe. We don't have
 		 * to worry about another thread being in the midst of
 		 * deleting this tracepoint (which would be the only valid
 		 * reason for a tracepoint to have no interested probes)
 		 * since we're holding P_PR_LOCK for this process.
 		 */
 		ASSERT(tp->ftt_ids != NULL || tp->ftt_retids != NULL);
 
 		switch (id->fti_ptype) {
 		case DTFTP_ENTRY:
 		case DTFTP_OFFSETS:
 		case DTFTP_IS_ENABLED:
 			id->fti_next = tp->ftt_ids;
 			membar_producer();
 			tp->ftt_ids = id;
 			membar_producer();
 			break;
 
 		case DTFTP_RETURN:
 		case DTFTP_POST_OFFSETS:
 			id->fti_next = tp->ftt_retids;
 			membar_producer();
 			tp->ftt_retids = id;
 			membar_producer();
 			break;
 
 		default:
 			ASSERT(0);
 		}
 
 		mutex_exit(&bucket->ftb_mtx);
 
 		if (new_tp != NULL) {
 			new_tp->ftt_ids = NULL;
 			new_tp->ftt_retids = NULL;
 		}
 
 		return (0);
 	}
 
 	/*
 	 * If we have a good tracepoint ready to go, install it now while
 	 * we have the lock held and no one can screw with us.
 	 */
 	if (new_tp != NULL) {
 		int rc = 0;
 
 		new_tp->ftt_next = bucket->ftb_data;
 		membar_producer();
 		bucket->ftb_data = new_tp;
 		membar_producer();
 		mutex_exit(&bucket->ftb_mtx);
 
 		/*
 		 * Activate the tracepoint in the ISA-specific manner.
 		 * If this fails, we need to report the failure, but
 		 * indicate that this tracepoint must still be disabled
 		 * by calling fasttrap_tracepoint_disable().
 		 */
 		if (fasttrap_tracepoint_install(p, new_tp) != 0)
 			rc = FASTTRAP_ENABLE_PARTIAL;
 
 		/*
 		 * Increment the count of the number of tracepoints active in
 		 * the victim process.
 		 */
 #if defined(sun)
 		ASSERT(p->p_proc_flag & P_PR_LOCK);
 #endif
 		p->p_dtrace_count++;
 
 		return (rc);
 	}
 
 	mutex_exit(&bucket->ftb_mtx);
 
 	/*
 	 * Initialize the tracepoint that's been preallocated with the probe.
 	 */
 	new_tp = probe->ftp_tps[index].fit_tp;
 
 	ASSERT(new_tp->ftt_pid == pid);
 	ASSERT(new_tp->ftt_pc == pc);
 	ASSERT(new_tp->ftt_proc == probe->ftp_prov->ftp_proc);
 	ASSERT(new_tp->ftt_ids == NULL);
 	ASSERT(new_tp->ftt_retids == NULL);
 
 	switch (id->fti_ptype) {
 	case DTFTP_ENTRY:
 	case DTFTP_OFFSETS:
 	case DTFTP_IS_ENABLED:
 		id->fti_next = NULL;
 		new_tp->ftt_ids = id;
 		break;
 
 	case DTFTP_RETURN:
 	case DTFTP_POST_OFFSETS:
 		id->fti_next = NULL;
 		new_tp->ftt_retids = id;
 		break;
 
 	default:
 		ASSERT(0);
 	}
 
 	/*
 	 * If the ISA-dependent initialization goes to plan, go back to the
 	 * beginning and try to install this freshly made tracepoint.
 	 */
 	if (fasttrap_tracepoint_init(p, new_tp, pc, id->fti_ptype) == 0)
 		goto again;
 
 	new_tp->ftt_ids = NULL;
 	new_tp->ftt_retids = NULL;
 
 	return (FASTTRAP_ENABLE_FAIL);
 }
 
 static void
 fasttrap_tracepoint_disable(proc_t *p, fasttrap_probe_t *probe, uint_t index)
 {
 	fasttrap_bucket_t *bucket;
 	fasttrap_provider_t *provider = probe->ftp_prov;
 	fasttrap_tracepoint_t **pp, *tp;
 	fasttrap_id_t *id, **idp = NULL;
 	pid_t pid;
 	uintptr_t pc;
 
 	ASSERT(index < probe->ftp_ntps);
 
 	pid = probe->ftp_pid;
 	pc = probe->ftp_tps[index].fit_tp->ftt_pc;
 	id = &probe->ftp_tps[index].fit_id;
 
 	ASSERT(probe->ftp_tps[index].fit_tp->ftt_pid == pid);
 
 	/*
 	 * Find the tracepoint and make sure that our id is one of the
 	 * ones registered with it.
 	 */
 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
 	mutex_enter(&bucket->ftb_mtx);
 	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
 		if (tp->ftt_pid == pid && tp->ftt_pc == pc &&
 		    tp->ftt_proc == provider->ftp_proc)
 			break;
 	}
 
 	/*
 	 * If we somehow lost this tracepoint, we're in a world of hurt.
 	 */
 	ASSERT(tp != NULL);
 
 	switch (id->fti_ptype) {
 	case DTFTP_ENTRY:
 	case DTFTP_OFFSETS:
 	case DTFTP_IS_ENABLED:
 		ASSERT(tp->ftt_ids != NULL);
 		idp = &tp->ftt_ids;
 		break;
 
 	case DTFTP_RETURN:
 	case DTFTP_POST_OFFSETS:
 		ASSERT(tp->ftt_retids != NULL);
 		idp = &tp->ftt_retids;
 		break;
 
 	default:
 		ASSERT(0);
 	}
 
 	while ((*idp)->fti_probe != probe) {
 		idp = &(*idp)->fti_next;
 		ASSERT(*idp != NULL);
 	}
 
 	id = *idp;
 	*idp = id->fti_next;
 	membar_producer();
 
 	ASSERT(id->fti_probe == probe);
 
 	/*
 	 * If there are other registered enablings of this tracepoint, we're
 	 * all done, but if this was the last probe assocated with this
 	 * this tracepoint, we need to remove and free it.
 	 */
 	if (tp->ftt_ids != NULL || tp->ftt_retids != NULL) {
 
 		/*
 		 * If the current probe's tracepoint is in use, swap it
 		 * for an unused tracepoint.
 		 */
 		if (tp == probe->ftp_tps[index].fit_tp) {
 			fasttrap_probe_t *tmp_probe;
 			fasttrap_tracepoint_t **tmp_tp;
 			uint_t tmp_index;
 
 			if (tp->ftt_ids != NULL) {
 				tmp_probe = tp->ftt_ids->fti_probe;
 				/* LINTED - alignment */
 				tmp_index = FASTTRAP_ID_INDEX(tp->ftt_ids);
 				tmp_tp = &tmp_probe->ftp_tps[tmp_index].fit_tp;
 			} else {
 				tmp_probe = tp->ftt_retids->fti_probe;
 				/* LINTED - alignment */
 				tmp_index = FASTTRAP_ID_INDEX(tp->ftt_retids);
 				tmp_tp = &tmp_probe->ftp_tps[tmp_index].fit_tp;
 			}
 
 			ASSERT(*tmp_tp != NULL);
 			ASSERT(*tmp_tp != probe->ftp_tps[index].fit_tp);
 			ASSERT((*tmp_tp)->ftt_ids == NULL);
 			ASSERT((*tmp_tp)->ftt_retids == NULL);
 
 			probe->ftp_tps[index].fit_tp = *tmp_tp;
 			*tmp_tp = tp;
 		}
 
 		mutex_exit(&bucket->ftb_mtx);
 
 		/*
 		 * Tag the modified probe with the generation in which it was
 		 * changed.
 		 */
 		probe->ftp_gen = fasttrap_mod_gen;
 		return;
 	}
 
 	mutex_exit(&bucket->ftb_mtx);
 
 	/*
 	 * We can't safely remove the tracepoint from the set of active
 	 * tracepoints until we've actually removed the fasttrap instruction
 	 * from the process's text. We can, however, operate on this
 	 * tracepoint secure in the knowledge that no other thread is going to
 	 * be looking at it since we hold P_PR_LOCK on the process if it's
 	 * live or we hold the provider lock on the process if it's dead and
 	 * gone.
 	 */
 
 	/*
 	 * We only need to remove the actual instruction if we're looking
 	 * at an existing process
 	 */
 	if (p != NULL) {
 		/*
 		 * If we fail to restore the instruction we need to kill
 		 * this process since it's in a completely unrecoverable
 		 * state.
 		 */
 		if (fasttrap_tracepoint_remove(p, tp) != 0)
 			fasttrap_sigtrap(p, NULL, pc);
 
 		/*
 		 * Decrement the count of the number of tracepoints active
 		 * in the victim process.
 		 */
 #if defined(sun)
 		ASSERT(p->p_proc_flag & P_PR_LOCK);
 #endif
 		p->p_dtrace_count--;
 	}
 
 	/*
 	 * Remove the probe from the hash table of active tracepoints.
 	 */
 	mutex_enter(&bucket->ftb_mtx);
 	pp = (fasttrap_tracepoint_t **)&bucket->ftb_data;
 	ASSERT(*pp != NULL);
 	while (*pp != tp) {
 		pp = &(*pp)->ftt_next;
 		ASSERT(*pp != NULL);
 	}
 
 	*pp = tp->ftt_next;
 	membar_producer();
 
 	mutex_exit(&bucket->ftb_mtx);
 
 	/*
 	 * Tag the modified probe with the generation in which it was changed.
 	 */
 	probe->ftp_gen = fasttrap_mod_gen;
 }
 
 static void
 fasttrap_enable_callbacks(void)
 {
 	/*
 	 * We don't have to play the rw lock game here because we're
 	 * providing something rather than taking something away --
 	 * we can be sure that no threads have tried to follow this
 	 * function pointer yet.
 	 */
 	mutex_enter(&fasttrap_count_mtx);
 	if (fasttrap_pid_count == 0) {
 		ASSERT(dtrace_pid_probe_ptr == NULL);
 		ASSERT(dtrace_return_probe_ptr == NULL);
 		dtrace_pid_probe_ptr = &fasttrap_pid_probe;
 		dtrace_return_probe_ptr = &fasttrap_return_probe;
 	}
 	ASSERT(dtrace_pid_probe_ptr == &fasttrap_pid_probe);
 	ASSERT(dtrace_return_probe_ptr == &fasttrap_return_probe);
 	fasttrap_pid_count++;
 	mutex_exit(&fasttrap_count_mtx);
 }
 
 static void
 fasttrap_disable_callbacks(void)
 {
 #if defined(sun)
 	ASSERT(MUTEX_HELD(&cpu_lock));
 #endif
 
 
 	mutex_enter(&fasttrap_count_mtx);
 	ASSERT(fasttrap_pid_count > 0);
 	fasttrap_pid_count--;
 	if (fasttrap_pid_count == 0) {
 #if defined(sun)
 		cpu_t *cur, *cpu = CPU;
 
 		for (cur = cpu->cpu_next_onln; cur != cpu;
 		    cur = cur->cpu_next_onln) {
 			rw_enter(&cur->cpu_ft_lock, RW_WRITER);
 		}
 #endif
 		dtrace_pid_probe_ptr = NULL;
 		dtrace_return_probe_ptr = NULL;
 #if defined(sun)
 		for (cur = cpu->cpu_next_onln; cur != cpu;
 		    cur = cur->cpu_next_onln) {
 			rw_exit(&cur->cpu_ft_lock);
 		}
 #endif
 	}
 	mutex_exit(&fasttrap_count_mtx);
 }
 
 /*ARGSUSED*/
 static void
 fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
 {
 	fasttrap_probe_t *probe = parg;
 	proc_t *p = NULL;
 	int i, rc;
 
 	ASSERT(probe != NULL);
 	ASSERT(!probe->ftp_enabled);
 	ASSERT(id == probe->ftp_id);
 #if defined(sun)
 	ASSERT(MUTEX_HELD(&cpu_lock));
 #endif
 
 	/*
 	 * Increment the count of enabled probes on this probe's provider;
 	 * the provider can't go away while the probe still exists. We
 	 * must increment this even if we aren't able to properly enable
 	 * this probe.
 	 */
 	mutex_enter(&probe->ftp_prov->ftp_mtx);
 	probe->ftp_prov->ftp_rcount++;
 	mutex_exit(&probe->ftp_prov->ftp_mtx);
 
 	/*
 	 * If this probe's provider is retired (meaning it was valid in a
 	 * previously exec'ed incarnation of this address space), bail out. The
 	 * provider can't go away while we're in this code path.
 	 */
 	if (probe->ftp_prov->ftp_retired)
 		return;
 
 	/*
 	 * If we can't find the process, it may be that we're in the context of
 	 * a fork in which the traced process is being born and we're copying
 	 * USDT probes. Otherwise, the process is gone so bail.
 	 */
 #if defined(sun)
 	if ((p = sprlock(probe->ftp_pid)) == NULL) {
 		if ((curproc->p_flag & SFORKING) == 0)
 			return;
 
 		mutex_enter(&pidlock);
 		p = prfind(probe->ftp_pid);
 
 		/*
 		 * Confirm that curproc is indeed forking the process in which
 		 * we're trying to enable probes.
 		 */
 		ASSERT(p != NULL);
 		ASSERT(p->p_parent == curproc);
 		ASSERT(p->p_stat == SIDL);
 
 		mutex_enter(&p->p_lock);
 		mutex_exit(&pidlock);
 
 		sprlock_proc(p);
 	}
 
 	ASSERT(!(p->p_flag & SVFORK));
 	mutex_exit(&p->p_lock);
 #else
 	if ((p = pfind(probe->ftp_pid)) == NULL)
 		return;
 #endif
 
 	/*
 	 * We have to enable the trap entry point before any user threads have
 	 * the chance to execute the trap instruction we're about to place
 	 * in their process's text.
 	 */
 #ifdef __FreeBSD__
 	/*
 	 * pfind() returns a locked process.
 	 */
 	_PHOLD(p);
 	PROC_UNLOCK(p);
 #endif
 	fasttrap_enable_callbacks();
 
 	/*
 	 * Enable all the tracepoints and add this probe's id to each
 	 * tracepoint's list of active probes.
 	 */
 	for (i = 0; i < probe->ftp_ntps; i++) {
 		if ((rc = fasttrap_tracepoint_enable(p, probe, i)) != 0) {
 			/*
 			 * If enabling the tracepoint failed completely,
 			 * we don't have to disable it; if the failure
 			 * was only partial we must disable it.
 			 */
 			if (rc == FASTTRAP_ENABLE_FAIL)
 				i--;
 			else
 				ASSERT(rc == FASTTRAP_ENABLE_PARTIAL);
 
 			/*
 			 * Back up and pull out all the tracepoints we've
 			 * created so far for this probe.
 			 */
 			while (i >= 0) {
 				fasttrap_tracepoint_disable(p, probe, i);
 				i--;
 			}
 
 #if defined(sun)
 			mutex_enter(&p->p_lock);
 			sprunlock(p);
 #else
 			PRELE(p);
 #endif
 
 			/*
 			 * Since we're not actually enabling this probe,
 			 * drop our reference on the trap table entry.
 			 */
 			fasttrap_disable_callbacks();
 			return;
 		}
 	}
 #if defined(sun)
 	mutex_enter(&p->p_lock);
 	sprunlock(p);
 #else
 	PRELE(p);
 #endif
 
 	probe->ftp_enabled = 1;
 }
 
 /*ARGSUSED*/
 static void
 fasttrap_pid_disable(void *arg, dtrace_id_t id, void *parg)
 {
 	fasttrap_probe_t *probe = parg;
 	fasttrap_provider_t *provider = probe->ftp_prov;
 	proc_t *p;
 	int i, whack = 0;
 
 	ASSERT(id == probe->ftp_id);
 
 	mutex_enter(&provider->ftp_mtx);
 
 	/*
 	 * We won't be able to acquire a /proc-esque lock on the process
 	 * iff the process is dead and gone. In this case, we rely on the
 	 * provider lock as a point of mutual exclusion to prevent other
 	 * DTrace consumers from disabling this probe.
 	 */
 	if ((p = pfind(probe->ftp_pid)) != NULL) {
 #ifdef __FreeBSD__
 		_PHOLD(p);
 		PROC_UNLOCK(p);
 #endif
 	}
 
 	/*
 	 * Disable all the associated tracepoints (for fully enabled probes).
 	 */
 	if (probe->ftp_enabled) {
 		for (i = 0; i < probe->ftp_ntps; i++) {
 			fasttrap_tracepoint_disable(p, probe, i);
 		}
 	}
 
 	ASSERT(provider->ftp_rcount > 0);
 	provider->ftp_rcount--;
 
 	if (p != NULL) {
 		/*
 		 * Even though we may not be able to remove it entirely, we
 		 * mark this retired provider to get a chance to remove some
 		 * of the associated probes.
 		 */
 		if (provider->ftp_retired && !provider->ftp_marked)
 			whack = provider->ftp_marked = 1;
 		mutex_exit(&provider->ftp_mtx);
 	} else {
 		/*
 		 * If the process is dead, we're just waiting for the
 		 * last probe to be disabled to be able to free it.
 		 */
 		if (provider->ftp_rcount == 0 && !provider->ftp_marked)
 			whack = provider->ftp_marked = 1;
 		mutex_exit(&provider->ftp_mtx);
 	}
 
 	if (whack)
 		fasttrap_pid_cleanup();
 
 #ifdef __FreeBSD__
 	if (p != NULL)
 		PRELE(p);
 #endif
 	if (!probe->ftp_enabled)
 		return;
 
 	probe->ftp_enabled = 0;
 
 #if defined(sun)
 	ASSERT(MUTEX_HELD(&cpu_lock));
 #endif
 	fasttrap_disable_callbacks();
 }
 
 /*ARGSUSED*/
 static void
 fasttrap_pid_getargdesc(void *arg, dtrace_id_t id, void *parg,
     dtrace_argdesc_t *desc)
 {
 	fasttrap_probe_t *probe = parg;
 	char *str;
 	int i, ndx;
 
 	desc->dtargd_native[0] = '\0';
 	desc->dtargd_xlate[0] = '\0';
 
 	if (probe->ftp_prov->ftp_retired != 0 ||
 	    desc->dtargd_ndx >= probe->ftp_nargs) {
 		desc->dtargd_ndx = DTRACE_ARGNONE;
 		return;
 	}
 
 	ndx = (probe->ftp_argmap != NULL) ?
 	    probe->ftp_argmap[desc->dtargd_ndx] : desc->dtargd_ndx;
 
 	str = probe->ftp_ntypes;
 	for (i = 0; i < ndx; i++) {
 		str += strlen(str) + 1;
 	}
 
 	ASSERT(strlen(str + 1) < sizeof (desc->dtargd_native));
 	(void) strcpy(desc->dtargd_native, str);
 
 	if (probe->ftp_xtypes == NULL)
 		return;
 
 	str = probe->ftp_xtypes;
 	for (i = 0; i < desc->dtargd_ndx; i++) {
 		str += strlen(str) + 1;
 	}
 
 	ASSERT(strlen(str + 1) < sizeof (desc->dtargd_xlate));
 	(void) strcpy(desc->dtargd_xlate, str);
 }
 
 /*ARGSUSED*/
 static void
 fasttrap_pid_destroy(void *arg, dtrace_id_t id, void *parg)
 {
 	fasttrap_probe_t *probe = parg;
 	int i;
 	size_t size;
 
 	ASSERT(probe != NULL);
 	ASSERT(!probe->ftp_enabled);
 	ASSERT(fasttrap_total >= probe->ftp_ntps);
 
 	atomic_add_32(&fasttrap_total, -probe->ftp_ntps);
 	size = offsetof(fasttrap_probe_t, ftp_tps[probe->ftp_ntps]);
 
 	if (probe->ftp_gen + 1 >= fasttrap_mod_gen)
 		fasttrap_mod_barrier(probe->ftp_gen);
 
 	for (i = 0; i < probe->ftp_ntps; i++) {
 		kmem_free(probe->ftp_tps[i].fit_tp,
 		    sizeof (fasttrap_tracepoint_t));
 	}
 
 	kmem_free(probe, size);
 }
 
 
 static const dtrace_pattr_t pid_attr = {
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 };
 
 static dtrace_pops_t pid_pops = {
 	fasttrap_pid_provide,
 	NULL,
 	fasttrap_pid_enable,
 	fasttrap_pid_disable,
 	NULL,
 	NULL,
 	fasttrap_pid_getargdesc,
 	fasttrap_pid_getarg,
 	NULL,
 	fasttrap_pid_destroy
 };
 
 static dtrace_pops_t usdt_pops = {
 	fasttrap_pid_provide,
 	NULL,
 	fasttrap_pid_enable,
 	fasttrap_pid_disable,
 	NULL,
 	NULL,
 	fasttrap_pid_getargdesc,
 	fasttrap_usdt_getarg,
 	NULL,
 	fasttrap_pid_destroy
 };
 
 static fasttrap_proc_t *
 fasttrap_proc_lookup(pid_t pid)
 {
 	fasttrap_bucket_t *bucket;
 	fasttrap_proc_t *fprc, *new_fprc;
 
 
 	bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
 	mutex_enter(&bucket->ftb_mtx);
 
 	for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) {
 		if (fprc->ftpc_pid == pid && fprc->ftpc_acount != 0) {
 			mutex_enter(&fprc->ftpc_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			fprc->ftpc_rcount++;
-			atomic_add_64(&fprc->ftpc_acount, 1);
+			atomic_inc_64(&fprc->ftpc_acount);
 			ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
 			mutex_exit(&fprc->ftpc_mtx);
 
 			return (fprc);
 		}
 	}
 
 	/*
 	 * Drop the bucket lock so we don't try to perform a sleeping
 	 * allocation under it.
 	 */
 	mutex_exit(&bucket->ftb_mtx);
 
 	new_fprc = kmem_zalloc(sizeof (fasttrap_proc_t), KM_SLEEP);
 	new_fprc->ftpc_pid = pid;
 	new_fprc->ftpc_rcount = 1;
 	new_fprc->ftpc_acount = 1;
 #if !defined(sun)
 	mutex_init(&new_fprc->ftpc_mtx, "fasttrap proc mtx", MUTEX_DEFAULT,
 	    NULL);
 #endif
 
 	mutex_enter(&bucket->ftb_mtx);
 
 	/*
 	 * Take another lap through the list to make sure a proc hasn't
 	 * been created for this pid while we weren't under the bucket lock.
 	 */
 	for (fprc = bucket->ftb_data; fprc != NULL; fprc = fprc->ftpc_next) {
 		if (fprc->ftpc_pid == pid && fprc->ftpc_acount != 0) {
 			mutex_enter(&fprc->ftpc_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			fprc->ftpc_rcount++;
-			atomic_add_64(&fprc->ftpc_acount, 1);
+			atomic_inc_64(&fprc->ftpc_acount);
 			ASSERT(fprc->ftpc_acount <= fprc->ftpc_rcount);
 			mutex_exit(&fprc->ftpc_mtx);
 
 			kmem_free(new_fprc, sizeof (fasttrap_proc_t));
 
 			return (fprc);
 		}
 	}
 
 	new_fprc->ftpc_next = bucket->ftb_data;
 	bucket->ftb_data = new_fprc;
 
 	mutex_exit(&bucket->ftb_mtx);
 
 	return (new_fprc);
 }
 
 static void
 fasttrap_proc_release(fasttrap_proc_t *proc)
 {
 	fasttrap_bucket_t *bucket;
 	fasttrap_proc_t *fprc, **fprcp;
 	pid_t pid = proc->ftpc_pid;
 #if !defined(sun)
 	fasttrap_scrblock_t *scrblk, *scrblktmp;
 	fasttrap_scrspace_t *scrspc, *scrspctmp;
 	struct proc *p;
 	struct thread *td;
 #endif
 
 	mutex_enter(&proc->ftpc_mtx);
 
 	ASSERT(proc->ftpc_rcount != 0);
 	ASSERT(proc->ftpc_acount <= proc->ftpc_rcount);
 
 	if (--proc->ftpc_rcount != 0) {
 		mutex_exit(&proc->ftpc_mtx);
 		return;
 	}
 
 #if !defined(sun)
 	/*
 	 * Free all structures used to manage per-thread scratch space.
 	 */
 	LIST_FOREACH_SAFE(scrblk, &proc->ftpc_scrblks, ftsb_next,
 	    scrblktmp) {
 		LIST_REMOVE(scrblk, ftsb_next);
 		free(scrblk, M_SOLARIS);
 	}
 	LIST_FOREACH_SAFE(scrspc, &proc->ftpc_fscr, ftss_next, scrspctmp) {
 		LIST_REMOVE(scrspc, ftss_next);
 		free(scrspc, M_SOLARIS);
 	}
 	LIST_FOREACH_SAFE(scrspc, &proc->ftpc_ascr, ftss_next, scrspctmp) {
 		LIST_REMOVE(scrspc, ftss_next);
 		free(scrspc, M_SOLARIS);
 	}
 
 	if ((p = pfind(pid)) != NULL) {
 		FOREACH_THREAD_IN_PROC(p, td)
 			td->t_dtrace_sscr = NULL;
 		PROC_UNLOCK(p);
 	}
 #endif
 
 	mutex_exit(&proc->ftpc_mtx);
 
 	/*
 	 * There should definitely be no live providers associated with this
 	 * process at this point.
 	 */
 	ASSERT(proc->ftpc_acount == 0);
 
 	bucket = &fasttrap_procs.fth_table[FASTTRAP_PROCS_INDEX(pid)];
 	mutex_enter(&bucket->ftb_mtx);
 
 	fprcp = (fasttrap_proc_t **)&bucket->ftb_data;
 	while ((fprc = *fprcp) != NULL) {
 		if (fprc == proc)
 			break;
 
 		fprcp = &fprc->ftpc_next;
 	}
 
 	/*
 	 * Something strange has happened if we can't find the proc.
 	 */
 	ASSERT(fprc != NULL);
 
 	*fprcp = fprc->ftpc_next;
 
 	mutex_exit(&bucket->ftb_mtx);
 
 	kmem_free(fprc, sizeof (fasttrap_proc_t));
 }
 
 /*
  * Lookup a fasttrap-managed provider based on its name and associated pid.
  * If the pattr argument is non-NULL, this function instantiates the provider
  * if it doesn't exist otherwise it returns NULL. The provider is returned
  * with its lock held.
  */
 static fasttrap_provider_t *
 fasttrap_provider_lookup(pid_t pid, const char *name,
     const dtrace_pattr_t *pattr)
 {
 	fasttrap_provider_t *fp, *new_fp = NULL;
 	fasttrap_bucket_t *bucket;
 	char provname[DTRACE_PROVNAMELEN];
 	proc_t *p;
 	cred_t *cred;
 
 	ASSERT(strlen(name) < sizeof (fp->ftp_name));
 	ASSERT(pattr != NULL);
 
 	bucket = &fasttrap_provs.fth_table[FASTTRAP_PROVS_INDEX(pid, name)];
 	mutex_enter(&bucket->ftb_mtx);
 
 	/*
 	 * Take a lap through the list and return the match if we find it.
 	 */
 	for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) {
 		if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 &&
 		    !fp->ftp_retired) {
 			mutex_enter(&fp->ftp_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			return (fp);
 		}
 	}
 
 	/*
 	 * Drop the bucket lock so we don't try to perform a sleeping
 	 * allocation under it.
 	 */
 	mutex_exit(&bucket->ftb_mtx);
 
 	/*
 	 * Make sure the process exists, isn't a child created as the result
 	 * of a vfork(2), and isn't a zombie (but may be in fork).
 	 */
 	if ((p = pfind(pid)) == NULL)
 		return (NULL);
 
 	/*
 	 * Increment p_dtrace_probes so that the process knows to inform us
 	 * when it exits or execs. fasttrap_provider_free() decrements this
 	 * when we're done with this provider.
 	 */
 	p->p_dtrace_probes++;
 
 	/*
 	 * Grab the credentials for this process so we have
 	 * something to pass to dtrace_register().
 	 */
 	PROC_LOCK_ASSERT(p, MA_OWNED);
 	crhold(p->p_ucred);
 	cred = p->p_ucred;
 	PROC_UNLOCK(p);
 
 	new_fp = kmem_zalloc(sizeof (fasttrap_provider_t), KM_SLEEP);
 	new_fp->ftp_pid = pid;
 	new_fp->ftp_proc = fasttrap_proc_lookup(pid);
 #if !defined(sun)
 	mutex_init(&new_fp->ftp_mtx, "provider mtx", MUTEX_DEFAULT, NULL);
 	mutex_init(&new_fp->ftp_cmtx, "lock on creating", MUTEX_DEFAULT, NULL);
 #endif
 
 	ASSERT(new_fp->ftp_proc != NULL);
 
 	mutex_enter(&bucket->ftb_mtx);
 
 	/*
 	 * Take another lap through the list to make sure a provider hasn't
 	 * been created for this pid while we weren't under the bucket lock.
 	 */
 	for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) {
 		if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 &&
 		    !fp->ftp_retired) {
 			mutex_enter(&fp->ftp_mtx);
 			mutex_exit(&bucket->ftb_mtx);
 			fasttrap_provider_free(new_fp);
 			crfree(cred);
 			return (fp);
 		}
 	}
 
 	(void) strcpy(new_fp->ftp_name, name);
 
 	/*
 	 * Fail and return NULL if either the provider name is too long
 	 * or we fail to register this new provider with the DTrace
 	 * framework. Note that this is the only place we ever construct
 	 * the full provider name -- we keep it in pieces in the provider
 	 * structure.
 	 */
 	if (snprintf(provname, sizeof (provname), "%s%u", name, (uint_t)pid) >=
 	    sizeof (provname) ||
 	    dtrace_register(provname, pattr,
 	    DTRACE_PRIV_PROC | DTRACE_PRIV_OWNER | DTRACE_PRIV_ZONEOWNER, cred,
 	    pattr == &pid_attr ? &pid_pops : &usdt_pops, new_fp,
 	    &new_fp->ftp_provid) != 0) {
 		mutex_exit(&bucket->ftb_mtx);
 		fasttrap_provider_free(new_fp);
 		crfree(cred);
 		return (NULL);
 	}
 
 	new_fp->ftp_next = bucket->ftb_data;
 	bucket->ftb_data = new_fp;
 
 	mutex_enter(&new_fp->ftp_mtx);
 	mutex_exit(&bucket->ftb_mtx);
 
 	crfree(cred);
 	return (new_fp);
 }
 
 static void
 fasttrap_provider_free(fasttrap_provider_t *provider)
 {
 	pid_t pid = provider->ftp_pid;
 	proc_t *p;
 
 	/*
 	 * There need to be no associated enabled probes, no consumers
 	 * creating probes, and no meta providers referencing this provider.
 	 */
 	ASSERT(provider->ftp_rcount == 0);
 	ASSERT(provider->ftp_ccount == 0);
 	ASSERT(provider->ftp_mcount == 0);
 
 	/*
 	 * If this provider hasn't been retired, we need to explicitly drop the
 	 * count of active providers on the associated process structure.
 	 */
 	if (!provider->ftp_retired) {
-		atomic_add_64(&provider->ftp_proc->ftpc_acount, -1);
+		atomic_dec_64(&provider->ftp_proc->ftpc_acount);
 		ASSERT(provider->ftp_proc->ftpc_acount <
 		    provider->ftp_proc->ftpc_rcount);
 	}
 
 	fasttrap_proc_release(provider->ftp_proc);
 
 #if !defined(sun)
 	mutex_destroy(&provider->ftp_mtx);
 	mutex_destroy(&provider->ftp_cmtx);
 #endif
 	kmem_free(provider, sizeof (fasttrap_provider_t));
 
 	/*
 	 * Decrement p_dtrace_probes on the process whose provider we're
 	 * freeing. We don't have to worry about clobbering somone else's
 	 * modifications to it because we have locked the bucket that
 	 * corresponds to this process's hash chain in the provider hash
 	 * table. Don't sweat it if we can't find the process.
 	 */
 	if ((p = pfind(pid)) == NULL) {
 		return;
 	}
 
 	p->p_dtrace_probes--;
 #if !defined(sun)
 	PROC_UNLOCK(p);
 #endif
 }
 
 static void
 fasttrap_provider_retire(pid_t pid, const char *name, int mprov)
 {
 	fasttrap_provider_t *fp;
 	fasttrap_bucket_t *bucket;
 	dtrace_provider_id_t provid;
 
 	ASSERT(strlen(name) < sizeof (fp->ftp_name));
 
 	bucket = &fasttrap_provs.fth_table[FASTTRAP_PROVS_INDEX(pid, name)];
 	mutex_enter(&bucket->ftb_mtx);
 
 	for (fp = bucket->ftb_data; fp != NULL; fp = fp->ftp_next) {
 		if (fp->ftp_pid == pid && strcmp(fp->ftp_name, name) == 0 &&
 		    !fp->ftp_retired)
 			break;
 	}
 
 	if (fp == NULL) {
 		mutex_exit(&bucket->ftb_mtx);
 		return;
 	}
 
 	mutex_enter(&fp->ftp_mtx);
 	ASSERT(!mprov || fp->ftp_mcount > 0);
 	if (mprov && --fp->ftp_mcount != 0)  {
 		mutex_exit(&fp->ftp_mtx);
 		mutex_exit(&bucket->ftb_mtx);
 		return;
 	}
 
 	/*
 	 * Mark the provider to be removed in our post-processing step, mark it
 	 * retired, and drop the active count on its proc. Marking it indicates
 	 * that we should try to remove it; setting the retired flag indicates
 	 * that we're done with this provider; dropping the active the proc
 	 * releases our hold, and when this reaches zero (as it will during
 	 * exit or exec) the proc and associated providers become defunct.
 	 *
 	 * We obviously need to take the bucket lock before the provider lock
 	 * to perform the lookup, but we need to drop the provider lock
 	 * before calling into the DTrace framework since we acquire the
 	 * provider lock in callbacks invoked from the DTrace framework. The
 	 * bucket lock therefore protects the integrity of the provider hash
 	 * table.
 	 */
-	atomic_add_64(&fp->ftp_proc->ftpc_acount, -1);
+	atomic_dec_64(&fp->ftp_proc->ftpc_acount);
 	ASSERT(fp->ftp_proc->ftpc_acount < fp->ftp_proc->ftpc_rcount);
 
 	fp->ftp_retired = 1;
 	fp->ftp_marked = 1;
 	provid = fp->ftp_provid;
 	mutex_exit(&fp->ftp_mtx);
 
 	/*
 	 * We don't have to worry about invalidating the same provider twice
 	 * since fasttrap_provider_lookup() will ignore provider that have
 	 * been marked as retired.
 	 */
 	dtrace_invalidate(provid);
 
 	mutex_exit(&bucket->ftb_mtx);
 
 	fasttrap_pid_cleanup();
 }
 
 static int
 fasttrap_uint32_cmp(const void *ap, const void *bp)
 {
 	return (*(const uint32_t *)ap - *(const uint32_t *)bp);
 }
 
 static int
 fasttrap_uint64_cmp(const void *ap, const void *bp)
 {
 	return (*(const uint64_t *)ap - *(const uint64_t *)bp);
 }
 
 static int
 fasttrap_add_probe(fasttrap_probe_spec_t *pdata)
 {
 	fasttrap_provider_t *provider;
 	fasttrap_probe_t *pp;
 	fasttrap_tracepoint_t *tp;
 	char *name;
 	int i, aframes = 0, whack;
 
 	/*
 	 * There needs to be at least one desired trace point.
 	 */
 	if (pdata->ftps_noffs == 0)
 		return (EINVAL);
 
 	switch (pdata->ftps_type) {
 	case DTFTP_ENTRY:
 		name = "entry";
 		aframes = FASTTRAP_ENTRY_AFRAMES;
 		break;
 	case DTFTP_RETURN:
 		name = "return";
 		aframes = FASTTRAP_RETURN_AFRAMES;
 		break;
 	case DTFTP_OFFSETS:
 		name = NULL;
 		break;
 	default:
 		return (EINVAL);
 	}
 
 	if ((provider = fasttrap_provider_lookup(pdata->ftps_pid,
 	    FASTTRAP_PID_NAME, &pid_attr)) == NULL)
 		return (ESRCH);
 
 	/*
 	 * Increment this reference count to indicate that a consumer is
 	 * actively adding a new probe associated with this provider. This
 	 * prevents the provider from being deleted -- we'll need to check
 	 * for pending deletions when we drop this reference count.
 	 */
 	provider->ftp_ccount++;
 	mutex_exit(&provider->ftp_mtx);
 
 	/*
 	 * Grab the creation lock to ensure consistency between calls to
 	 * dtrace_probe_lookup() and dtrace_probe_create() in the face of
 	 * other threads creating probes. We must drop the provider lock
 	 * before taking this lock to avoid a three-way deadlock with the
 	 * DTrace framework.
 	 */
 	mutex_enter(&provider->ftp_cmtx);
 
 	if (name == NULL) {
 		for (i = 0; i < pdata->ftps_noffs; i++) {
 			char name_str[17];
 
 			(void) sprintf(name_str, "%llx",
 			    (unsigned long long)pdata->ftps_offs[i]);
 
 			if (dtrace_probe_lookup(provider->ftp_provid,
 			    pdata->ftps_mod, pdata->ftps_func, name_str) != 0)
 				continue;
 
-			atomic_add_32(&fasttrap_total, 1);
+			atomic_inc_32(&fasttrap_total);
 
 			if (fasttrap_total > fasttrap_max) {
-				atomic_add_32(&fasttrap_total, -1);
+				atomic_dec_32(&fasttrap_total);
 				goto no_mem;
 			}
 
 			pp = kmem_zalloc(sizeof (fasttrap_probe_t), KM_SLEEP);
 
 			pp->ftp_prov = provider;
 			pp->ftp_faddr = pdata->ftps_pc;
 			pp->ftp_fsize = pdata->ftps_size;
 			pp->ftp_pid = pdata->ftps_pid;
 			pp->ftp_ntps = 1;
 
 			tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t),
 			    KM_SLEEP);
 
 			tp->ftt_proc = provider->ftp_proc;
 			tp->ftt_pc = pdata->ftps_offs[i] + pdata->ftps_pc;
 			tp->ftt_pid = pdata->ftps_pid;
 
 			pp->ftp_tps[0].fit_tp = tp;
 			pp->ftp_tps[0].fit_id.fti_probe = pp;
 			pp->ftp_tps[0].fit_id.fti_ptype = pdata->ftps_type;
 
 			pp->ftp_id = dtrace_probe_create(provider->ftp_provid,
 			    pdata->ftps_mod, pdata->ftps_func, name_str,
 			    FASTTRAP_OFFSET_AFRAMES, pp);
 		}
 
 	} else if (dtrace_probe_lookup(provider->ftp_provid, pdata->ftps_mod,
 	    pdata->ftps_func, name) == 0) {
 		atomic_add_32(&fasttrap_total, pdata->ftps_noffs);
 
 		if (fasttrap_total > fasttrap_max) {
 			atomic_add_32(&fasttrap_total, -pdata->ftps_noffs);
 			goto no_mem;
 		}
 
 		/*
 		 * Make sure all tracepoint program counter values are unique.
 		 * We later assume that each probe has exactly one tracepoint
 		 * for a given pc.
 		 */
 		qsort(pdata->ftps_offs, pdata->ftps_noffs,
 		    sizeof (uint64_t), fasttrap_uint64_cmp);
 		for (i = 1; i < pdata->ftps_noffs; i++) {
 			if (pdata->ftps_offs[i] > pdata->ftps_offs[i - 1])
 				continue;
 
 			atomic_add_32(&fasttrap_total, -pdata->ftps_noffs);
 			goto no_mem;
 		}
 
 		ASSERT(pdata->ftps_noffs > 0);
 		pp = kmem_zalloc(offsetof(fasttrap_probe_t,
 		    ftp_tps[pdata->ftps_noffs]), KM_SLEEP);
 
 		pp->ftp_prov = provider;
 		pp->ftp_faddr = pdata->ftps_pc;
 		pp->ftp_fsize = pdata->ftps_size;
 		pp->ftp_pid = pdata->ftps_pid;
 		pp->ftp_ntps = pdata->ftps_noffs;
 
 		for (i = 0; i < pdata->ftps_noffs; i++) {
 			tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t),
 			    KM_SLEEP);
 
 			tp->ftt_proc = provider->ftp_proc;
 			tp->ftt_pc = pdata->ftps_offs[i] + pdata->ftps_pc;
 			tp->ftt_pid = pdata->ftps_pid;
 
 			pp->ftp_tps[i].fit_tp = tp;
 			pp->ftp_tps[i].fit_id.fti_probe = pp;
 			pp->ftp_tps[i].fit_id.fti_ptype = pdata->ftps_type;
 		}
 
 		pp->ftp_id = dtrace_probe_create(provider->ftp_provid,
 		    pdata->ftps_mod, pdata->ftps_func, name, aframes, pp);
 	}
 
 	mutex_exit(&provider->ftp_cmtx);
 
 	/*
 	 * We know that the provider is still valid since we incremented the
 	 * creation reference count. If someone tried to clean up this provider
 	 * while we were using it (e.g. because the process called exec(2) or
 	 * exit(2)), take note of that and try to clean it up now.
 	 */
 	mutex_enter(&provider->ftp_mtx);
 	provider->ftp_ccount--;
 	whack = provider->ftp_retired;
 	mutex_exit(&provider->ftp_mtx);
 
 	if (whack)
 		fasttrap_pid_cleanup();
 
 	return (0);
 
 no_mem:
 	/*
 	 * If we've exhausted the allowable resources, we'll try to remove
 	 * this provider to free some up. This is to cover the case where
 	 * the user has accidentally created many more probes than was
 	 * intended (e.g. pid123:::).
 	 */
 	mutex_exit(&provider->ftp_cmtx);
 	mutex_enter(&provider->ftp_mtx);
 	provider->ftp_ccount--;
 	provider->ftp_marked = 1;
 	mutex_exit(&provider->ftp_mtx);
 
 	fasttrap_pid_cleanup();
 
 	return (ENOMEM);
 }
 
 /*ARGSUSED*/
 static void *
 fasttrap_meta_provide(void *arg, dtrace_helper_provdesc_t *dhpv, pid_t pid)
 {
 	fasttrap_provider_t *provider;
 
 	/*
 	 * A 32-bit unsigned integer (like a pid for example) can be
 	 * expressed in 10 or fewer decimal digits. Make sure that we'll
 	 * have enough space for the provider name.
 	 */
 	if (strlen(dhpv->dthpv_provname) + 10 >=
 	    sizeof (provider->ftp_name)) {
 		printf("failed to instantiate provider %s: "
 		    "name too long to accomodate pid", dhpv->dthpv_provname);
 		return (NULL);
 	}
 
 	/*
 	 * Don't let folks spoof the true pid provider.
 	 */
 	if (strcmp(dhpv->dthpv_provname, FASTTRAP_PID_NAME) == 0) {
 		printf("failed to instantiate provider %s: "
 		    "%s is an invalid name", dhpv->dthpv_provname,
 		    FASTTRAP_PID_NAME);
 		return (NULL);
 	}
 
 	/*
 	 * The highest stability class that fasttrap supports is ISA; cap
 	 * the stability of the new provider accordingly.
 	 */
 	if (dhpv->dthpv_pattr.dtpa_provider.dtat_class > DTRACE_CLASS_ISA)
 		dhpv->dthpv_pattr.dtpa_provider.dtat_class = DTRACE_CLASS_ISA;
 	if (dhpv->dthpv_pattr.dtpa_mod.dtat_class > DTRACE_CLASS_ISA)
 		dhpv->dthpv_pattr.dtpa_mod.dtat_class = DTRACE_CLASS_ISA;
 	if (dhpv->dthpv_pattr.dtpa_func.dtat_class > DTRACE_CLASS_ISA)
 		dhpv->dthpv_pattr.dtpa_func.dtat_class = DTRACE_CLASS_ISA;
 	if (dhpv->dthpv_pattr.dtpa_name.dtat_class > DTRACE_CLASS_ISA)
 		dhpv->dthpv_pattr.dtpa_name.dtat_class = DTRACE_CLASS_ISA;
 	if (dhpv->dthpv_pattr.dtpa_args.dtat_class > DTRACE_CLASS_ISA)
 		dhpv->dthpv_pattr.dtpa_args.dtat_class = DTRACE_CLASS_ISA;
 
 	if ((provider = fasttrap_provider_lookup(pid, dhpv->dthpv_provname,
 	    &dhpv->dthpv_pattr)) == NULL) {
 		printf("failed to instantiate provider %s for "
 		    "process %u",  dhpv->dthpv_provname, (uint_t)pid);
 		return (NULL);
 	}
 
 	/*
 	 * Up the meta provider count so this provider isn't removed until
 	 * the meta provider has been told to remove it.
 	 */
 	provider->ftp_mcount++;
 
 	mutex_exit(&provider->ftp_mtx);
 
 	return (provider);
 }
 
 /*ARGSUSED*/
 static void
 fasttrap_meta_create_probe(void *arg, void *parg,
     dtrace_helper_probedesc_t *dhpb)
 {
 	fasttrap_provider_t *provider = parg;
 	fasttrap_probe_t *pp;
 	fasttrap_tracepoint_t *tp;
 	int i, j;
 	uint32_t ntps;
 
 	/*
 	 * Since the meta provider count is non-zero we don't have to worry
 	 * about this provider disappearing.
 	 */
 	ASSERT(provider->ftp_mcount > 0);
 
 	/*
 	 * The offsets must be unique.
 	 */
 	qsort(dhpb->dthpb_offs, dhpb->dthpb_noffs, sizeof (uint32_t),
 	    fasttrap_uint32_cmp);
 	for (i = 1; i < dhpb->dthpb_noffs; i++) {
 		if (dhpb->dthpb_base + dhpb->dthpb_offs[i] <=
 		    dhpb->dthpb_base + dhpb->dthpb_offs[i - 1])
 			return;
 	}
 
 	qsort(dhpb->dthpb_enoffs, dhpb->dthpb_nenoffs, sizeof (uint32_t),
 	    fasttrap_uint32_cmp);
 	for (i = 1; i < dhpb->dthpb_nenoffs; i++) {
 		if (dhpb->dthpb_base + dhpb->dthpb_enoffs[i] <=
 		    dhpb->dthpb_base + dhpb->dthpb_enoffs[i - 1])
 			return;
 	}
 
 	/*
 	 * Grab the creation lock to ensure consistency between calls to
 	 * dtrace_probe_lookup() and dtrace_probe_create() in the face of
 	 * other threads creating probes.
 	 */
 	mutex_enter(&provider->ftp_cmtx);
 
 	if (dtrace_probe_lookup(provider->ftp_provid, dhpb->dthpb_mod,
 	    dhpb->dthpb_func, dhpb->dthpb_name) != 0) {
 		mutex_exit(&provider->ftp_cmtx);
 		return;
 	}
 
 	ntps = dhpb->dthpb_noffs + dhpb->dthpb_nenoffs;
 	ASSERT(ntps > 0);
 
 	atomic_add_32(&fasttrap_total, ntps);
 
 	if (fasttrap_total > fasttrap_max) {
 		atomic_add_32(&fasttrap_total, -ntps);
 		mutex_exit(&provider->ftp_cmtx);
 		return;
 	}
 
 	pp = kmem_zalloc(offsetof(fasttrap_probe_t, ftp_tps[ntps]), KM_SLEEP);
 
 	pp->ftp_prov = provider;
 	pp->ftp_pid = provider->ftp_pid;
 	pp->ftp_ntps = ntps;
 	pp->ftp_nargs = dhpb->dthpb_xargc;
 	pp->ftp_xtypes = dhpb->dthpb_xtypes;
 	pp->ftp_ntypes = dhpb->dthpb_ntypes;
 
 	/*
 	 * First create a tracepoint for each actual point of interest.
 	 */
 	for (i = 0; i < dhpb->dthpb_noffs; i++) {
 		tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), KM_SLEEP);
 
 		tp->ftt_proc = provider->ftp_proc;
 		tp->ftt_pc = dhpb->dthpb_base + dhpb->dthpb_offs[i];
 		tp->ftt_pid = provider->ftp_pid;
 
 		pp->ftp_tps[i].fit_tp = tp;
 		pp->ftp_tps[i].fit_id.fti_probe = pp;
 #ifdef __sparc
 		pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_POST_OFFSETS;
 #else
 		pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_OFFSETS;
 #endif
 	}
 
 	/*
 	 * Then create a tracepoint for each is-enabled point.
 	 */
 	for (j = 0; i < ntps; i++, j++) {
 		tp = kmem_zalloc(sizeof (fasttrap_tracepoint_t), KM_SLEEP);
 
 		tp->ftt_proc = provider->ftp_proc;
 		tp->ftt_pc = dhpb->dthpb_base + dhpb->dthpb_enoffs[j];
 		tp->ftt_pid = provider->ftp_pid;
 
 		pp->ftp_tps[i].fit_tp = tp;
 		pp->ftp_tps[i].fit_id.fti_probe = pp;
 		pp->ftp_tps[i].fit_id.fti_ptype = DTFTP_IS_ENABLED;
 	}
 
 	/*
 	 * If the arguments are shuffled around we set the argument remapping
 	 * table. Later, when the probe fires, we only remap the arguments
 	 * if the table is non-NULL.
 	 */
 	for (i = 0; i < dhpb->dthpb_xargc; i++) {
 		if (dhpb->dthpb_args[i] != i) {
 			pp->ftp_argmap = dhpb->dthpb_args;
 			break;
 		}
 	}
 
 	/*
 	 * The probe is fully constructed -- register it with DTrace.
 	 */
 	pp->ftp_id = dtrace_probe_create(provider->ftp_provid, dhpb->dthpb_mod,
 	    dhpb->dthpb_func, dhpb->dthpb_name, FASTTRAP_OFFSET_AFRAMES, pp);
 
 	mutex_exit(&provider->ftp_cmtx);
 }
 
 /*ARGSUSED*/
 static void
 fasttrap_meta_remove(void *arg, dtrace_helper_provdesc_t *dhpv, pid_t pid)
 {
 	/*
 	 * Clean up the USDT provider. There may be active consumers of the
 	 * provider busy adding probes, no damage will actually befall the
 	 * provider until that count has dropped to zero. This just puts
 	 * the provider on death row.
 	 */
 	fasttrap_provider_retire(pid, dhpv->dthpv_provname, 1);
 }
 
 static dtrace_mops_t fasttrap_mops = {
 	fasttrap_meta_create_probe,
 	fasttrap_meta_provide,
 	fasttrap_meta_remove
 };
 
 /*ARGSUSED*/
 static int
 fasttrap_open(struct cdev *dev __unused, int oflags __unused,
     int devtype __unused, struct thread *td __unused)
 {
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 fasttrap_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int fflag,
     struct thread *td)
 {
 #ifdef notyet
 	struct kinfo_proc kp;
 	const cred_t *cr = td->td_ucred;
 #endif
 	if (!dtrace_attached())
 		return (EAGAIN);
 
 	if (cmd == FASTTRAPIOC_MAKEPROBE) {
 		fasttrap_probe_spec_t *uprobe = *(fasttrap_probe_spec_t **)arg;
 		fasttrap_probe_spec_t *probe;
 		uint64_t noffs;
 		size_t size;
 		int ret, err;
 
 		if (copyin(&uprobe->ftps_noffs, &noffs,
 		    sizeof (uprobe->ftps_noffs)))
 			return (EFAULT);
 
 		/*
 		 * Probes must have at least one tracepoint.
 		 */
 		if (noffs == 0)
 			return (EINVAL);
 
 		size = sizeof (fasttrap_probe_spec_t) +
 		    sizeof (probe->ftps_offs[0]) * (noffs - 1);
 
 		if (size > 1024 * 1024)
 			return (ENOMEM);
 
 		probe = kmem_alloc(size, KM_SLEEP);
 
 		if (copyin(uprobe, probe, size) != 0 ||
 		    probe->ftps_noffs != noffs) {
 			kmem_free(probe, size);
 			return (EFAULT);
 		}
 
 		/*
 		 * Verify that the function and module strings contain no
 		 * funny characters.
 		 */
 		if (u8_validate(probe->ftps_func, strlen(probe->ftps_func),
 		    NULL, U8_VALIDATE_ENTIRE, &err) < 0) {
 			ret = EINVAL;
 			goto err;
 		}
 
 		if (u8_validate(probe->ftps_mod, strlen(probe->ftps_mod),
 		    NULL, U8_VALIDATE_ENTIRE, &err) < 0) {
 			ret = EINVAL;
 			goto err;
 		}
 
 #ifdef notyet
 		if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) {
 			proc_t *p;
 			pid_t pid = probe->ftps_pid;
 
 #if defined(sun)
 			mutex_enter(&pidlock);
 #endif
 			/*
 			 * Report an error if the process doesn't exist
 			 * or is actively being birthed.
 			 */
 			p = pfind(pid);
 			if (p)
 				fill_kinfo_proc(p, &kp);
 			if (p == NULL || kp.ki_stat == SIDL) {
 #if defined(sun)
 				mutex_exit(&pidlock);
 #endif
 				return (ESRCH);
 			}
 #if defined(sun)
 			mutex_enter(&p->p_lock);
 			mutex_exit(&pidlock);
 #else
 			PROC_LOCK_ASSERT(p, MA_OWNED);
 #endif
 
 #ifdef notyet
 			if ((ret = priv_proc_cred_perm(cr, p, NULL,
 			    VREAD | VWRITE)) != 0) {
 #if defined(sun)
 				mutex_exit(&p->p_lock);
 #else
 				PROC_UNLOCK(p);
 #endif
 				return (ret);
 			}
 #endif /* notyet */
 #if defined(sun)
 			mutex_exit(&p->p_lock);
 #else
 			PROC_UNLOCK(p);
 #endif
 		}
 #endif /* notyet */
 
 		ret = fasttrap_add_probe(probe);
 err:
 		kmem_free(probe, size);
 
 		return (ret);
 
 	} else if (cmd == FASTTRAPIOC_GETINSTR) {
 		fasttrap_instr_query_t instr;
 		fasttrap_tracepoint_t *tp;
 		uint_t index;
 #if defined(sun)
 		int ret;
 #endif
 
 #if defined(sun)
 		if (copyin((void *)arg, &instr, sizeof (instr)) != 0)
 			return (EFAULT);
 #endif
 
 #ifdef notyet
 		if (!PRIV_POLICY_CHOICE(cr, PRIV_ALL, B_FALSE)) {
 			proc_t *p;
 			pid_t pid = instr.ftiq_pid;
 
 #if defined(sun)
 			mutex_enter(&pidlock);
 #endif
 			/*
 			 * Report an error if the process doesn't exist
 			 * or is actively being birthed.
 			 */
 			p = pfind(pid);
 			if (p)
 				fill_kinfo_proc(p, &kp);
 			if (p == NULL || kp.ki_stat == SIDL) {
 #if defined(sun)
 				mutex_exit(&pidlock);
 #endif
 				return (ESRCH);
 			}
 #if defined(sun)
 			mutex_enter(&p->p_lock);
 			mutex_exit(&pidlock);
 #else
 			PROC_LOCK_ASSERT(p, MA_OWNED);
 #endif
 
 #ifdef notyet
 			if ((ret = priv_proc_cred_perm(cr, p, NULL,
 			    VREAD)) != 0) {
 #if defined(sun)
 				mutex_exit(&p->p_lock);
 #else
 				PROC_UNLOCK(p);
 #endif
 				return (ret);
 			}
 #endif /* notyet */
 
 #if defined(sun)
 			mutex_exit(&p->p_lock);
 #else
 			PROC_UNLOCK(p);
 #endif
 		}
 #endif /* notyet */
 
 		index = FASTTRAP_TPOINTS_INDEX(instr.ftiq_pid, instr.ftiq_pc);
 
 		mutex_enter(&fasttrap_tpoints.fth_table[index].ftb_mtx);
 		tp = fasttrap_tpoints.fth_table[index].ftb_data;
 		while (tp != NULL) {
 			if (instr.ftiq_pid == tp->ftt_pid &&
 			    instr.ftiq_pc == tp->ftt_pc &&
 			    tp->ftt_proc->ftpc_acount != 0)
 				break;
 
 			tp = tp->ftt_next;
 		}
 
 		if (tp == NULL) {
 			mutex_exit(&fasttrap_tpoints.fth_table[index].ftb_mtx);
 			return (ENOENT);
 		}
 
 		bcopy(&tp->ftt_instr, &instr.ftiq_instr,
 		    sizeof (instr.ftiq_instr));
 		mutex_exit(&fasttrap_tpoints.fth_table[index].ftb_mtx);
 
 		if (copyout(&instr, (void *)arg, sizeof (instr)) != 0)
 			return (EFAULT);
 
 		return (0);
 	}
 
 	return (EINVAL);
 }
 
 static int
 fasttrap_load(void)
 {
 	ulong_t nent;
 	int i, ret;
 
         /* Create the /dev/dtrace/fasttrap entry. */
         fasttrap_cdev = make_dev(&fasttrap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0600,
             "dtrace/fasttrap");
 
 	mtx_init(&fasttrap_cleanup_mtx, "fasttrap clean", "dtrace", MTX_DEF);
 	mutex_init(&fasttrap_count_mtx, "fasttrap count mtx", MUTEX_DEFAULT,
 	    NULL);
 
 #if defined(sun)
 	fasttrap_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 	    "fasttrap-max-probes", FASTTRAP_MAX_DEFAULT);
 #else
 	fasttrap_max = FASTTRAP_MAX_DEFAULT;
 #endif
 	fasttrap_total = 0;
 
 	/*
 	 * Conjure up the tracepoints hashtable...
 	 */
 #if defined(sun)
 	nent = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 	    "fasttrap-hash-size", FASTTRAP_TPOINTS_DEFAULT_SIZE);
 #else
 	nent = FASTTRAP_TPOINTS_DEFAULT_SIZE;
 #endif
 
 	if (nent == 0 || nent > 0x1000000)
 		nent = FASTTRAP_TPOINTS_DEFAULT_SIZE;
 
 	if ((nent & (nent - 1)) == 0)
 		fasttrap_tpoints.fth_nent = nent;
 	else
 		fasttrap_tpoints.fth_nent = 1 << fasttrap_highbit(nent);
 	ASSERT(fasttrap_tpoints.fth_nent > 0);
 	fasttrap_tpoints.fth_mask = fasttrap_tpoints.fth_nent - 1;
 	fasttrap_tpoints.fth_table = kmem_zalloc(fasttrap_tpoints.fth_nent *
 	    sizeof (fasttrap_bucket_t), KM_SLEEP);
 #if !defined(sun)
 	for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
 		mutex_init(&fasttrap_tpoints.fth_table[i].ftb_mtx,
 		    "tracepoints bucket mtx", MUTEX_DEFAULT, NULL);
 #endif
 
 	/*
 	 * ... and the providers hash table...
 	 */
 	nent = FASTTRAP_PROVIDERS_DEFAULT_SIZE;
 	if ((nent & (nent - 1)) == 0)
 		fasttrap_provs.fth_nent = nent;
 	else
 		fasttrap_provs.fth_nent = 1 << fasttrap_highbit(nent);
 	ASSERT(fasttrap_provs.fth_nent > 0);
 	fasttrap_provs.fth_mask = fasttrap_provs.fth_nent - 1;
 	fasttrap_provs.fth_table = kmem_zalloc(fasttrap_provs.fth_nent *
 	    sizeof (fasttrap_bucket_t), KM_SLEEP);
 #if !defined(sun)
 	for (i = 0; i < fasttrap_provs.fth_nent; i++)
 		mutex_init(&fasttrap_provs.fth_table[i].ftb_mtx, 
 		    "providers bucket mtx", MUTEX_DEFAULT, NULL);
 #endif
 
 	ret = kproc_create(fasttrap_pid_cleanup_cb, NULL,
 	    &fasttrap_cleanup_proc, 0, 0, "ftcleanup");
 	if (ret != 0) {
 		destroy_dev(fasttrap_cdev);
 #if !defined(sun)
 		for (i = 0; i < fasttrap_provs.fth_nent; i++)
 			mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx);
 		for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
 			mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx);
 #endif
 		kmem_free(fasttrap_provs.fth_table, fasttrap_provs.fth_nent *
 		    sizeof (fasttrap_bucket_t));
 		mtx_destroy(&fasttrap_cleanup_mtx);
 		mutex_destroy(&fasttrap_count_mtx);
 		return (ret);
 	}
 
 
 	/*
 	 * ... and the procs hash table.
 	 */
 	nent = FASTTRAP_PROCS_DEFAULT_SIZE;
 	if ((nent & (nent - 1)) == 0)
 		fasttrap_procs.fth_nent = nent;
 	else
 		fasttrap_procs.fth_nent = 1 << fasttrap_highbit(nent);
 	ASSERT(fasttrap_procs.fth_nent > 0);
 	fasttrap_procs.fth_mask = fasttrap_procs.fth_nent - 1;
 	fasttrap_procs.fth_table = kmem_zalloc(fasttrap_procs.fth_nent *
 	    sizeof (fasttrap_bucket_t), KM_SLEEP);
 #if !defined(sun)
 	for (i = 0; i < fasttrap_procs.fth_nent; i++)
 		mutex_init(&fasttrap_procs.fth_table[i].ftb_mtx,
 		    "processes bucket mtx", MUTEX_DEFAULT, NULL);
 
 	CPU_FOREACH(i) {
 		mutex_init(&fasttrap_cpuc_pid_lock[i], "fasttrap barrier",
 		    MUTEX_DEFAULT, NULL);
 	}
 
 	/*
 	 * This event handler must run before kdtrace_thread_dtor() since it
 	 * accesses the thread's struct kdtrace_thread.
 	 */
 	fasttrap_thread_dtor_tag = EVENTHANDLER_REGISTER(thread_dtor,
 	    fasttrap_thread_dtor, NULL, EVENTHANDLER_PRI_FIRST);
 #endif
 
 	/*
 	 * Install our hooks into fork(2), exec(2), and exit(2).
 	 */
 	dtrace_fasttrap_fork = &fasttrap_fork;
 	dtrace_fasttrap_exit = &fasttrap_exec_exit;
 	dtrace_fasttrap_exec = &fasttrap_exec_exit;
 
 	(void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL,
 	    &fasttrap_meta_id);
 
 	return (0);
 }
 
 static int
 fasttrap_unload(void)
 {
 	int i, fail = 0;
 
 	/*
 	 * Unregister the meta-provider to make sure no new fasttrap-
 	 * managed providers come along while we're trying to close up
 	 * shop. If we fail to detach, we'll need to re-register as a
 	 * meta-provider. We can fail to unregister as a meta-provider
 	 * if providers we manage still exist.
 	 */
 	if (fasttrap_meta_id != DTRACE_METAPROVNONE &&
 	    dtrace_meta_unregister(fasttrap_meta_id) != 0)
 		return (-1);
 
 	/*
 	 * Iterate over all of our providers. If there's still a process
 	 * that corresponds to that pid, fail to detach.
 	 */
 	for (i = 0; i < fasttrap_provs.fth_nent; i++) {
 		fasttrap_provider_t **fpp, *fp;
 		fasttrap_bucket_t *bucket = &fasttrap_provs.fth_table[i];
 
 		mutex_enter(&bucket->ftb_mtx);
 		fpp = (fasttrap_provider_t **)&bucket->ftb_data;
 		while ((fp = *fpp) != NULL) {
 			/*
 			 * Acquire and release the lock as a simple way of
 			 * waiting for any other consumer to finish with
 			 * this provider. A thread must first acquire the
 			 * bucket lock so there's no chance of another thread
 			 * blocking on the provider's lock.
 			 */
 			mutex_enter(&fp->ftp_mtx);
 			mutex_exit(&fp->ftp_mtx);
 
 			if (dtrace_unregister(fp->ftp_provid) != 0) {
 				fail = 1;
 				fpp = &fp->ftp_next;
 			} else {
 				*fpp = fp->ftp_next;
 				fasttrap_provider_free(fp);
 			}
 		}
 
 		mutex_exit(&bucket->ftb_mtx);
 	}
 
 	if (fail) {
 		(void) dtrace_meta_register("fasttrap", &fasttrap_mops, NULL,
 		    &fasttrap_meta_id);
 
 		return (-1);
 	}
 
 	/*
 	 * Stop new processes from entering these hooks now, before the
 	 * fasttrap_cleanup thread runs.  That way all processes will hopefully
 	 * be out of these hooks before we free fasttrap_provs.fth_table
 	 */
 	ASSERT(dtrace_fasttrap_fork == &fasttrap_fork);
 	dtrace_fasttrap_fork = NULL;
 
 	ASSERT(dtrace_fasttrap_exec == &fasttrap_exec_exit);
 	dtrace_fasttrap_exec = NULL;
 
 	ASSERT(dtrace_fasttrap_exit == &fasttrap_exec_exit);
 	dtrace_fasttrap_exit = NULL;
 
 	mtx_lock(&fasttrap_cleanup_mtx);
 	fasttrap_cleanup_drain = 1;
 	/* Wait for the cleanup thread to finish up and signal us. */
 	wakeup(&fasttrap_cleanup_cv);
 	mtx_sleep(&fasttrap_cleanup_drain, &fasttrap_cleanup_mtx, 0, "ftcld",
 	    0);
 	fasttrap_cleanup_proc = NULL;
 	mtx_destroy(&fasttrap_cleanup_mtx);
 
 #ifdef DEBUG
 	mutex_enter(&fasttrap_count_mtx);
 	ASSERT(fasttrap_pid_count == 0);
 	mutex_exit(&fasttrap_count_mtx);
 #endif
 
 #if !defined(sun)
 	EVENTHANDLER_DEREGISTER(thread_dtor, fasttrap_thread_dtor_tag);
 
 	for (i = 0; i < fasttrap_tpoints.fth_nent; i++)
 		mutex_destroy(&fasttrap_tpoints.fth_table[i].ftb_mtx);
 	for (i = 0; i < fasttrap_provs.fth_nent; i++)
 		mutex_destroy(&fasttrap_provs.fth_table[i].ftb_mtx);
 	for (i = 0; i < fasttrap_procs.fth_nent; i++)
 		mutex_destroy(&fasttrap_procs.fth_table[i].ftb_mtx);
 #endif
 	kmem_free(fasttrap_tpoints.fth_table,
 	    fasttrap_tpoints.fth_nent * sizeof (fasttrap_bucket_t));
 	fasttrap_tpoints.fth_nent = 0;
 
 	kmem_free(fasttrap_provs.fth_table,
 	    fasttrap_provs.fth_nent * sizeof (fasttrap_bucket_t));
 	fasttrap_provs.fth_nent = 0;
 
 	kmem_free(fasttrap_procs.fth_table,
 	    fasttrap_procs.fth_nent * sizeof (fasttrap_bucket_t));
 	fasttrap_procs.fth_nent = 0;
 
 #if !defined(sun)
 	destroy_dev(fasttrap_cdev);
 	mutex_destroy(&fasttrap_count_mtx);
 	CPU_FOREACH(i) {
 		mutex_destroy(&fasttrap_cpuc_pid_lock[i]);
 	}
 #endif
 
 	return (0);
 }
 
 /* ARGSUSED */
 static int
 fasttrap_modevent(module_t mod __unused, int type, void *data __unused)
 {
 	int error = 0;
 
 	switch (type) {
 	case MOD_LOAD:
 		break;
 
 	case MOD_UNLOAD:
 		break;
 
 	case MOD_SHUTDOWN:
 		break;
 
 	default:
 		error = EOPNOTSUPP;
 		break;
 	}
 	return (error);
 }
 
 SYSINIT(fasttrap_load, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY, fasttrap_load,
     NULL);
 SYSUNINIT(fasttrap_unload, SI_SUB_DTRACE_PROVIDER, SI_ORDER_ANY,
     fasttrap_unload, NULL);
 
 DEV_MODULE(fasttrap, fasttrap_modevent, NULL);
 MODULE_VERSION(fasttrap, 1);
 MODULE_DEPEND(fasttrap, dtrace, 1, 1, 1);
 MODULE_DEPEND(fasttrap, opensolaris, 1, 1, 1);
Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/profile.c
===================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/profile.c	(revision 271000)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/dtrace/profile.c	(revision 271001)
@@ -1,576 +1,576 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #pragma ident	"%Z%%M%	%I%	%E% SMI"
 
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/modctl.h>
 #include <sys/conf.h>
 #include <sys/systm.h>
 #include <sys/ddi.h>
 #include <sys/sunddi.h>
 #include <sys/cpuvar.h>
 #include <sys/kmem.h>
 #include <sys/strsubr.h>
 #include <sys/dtrace.h>
 #include <sys/cyclic.h>
 #include <sys/atomic.h>
 
 static dev_info_t *profile_devi;
 static dtrace_provider_id_t profile_id;
 
 /*
  * Regardless of platform, the stack frames look like this in the case of the
  * profile provider:
  *
  *	profile_fire
  *	cyclic_expire
  *	cyclic_fire
  *	[ cbe ]
  *	[ interrupt code ]
  *
  * On x86, there are five frames from the generic interrupt code; further, the
  * interrupted instruction appears as its own stack frame, giving us a total of
  * 10.
  *
  * On SPARC, the picture is further complicated because the compiler
  * optimizes away tail-calls -- so the following frames are optimized away:
  *
  * 	profile_fire
  *	cyclic_expire
  *
  * This gives three frames.  However, on DEBUG kernels, the cyclic_expire
  * frame cannot be tail-call eliminated, yielding four frames in this case.
  *
  * All of the above constraints lead to the mess below.  Yes, the profile
  * provider should ideally figure this out on-the-fly by hitting one of its own
  * probes and then walking its own stack trace.  This is complicated, however,
  * and the static definition doesn't seem to be overly brittle.  Still, we
  * allow for a manual override in case we get it completely wrong.
  */
 #ifdef __x86
 #define	PROF_ARTIFICIAL_FRAMES	10
 #else
 #ifdef __sparc
 #ifdef DEBUG
 #define	PROF_ARTIFICIAL_FRAMES	4
 #else
 #define	PROF_ARTIFICIAL_FRAMES	3
 #endif
 #endif
 #endif
 
 #define	PROF_NAMELEN		15
 
 #define	PROF_PROFILE		0
 #define	PROF_TICK		1
 #define	PROF_PREFIX_PROFILE	"profile-"
 #define	PROF_PREFIX_TICK	"tick-"
 
 typedef struct profile_probe {
 	char		prof_name[PROF_NAMELEN];
 	dtrace_id_t	prof_id;
 	int		prof_kind;
 	hrtime_t	prof_interval;
 	cyclic_id_t	prof_cyclic;
 } profile_probe_t;
 
 typedef struct profile_probe_percpu {
 	hrtime_t	profc_expected;
 	hrtime_t	profc_interval;
 	profile_probe_t	*profc_probe;
 } profile_probe_percpu_t;
 
 hrtime_t	profile_interval_min = NANOSEC / 5000;		/* 5000 hz */
 int		profile_aframes = 0;				/* override */
 
 static int profile_rates[] = {
     97, 199, 499, 997, 1999,
     4001, 4999, 0, 0, 0,
     0, 0, 0, 0, 0,
     0, 0, 0, 0, 0
 };
 
 static int profile_ticks[] = {
     1, 10, 100, 500, 1000,
     5000, 0, 0, 0, 0,
     0, 0, 0, 0, 0
 };
 
 /*
  * profile_max defines the upper bound on the number of profile probes that
  * can exist (this is to prevent malicious or clumsy users from exhausing
  * system resources by creating a slew of profile probes). At mod load time,
  * this gets its value from PROFILE_MAX_DEFAULT or profile-max-probes if it's
  * present in the profile.conf file.
  */
 #define	PROFILE_MAX_DEFAULT	1000	/* default max. number of probes */
 static uint32_t profile_max;		/* maximum number of profile probes */
 static uint32_t profile_total;	/* current number of profile probes */
 
 static void
 profile_fire(void *arg)
 {
 	profile_probe_percpu_t *pcpu = arg;
 	profile_probe_t *prof = pcpu->profc_probe;
 	hrtime_t late;
 
 	late = dtrace_gethrtime() - pcpu->profc_expected;
 	pcpu->profc_expected += pcpu->profc_interval;
 
 	dtrace_probe(prof->prof_id, CPU->cpu_profile_pc,
 	    CPU->cpu_profile_upc, late, 0, 0);
 }
 
 static void
 profile_tick(void *arg)
 {
 	profile_probe_t *prof = arg;
 
 	dtrace_probe(prof->prof_id, CPU->cpu_profile_pc,
 	    CPU->cpu_profile_upc, 0, 0, 0);
 }
 
 static void
 profile_create(hrtime_t interval, const char *name, int kind)
 {
 	profile_probe_t *prof;
 	int nr_frames = PROF_ARTIFICIAL_FRAMES + dtrace_mach_aframes();
 
 	if (profile_aframes)
 		nr_frames = profile_aframes;
 
 	if (interval < profile_interval_min)
 		return;
 
 	if (dtrace_probe_lookup(profile_id, NULL, NULL, name) != 0)
 		return;
 
-	atomic_add_32(&profile_total, 1);
+	atomic_inc_32(&profile_total);
 	if (profile_total > profile_max) {
-		atomic_add_32(&profile_total, -1);
+		atomic_dec_32(&profile_total);
 		return;
 	}
 
 	prof = kmem_zalloc(sizeof (profile_probe_t), KM_SLEEP);
 	(void) strcpy(prof->prof_name, name);
 	prof->prof_interval = interval;
 	prof->prof_cyclic = CYCLIC_NONE;
 	prof->prof_kind = kind;
 	prof->prof_id = dtrace_probe_create(profile_id,
 	    NULL, NULL, name, nr_frames, prof);
 }
 
 /*ARGSUSED*/
 static void
 profile_provide(void *arg, const dtrace_probedesc_t *desc)
 {
 	int i, j, rate, kind;
 	hrtime_t val = 0, mult = 1, len;
 	const char *name, *suffix = NULL;
 
 	const struct {
 		char *prefix;
 		int kind;
 	} types[] = {
 		{ PROF_PREFIX_PROFILE, PROF_PROFILE },
 		{ PROF_PREFIX_TICK, PROF_TICK },
 		{ NULL, NULL }
 	};
 
 	const struct {
 		char *name;
 		hrtime_t mult;
 	} suffixes[] = {
 		{ "ns", 	NANOSEC / NANOSEC },
 		{ "nsec",	NANOSEC / NANOSEC },
 		{ "us",		NANOSEC / MICROSEC },
 		{ "usec",	NANOSEC / MICROSEC },
 		{ "ms",		NANOSEC / MILLISEC },
 		{ "msec",	NANOSEC / MILLISEC },
 		{ "s",		NANOSEC / SEC },
 		{ "sec",	NANOSEC / SEC },
 		{ "m",		NANOSEC * (hrtime_t)60 },
 		{ "min",	NANOSEC * (hrtime_t)60 },
 		{ "h",		NANOSEC * (hrtime_t)(60 * 60) },
 		{ "hour",	NANOSEC * (hrtime_t)(60 * 60) },
 		{ "d",		NANOSEC * (hrtime_t)(24 * 60 * 60) },
 		{ "day",	NANOSEC * (hrtime_t)(24 * 60 * 60) },
 		{ "hz",		0 },
 		{ NULL }
 	};
 
 	if (desc == NULL) {
 		char n[PROF_NAMELEN];
 
 		/*
 		 * If no description was provided, provide all of our probes.
 		 */
 		for (i = 0; i < sizeof (profile_rates) / sizeof (int); i++) {
 			if ((rate = profile_rates[i]) == 0)
 				continue;
 
 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
 			    PROF_PREFIX_PROFILE, rate);
 			profile_create(NANOSEC / rate, n, PROF_PROFILE);
 		}
 
 		for (i = 0; i < sizeof (profile_ticks) / sizeof (int); i++) {
 			if ((rate = profile_ticks[i]) == 0)
 				continue;
 
 			(void) snprintf(n, PROF_NAMELEN, "%s%d",
 			    PROF_PREFIX_TICK, rate);
 			profile_create(NANOSEC / rate, n, PROF_TICK);
 		}
 
 		return;
 	}
 
 	name = desc->dtpd_name;
 
 	for (i = 0; types[i].prefix != NULL; i++) {
 		len = strlen(types[i].prefix);
 
 		if (strncmp(name, types[i].prefix, len) != 0)
 			continue;
 		break;
 	}
 
 	if (types[i].prefix == NULL)
 		return;
 
 	kind = types[i].kind;
 	j = strlen(name) - len;
 
 	/*
 	 * We need to start before any time suffix.
 	 */
 	for (j = strlen(name); j >= len; j--) {
 		if (name[j] >= '0' && name[j] <= '9')
 			break;
 		suffix = &name[j];
 	}
 
 	ASSERT(suffix != NULL);
 
 	/*
 	 * Now determine the numerical value present in the probe name.
 	 */
 	for (; j >= len; j--) {
 		if (name[j] < '0' || name[j] > '9')
 			return;
 
 		val += (name[j] - '0') * mult;
 		mult *= (hrtime_t)10;
 	}
 
 	if (val == 0)
 		return;
 
 	/*
 	 * Look-up the suffix to determine the multiplier.
 	 */
 	for (i = 0, mult = 0; suffixes[i].name != NULL; i++) {
 		if (strcasecmp(suffixes[i].name, suffix) == 0) {
 			mult = suffixes[i].mult;
 			break;
 		}
 	}
 
 	if (suffixes[i].name == NULL && *suffix != '\0')
 		return;
 
 	if (mult == 0) {
 		/*
 		 * The default is frequency-per-second.
 		 */
 		val = NANOSEC / val;
 	} else {
 		val *= mult;
 	}
 
 	profile_create(val, name, kind);
 }
 
 /*ARGSUSED*/
 static void
 profile_destroy(void *arg, dtrace_id_t id, void *parg)
 {
 	profile_probe_t *prof = parg;
 
 	ASSERT(prof->prof_cyclic == CYCLIC_NONE);
 	kmem_free(prof, sizeof (profile_probe_t));
 
 	ASSERT(profile_total >= 1);
-	atomic_add_32(&profile_total, -1);
+	atomic_dec_32(&profile_total);
 }
 
 /*ARGSUSED*/
 static void
 profile_online(void *arg, cpu_t *cpu, cyc_handler_t *hdlr, cyc_time_t *when)
 {
 	profile_probe_t *prof = arg;
 	profile_probe_percpu_t *pcpu;
 
 	pcpu = kmem_zalloc(sizeof (profile_probe_percpu_t), KM_SLEEP);
 	pcpu->profc_probe = prof;
 
 	hdlr->cyh_func = profile_fire;
 	hdlr->cyh_arg = pcpu;
 	hdlr->cyh_level = CY_HIGH_LEVEL;
 
 	when->cyt_interval = prof->prof_interval;
 	when->cyt_when = dtrace_gethrtime() + when->cyt_interval;
 
 	pcpu->profc_expected = when->cyt_when;
 	pcpu->profc_interval = when->cyt_interval;
 }
 
 /*ARGSUSED*/
 static void
 profile_offline(void *arg, cpu_t *cpu, void *oarg)
 {
 	profile_probe_percpu_t *pcpu = oarg;
 
 	ASSERT(pcpu->profc_probe == arg);
 	kmem_free(pcpu, sizeof (profile_probe_percpu_t));
 }
 
 /*ARGSUSED*/
 static void
 profile_enable(void *arg, dtrace_id_t id, void *parg)
 {
 	profile_probe_t *prof = parg;
 	cyc_omni_handler_t omni;
 	cyc_handler_t hdlr;
 	cyc_time_t when;
 
 	ASSERT(prof->prof_interval != 0);
 	ASSERT(MUTEX_HELD(&cpu_lock));
 
 	if (prof->prof_kind == PROF_TICK) {
 		hdlr.cyh_func = profile_tick;
 		hdlr.cyh_arg = prof;
 		hdlr.cyh_level = CY_HIGH_LEVEL;
 
 		when.cyt_interval = prof->prof_interval;
 		when.cyt_when = dtrace_gethrtime() + when.cyt_interval;
 	} else {
 		ASSERT(prof->prof_kind == PROF_PROFILE);
 		omni.cyo_online = profile_online;
 		omni.cyo_offline = profile_offline;
 		omni.cyo_arg = prof;
 	}
 
 	if (prof->prof_kind == PROF_TICK) {
 		prof->prof_cyclic = cyclic_add(&hdlr, &when);
 	} else {
 		prof->prof_cyclic = cyclic_add_omni(&omni);
 	}
 }
 
 /*ARGSUSED*/
 static void
 profile_disable(void *arg, dtrace_id_t id, void *parg)
 {
 	profile_probe_t *prof = parg;
 
 	ASSERT(prof->prof_cyclic != CYCLIC_NONE);
 	ASSERT(MUTEX_HELD(&cpu_lock));
 
 	cyclic_remove(prof->prof_cyclic);
 	prof->prof_cyclic = CYCLIC_NONE;
 }
 
 /*ARGSUSED*/
 static int
 profile_usermode(void *arg, dtrace_id_t id, void *parg)
 {
 	return (CPU->cpu_profile_pc == 0);
 }
 
 static dtrace_pattr_t profile_attr = {
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
 { DTRACE_STABILITY_UNSTABLE, DTRACE_STABILITY_UNSTABLE, DTRACE_CLASS_UNKNOWN },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_COMMON },
 };
 
 static dtrace_pops_t profile_pops = {
 	profile_provide,
 	NULL,
 	profile_enable,
 	profile_disable,
 	NULL,
 	NULL,
 	NULL,
 	NULL,
 	profile_usermode,
 	profile_destroy
 };
 
 static int
 profile_attach(dev_info_t *devi, ddi_attach_cmd_t cmd)
 {
 	switch (cmd) {
 	case DDI_ATTACH:
 		break;
 	case DDI_RESUME:
 		return (DDI_SUCCESS);
 	default:
 		return (DDI_FAILURE);
 	}
 
 	if (ddi_create_minor_node(devi, "profile", S_IFCHR, 0,
 	    DDI_PSEUDO, NULL) == DDI_FAILURE ||
 	    dtrace_register("profile", &profile_attr,
 	    DTRACE_PRIV_KERNEL | DTRACE_PRIV_USER, NULL,
 	    &profile_pops, NULL, &profile_id) != 0) {
 		ddi_remove_minor_node(devi, NULL);
 		return (DDI_FAILURE);
 	}
 
 	profile_max = ddi_getprop(DDI_DEV_T_ANY, devi, DDI_PROP_DONTPASS,
 	    "profile-max-probes", PROFILE_MAX_DEFAULT);
 
 	ddi_report_dev(devi);
 	profile_devi = devi;
 	return (DDI_SUCCESS);
 }
 
 static int
 profile_detach(dev_info_t *devi, ddi_detach_cmd_t cmd)
 {
 	switch (cmd) {
 	case DDI_DETACH:
 		break;
 	case DDI_SUSPEND:
 		return (DDI_SUCCESS);
 	default:
 		return (DDI_FAILURE);
 	}
 
 	if (dtrace_unregister(profile_id) != 0)
 		return (DDI_FAILURE);
 
 	ddi_remove_minor_node(devi, NULL);
 	return (DDI_SUCCESS);
 }
 
 /*ARGSUSED*/
 static int
 profile_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
 {
 	int error;
 
 	switch (infocmd) {
 	case DDI_INFO_DEVT2DEVINFO:
 		*result = (void *)profile_devi;
 		error = DDI_SUCCESS;
 		break;
 	case DDI_INFO_DEVT2INSTANCE:
 		*result = (void *)0;
 		error = DDI_SUCCESS;
 		break;
 	default:
 		error = DDI_FAILURE;
 	}
 	return (error);
 }
 
 /*ARGSUSED*/
 static int
 profile_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
 {
 	return (0);
 }
 
 static struct cb_ops profile_cb_ops = {
 	profile_open,		/* open */
 	nodev,			/* close */
 	nulldev,		/* strategy */
 	nulldev,		/* print */
 	nodev,			/* dump */
 	nodev,			/* read */
 	nodev,			/* write */
 	nodev,			/* ioctl */
 	nodev,			/* devmap */
 	nodev,			/* mmap */
 	nodev,			/* segmap */
 	nochpoll,		/* poll */
 	ddi_prop_op,		/* cb_prop_op */
 	0,			/* streamtab  */
 	D_NEW | D_MP		/* Driver compatibility flag */
 };
 
 static struct dev_ops profile_ops = {
 	DEVO_REV,		/* devo_rev, */
 	0,			/* refcnt  */
 	profile_info,		/* get_dev_info */
 	nulldev,		/* identify */
 	nulldev,		/* probe */
 	profile_attach,		/* attach */
 	profile_detach,		/* detach */
 	nodev,			/* reset */
 	&profile_cb_ops,	/* driver operations */
 	NULL,			/* bus operations */
 	nodev			/* dev power */
 };
 
 /*
  * Module linkage information for the kernel.
  */
 static struct modldrv modldrv = {
 	&mod_driverops,		/* module type (this is a pseudo driver) */
 	"Profile Interrupt Tracing",	/* name of module */
 	&profile_ops,		/* driver ops */
 };
 
 static struct modlinkage modlinkage = {
 	MODREV_1,
 	(void *)&modldrv,
 	NULL
 };
 
 int
 _init(void)
 {
 	return (mod_install(&modlinkage));
 }
 
 int
 _info(struct modinfo *modinfop)
 {
 	return (mod_info(&modlinkage, modinfop));
 }
 
 int
 _fini(void)
 {
 	return (mod_remove(&modlinkage));
 }
Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
===================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	(revision 271000)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c	(revision 271001)
@@ -1,2858 +1,2858 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dmu_tx.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/blkptr.h>
 #include <sys/range_tree.h>
 
 /*
  * Number of times that zfs_free_range() took the slow path while doing
  * a zfs receive.  A nonzero value indicates a potential performance problem.
  */
 uint64_t zfs_free_range_recv_miss;
 
 static void dbuf_destroy(dmu_buf_impl_t *db);
 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
 /*
  * Global data structures and functions for the dbuf cache.
  */
 static kmem_cache_t *dbuf_cache;
 
 /* ARGSUSED */
 static int
 dbuf_cons(void *vdb, void *unused, int kmflag)
 {
 	dmu_buf_impl_t *db = vdb;
 	bzero(db, sizeof (dmu_buf_impl_t));
 
 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
 	refcount_create(&db->db_holds);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dbuf_dest(void *vdb, void *unused)
 {
 	dmu_buf_impl_t *db = vdb;
 	mutex_destroy(&db->db_mtx);
 	cv_destroy(&db->db_changed);
 	refcount_destroy(&db->db_holds);
 }
 
 /*
  * dbuf hash table routines
  */
 static dbuf_hash_table_t dbuf_hash_table;
 
 static uint64_t dbuf_hash_count;
 
 static uint64_t
 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
 {
 	uintptr_t osv = (uintptr_t)os;
 	uint64_t crc = -1ULL;
 
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF];
 
 	crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16);
 
 	return (crc);
 }
 
 #define	DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid);
 
 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
 	((dbuf)->db.db_object == (obj) &&		\
 	(dbuf)->db_objset == (os) &&			\
 	(dbuf)->db_level == (level) &&			\
 	(dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	objset_t *os = dn->dn_objset;
 	uint64_t obj = dn->dn_object;
 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *db;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 			mutex_enter(&db->db_mtx);
 			if (db->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (db);
 			}
 			mutex_exit(&db->db_mtx);
 		}
 	}
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	return (NULL);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  */
 static dmu_buf_impl_t *
 dbuf_hash_insert(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	objset_t *os = db->db_objset;
 	uint64_t obj = db->db.db_object;
 	int level = db->db_level;
 	uint64_t blkid = db->db_blkid;
 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *dbf;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 			mutex_enter(&dbf->db_mtx);
 			if (dbf->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (dbf);
 			}
 			mutex_exit(&dbf->db_mtx);
 		}
 	}
 
 	mutex_enter(&db->db_mtx);
 	db->db_hash_next = h->hash_table[idx];
 	h->hash_table[idx] = db;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
-	atomic_add_64(&dbuf_hash_count, 1);
+	atomic_inc_64(&dbuf_hash_count);
 
 	return (NULL);
 }
 
 /*
  * Remove an entry from the hash table.  It must be in the EVICTING state.
  */
 static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object,
 	    db->db_level, db->db_blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *dbf, **dbp;
 
 	/*
 	 * We musn't hold db_mtx to maintain lock ordering:
 	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_state == DB_EVICTING);
 	ASSERT(!MUTEX_HELD(&db->db_mtx));
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	dbp = &h->hash_table[idx];
 	while ((dbf = *dbp) != db) {
 		dbp = &dbf->db_hash_next;
 		ASSERT(dbf != NULL);
 	}
 	*dbp = db->db_hash_next;
 	db->db_hash_next = NULL;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
-	atomic_add_64(&dbuf_hash_count, -1);
+	atomic_dec_64(&dbuf_hash_count);
 }
 
 static arc_evict_func_t dbuf_do_evict;
 
 static void
 dbuf_evict_user(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_level != 0 || db->db_evict_func == NULL)
 		return;
 
 	if (db->db_user_data_ptr_ptr)
 		*db->db_user_data_ptr_ptr = db->db.db_data;
 	db->db_evict_func(&db->db, db->db_user_ptr);
 	db->db_user_ptr = NULL;
 	db->db_user_data_ptr_ptr = NULL;
 	db->db_evict_func = NULL;
 }
 
 boolean_t
 dbuf_is_metadata(dmu_buf_impl_t *db)
 {
 	if (db->db_level > 0) {
 		return (B_TRUE);
 	} else {
 		boolean_t is_metadata;
 
 		DB_DNODE_ENTER(db);
 		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 		DB_DNODE_EXIT(db);
 
 		return (is_metadata);
 	}
 }
 
 void
 dbuf_evict(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db_data_pending == NULL);
 
 	dbuf_clear(db);
 	dbuf_destroy(db);
 }
 
 void
 dbuf_init(void)
 {
 	uint64_t hsize = 1ULL << 16;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	int i;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average 4K block size.  The table will take up
 	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
 	 */
 	while (hsize * 4096 < (uint64_t)physmem * PAGESIZE)
 		hsize <<= 1;
 
 retry:
 	h->hash_table_mask = hsize - 1;
 	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 	if (h->hash_table == NULL) {
 		/* XXX - we should really return an error instead of assert */
 		ASSERT(hsize > (1ULL << 10));
 		hsize >>= 1;
 		goto retry;
 	}
 
 	dbuf_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 }
 
 void
 dbuf_fini(void)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	int i;
 
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_destroy(&h->hash_mutexes[i]);
 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 	kmem_cache_destroy(dbuf_cache);
 }
 
 /*
  * Other stuff.
  */
 
 #ifdef ZFS_DEBUG
 static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dbuf_dirty_record_t *dr;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 		return;
 
 	ASSERT(db->db_objset != NULL);
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dn == NULL) {
 		ASSERT(db->db_parent == NULL);
 		ASSERT(db->db_blkptr == NULL);
 	} else {
 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
 		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 		    db->db_blkid == DMU_SPILL_BLKID ||
 		    !avl_is_empty(&dn->dn_dbufs));
 	}
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
 	} else if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT0(db->db.db_offset);
 	} else {
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 	}
 
 	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
 		ASSERT(dr->dr_dbuf == db);
 
 	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
 		ASSERT(dr->dr_dbuf == db);
 
 	/*
 	 * We can't assert that db_size matches dn_datablksz because it
 	 * can be momentarily different when another thread is doing
 	 * dnode_set_blksz().
 	 */
 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
 		dr = db->db_data_pending;
 		/*
 		 * It should only be modified in syncing context, so
 		 * make sure we only have one copy of the data.
 		 */
 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 	}
 
 	/* verify db->db_blkptr */
 	if (db->db_blkptr) {
 		if (db->db_parent == dn->dn_dbuf) {
 			/* db is pointed to by the dnode */
 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
 			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 				ASSERT(db->db_parent == NULL);
 			else
 				ASSERT(db->db_parent != NULL);
 			if (db->db_blkid != DMU_SPILL_BLKID)
 				ASSERT3P(db->db_blkptr, ==,
 				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		} else {
 			/* db is pointed to by an indirect block */
 			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 			ASSERT3U(db->db_parent->db.db_object, ==,
 			    db->db.db_object);
 			/*
 			 * dnode_grow_indblksz() can make this fail if we don't
 			 * have the struct_rwlock.  XXX indblksz no longer
 			 * grows.  safe to do this now?
 			 */
 			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 				ASSERT3P(db->db_blkptr, ==,
 				    ((blkptr_t *)db->db_parent->db.db_data +
 				    db->db_blkid % epb));
 			}
 		}
 	}
 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
 	    (db->db_buf == NULL || db->db_buf->b_data) &&
 	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
 		/*
 		 * If the blkptr isn't set but they have nonzero data,
 		 * it had better be dirty, otherwise we'll lose that
 		 * data when we evict this buffer.
 		 */
 		if (db->db_dirtycnt == 0) {
 			uint64_t *buf = db->db.db_data;
 			int i;
 
 			for (i = 0; i < db->db.db_size >> 3; i++) {
 				ASSERT(buf[i] == 0);
 			}
 		}
 	}
 	DB_DNODE_EXIT(db);
 }
 #endif
 
 static void
 dbuf_update_data(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	if (db->db_level == 0 && db->db_user_data_ptr_ptr) {
 		ASSERT(!refcount_is_zero(&db->db_holds));
 		*db->db_user_data_ptr_ptr = db->db.db_data;
 	}
 }
 
 static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	db->db_buf = buf;
 	if (buf != NULL) {
 		ASSERT(buf->b_data != NULL);
 		db->db.db_data = buf->b_data;
 		if (!arc_released(buf))
 			arc_set_callback(buf, dbuf_do_evict, db);
 		dbuf_update_data(db);
 	} else {
 		dbuf_evict_user(db);
 		db->db.db_data = NULL;
 		if (db->db_state != DB_NOFILL)
 			db->db_state = DB_UNCACHED;
 	}
 }
 
 /*
  * Loan out an arc_buf for read.  Return the loaned arc_buf.
  */
 arc_buf_t *
 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 {
 	arc_buf_t *abuf;
 
 	mutex_enter(&db->db_mtx);
 	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 		int blksz = db->db.db_size;
 		spa_t *spa = db->db_objset->os_spa;
 
 		mutex_exit(&db->db_mtx);
 		abuf = arc_loan_buf(spa, blksz);
 		bcopy(db->db.db_data, abuf->b_data, blksz);
 	} else {
 		abuf = db->db_buf;
 		arc_loan_inuse_buf(abuf, db);
 		dbuf_set_data(db, NULL);
 		mutex_exit(&db->db_mtx);
 	}
 	return (abuf);
 }
 
 uint64_t
 dbuf_whichblock(dnode_t *dn, uint64_t offset)
 {
 	if (dn->dn_datablkshift) {
 		return (offset >> dn->dn_datablkshift);
 	} else {
 		ASSERT3U(offset, <, dn->dn_datablksz);
 		return (0);
 	}
 }
 
 static void
 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT3U(db->db_state, ==, DB_READ);
 	/*
 	 * All reads are synchronous, so we must have a hold on the dbuf
 	 */
 	ASSERT(refcount_count(&db->db_holds) > 0);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	if (db->db_level == 0 && db->db_freed_in_flight) {
 		/* we were freed in flight; disregard any error */
 		arc_release(buf, db);
 		bzero(buf->b_data, db->db.db_size);
 		arc_buf_freeze(buf);
 		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else if (zio == NULL || zio->io_error == 0) {
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
 		VERIFY(arc_buf_remove_ref(buf, db));
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
 	dbuf_rele_and_unlock(db, NULL);
 }
 
 static void
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 {
 	dnode_t *dn;
 	zbookmark_phys_t zb;
 	uint32_t aflags = ARC_NOWAIT;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED);
 	ASSERT(db->db_buf == NULL);
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 
 		ASSERT3U(bonuslen, <=, db->db.db_size);
 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		if (bonuslen < DN_MAX_BONUSLEN)
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
 		if (bonuslen)
 			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
 		DB_DNODE_EXIT(db);
 		dbuf_update_data(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
 	/*
 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 	 * processes the delete record and clears the bp while we are waiting
 	 * for the dn_mtx (resulting in a "no" from block_freed).
 	 */
 	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
 	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
 	    BP_IS_HOLE(db->db_blkptr)))) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 		DB_DNODE_EXIT(db);
 		dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa,
 		    db->db.db_size, db, type));
 		bzero(db->db.db_data, db->db.db_size);
 		db->db_state = DB_CACHED;
 		*flags |= DB_RF_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
 	DB_DNODE_EXIT(db);
 
 	db->db_state = DB_READ;
 	mutex_exit(&db->db_mtx);
 
 	if (DBUF_IS_L2CACHEABLE(db))
 		aflags |= ARC_L2CACHE;
 	if (DBUF_IS_L2COMPRESSIBLE(db))
 		aflags |= ARC_L2COMPRESS;
 
 	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
 	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	dbuf_add_ref(db, NULL);
 
 	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 	    &aflags, &zb);
 	if (aflags & ARC_CACHED)
 		*flags |= DB_RF_CACHED;
 }
 
 int
 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
 	int err = 0;
 	boolean_t havepzio = (zio != NULL);
 	boolean_t prefetch;
 	dnode_t *dn;
 
 	/*
 	 * We don't have to hold the mutex to check db_state because it
 	 * can't be freed while we have a hold on the buffer.
 	 */
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	if (db->db_state == DB_NOFILL)
 		return (SET_ERROR(EIO));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
 	    DBUF_IS_CACHEABLE(db);
 
 	mutex_enter(&db->db_mtx);
 	if (db->db_state == DB_CACHED) {
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
 			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 			    db->db.db_size, TRUE);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
 	} else if (db->db_state == DB_UNCACHED) {
 		spa_t *spa = dn->dn_objset->os_spa;
 
 		if (zio == NULL)
 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 		dbuf_read_impl(db, zio, &flags);
 
 		/* dbuf_read_impl has dropped db_mtx for us */
 
 		if (prefetch)
 			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 			    db->db.db_size, flags & DB_RF_CACHED);
 
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
 
 		if (!havepzio)
 			err = zio_wait(zio);
 	} else {
 		/*
 		 * Another reader came in while the dbuf was in flight
 		 * between UNCACHED and CACHED.  Either a writer will finish
 		 * writing the buffer (sending the dbuf to CACHED) or the
 		 * first reader's request will reach the read_done callback
 		 * and send the dbuf to CACHED.  Otherwise, a failure
 		 * occurred and the dbuf went to UNCACHED.
 		 */
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
 			dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
 			    db->db.db_size, TRUE);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
 
 		/* Skip the wait per the caller's request. */
 		mutex_enter(&db->db_mtx);
 		if ((flags & DB_RF_NEVERWAIT) == 0) {
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL) {
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 		}
 		mutex_exit(&db->db_mtx);
 	}
 
 	ASSERT(err || havepzio || db->db_state == DB_CACHED);
 	return (err);
 }
 
 static void
 dbuf_noread(dmu_buf_impl_t *db)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		spa_t *spa = db->db_objset->os_spa;
 
 		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
 		dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type));
 		db->db_state = DB_FILL;
 	} else if (db->db_state == DB_NOFILL) {
 		dbuf_set_data(db, NULL);
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
 	}
 	mutex_exit(&db->db_mtx);
 }
 
 /*
  * This is our just-in-time copy function.  It makes a copy of
  * buffers, that have been modified in a previous transaction
  * group, before we modify them in the current active group.
  *
  * This function is used in two places: when we are dirtying a
  * buffer for the first time in a txg, and when we are freeing
  * a range in a dnode that includes this buffer.
  *
  * Note that when we are called from dbuf_free_range() we do
  * not put a hold on the buffer, we just traverse the active
  * dbuf list for the dnode.
  */
 static void
 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 {
 	dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db.db_data != NULL);
 	ASSERT(db->db_level == 0);
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 
 	if (dr == NULL ||
 	    (dr->dt.dl.dr_data !=
 	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
 		return;
 
 	/*
 	 * If the last dirty record for this dbuf has not yet synced
 	 * and its referencing the dbuf data, either:
 	 *	reset the reference to point to a new copy,
 	 * or (if there a no active holders)
 	 *	just null out the current db_data pointer.
 	 */
 	ASSERT(dr->dr_txg >= txg - 2);
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		/* Note that the data bufs here are zio_bufs */
 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		int size = db->db.db_size;
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		spa_t *spa = db->db_objset->os_spa;
 
 		dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 	} else {
 		dbuf_set_data(db, NULL);
 	}
 }
 
 void
 dbuf_unoverride(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 	uint64_t txg = dr->dr_txg;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 	ASSERT(db->db_level == 0);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 		return;
 
 	ASSERT(db->db_data_pending != dr);
 
 	/* free this block */
 	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
 		zio_free(db->db_objset->os_spa, txg, bp);
 
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	dr->dt.dl.dr_nopwrite = B_FALSE;
 
 	/*
 	 * Release the already-written buffer, so we leave it in
 	 * a consistent dirty state.  Note that all callers are
 	 * modifying the buffer, so they will immediately do
 	 * another (redundant) arc_release().  Therefore, leave
 	 * the buf thawed to save the effort of freezing &
 	 * immediately re-thawing it.
 	 */
 	arc_release(dr->dt.dl.dr_data, db);
 }
 
 /*
  * Evict (if its unreferenced) or clear (if its referenced) any level-0
  * data blocks in the free range, so that any future readers will find
  * empty blocks.
  *
  * This is a no-op if the dataset is in the middle of an incremental
  * receive; see comment below for details.
  */
 void
 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db, *db_next, db_search;
 	uint64_t txg = tx->tx_txg;
 	avl_index_t where;
 
 	if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
 		end_blkid = dn->dn_maxblkid;
 	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
 
 	db_search.db_level = 0;
 	db_search.db_blkid = start_blkid;
 	db_search.db_state = DB_SEARCH;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 	if (start_blkid >= dn->dn_unlisted_l0_blkid) {
 		/* There can't be any dbufs in this range; no need to search. */
 #ifdef DEBUG
 		db = avl_find(&dn->dn_dbufs, &db_search, &where);
 		ASSERT3P(db, ==, NULL);
 		db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 		ASSERT(db == NULL || db->db_level > 0);
 #endif
 		mutex_exit(&dn->dn_dbufs_mtx);
 		return;
 	} else if (dmu_objset_is_receiving(dn->dn_objset)) {
 		/*
 		 * If we are receiving, we expect there to be no dbufs in
 		 * the range to be freed, because receive modifies each
 		 * block at most once, and in offset order.  If this is
 		 * not the case, it can lead to performance problems,
 		 * so note that we unexpectedly took the slow path.
 		 */
 		atomic_inc_64(&zfs_free_range_recv_miss);
 	}
 
 	db = avl_find(&dn->dn_dbufs, &db_search, &where);
 	ASSERT3P(db, ==, NULL);
 	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 
 	for (; db != NULL; db = db_next) {
 		db_next = AVL_NEXT(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 		if (db->db_level != 0 || db->db_blkid > end_blkid) {
 			break;
 		}
 		ASSERT3U(db->db_blkid, >=, start_blkid);
 
 		/* found a level 0 buffer in the range */
 		mutex_enter(&db->db_mtx);
 		if (dbuf_undirty(db, tx)) {
 			/* mutex has been dropped and dbuf destroyed */
 			continue;
 		}
 
 		if (db->db_state == DB_UNCACHED ||
 		    db->db_state == DB_NOFILL ||
 		    db->db_state == DB_EVICTING) {
 			ASSERT(db->db.db_data == NULL);
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 			/* will be handled in dbuf_read_done or dbuf_rele */
 			db->db_freed_in_flight = TRUE;
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (refcount_count(&db->db_holds) == 0) {
 			ASSERT(db->db_buf);
 			dbuf_clear(db);
 			continue;
 		}
 		/* The dbuf is referenced */
 
 		if (db->db_last_dirty != NULL) {
 			dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 			if (dr->dr_txg == txg) {
 				/*
 				 * This buffer is "in-use", re-adjust the file
 				 * size to reflect that this buffer may
 				 * contain new data when we sync.
 				 */
 				if (db->db_blkid != DMU_SPILL_BLKID &&
 				    db->db_blkid > dn->dn_maxblkid)
 					dn->dn_maxblkid = db->db_blkid;
 				dbuf_unoverride(dr);
 			} else {
 				/*
 				 * This dbuf is not dirty in the open context.
 				 * Either uncache it (if its not referenced in
 				 * the open context) or reset its contents to
 				 * empty.
 				 */
 				dbuf_fix_old_data(db, txg);
 			}
 		}
 		/* clear the contents if its cached */
 		if (db->db_state == DB_CACHED) {
 			ASSERT(db->db.db_data != NULL);
 			arc_release(db->db_buf, db);
 			bzero(db->db.db_data, db->db.db_size);
 			arc_buf_freeze(db->db_buf);
 		}
 
 		mutex_exit(&db->db_mtx);
 	}
 	mutex_exit(&dn->dn_dbufs_mtx);
 }
 
 static int
 dbuf_block_freeable(dmu_buf_impl_t *db)
 {
 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
 	uint64_t birth_txg = 0;
 
 	/*
 	 * We don't need any locking to protect db_blkptr:
 	 * If it's syncing, then db_last_dirty will be set
 	 * so we'll ignore db_blkptr.
 	 *
 	 * This logic ensures that only block births for
 	 * filled blocks are considered.
 	 */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	if (db->db_last_dirty && (db->db_blkptr == NULL ||
 	    !BP_IS_HOLE(db->db_blkptr))) {
 		birth_txg = db->db_last_dirty->dr_txg;
 	} else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
 		birth_txg = db->db_blkptr->blk_birth;
 	}
 
 	/*
 	 * If this block don't exist or is in a snapshot, it can't be freed.
 	 * Don't pass the bp to dsl_dataset_block_freeable() since we
 	 * are holding the db_mtx lock and might deadlock if we are
 	 * prefetching a dedup-ed block.
 	 */
 	if (birth_txg != 0)
 		return (ds == NULL ||
 		    dsl_dataset_block_freeable(ds, NULL, birth_txg));
 	else
 		return (B_FALSE);
 }
 
 void
 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 {
 	arc_buf_t *buf, *obuf;
 	int osize = db->db.db_size;
 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 	dnode_t *dn;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	/* XXX does *this* func really need the lock? */
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	/*
 	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
 	 * is OK, because there can be no other references to the db
 	 * when we are changing its size, so no concurrent DB_FILL can
 	 * be happening.
 	 */
 	/*
 	 * XXX we should be doing a dbuf_read, checking the return
 	 * value and returning that up to our callers
 	 */
 	dmu_buf_will_dirty(&db->db, tx);
 
 	/* create the data buffer for the new block */
 	buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type);
 
 	/* copy old block data to the new block */
 	obuf = db->db_buf;
 	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
 	/* zero the remainder */
 	if (size > osize)
 		bzero((uint8_t *)buf->b_data + osize, size - osize);
 
 	mutex_enter(&db->db_mtx);
 	dbuf_set_data(db, buf);
 	VERIFY(arc_buf_remove_ref(obuf, db));
 	db->db.db_size = size;
 
 	if (db->db_level == 0) {
 		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
 		db->db_last_dirty->dt.dl.dr_data = buf;
 	}
 	mutex_exit(&db->db_mtx);
 
 	dnode_willuse_space(dn, size-osize, tx);
 	DB_DNODE_EXIT(db);
 }
 
 void
 dbuf_release_bp(dmu_buf_impl_t *db)
 {
 	objset_t *os = db->db_objset;
 
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 	ASSERT(arc_released(os->os_phys_buf) ||
 	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
 	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
 
 	(void) arc_release(db->db_buf, db);
 }
 
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	objset_t *os;
 	dbuf_dirty_record_t **drp, *dr;
 	int drop_struct_lock = FALSE;
 	boolean_t do_free_accounting = B_FALSE;
 	int txgoff = tx->tx_txg & TXG_MASK;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	DMU_TX_DIRTY_BUF(tx, db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	/*
 	 * Shouldn't dirty a regular buffer in syncing context.  Private
 	 * objects may be dirtied in syncing context, but only if they
 	 * were already pre-dirtied in open context.
 	 */
 	ASSERT(!dmu_tx_is_syncing(tx) ||
 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    dn->dn_objset->os_dsl_dataset == NULL);
 	/*
 	 * We make this assert for private objects as well, but after we
 	 * check if we're already dirty.  They are allowed to re-dirty
 	 * in syncing context.
 	 */
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * XXX make this true for indirects too?  The problem is that
 	 * transactions created with dmu_tx_create_assigned() from
 	 * syncing context don't bother holding ahead.
 	 */
 	ASSERT(db->db_level != 0 ||
 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
 	    db->db_state == DB_NOFILL);
 
 	mutex_enter(&dn->dn_mtx);
 	/*
 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
 	 * initialize the objset.
 	 */
 	if (dn->dn_dirtyctx == DN_UNDIRTIED &&
 	    !BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
 		dn->dn_dirtyctx =
 		    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN);
 		ASSERT(dn->dn_dirtyctx_firstset == NULL);
 		dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
 	}
 	mutex_exit(&dn->dn_mtx);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		dn->dn_have_spill = B_TRUE;
 
 	/*
 	 * If this buffer is already dirty, we're done.
 	 */
 	drp = &db->db_last_dirty;
 	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
 	    db->db.db_object == DMU_META_DNODE_OBJECT);
 	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
 		drp = &dr->dr_next;
 	if (dr && dr->dr_txg == tx->tx_txg) {
 		DB_DNODE_EXIT(db);
 
 		if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
 			/*
 			 * If this buffer has already been written out,
 			 * we now need to reset its state.
 			 */
 			dbuf_unoverride(dr);
 			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
 			    db->db_state != DB_NOFILL)
 				arc_buf_thaw(db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
 		return (dr);
 	}
 
 	/*
 	 * Only valid if not already dirty.
 	 */
 	ASSERT(dn->dn_object == 0 ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
 	    dn->dn_phys->dn_nlevels > db->db_level ||
 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
 
 	/*
 	 * We should only be dirtying in syncing context if it's the
 	 * mos or we're initializing the os or it's a special object.
 	 * However, we are allowed to dirty in syncing context provided
 	 * we already dirtied it in open context.  Hence we must make
 	 * this assertion only if we're not already dirty.
 	 */
 	os = dn->dn_objset;
 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
 	ASSERT(db->db.db_size != 0);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	if (db->db_blkid != DMU_BONUS_BLKID) {
 		/*
 		 * Update the accounting.
 		 * Note: we delay "free accounting" until after we drop
 		 * the db_mtx.  This keeps us from grabbing other locks
 		 * (and possibly deadlocking) in bp_get_dsize() while
 		 * also holding the db_mtx.
 		 */
 		dnode_willuse_space(dn, db->db.db_size, tx);
 		do_free_accounting = dbuf_block_freeable(db);
 	}
 
 	/*
 	 * If this buffer is dirty in an old transaction group we need
 	 * to make a copy of it so that the changes we make in this
 	 * transaction group won't leak out when we sync the older txg.
 	 */
 	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
 	if (db->db_level == 0) {
 		void *data_old = db->db_buf;
 
 		if (db->db_state != DB_NOFILL) {
 			if (db->db_blkid == DMU_BONUS_BLKID) {
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db.db_data;
 			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
 				/*
 				 * Release the data buffer from the cache so
 				 * that we can modify it without impacting
 				 * possible other users of this cached data
 				 * block.  Note that indirect blocks and
 				 * private objects are not released until the
 				 * syncing state (since they are only modified
 				 * then).
 				 */
 				arc_release(db->db_buf, db);
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db_buf;
 			}
 			ASSERT(data_old != NULL);
 		}
 		dr->dt.dl.dr_data = data_old;
 	} else {
 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
 		list_create(&dr->dt.di.dr_children,
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
 		dr->dr_accounted = db->db.db_size;
 	dr->dr_dbuf = db;
 	dr->dr_txg = tx->tx_txg;
 	dr->dr_next = *drp;
 	*drp = dr;
 
 	/*
 	 * We could have been freed_in_flight between the dbuf_noread
 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
 	 * happened after the free.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_blkid != DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_free_ranges[txgoff] != NULL) {
 			range_tree_clear(dn->dn_free_ranges[txgoff],
 			    db->db_blkid, 1);
 		}
 		mutex_exit(&dn->dn_mtx);
 		db->db_freed_in_flight = FALSE;
 	}
 
 	/*
 	 * This buffer is now part of this txg
 	 */
 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
 	db->db_dirtycnt += 1;
 	ASSERT3U(db->db_dirtycnt, <=, 3);
 
 	mutex_exit(&db->db_mtx);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		dnode_setdirty(dn, tx);
 		DB_DNODE_EXIT(db);
 		return (dr);
 	} else if (do_free_accounting) {
 		blkptr_t *bp = db->db_blkptr;
 		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
 		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
 		/*
 		 * This is only a guess -- if the dbuf is dirty
 		 * in a previous txg, we don't know how much
 		 * space it will use on disk yet.  We should
 		 * really have the struct_rwlock to access
 		 * db_blkptr, but since this is just a guess,
 		 * it's OK if we get an odd answer.
 		 */
 		ddt_prefetch(os->os_spa, bp);
 		dnode_willuse_space(dn, -willfree, tx);
 	}
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		drop_struct_lock = TRUE;
 	}
 
 	if (db->db_level == 0) {
 		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
 	}
 
 	if (db->db_level+1 < dn->dn_nlevels) {
 		dmu_buf_impl_t *parent = db->db_parent;
 		dbuf_dirty_record_t *di;
 		int parent_held = FALSE;
 
 		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 			parent = dbuf_hold_level(dn, db->db_level+1,
 			    db->db_blkid >> epbs, FTAG);
 			ASSERT(parent != NULL);
 			parent_held = TRUE;
 		}
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
 		ASSERT3U(db->db_level+1, ==, parent->db_level);
 		di = dbuf_dirty(parent, tx);
 		if (parent_held)
 			dbuf_rele(parent, FTAG);
 
 		mutex_enter(&db->db_mtx);
 		/*
 		 * Since we've dropped the mutex, it's possible that
 		 * dbuf_undirty() might have changed this out from under us.
 		 */
 		if (db->db_last_dirty == dr ||
 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
 			mutex_enter(&di->dt.di.dr_mtx);
 			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
 			ASSERT(!list_link_active(&dr->dr_dirty_node));
 			list_insert_tail(&di->dt.di.dr_children, dr);
 			mutex_exit(&di->dt.di.dr_mtx);
 			dr->dr_parent = di;
 		}
 		mutex_exit(&db->db_mtx);
 	} else {
 		ASSERT(db->db_level+1 == dn->dn_nlevels);
 		ASSERT(db->db_blkid < dn->dn_nblkptr);
 		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	dnode_setdirty(dn, tx);
 	DB_DNODE_EXIT(db);
 	return (dr);
 }
 
 /*
  * Undirty a buffer in the transaction group referenced by the given
  * transaction.  Return whether this evicted the dbuf.
  */
 static boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	uint64_t txg = tx->tx_txg;
 	dbuf_dirty_record_t *dr, **drp;
 
 	ASSERT(txg != 0);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
 		if (dr->dr_txg <= txg)
 			break;
 	if (dr == NULL || dr->dr_txg < txg)
 		return (B_FALSE);
 	ASSERT(dr->dr_txg == txg);
 	ASSERT(dr->dr_dbuf == db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	ASSERT(db->db.db_size != 0);
 
 	/*
 	 * Any space we accounted for in dp_dirty_* will be cleaned up by
 	 * dsl_pool_sync().  This is relatively rare so the discrepancy
 	 * is not a big deal.
 	 */
 
 	*drp = dr->dr_next;
 
 	/*
 	 * Note that there are three places in dbuf_dirty()
 	 * where this dirty record may be put on a list.
 	 * Make sure to do a list_remove corresponding to
 	 * every one of those list_insert calls.
 	 */
 	if (dr->dr_parent) {
 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
 	    db->db_level+1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 		mutex_exit(&dn->dn_mtx);
 	}
 	DB_DNODE_EXIT(db);
 
 	if (db->db_state != DB_NOFILL) {
 		dbuf_unoverride(dr);
 
 		ASSERT(db->db_buf != NULL);
 		ASSERT(dr->dt.dl.dr_data != NULL);
 		if (dr->dt.dl.dr_data != db->db_buf)
 			VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db));
 	}
 
 	if (db->db_level != 0) {
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
 
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 
 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
 		arc_buf_t *buf = db->db_buf;
 
 		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
 		dbuf_set_data(db, NULL);
 		VERIFY(arc_buf_remove_ref(buf, db));
 		dbuf_evict(db);
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 void
 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	DB_DNODE_ENTER(db);
 	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
 		rf |= DB_RF_HAVESTRUCT;
 	DB_DNODE_EXIT(db);
 	(void) dbuf_read(db, NULL, rf);
 	(void) dbuf_dirty(db, tx);
 }
 
 void
 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	db->db_state = DB_NOFILL;
 
 	dmu_buf_will_fill(db_fake, tx);
 }
 
 void
 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(db->db_level == 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
 	    dmu_tx_private_ok(tx));
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 
 #pragma weak dmu_buf_fill_done = dbuf_fill_done
 /* ARGSUSED */
 void
 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 
 	if (db->db_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_freed_in_flight) {
 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			bzero(db->db.db_data, db->db.db_size);
 			db->db_freed_in_flight = FALSE;
 		}
 		db->db_state = DB_CACHED;
 		cv_broadcast(&db->db_changed);
 	}
 	mutex_exit(&db->db_mtx);
 }
 
 void
 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
     bp_embedded_type_t etype, enum zio_compress comp,
     int uncompressed_size, int compressed_size, int byteorder,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	struct dirty_leaf *dl;
 	dmu_object_type_t type;
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	ASSERT0(db->db_level);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	dmu_buf_will_not_fill(dbuf, tx);
 
 	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
 	dl = &db->db_last_dirty->dt.dl;
 	encode_embedded_bp_compressed(&dl->dr_overridden_by,
 	    data, comp, uncompressed_size, compressed_size);
 	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
 	BP_SET_TYPE(&dl->dr_overridden_by, type);
 	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
 	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
 
 	dl->dr_override_state = DR_OVERRIDDEN;
 	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
 }
 
 /*
  * Directly assign a provided arc buf to a given dbuf if it's not referenced
  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
  */
 void
 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(db->db_level == 0);
 	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
 	ASSERT(buf != NULL);
 	ASSERT(arc_buf_size(buf) == db->db.db_size);
 	ASSERT(tx->tx_txg != 0);
 
 	arc_return_buf(buf, db);
 	ASSERT(arc_released(buf));
 
 	mutex_enter(&db->db_mtx);
 
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 
 	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
 
 	if (db->db_state == DB_CACHED &&
 	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_dirty(db, tx);
 		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
 		VERIFY(arc_buf_remove_ref(buf, db));
 		xuio_stat_wbuf_copied();
 		return;
 	}
 
 	xuio_stat_wbuf_nocopy();
 	if (db->db_state == DB_CACHED) {
 		dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 		ASSERT(db->db_buf != NULL);
 		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
 			ASSERT(dr->dt.dl.dr_data == db->db_buf);
 			if (!arc_released(db->db_buf)) {
 				ASSERT(dr->dt.dl.dr_override_state ==
 				    DR_OVERRIDDEN);
 				arc_release(db->db_buf, db);
 			}
 			dr->dt.dl.dr_data = buf;
 			VERIFY(arc_buf_remove_ref(db->db_buf, db));
 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
 			arc_release(db->db_buf, db);
 			VERIFY(arc_buf_remove_ref(db->db_buf, db));
 		}
 		db->db_buf = NULL;
 	}
 	ASSERT(db->db_buf == NULL);
 	dbuf_set_data(db, buf);
 	db->db_state = DB_FILL;
 	mutex_exit(&db->db_mtx);
 	(void) dbuf_dirty(db, tx);
 	dmu_buf_fill_done(&db->db, tx);
 }
 
 /*
  * "Clear" the contents of this dbuf.  This will mark the dbuf
  * EVICTING and clear *most* of its references.  Unfortunately,
  * when we are not holding the dn_dbufs_mtx, we can't clear the
  * entry in the dn_dbufs list.  We have to wait until dbuf_destroy()
  * in this case.  For callers from the DMU we will usually see:
  *	dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy()
  * For the arc callback, we will usually see:
  *	dbuf_do_evict()->dbuf_clear();dbuf_destroy()
  * Sometimes, though, we will get a mix of these two:
  *	DMU: dbuf_clear()->arc_clear_callback()
  *	ARC: dbuf_do_evict()->dbuf_destroy()
  *
  * This routine will dissociate the dbuf from the arc, by calling
  * arc_clear_callback(), but will not evict the data from the ARC.
  */
 void
 dbuf_clear(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *parent = db->db_parent;
 	dmu_buf_impl_t *dndb;
 	boolean_t dbuf_gone = B_FALSE;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	dbuf_evict_user(db);
 
 	if (db->db_state == DB_CACHED) {
 		ASSERT(db->db.db_data != NULL);
 		if (db->db_blkid == DMU_BONUS_BLKID) {
 			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		}
 		db->db.db_data = NULL;
 		db->db_state = DB_UNCACHED;
 	}
 
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_data_pending == NULL);
 
 	db->db_state = DB_EVICTING;
 	db->db_blkptr = NULL;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dndb = dn->dn_dbuf;
 	if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
 		avl_remove(&dn->dn_dbufs, db);
 		(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
 		membar_producer();
 		DB_DNODE_EXIT(db);
 		/*
 		 * Decrementing the dbuf count means that the hold corresponding
 		 * to the removed dbuf is no longer discounted in dnode_move(),
 		 * so the dnode cannot be moved until after we release the hold.
 		 * The membar_producer() ensures visibility of the decremented
 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
 		 * release any lock.
 		 */
 		dnode_rele(dn, db);
 		db->db_dnode_handle = NULL;
 	} else {
 		DB_DNODE_EXIT(db);
 	}
 
 	if (db->db_buf)
 		dbuf_gone = arc_clear_callback(db->db_buf);
 
 	if (!dbuf_gone)
 		mutex_exit(&db->db_mtx);
 
 	/*
 	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
 	if (parent && parent != dndb)
 		dbuf_rele(parent, db);
 }
 
 static int
 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
     dmu_buf_impl_t **parentp, blkptr_t **bpp)
 {
 	int nlevels, epbs;
 
 	*parentp = NULL;
 	*bpp = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 
 	if (blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_have_spill &&
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
 			*bpp = &dn->dn_phys->dn_spill;
 		else
 			*bpp = NULL;
 		dbuf_add_ref(dn->dn_dbuf, NULL);
 		*parentp = dn->dn_dbuf;
 		mutex_exit(&dn->dn_mtx);
 		return (0);
 	}
 
 	if (dn->dn_phys->dn_nlevels == 0)
 		nlevels = 1;
 	else
 		nlevels = dn->dn_phys->dn_nlevels;
 
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 	ASSERT3U(level * epbs, <, 64);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	if (level >= nlevels ||
 	    (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
 		/* the buffer has no parent yet */
 		return (SET_ERROR(ENOENT));
 	} else if (level < nlevels-1) {
 		/* this block is referenced from an indirect block */
 		int err = dbuf_hold_impl(dn, level+1,
 		    blkid >> epbs, fail_sparse, NULL, parentp);
 		if (err)
 			return (err);
 		err = dbuf_read(*parentp, NULL,
 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
 		if (err) {
 			dbuf_rele(*parentp, NULL);
 			*parentp = NULL;
 			return (err);
 		}
 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
 		    (blkid & ((1ULL << epbs) - 1));
 		return (0);
 	} else {
 		/* the block is referenced from the dnode */
 		ASSERT3U(level, ==, nlevels-1);
 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
 		    blkid < dn->dn_phys->dn_nblkptr);
 		if (dn->dn_dbuf) {
 			dbuf_add_ref(dn->dn_dbuf, NULL);
 			*parentp = dn->dn_dbuf;
 		}
 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
 		return (0);
 	}
 }
 
 static dmu_buf_impl_t *
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
     dmu_buf_impl_t *parent, blkptr_t *blkptr)
 {
 	objset_t *os = dn->dn_objset;
 	dmu_buf_impl_t *db, *odb;
 
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 
 	db = kmem_cache_alloc(dbuf_cache, KM_SLEEP);
 
 	db->db_objset = os;
 	db->db.db_object = dn->dn_object;
 	db->db_level = level;
 	db->db_blkid = blkid;
 	db->db_last_dirty = NULL;
 	db->db_dirtycnt = 0;
 	db->db_dnode_handle = dn->dn_handle;
 	db->db_parent = parent;
 	db->db_blkptr = blkptr;
 
 	db->db_user_ptr = NULL;
 	db->db_user_data_ptr_ptr = NULL;
 	db->db_evict_func = NULL;
 	db->db_immediate_evict = 0;
 	db->db_freed_in_flight = 0;
 
 	if (blkid == DMU_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
 		db->db.db_size = DN_MAX_BONUSLEN -
 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DMU_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
 		/* the bonus dbuf is not placed in the hash table */
 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 		return (db);
 	} else if (blkid == DMU_SPILL_BLKID) {
 		db->db.db_size = (blkptr != NULL) ?
 		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
 		db->db.db_offset = 0;
 	} else {
 		int blocksize =
 		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
 		db->db.db_size = blocksize;
 		db->db.db_offset = db->db_blkid * blocksize;
 	}
 
 	/*
 	 * Hold the dn_dbufs_mtx while we get the new dbuf
 	 * in the hash table *and* added to the dbufs list.
 	 * This prevents a possible deadlock with someone
 	 * trying to look up this dbuf before its added to the
 	 * dn_dbufs list.
 	 */
 	mutex_enter(&dn->dn_dbufs_mtx);
 	db->db_state = DB_EVICTING;
 	if ((odb = dbuf_hash_insert(db)) != NULL) {
 		/* someone else inserted it first */
 		kmem_cache_free(dbuf_cache, db);
 		mutex_exit(&dn->dn_dbufs_mtx);
 		return (odb);
 	}
 	avl_add(&dn->dn_dbufs, db);
 	if (db->db_level == 0 && db->db_blkid >=
 	    dn->dn_unlisted_l0_blkid)
 		dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
 	db->db_state = DB_UNCACHED;
 	mutex_exit(&dn->dn_dbufs_mtx);
 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
 
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    refcount_count(&dn->dn_holds) > 0);
 	(void) refcount_add(&dn->dn_holds, db);
 	(void) atomic_inc_32_nv(&dn->dn_dbufs_count);
 
 	dprintf_dbuf(db, "db=%p\n", db);
 
 	return (db);
 }
 
 static int
 dbuf_do_evict(void *private)
 {
 	dmu_buf_impl_t *db = private;
 
 	if (!MUTEX_HELD(&db->db_mtx))
 		mutex_enter(&db->db_mtx);
 
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	if (db->db_state != DB_EVICTING) {
 		ASSERT(db->db_state == DB_CACHED);
 		DBUF_VERIFY(db);
 		db->db_buf = NULL;
 		dbuf_evict(db);
 	} else {
 		mutex_exit(&db->db_mtx);
 		dbuf_destroy(db);
 	}
 	return (0);
 }
 
 static void
 dbuf_destroy(dmu_buf_impl_t *db)
 {
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	if (db->db_blkid != DMU_BONUS_BLKID) {
 		/*
 		 * If this dbuf is still on the dn_dbufs list,
 		 * remove it from that list.
 		 */
 		if (db->db_dnode_handle != NULL) {
 			dnode_t *dn;
 
 			DB_DNODE_ENTER(db);
 			dn = DB_DNODE(db);
 			mutex_enter(&dn->dn_dbufs_mtx);
 			avl_remove(&dn->dn_dbufs, db);
 			(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
 			mutex_exit(&dn->dn_dbufs_mtx);
 			DB_DNODE_EXIT(db);
 			/*
 			 * Decrementing the dbuf count means that the hold
 			 * corresponding to the removed dbuf is no longer
 			 * discounted in dnode_move(), so the dnode cannot be
 			 * moved until after we release the hold.
 			 */
 			dnode_rele(dn, db);
 			db->db_dnode_handle = NULL;
 		}
 		dbuf_hash_remove(db);
 	}
 	db->db_parent = NULL;
 	db->db_buf = NULL;
 
 	ASSERT(db->db.db_data == NULL);
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
 
 	kmem_cache_free(dbuf_cache, db);
 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 }
 
 void
 dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio)
 {
 	dmu_buf_impl_t *db = NULL;
 	blkptr_t *bp = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
 	if (dnode_block_freed(dn, blkid))
 		return;
 
 	/* dbuf_find() returns with db_mtx held */
 	if (db = dbuf_find(dn, 0, blkid)) {
 		/*
 		 * This dbuf is already in the cache.  We assume that
 		 * it is already CACHED, or else about to be either
 		 * read or filled.
 		 */
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
 		if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) {
 			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 			zbookmark_phys_t zb;
 
 			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 			    dn->dn_object, 0, blkid);
 
 			(void) arc_read(NULL, dn->dn_objset->os_spa,
 			    bp, NULL, NULL, prio,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zb);
 		}
 		if (db)
 			dbuf_rele(db, NULL);
 	}
 }
 
 /*
  * Returns with db_holds incremented, and db_mtx not held.
  * Note: dn_struct_rwlock must be held.
  */
 int
 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse,
     void *tag, dmu_buf_impl_t **dbp)
 {
 	dmu_buf_impl_t *db, *parent = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT3U(dn->dn_nlevels, >, level);
 
 	*dbp = NULL;
 top:
 	/* dbuf_find() returns with db_mtx held */
 	db = dbuf_find(dn, level, blkid);
 
 	if (db == NULL) {
 		blkptr_t *bp = NULL;
 		int err;
 
 		ASSERT3P(parent, ==, NULL);
 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
 		if (fail_sparse) {
 			if (err == 0 && bp && BP_IS_HOLE(bp))
 				err = SET_ERROR(ENOENT);
 			if (err) {
 				if (parent)
 					dbuf_rele(parent, NULL);
 				return (err);
 			}
 		}
 		if (err && err != ENOENT)
 			return (err);
 		db = dbuf_create(dn, level, blkid, parent, bp);
 	}
 
 	if (db->db_buf && refcount_is_zero(&db->db_holds)) {
 		arc_buf_add_ref(db->db_buf, db);
 		if (db->db_buf->b_data == NULL) {
 			dbuf_clear(db);
 			if (parent) {
 				dbuf_rele(parent, NULL);
 				parent = NULL;
 			}
 			goto top;
 		}
 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
 	}
 
 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
 
 	/*
 	 * If this buffer is currently syncing out, and we are are
 	 * still referencing it from db_data, we need to make a copy
 	 * of it in case we decide we want to dirty it again in this txg.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    db->db_state == DB_CACHED && db->db_data_pending) {
 		dbuf_dirty_record_t *dr = db->db_data_pending;
 
 		if (dr->dt.dl.dr_data == db->db_buf) {
 			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 			dbuf_set_data(db,
 			    arc_buf_alloc(dn->dn_objset->os_spa,
 			    db->db.db_size, db, type));
 			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
 			    db->db.db_size);
 		}
 	}
 
 	(void) refcount_add(&db->db_holds, tag);
 	dbuf_update_data(db);
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
 	if (parent)
 		dbuf_rele(parent, NULL);
 
 	ASSERT3P(DB_DNODE(db), ==, dn);
 	ASSERT3U(db->db_blkid, ==, blkid);
 	ASSERT3U(db->db_level, ==, level);
 	*dbp = db;
 
 	return (0);
 }
 
 dmu_buf_impl_t *
 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
 	int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
 dmu_buf_impl_t *
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
 void
 dbuf_create_bonus(dnode_t *dn)
 {
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT(dn->dn_bonus == NULL);
 	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
 }
 
 int
 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	if (db->db_blkid != DMU_SPILL_BLKID)
 		return (SET_ERROR(ENOTSUP));
 	if (blksz == 0)
 		blksz = SPA_MINBLOCKSIZE;
 	if (blksz > SPA_MAXBLOCKSIZE)
 		blksz = SPA_MAXBLOCKSIZE;
 	else
 		blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dbuf_new_size(db, blksz, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	DB_DNODE_EXIT(db);
 
 	return (0);
 }
 
 void
 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 {
 	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
 }
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
 void
 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds = refcount_add(&db->db_holds, tag);
 	ASSERT(holds > 1);
 }
 
 /*
  * If you call dbuf_rele() you had better not be referencing the dnode handle
  * unless you have some other direct or indirect hold on the dnode. (An indirect
  * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
  * dnode's parent dbuf evicting its dnode handles.
  */
 void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
 	mutex_enter(&db->db_mtx);
 	dbuf_rele_and_unlock(db, tag);
 }
 
 void
 dmu_buf_rele(dmu_buf_t *db, void *tag)
 {
 	dbuf_rele((dmu_buf_impl_t *)db, tag);
 }
 
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
  * db_dirtycnt and db_holds to be updated atomically.
  */
 void
 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	DBUF_VERIFY(db);
 
 	/*
 	 * Remove the reference to the dbuf before removing its hold on the
 	 * dnode so we can guarantee in dnode_move() that a referenced bonus
 	 * buffer has a corresponding dnode hold.
 	 */
 	holds = refcount_remove(&db->db_holds, tag);
 	ASSERT(holds >= 0);
 
 	/*
 	 * We can't freeze indirects if there is a possibility that they
 	 * may be modified in the current syncing context.
 	 */
 	if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0))
 		arc_buf_freeze(db->db_buf);
 
 	if (holds == db->db_dirtycnt &&
 	    db->db_level == 0 && db->db_immediate_evict)
 		dbuf_evict_user(db);
 
 	if (holds == 0) {
 		if (db->db_blkid == DMU_BONUS_BLKID) {
 			mutex_exit(&db->db_mtx);
 
 			/*
 			 * If the dnode moves here, we cannot cross this barrier
 			 * until the move completes.
 			 */
 			DB_DNODE_ENTER(db);
 			(void) atomic_dec_32_nv(&DB_DNODE(db)->dn_dbufs_count);
 			DB_DNODE_EXIT(db);
 			/*
 			 * The bonus buffer's dnode hold is no longer discounted
 			 * in dnode_move(). The dnode cannot move until after
 			 * the dnode_rele().
 			 */
 			dnode_rele(DB_DNODE(db), db);
 		} else if (db->db_buf == NULL) {
 			/*
 			 * This is a special case: we never associated this
 			 * dbuf with any data allocated from the ARC.
 			 */
 			ASSERT(db->db_state == DB_UNCACHED ||
 			    db->db_state == DB_NOFILL);
 			dbuf_evict(db);
 		} else if (arc_released(db->db_buf)) {
 			arc_buf_t *buf = db->db_buf;
 			/*
 			 * This dbuf has anonymous data associated with it.
 			 */
 			dbuf_set_data(db, NULL);
 			VERIFY(arc_buf_remove_ref(buf, db));
 			dbuf_evict(db);
 		} else {
 			VERIFY(!arc_buf_remove_ref(db->db_buf, db));
 
 			/*
 			 * A dbuf will be eligible for eviction if either the
 			 * 'primarycache' property is set or a duplicate
 			 * copy of this buffer is already cached in the arc.
 			 *
 			 * In the case of the 'primarycache' a buffer
 			 * is considered for eviction if it matches the
 			 * criteria set in the property.
 			 *
 			 * To decide if our buffer is considered a
 			 * duplicate, we must call into the arc to determine
 			 * if multiple buffers are referencing the same
 			 * block on-disk. If so, then we simply evict
 			 * ourselves.
 			 */
 			if (!DBUF_IS_CACHEABLE(db)) {
 				if (db->db_blkptr != NULL &&
 				    !BP_IS_HOLE(db->db_blkptr) &&
 				    !BP_IS_EMBEDDED(db->db_blkptr)) {
 					spa_t *spa =
 					    dmu_objset_spa(db->db_objset);
 					blkptr_t bp = *db->db_blkptr;
 					dbuf_clear(db);
 					arc_freed(spa, &bp);
 				} else {
 					dbuf_clear(db);
 				}
 			} else if (arc_buf_eviction_needed(db->db_buf)) {
 				dbuf_clear(db);
 			} else {
 				mutex_exit(&db->db_mtx);
 			}
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
 	}
 }
 
 #pragma weak dmu_buf_refcount = dbuf_refcount
 uint64_t
 dbuf_refcount(dmu_buf_impl_t *db)
 {
 	return (refcount_count(&db->db_holds));
 }
 
 void *
 dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
     dmu_buf_evict_func_t *evict_func)
 {
 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
 	    user_data_ptr_ptr, evict_func));
 }
 
 void *
 dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, void *user_data_ptr_ptr,
     dmu_buf_evict_func_t *evict_func)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	db->db_immediate_evict = TRUE;
 	return (dmu_buf_update_user(db_fake, NULL, user_ptr,
 	    user_data_ptr_ptr, evict_func));
 }
 
 void *
 dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr,
     void *user_data_ptr_ptr, dmu_buf_evict_func_t *evict_func)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	ASSERT(db->db_level == 0);
 
 	ASSERT((user_ptr == NULL) == (evict_func == NULL));
 
 	mutex_enter(&db->db_mtx);
 
 	if (db->db_user_ptr == old_user_ptr) {
 		db->db_user_ptr = user_ptr;
 		db->db_user_data_ptr_ptr = user_data_ptr_ptr;
 		db->db_evict_func = evict_func;
 
 		dbuf_update_data(db);
 	} else {
 		old_user_ptr = db->db_user_ptr;
 	}
 
 	mutex_exit(&db->db_mtx);
 	return (old_user_ptr);
 }
 
 void *
 dmu_buf_get_user(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	return (db->db_user_ptr);
 }
 
 boolean_t
 dmu_buf_freeable(dmu_buf_t *dbuf)
 {
 	boolean_t res = B_FALSE;
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 
 	if (db->db_blkptr)
 		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
 		    db->db_blkptr, db->db_blkptr->blk_birth);
 
 	return (res);
 }
 
 blkptr_t *
 dmu_buf_get_blkptr(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	return (dbi->db_blkptr);
 }
 
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
 	/* ASSERT(dmu_tx_is_syncing(tx) */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_blkptr != NULL)
 		return;
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		db->db_blkptr = &dn->dn_phys->dn_spill;
 		BP_ZERO(db->db_blkptr);
 		return;
 	}
 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
 		/*
 		 * This buffer was allocated at a time when there was
 		 * no available blkptrs from the dnode, or it was
 		 * inappropriate to hook it in (i.e., nlevels mis-match).
 		 */
 		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
 		ASSERT(db->db_parent == NULL);
 		db->db_parent = dn->dn_dbuf;
 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
 		DBUF_VERIFY(db);
 	} else {
 		dmu_buf_impl_t *parent = db->db_parent;
 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 		ASSERT(dn->dn_phys->dn_nlevels > 1);
 		if (parent == NULL) {
 			mutex_exit(&db->db_mtx);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			(void) dbuf_hold_impl(dn, db->db_level+1,
 			    db->db_blkid >> epbs, FALSE, db, &parent);
 			rw_exit(&dn->dn_struct_rwlock);
 			mutex_enter(&db->db_mtx);
 			db->db_parent = parent;
 		}
 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
 		    (db->db_blkid & ((1ULL << epbs) - 1));
 		DBUF_VERIFY(db);
 	}
 }
 
 static void
 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn;
 	zio_t *zio;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 
 	ASSERT(db->db_level > 0);
 	DBUF_VERIFY(db);
 
 	/* Read the block if it hasn't been read yet. */
 	if (db->db_buf == NULL) {
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 		mutex_enter(&db->db_mtx);
 	}
 	ASSERT3U(db->db_state, ==, DB_CACHED);
 	ASSERT(db->db_buf != NULL);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	/* Indirect block size must match what the dnode thinks it is. */
 	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 	dbuf_check_blkptr(dn, db);
 	DB_DNODE_EXIT(db);
 
 	/* Provide the pending dirty record to child dbufs */
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 	dbuf_write(dr, db->db_buf, tx);
 
 	zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
 	dbuf_sync_list(&dr->dt.di.dr_children, tx);
 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 	mutex_exit(&dr->dt.di.dr_mtx);
 	zio_nowait(zio);
 }
 
 static void
 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	arc_buf_t **datap = &dr->dt.dl.dr_data;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn;
 	objset_t *os;
 	uint64_t txg = tx->tx_txg;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * To be synced, we must be dirtied.  But we
 	 * might have been freed after the dirty.
 	 */
 	if (db->db_state == DB_UNCACHED) {
 		/* This buffer has been freed since it was dirtied */
 		ASSERT(db->db.db_data == NULL);
 	} else if (db->db_state == DB_FILL) {
 		/* This buffer was freed and is now being re-filled */
 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
 	} else {
 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
 	}
 	DBUF_VERIFY(db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	/*
 	 * If this is a bonus buffer, simply copy the bonus data into the
 	 * dnode.  It will be written out when the dnode is synced (and it
 	 * will be synced, since it must have been dirty for dbuf_sync to
 	 * be called).
 	 */
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		dbuf_dirty_record_t **drp;
 
 		ASSERT(*datap != NULL);
 		ASSERT0(db->db_level);
 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
 		DB_DNODE_EXIT(db);
 
 		if (*datap != db->db.db_data) {
 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		}
 		db->db_data_pending = NULL;
 		drp = &db->db_last_dirty;
 		while (*drp != dr)
 			drp = &(*drp)->dr_next;
 		ASSERT(dr->dr_next == NULL);
 		ASSERT(dr->dr_dbuf == db);
 		*drp = dr->dr_next;
 		if (dr->dr_dbuf->db_level != 0) {
 			list_destroy(&dr->dt.di.dr_children);
 			mutex_destroy(&dr->dt.di.dr_mtx);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
 		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 		return;
 	}
 
 	os = dn->dn_objset;
 
 	/*
 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
 	 * operation to sneak in. As a result, we need to ensure that we
 	 * don't check the dr_override_state until we have returned from
 	 * dbuf_check_blkptr.
 	 */
 	dbuf_check_blkptr(dn, db);
 
 	/*
 	 * If this buffer is in the middle of an immediate write,
 	 * wait for the synchronous IO to complete.
 	 */
 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		cv_wait(&db->db_changed, &db->db_mtx);
 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
 	}
 
 	if (db->db_state != DB_NOFILL &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    refcount_count(&db->db_holds) > 1 &&
 	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
 	    *datap == db->db_buf) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),
 		 * then make a copy before we start the write so that
 		 * any modifications from the open txg will not leak
 		 * into this write.
 		 *
 		 * NOTE: this copy does not need to be made for
 		 * objects only modified in the syncing context (e.g.
 		 * DNONE_DNODE blocks).
 		 */
 		int blksz = arc_buf_size(*datap);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
 		bcopy(db->db.db_data, (*datap)->b_data, blksz);
 	}
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write(dr, *datap, tx);
 
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
 		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
 		DB_DNODE_EXIT(db);
 	} else {
 		/*
 		 * Although zio_nowait() does not "wait for an IO", it does
 		 * initiate the IO. If this is an empty write it seems plausible
 		 * that the IO could actually be completed before the nowait
 		 * returns. We need to DB_DNODE_EXIT() first in case
 		 * zio_nowait() invalidates the dbuf.
 		 */
 		DB_DNODE_EXIT(db);
 		zio_nowait(dr->dr_zio);
 	}
 }
 
 void
 dbuf_sync_list(list_t *list, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr;
 
 	while (dr = list_head(list)) {
 		if (dr->dr_zio != NULL) {
 			/*
 			 * If we find an already initialized zio then we
 			 * are processing the meta-dnode, and we have finished.
 			 * The dbufs for all dnodes are put back on the list
 			 * during processing, so that we can zio_wait()
 			 * these IOs after initiating all child IOs.
 			 */
 			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
 			    DMU_META_DNODE_OBJECT);
 			break;
 		}
 		list_remove(list, dr);
 		if (dr->dr_dbuf->db_level > 0)
 			dbuf_sync_indirect(dr, tx);
 		else
 			dbuf_sync_leaf(dr, tx);
 	}
 }
 
 /* ARGSUSED */
 static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	spa_t *spa = zio->io_spa;
 	int64_t delta;
 	uint64_t fill = 0;
 	int i;
 
 	ASSERT3P(db->db_blkptr, ==, bp);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
 	if (bp->blk_birth != 0) {
 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_type) ||
 		    (db->db_blkid == DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
 		    BP_IS_EMBEDDED(bp));
 		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
 		    db->db_blkptr == &dn->dn_phys->dn_spill);
 	}
 #endif
 
 	if (db->db_level == 0) {
 		mutex_enter(&dn->dn_mtx);
 		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
 		    db->db_blkid != DMU_SPILL_BLKID)
 			dn->dn_phys->dn_maxblkid = db->db_blkid;
 		mutex_exit(&dn->dn_mtx);
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			dnode_phys_t *dnp = db->db.db_data;
 			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
 			    i--, dnp++) {
 				if (dnp->dn_type != DMU_OT_NONE)
 					fill++;
 			}
 		} else {
 			if (BP_IS_HOLE(bp)) {
 				fill = 0;
 			} else {
 				fill = 1;
 			}
 		}
 	} else {
 		blkptr_t *ibp = db->db.db_data;
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
 			if (BP_IS_HOLE(ibp))
 				continue;
 			fill += BP_GET_FILL(ibp);
 		}
 	}
 	DB_DNODE_EXIT(db);
 
 	if (!BP_IS_EMBEDDED(bp))
 		bp->blk_fill = fill;
 
 	mutex_exit(&db->db_mtx);
 }
 
 /*
  * The SPA will call this callback several times for each zio - once
  * for every physical child i/o (zio->io_phys_children times).  This
  * allows the DMU to monitor the progress of each logical i/o.  For example,
  * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
  * block.  There may be a long delay before all copies/fragments are completed,
  * so this callback allows us to retire dirty space gradually, as the physical
  * i/os complete.
  */
 /* ARGSUSED */
 static void
 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	dmu_buf_impl_t *db = arg;
 	objset_t *os = db->db_objset;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 	dbuf_dirty_record_t *dr;
 	int delta = 0;
 
 	dr = db->db_data_pending;
 	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
 
 	/*
 	 * The callback will be called io_phys_children times.  Retire one
 	 * portion of our dirty space each time we are called.  Any rounding
 	 * error will be cleaned up by dsl_pool_sync()'s call to
 	 * dsl_pool_undirty_space().
 	 */
 	delta = dr->dr_accounted / zio->io_phys_children;
 	dsl_pool_undirty_space(dp, delta, zio->io_txg);
 }
 
 /* ARGSUSED */
 static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	blkptr_t *bp = db->db_blkptr;
 	objset_t *os = db->db_objset;
 	dmu_tx_t *tx = os->os_synctx;
 	dbuf_dirty_record_t **drp, *dr;
 
 	ASSERT0(zio->io_error);
 	ASSERT(db->db_blkptr == bp);
 
 	/*
 	 * For nopwrites and rewrites we ensure that the bp matches our
 	 * original and bypass all the accounting.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 	DBUF_VERIFY(db);
 
 	drp = &db->db_last_dirty;
 	while ((dr = *drp) != db->db_data_pending)
 		drp = &dr->dr_next;
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	ASSERT(dr->dr_dbuf == db);
 	ASSERT(dr->dr_next == NULL);
 	*drp = dr->dr_next;
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(db);
 		dn = DB_DNODE(db);
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
 		    db->db_blkptr == &dn->dn_phys->dn_spill);
 		DB_DNODE_EXIT(db);
 	}
 #endif
 
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 		if (db->db_state != DB_NOFILL) {
 			if (dr->dt.dl.dr_data != db->db_buf)
 				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
 				    db));
 			else if (!arc_released(db->db_buf))
 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
 	} else {
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(db);
 		dn = DB_DNODE(db);
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 		if (!BP_IS_HOLE(db->db_blkptr)) {
 			int epbs =
 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_blkid, <=,
 			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
 			    db->db.db_size);
 			if (!arc_released(db->db_buf))
 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
 		DB_DNODE_EXIT(db);
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	cv_broadcast(&db->db_changed);
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
 }
 
 static void
 dbuf_write_nofill_ready(zio_t *zio)
 {
 	dbuf_write_ready(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_nofill_done(zio_t *zio)
 {
 	dbuf_write_done(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_override_ready(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	dbuf_write_ready(zio, NULL, db);
 }
 
 static void
 dbuf_write_override_done(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
 
 	mutex_enter(&db->db_mtx);
 	if (!BP_EQUAL(zio->io_bp, obp)) {
 		if (!BP_IS_HOLE(obp))
 			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
 		arc_release(dr->dt.dl.dr_data, db);
 	}
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write_done(zio, NULL, db);
 }
 
 /* Issue I/O to commit a dirty buffer to disk. */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn;
 	objset_t *os;
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
 	int wp_flag = 0;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	os = dn->dn_objset;
 
 	if (db->db_state != DB_NOFILL) {
 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
 			/*
 			 * Private object buffers are released here rather
 			 * than in dbuf_dirty() since they are only modified
 			 * in the syncing context and we don't want the
 			 * overhead of making multiple copies of the data.
 			 */
 			if (BP_IS_HOLE(db->db_blkptr)) {
 				arc_buf_thaw(data);
 			} else {
 				dbuf_release_bp(db);
 			}
 		}
 	}
 
 	if (parent != dn->dn_dbuf) {
 		/* Our parent is an indirect block. */
 		/* We have a dirty parent that has been scheduled for write. */
 		ASSERT(parent && parent->db_data_pending);
 		/* Our parent's buffer is one level closer to the dnode. */
 		ASSERT(db->db_level == parent->db_level-1);
 		/*
 		 * We're about to modify our parent's db_data by modifying
 		 * our block pointer, so the parent must be released.
 		 */
 		ASSERT(arc_released(parent->db_buf));
 		zio = parent->db_data_pending->dr_zio;
 	} else {
 		/* Our parent is the dnode itself. */
 		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
 		    db->db_blkid != DMU_SPILL_BLKID) ||
 		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
 		if (db->db_blkid != DMU_SPILL_BLKID)
 			ASSERT3P(db->db_blkptr, ==,
 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		zio = dn->dn_zio;
 	}
 
 	ASSERT(db->db_level == 0 || data == db->db_buf);
 	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
 	ASSERT(zio);
 
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		wp_flag = WP_SPILL;
 	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 	DB_DNODE_EXIT(db);
 
 	if (db->db_level == 0 &&
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * The BP for this block has been provided by open context
 		 * (by dmu_sync() or dmu_buf_write_embedded()).
 		 */
 		void *contents = (data != NULL) ? data->b_data : NULL;
 
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    db->db_blkptr, contents, db->db.db_size, &zp,
 		    dbuf_write_override_ready, NULL, dbuf_write_override_done,
 		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    db->db_blkptr, NULL, db->db.db_size, &zp,
 		    dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
 	} else {
 		ASSERT(arc_released(data));
 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
 		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db),
 		    DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready,
 		    dbuf_write_physdone, dbuf_write_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }
Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
===================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	(revision 271000)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c	(revision 271001)
@@ -1,6944 +1,6944 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright (c) 2013, 2014, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2013 Martin Matuska <mm@FreeBSD.org>. All rights reserved.
  */
 
 /*
  * SPA: Storage Pool Allocator
  *
  * This file contains all the routines used when modifying on-disk SPA state.
  * This includes opening, importing, destroying, exporting a pool, and syncing a
  * pool.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_objset.h>
 #include <sys/unique.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/callb.h>
 #include <sys/spa_boot.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_scan.h>
 #include <sys/dmu_send.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_userhold.h>
 #include <sys/zfeature.h>
 #include <sys/zvol.h>
 #include <sys/trim_map.h>
 
 #ifdef	_KERNEL
 #include <sys/callb.h>
 #include <sys/cpupart.h>
 #include <sys/zone.h>
 #endif	/* _KERNEL */
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
 /* Check hostid on import? */
 static int check_hostid = 1;
 
 SYSCTL_DECL(_vfs_zfs);
 TUNABLE_INT("vfs.zfs.check_hostid", &check_hostid);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RW, &check_hostid, 0,
     "Check hostid on import?");
 
 /*
  * The interval, in seconds, at which failed configuration cache file writes
  * should be retried.
  */
 static int zfs_ccw_retry_interval = 300;
 
 typedef enum zti_modes {
 	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
 	ZTI_MODE_BATCH,			/* cpu-intensive; value is ignored */
 	ZTI_MODE_NULL,			/* don't create a taskq */
 	ZTI_NMODES
 } zti_modes_t;
 
 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
 #define	ZTI_BATCH	{ ZTI_MODE_BATCH, 0, 1 }
 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
 #define	ZTI_N(n)	ZTI_P(n, 1)
 #define	ZTI_ONE		ZTI_N(1)
 
 typedef struct zio_taskq_info {
 	zti_modes_t zti_mode;
 	uint_t zti_value;
 	uint_t zti_count;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 	"issue", "issue_high", "intr", "intr_high"
 };
 
 /*
  * This table defines the taskq settings for each ZFS I/O type. When
  * initializing a pool, we use this table to create an appropriately sized
  * taskq. Some operations are low volume and therefore have a small, static
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
  * macros. Other operations process a large amount of data; the ZTI_BATCH
  * macro causes us to create a taskq oriented for throughput. Some operations
  * are so high frequency and short-lived that the taskq itself can become a a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
  * particular taskq is chosen at random.
  *
  * The different taskq priorities are to handle the different contexts (issue
  * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that
  * need to be handled with minimum delay.
  */
 const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
 	{ ZTI_N(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL }, /* READ */
 	{ ZTI_BATCH,	ZTI_N(5),	ZTI_N(8),	ZTI_N(5) }, /* WRITE */
 	{ ZTI_P(12, 8),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* IOCTL */
 };
 
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
     char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
 uint_t		zio_taskq_batch_pct = 75;	/* 1 thread per cpu in pset */
 #ifdef PSRSET_BIND
 id_t		zio_taskq_psrset_bind = PS_NONE;
 #endif
 #ifdef SYSDC
 boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
 #endif
 uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
 
 boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
 extern int	zfs_sync_pass_deferred_free;
 
 #ifndef illumos
 extern void spa_deadman(void *arg);
 #endif
 
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
  */
 #define	TRYIMPORT_NAME	"$import"
 
 /*
  * ==========================================================================
  * SPA properties routines
  * ==========================================================================
  */
 
 /*
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	const char *propname = zpool_prop_to_name(prop);
 	nvlist_t *propval;
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0);
 
 	if (strval != NULL)
 		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0);
 	else
 		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0);
 
 	VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0);
 	nvlist_free(propval);
 }
 
 /*
  * Get property values from the spa configuration.
  */
 static void
 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	dsl_pool_t *pool = spa->spa_dsl_pool;
 	uint64_t size, alloc, cap, version;
 	zprop_source_t src = ZPROP_SRC_NONE;
 	spa_config_dirent_t *dp;
 	metaslab_class_t *mc = spa_normal_class(spa);
 
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
 	if (rvd != NULL) {
 		alloc = metaslab_class_get_alloc(spa_normal_class(spa));
 		size = metaslab_class_get_space(spa_normal_class(spa));
 		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL,
 		    metaslab_class_fragmentation(mc), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL,
 		    metaslab_class_expandable_space(mc), src);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL,
 		    (spa_mode(spa) == FREAD), src);
 
 		cap = (size == 0) ? 0 : (alloc * 100 / size);
 		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
 		    ddt_get_pool_dedup_ratio(spa), src);
 
 		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
 		    rvd->vdev_state, src);
 
 		version = spa_version(spa);
 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
 			src = ZPROP_SRC_DEFAULT;
 		else
 			src = ZPROP_SRC_LOCAL;
 		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 	}
 
 	if (pool != NULL) {
 		/*
 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 		 * when opening pools before this version freedir will be NULL.
 		 */
 		if (pool->dp_free_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL,
 			    pool->dp_free_dir->dd_phys->dd_used_bytes, src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING,
 			    NULL, 0, src);
 		}
 
 		if (pool->dp_leak_dir != NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL,
 			    pool->dp_leak_dir->dd_phys->dd_used_bytes, src);
 		} else {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED,
 			    NULL, 0, src);
 		}
 	}
 
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 
 	if (spa->spa_comment != NULL) {
 		spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment,
 		    0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_root != NULL)
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
 		    0, ZPROP_SRC_LOCAL);
 
 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path == NULL) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 			    "none", 0, ZPROP_SRC_LOCAL);
 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 			spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE,
 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
 		}
 	}
 }
 
 /*
  * Get zpool property values.
  */
 int
 spa_prop_get(spa_t *spa, nvlist_t **nvp)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	int err;
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	mutex_enter(&spa->spa_props_lock);
 
 	/*
 	 * Get properties from the spa config.
 	 */
 	spa_prop_get_config(spa, nvp);
 
 	/* If no pool property object, no more prop to get. */
 	if (mos == NULL || spa->spa_pool_props_object == 0) {
 		mutex_exit(&spa->spa_props_lock);
 		return (0);
 	}
 
 	/*
 	 * Get properties from the MOS pool property object.
 	 */
 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t intval = 0;
 		char *strval = NULL;
 		zprop_source_t src = ZPROP_SRC_DEFAULT;
 		zpool_prop_t prop;
 
 		if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL)
 			continue;
 
 		switch (za.za_integer_length) {
 		case 8:
 			/* integer property */
 			if (za.za_first_integer !=
 			    zpool_prop_default_numeric(prop))
 				src = ZPROP_SRC_LOCAL;
 
 			if (prop == ZPOOL_PROP_BOOTFS) {
 				dsl_pool_t *dp;
 				dsl_dataset_t *ds = NULL;
 
 				dp = spa_get_dsl(spa);
 				dsl_pool_config_enter(dp, FTAG);
 				if (err = dsl_dataset_hold_obj(dp,
 				    za.za_first_integer, FTAG, &ds)) {
 					dsl_pool_config_exit(dp, FTAG);
 					break;
 				}
 
 				strval = kmem_alloc(
 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1,
 				    KM_SLEEP);
 				dsl_dataset_name(ds, strval);
 				dsl_dataset_rele(ds, FTAG);
 				dsl_pool_config_exit(dp, FTAG);
 			} else {
 				strval = NULL;
 				intval = za.za_first_integer;
 			}
 
 			spa_prop_add_list(*nvp, prop, strval, intval, src);
 
 			if (strval != NULL)
 				kmem_free(strval,
 				    MAXNAMELEN + strlen(MOS_DIR_NAME) + 1);
 
 			break;
 
 		case 1:
 			/* string property */
 			strval = kmem_alloc(za.za_num_integers, KM_SLEEP);
 			err = zap_lookup(mos, spa->spa_pool_props_object,
 			    za.za_name, 1, za.za_num_integers, strval);
 			if (err) {
 				kmem_free(strval, za.za_num_integers);
 				break;
 			}
 			spa_prop_add_list(*nvp, prop, strval, 0, src);
 			kmem_free(strval, za.za_num_integers);
 			break;
 
 		default:
 			break;
 		}
 	}
 	zap_cursor_fini(&zc);
 	mutex_exit(&spa->spa_props_lock);
 out:
 	if (err && err != ENOENT) {
 		nvlist_free(*nvp);
 		*nvp = NULL;
 		return (err);
 	}
 
 	return (0);
 }
 
 /*
  * Validate the given pool properties nvlist and modify the list
  * for the property values to be set.
  */
 static int
 spa_prop_validate(spa_t *spa, nvlist_t *props)
 {
 	nvpair_t *elem;
 	int error = 0, reset_bootfs = 0;
 	uint64_t objnum = 0;
 	boolean_t has_feature = B_FALSE;
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		uint64_t intval;
 		char *strval, *slash, *check, *fname;
 		const char *propname = nvpair_name(elem);
 		zpool_prop_t prop = zpool_name_to_prop(propname);
 
 		switch (prop) {
 		case ZPROP_INVAL:
 			if (!zpool_prop_feature(propname)) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			/*
 			 * Sanitize the input.
 			 */
 			if (nvpair_type(elem) != DATA_TYPE_UINT64) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			if (nvpair_value_uint64(elem, &intval) != 0) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			if (intval != 0) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			fname = strchr(propname, '@') + 1;
 			if (zfeature_lookup_name(fname, NULL) != 0) {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			has_feature = B_TRUE;
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error &&
 			    (intval < spa_version(spa) ||
 			    intval > SPA_VERSION_BEFORE_FEATURES ||
 			    has_feature))
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_DELEGATION:
 		case ZPOOL_PROP_AUTOREPLACE:
 		case ZPOOL_PROP_LISTSNAPS:
 		case ZPOOL_PROP_AUTOEXPAND:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_BOOTFS:
 			/*
 			 * If the pool version is less than SPA_VERSION_BOOTFS,
 			 * or the pool is still being created (version == 0),
 			 * the bootfs property cannot be set.
 			 */
 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			/*
 			 * Make sure the vdev config is bootable
 			 */
 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			reset_bootfs = 1;
 
 			error = nvpair_value_string(elem, &strval);
 
 			if (!error) {
 				objset_t *os;
 				uint64_t compress;
 
 				if (strval == NULL || strval[0] == '\0') {
 					objnum = zpool_prop_default_numeric(
 					    ZPOOL_PROP_BOOTFS);
 					break;
 				}
 
 				if (error = dmu_objset_hold(strval, FTAG, &os))
 					break;
 
 				/* Must be ZPL and not gzip compressed. */
 
 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
 					error = SET_ERROR(ENOTSUP);
 				} else if ((error =
 				    dsl_prop_get_int_ds(dmu_objset_ds(os),
 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 				    &compress)) == 0 &&
 				    !BOOTFS_COMPRESS_VALID(compress)) {
 					error = SET_ERROR(ENOTSUP);
 				} else {
 					objnum = dmu_objset_id(os);
 				}
 				dmu_objset_rele(os, FTAG);
 			}
 			break;
 
 		case ZPOOL_PROP_FAILUREMODE:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && (intval < ZIO_FAILURE_MODE_WAIT ||
 			    intval > ZIO_FAILURE_MODE_PANIC))
 				error = SET_ERROR(EINVAL);
 
 			/*
 			 * This is a special case which only occurs when
 			 * the pool has completely failed. This allows
 			 * the user to change the in-core failmode property
 			 * without syncing it out to disk (I/Os might
 			 * currently be blocked). We do this by returning
 			 * EIO to the caller (spa_prop_set) to trick it
 			 * into thinking we encountered a property validation
 			 * error.
 			 */
 			if (!error && spa_suspended(spa)) {
 				spa->spa_failmode = intval;
 				error = SET_ERROR(EIO);
 			}
 			break;
 
 		case ZPOOL_PROP_CACHEFILE:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 
 			if (strval[0] == '\0')
 				break;
 
 			if (strcmp(strval, "none") == 0)
 				break;
 
 			if (strval[0] != '/') {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			slash = strrchr(strval, '/');
 			ASSERT(slash != NULL);
 
 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 			    strcmp(slash, "/..") == 0)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_COMMENT:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 			for (check = strval; *check != '\0'; check++) {
 				/*
 				 * The kernel doesn't have an easy isprint()
 				 * check.  For this kernel check, we merely
 				 * check ASCII apart from DEL.  Fix this if
 				 * there is an easy-to-use kernel isprint().
 				 */
 				if (*check >= 0x7f) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 				check++;
 			}
 			if (strlen(strval) > ZPROP_MAX_COMMENT)
 				error = E2BIG;
 			break;
 
 		case ZPOOL_PROP_DEDUPDITTO:
 			if (spa_version(spa) < SPA_VERSION_DEDUP)
 				error = SET_ERROR(ENOTSUP);
 			else
 				error = nvpair_value_uint64(elem, &intval);
 			if (error == 0 &&
 			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
 				error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		if (error)
 			break;
 	}
 
 	if (!error && reset_bootfs) {
 		error = nvlist_remove(props,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 
 		if (!error) {
 			error = nvlist_add_uint64(props,
 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 		}
 	}
 
 	return (error);
 }
 
 void
 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 {
 	char *cachefile;
 	spa_config_dirent_t *dp;
 
 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 	    &cachefile) != 0)
 		return;
 
 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
 	    KM_SLEEP);
 
 	if (cachefile[0] == '\0')
 		dp->scd_path = spa_strdup(spa_config_path);
 	else if (strcmp(cachefile, "none") == 0)
 		dp->scd_path = NULL;
 	else
 		dp->scd_path = spa_strdup(cachefile);
 
 	list_insert_head(&spa->spa_config_list, dp);
 	if (need_sync)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
 int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
 	int error;
 	nvpair_t *elem = NULL;
 	boolean_t need_sync = B_FALSE;
 
 	if ((error = spa_prop_validate(spa, nvp)) != 0)
 		return (error);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 
 		if (prop == ZPOOL_PROP_CACHEFILE ||
 		    prop == ZPOOL_PROP_ALTROOT ||
 		    prop == ZPOOL_PROP_READONLY)
 			continue;
 
 		if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) {
 			uint64_t ver;
 
 			if (prop == ZPOOL_PROP_VERSION) {
 				VERIFY(nvpair_value_uint64(elem, &ver) == 0);
 			} else {
 				ASSERT(zpool_prop_feature(nvpair_name(elem)));
 				ver = SPA_VERSION_FEATURES;
 				need_sync = B_TRUE;
 			}
 
 			/* Save time if the version is already set. */
 			if (ver == spa_version(spa))
 				continue;
 
 			/*
 			 * In addition to the pool directory object, we might
 			 * create the pool properties object, the features for
 			 * read object, the features for write object, or the
 			 * feature descriptions object.
 			 */
 			error = dsl_sync_task(spa->spa_name, NULL,
 			    spa_sync_version, &ver,
 			    6, ZFS_SPACE_CHECK_RESERVED);
 			if (error)
 				return (error);
 			continue;
 		}
 
 		need_sync = B_TRUE;
 		break;
 	}
 
 	if (need_sync) {
 		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
 		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
 	}
 
 	return (0);
 }
 
 /*
  * If the bootfs property value is dsobj, clear it.
  */
 void
 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 {
 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 		VERIFY(zap_remove(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 		spa->spa_bootfs = 0;
 	}
 }
 
 /*ARGSUSED*/
 static int
 spa_change_guid_check(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t vdev_state;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_state = rvd->vdev_state;
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (vdev_state != VDEV_STATE_HEALTHY)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT3U(spa_guid(spa), !=, *newguid);
 
 	return (0);
 }
 
 static void
 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	uint64_t oldguid;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	oldguid = spa_guid(spa);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	rvd->vdev_guid = *newguid;
 	rvd->vdev_guid_sum += (*newguid - oldguid);
 	vdev_config_dirty(rvd);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
 	    oldguid, *newguid);
 }
 
 /*
  * Change the GUID for the pool.  This is done so that we can later
  * re-import a pool built from a clone of our own vdevs.  We will modify
  * the root vdev's guid, our own pool guid, and then mark all of our
  * vdevs dirty.  Note that we must make sure that all our vdevs are
  * online when we do this, or else any vdevs that weren't present
  * would be orphaned from our pool.  We are also going to issue a
  * sysevent to update any watchers.
  */
 int
 spa_change_guid(spa_t *spa)
 {
 	int error;
 	uint64_t guid;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
 	guid = spa_generate_guid(NULL);
 
 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
 
 	if (error == 0) {
 		spa_config_sync(spa, B_FALSE, B_TRUE);
 		spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID);
 	}
 
 	mutex_exit(&spa_namespace_lock);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * SPA state manipulation (open/create/destroy/import/export)
  * ==========================================================================
  */
 
 static int
 spa_error_entry_compare(const void *a, const void *b)
 {
 	spa_error_entry_t *sa = (spa_error_entry_t *)a;
 	spa_error_entry_t *sb = (spa_error_entry_t *)b;
 	int ret;
 
 	ret = bcmp(&sa->se_bookmark, &sb->se_bookmark,
 	    sizeof (zbookmark_phys_t));
 
 	if (ret < 0)
 		return (-1);
 	else if (ret > 0)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Utility function which retrieves copies of the current logs and
  * re-initializes them in the process.
  */
 void
 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 {
 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 
 	bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t));
 	bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
 static void
 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 	enum zti_modes mode = ztip->zti_mode;
 	uint_t value = ztip->zti_value;
 	uint_t count = ztip->zti_count;
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	char name[32];
 	uint_t flags = 0;
 	boolean_t batch = B_FALSE;
 
 	if (mode == ZTI_MODE_NULL) {
 		tqs->stqs_count = 0;
 		tqs->stqs_taskq = NULL;
 		return;
 	}
 
 	ASSERT3U(count, >, 0);
 
 	tqs->stqs_count = count;
 	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 
 	switch (mode) {
 	case ZTI_MODE_FIXED:
 		ASSERT3U(value, >=, 1);
 		value = MAX(value, 1);
 		break;
 
 	case ZTI_MODE_BATCH:
 		batch = B_TRUE;
 		flags |= TASKQ_THREADS_CPU_PCT;
 		value = zio_taskq_batch_pct;
 		break;
 
 	default:
 		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 		    "spa_activate()",
 		    zio_type_name[t], zio_taskq_types[q], mode, value);
 		break;
 	}
 
 	for (uint_t i = 0; i < count; i++) {
 		taskq_t *tq;
 
 		if (count > 1) {
 			(void) snprintf(name, sizeof (name), "%s_%s_%u",
 			    zio_type_name[t], zio_taskq_types[q], i);
 		} else {
 			(void) snprintf(name, sizeof (name), "%s_%s",
 			    zio_type_name[t], zio_taskq_types[q]);
 		}
 
 #ifdef SYSDC
 		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 			if (batch)
 				flags |= TASKQ_DC_BATCH;
 
 			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 			    spa->spa_proc, zio_taskq_basedc, flags);
 		} else {
 #endif
 			pri_t pri = maxclsyspri;
 			/*
 			 * The write issue taskq can be extremely CPU
 			 * intensive.  Run it at slightly lower priority
 			 * than the other taskqs.
 			 */
 			if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE)
 				pri--;
 
 			tq = taskq_create_proc(name, value, pri, 50,
 			    INT_MAX, spa->spa_proc, flags);
 #ifdef SYSDC
 		}
 #endif
 
 		tqs->stqs_taskq[i] = tq;
 	}
 }
 
 static void
 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 
 	if (tqs->stqs_taskq == NULL) {
 		ASSERT0(tqs->stqs_count);
 		return;
 	}
 
 	for (uint_t i = 0; i < tqs->stqs_count; i++) {
 		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
 		taskq_destroy(tqs->stqs_taskq[i]);
 	}
 
 	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
 	tqs->stqs_taskq = NULL;
 }
 
 /*
  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
  * Note that a type may have multiple discrete taskqs to avoid lock contention
  * on the taskq itself. In that case we choose which taskq at random by using
  * the low bits of gethrtime().
  */
 void
 spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	taskq_t *tq;
 
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
 	} else {
 #ifdef _KERNEL
 		tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count];
 #else
 		tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count];
 #endif
 	}
 
 	taskq_dispatch_ent(tq, func, arg, flags, ent);
 }
 
 static void
 spa_create_zio_taskqs(spa_t *spa)
 {
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_init(spa, t, q);
 		}
 	}
 }
 
 #ifdef _KERNEL
 #ifdef SPA_PROCESS
 static void
 spa_thread(void *arg)
 {
 	callb_cpr_t cprinfo;
 
 	spa_t *spa = arg;
 	user_t *pu = PTOU(curproc);
 
 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 	    spa->spa_name);
 
 	ASSERT(curproc != &p0);
 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 	    "zpool-%s", spa->spa_name);
 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 
 #ifdef PSRSET_BIND
 	/* bind this thread to the requested psrset */
 	if (zio_taskq_psrset_bind != PS_NONE) {
 		pool_lock();
 		mutex_enter(&cpu_lock);
 		mutex_enter(&pidlock);
 		mutex_enter(&curproc->p_lock);
 
 		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
 		    0, NULL, NULL) == 0)  {
 			curthread->t_bind_pset = zio_taskq_psrset_bind;
 		} else {
 			cmn_err(CE_WARN,
 			    "Couldn't bind process for zfs pool \"%s\" to "
 			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
 		}
 
 		mutex_exit(&curproc->p_lock);
 		mutex_exit(&pidlock);
 		mutex_exit(&cpu_lock);
 		pool_unlock();
 	}
 #endif
 
 #ifdef SYSDC
 	if (zio_taskq_sysdc) {
 		sysdc_thread_enter(curthread, 100, 0);
 	}
 #endif
 
 	spa->spa_proc = curproc;
 	spa->spa_did = curthread->t_did;
 
 	spa_create_zio_taskqs(spa);
 
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
 
 	spa->spa_proc_state = SPA_PROC_ACTIVE;
 	cv_broadcast(&spa->spa_proc_cv);
 
 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
 	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
 		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
 
 	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
 	spa->spa_proc_state = SPA_PROC_GONE;
 	spa->spa_proc = &p0;
 	cv_broadcast(&spa->spa_proc_cv);
 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
 
 	mutex_enter(&curproc->p_lock);
 	lwp_exit();
 }
 #endif	/* SPA_PROCESS */
 #endif
 
 /*
  * Activate an uninitialized pool.
  */
 static void
 spa_activate(spa_t *spa, int mode)
 {
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_mode = mode;
 
 	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
 	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
 
 	/* Try to create a covering process */
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
 	ASSERT(spa->spa_proc == &p0);
 	spa->spa_did = 0;
 
 #ifdef SPA_PROCESS
 	/* Only create a process if we're going to be around a while. */
 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
 		    NULL, 0) == 0) {
 			spa->spa_proc_state = SPA_PROC_CREATED;
 			while (spa->spa_proc_state == SPA_PROC_CREATED) {
 				cv_wait(&spa->spa_proc_cv,
 				    &spa->spa_proc_lock);
 			}
 			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 			ASSERT(spa->spa_proc != &p0);
 			ASSERT(spa->spa_did != 0);
 		} else {
 #ifdef _KERNEL
 			cmn_err(CE_WARN,
 			    "Couldn't create process for zfs pool \"%s\"\n",
 			    spa->spa_name);
 #endif
 		}
 	}
 #endif	/* SPA_PROCESS */
 	mutex_exit(&spa->spa_proc_lock);
 
 	/* If we didn't create a process, we need to create our taskqs. */
 	ASSERT(spa->spa_proc == &p0);
 	if (spa->spa_proc == &p0) {
 		spa_create_zio_taskqs(spa);
 	}
 
 	/*
 	 * Start TRIM thread.
 	 */
 	trim_thread_create(spa);
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_state_dirty_node));
 
 	txg_list_create(&spa->spa_vdev_txg_list,
 	    offsetof(struct vdev, vdev_txg_node));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
 /*
  * Opposite of spa_activate().
  */
 static void
 spa_deactivate(spa_t *spa)
 {
 	ASSERT(spa->spa_sync_on == B_FALSE);
 	ASSERT(spa->spa_dsl_pool == NULL);
 	ASSERT(spa->spa_root_vdev == NULL);
 	ASSERT(spa->spa_async_zio_root == NULL);
 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
 	/*
 	 * Stop TRIM thread in case spa_unload() wasn't called directly
 	 * before spa_deactivate().
 	 */
 	trim_thread_destroy(spa);
 
 	txg_list_destroy(&spa->spa_vdev_txg_list);
 
 	list_destroy(&spa->spa_config_dirty_list);
 	list_destroy(&spa->spa_state_dirty_list);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_fini(spa, t, q);
 		}
 	}
 
 	metaslab_class_destroy(spa->spa_normal_class);
 	spa->spa_normal_class = NULL;
 
 	metaslab_class_destroy(spa->spa_log_class);
 	spa->spa_log_class = NULL;
 
 	/*
 	 * If this was part of an import or the open otherwise failed, we may
 	 * still have errors left in the queues.  Empty them just in case.
 	 */
 	spa_errlog_drain(spa);
 
 	avl_destroy(&spa->spa_errlist_scrub);
 	avl_destroy(&spa->spa_errlist_last);
 
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 
 	mutex_enter(&spa->spa_proc_lock);
 	if (spa->spa_proc_state != SPA_PROC_NONE) {
 		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
 		cv_broadcast(&spa->spa_proc_cv);
 		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
 			ASSERT(spa->spa_proc != &p0);
 			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 		}
 		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
 		spa->spa_proc_state = SPA_PROC_NONE;
 	}
 	ASSERT(spa->spa_proc == &p0);
 	mutex_exit(&spa->spa_proc_lock);
 
 #ifdef SPA_PROCESS
 	/*
 	 * We want to make sure spa_thread() has actually exited the ZFS
 	 * module, so that the module can't be unloaded out from underneath
 	 * it.
 	 */
 	if (spa->spa_did != 0) {
 		thread_join(spa->spa_did);
 		spa->spa_did = 0;
 	}
 #endif	/* SPA_PROCESS */
 }
 
 /*
  * Verify a pool configuration, and construct the vdev tree appropriately.  This
  * will create all the necessary vdevs in the appropriate layout, with each vdev
  * in the CLOSED state.  This will prep the pool before open/creation/import.
  * All vdev validation is done by the vdev_alloc() routine.
  */
 static int
 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
     uint_t id, int atype)
 {
 	nvlist_t **child;
 	uint_t children;
 	int error;
 
 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
 		return (error);
 
 	if ((*vdp)->vdev_ops->vdev_op_leaf)
 		return (0);
 
 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children);
 
 	if (error == ENOENT)
 		return (0);
 
 	if (error) {
 		vdev_free(*vdp);
 		*vdp = NULL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *vd;
 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
 		    atype)) != 0) {
 			vdev_free(*vdp);
 			*vdp = NULL;
 			return (error);
 		}
 	}
 
 	ASSERT(*vdp != NULL);
 
 	return (0);
 }
 
 /*
  * Opposite of spa_load().
  */
 static void
 spa_unload(spa_t *spa)
 {
 	int i;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	/*
 	 * Stop TRIM thread.
 	 */
 	trim_thread_destroy(spa);
 
 	/*
 	 * Stop async tasks.
 	 */
 	spa_async_suspend(spa);
 
 	/*
 	 * Stop syncing.
 	 */
 	if (spa->spa_sync_on) {
 		txg_sync_stop(spa->spa_dsl_pool);
 		spa->spa_sync_on = B_FALSE;
 	}
 
 	/*
 	 * Wait for any outstanding async I/O to complete.
 	 */
 	if (spa->spa_async_zio_root != NULL) {
 		(void) zio_wait(spa->spa_async_zio_root);
 		spa->spa_async_zio_root = NULL;
 	}
 
 	bpobj_close(&spa->spa_deferred_bpobj);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * Close all vdevs.
 	 */
 	if (spa->spa_root_vdev)
 		vdev_free(spa->spa_root_vdev);
 	ASSERT(spa->spa_root_vdev == NULL);
 
 	/*
 	 * Close the dsl pool.
 	 */
 	if (spa->spa_dsl_pool) {
 		dsl_pool_close(spa->spa_dsl_pool);
 		spa->spa_dsl_pool = NULL;
 		spa->spa_meta_objset = NULL;
 	}
 
 	ddt_unload(spa);
 
 
 	/*
 	 * Drop and purge level 2 cache
 	 */
 	spa_l2cache_drop(spa);
 
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		vdev_free(spa->spa_spares.sav_vdevs[i]);
 	if (spa->spa_spares.sav_vdevs) {
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 		spa->spa_spares.sav_vdevs = NULL;
 	}
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 	}
 	spa->spa_spares.sav_count = 0;
 
 	for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 		vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
 		vdev_free(spa->spa_l2cache.sav_vdevs[i]);
 	}
 	if (spa->spa_l2cache.sav_vdevs) {
 		kmem_free(spa->spa_l2cache.sav_vdevs,
 		    spa->spa_l2cache.sav_count * sizeof (void *));
 		spa->spa_l2cache.sav_vdevs = NULL;
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 	}
 	spa->spa_l2cache.sav_count = 0;
 
 	spa->spa_async_suspended = 0;
 
 	if (spa->spa_comment != NULL) {
 		spa_strfree(spa->spa_comment);
 		spa->spa_comment = NULL;
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active spares for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  */
 static void
 spa_load_spares(spa_t *spa)
 {
 	nvlist_t **spares;
 	uint_t nspares;
 	int i;
 	vdev_t *vd, *tvd;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * First, close and free any existing spare vdevs.
 	 */
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		vd = spa->spa_spares.sav_vdevs[i];
 
 		/* Undo the call to spa_activate() below */
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL && tvd->vdev_isspare)
 			spa_spare_remove(tvd);
 		vdev_close(vd);
 		vdev_free(vd);
 	}
 
 	if (spa->spa_spares.sav_vdevs)
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 
 	if (spa->spa_spares.sav_config == NULL)
 		nspares = 0;
 	else
 		VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 
 	spa->spa_spares.sav_count = (int)nspares;
 	spa->spa_spares.sav_vdevs = NULL;
 
 	if (nspares == 0)
 		return;
 
 	/*
 	 * Construct the array of vdevs, opening them to get status in the
 	 * process.   For each spare, there is potentially two different vdev_t
 	 * structures associated with it: one in the list of spares (used only
 	 * for basic validation purposes) and one in the active vdev
 	 * configuration (if it's spared in).  During this phase we open and
 	 * validate each vdev on the spare list.  If the vdev also exists in the
 	 * active configuration, then we also mark this vdev as an active spare.
 	 */
 	spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0,
 		    VDEV_ALLOC_SPARE) == 0);
 		ASSERT(vd != NULL);
 
 		spa->spa_spares.sav_vdevs[i] = vd;
 
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL) {
 			if (!tvd->vdev_isspare)
 				spa_spare_add(tvd);
 
 			/*
 			 * We only mark the spare active if we were successfully
 			 * able to load the vdev.  Otherwise, importing a pool
 			 * with a bad active spare would result in strange
 			 * behavior, because multiple pool would think the spare
 			 * is actively in use.
 			 *
 			 * There is a vulnerability here to an equally bizarre
 			 * circumstance, where a dead active spare is later
 			 * brought back to life (onlined or otherwise).  Given
 			 * the rarity of this scenario, and the extra complexity
 			 * it adds, we ignore the possibility.
 			 */
 			if (!vdev_is_dead(tvd))
 				spa_spare_activate(tvd);
 		}
 
 		vd->vdev_top = vd;
 		vd->vdev_aux = &spa->spa_spares;
 
 		if (vdev_open(vd) != 0)
 			continue;
 
 		if (vdev_validate_aux(vd) == 0)
 			spa_spare_add(vd);
 	}
 
 	/*
 	 * Recompute the stashed list of spares, with status information
 	 * this time.
 	 */
 	VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
 	    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		spares[i] = vdev_config_generate(spa,
 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
 	VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		nvlist_free(spares[i]);
 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active l2cache for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  * Devices which are already active have their details maintained, and are
  * not re-opened.
  */
 static void
 spa_load_l2cache(spa_t *spa)
 {
 	nvlist_t **l2cache;
 	uint_t nl2cache;
 	int i, j, oldnvdevs;
 	uint64_t guid;
 	vdev_t *vd, **oldvdevs, **newvdevs;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if (sav->sav_config != NULL) {
 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 		newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
 	} else {
 		nl2cache = 0;
 		newvdevs = NULL;
 	}
 
 	oldvdevs = sav->sav_vdevs;
 	oldnvdevs = sav->sav_count;
 	sav->sav_vdevs = NULL;
 	sav->sav_count = 0;
 
 	/*
 	 * Process new nvlist of vdevs.
 	 */
 	for (i = 0; i < nl2cache; i++) {
 		VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID,
 		    &guid) == 0);
 
 		newvdevs[i] = NULL;
 		for (j = 0; j < oldnvdevs; j++) {
 			vd = oldvdevs[j];
 			if (vd != NULL && guid == vd->vdev_guid) {
 				/*
 				 * Retain previous vdev for add/remove ops.
 				 */
 				newvdevs[i] = vd;
 				oldvdevs[j] = NULL;
 				break;
 			}
 		}
 
 		if (newvdevs[i] == NULL) {
 			/*
 			 * Create new vdev
 			 */
 			VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
 			    VDEV_ALLOC_L2CACHE) == 0);
 			ASSERT(vd != NULL);
 			newvdevs[i] = vd;
 
 			/*
 			 * Commit this vdev as an l2cache device,
 			 * even if it fails to open.
 			 */
 			spa_l2cache_add(vd);
 
 			vd->vdev_top = vd;
 			vd->vdev_aux = sav;
 
 			spa_l2cache_activate(vd);
 
 			if (vdev_open(vd) != 0)
 				continue;
 
 			(void) vdev_validate_aux(vd);
 
 			if (!vdev_is_dead(vd))
 				l2arc_add_vdev(spa, vd);
 		}
 	}
 
 	/*
 	 * Purge vdevs that were dropped
 	 */
 	for (i = 0; i < oldnvdevs; i++) {
 		uint64_t pool;
 
 		vd = oldvdevs[i];
 		if (vd != NULL) {
 			ASSERT(vd->vdev_isl2cache);
 
 			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 			    pool != 0ULL && l2arc_vdev_present(vd))
 				l2arc_remove_vdev(vd);
 			vdev_clear_stats(vd);
 			vdev_free(vd);
 		}
 	}
 
 	if (oldvdevs)
 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
 
 	if (sav->sav_config == NULL)
 		goto out;
 
 	sav->sav_vdevs = newvdevs;
 	sav->sav_count = (int)nl2cache;
 
 	/*
 	 * Recompute the stashed list of l2cache devices, with status
 	 * information this time.
 	 */
 	VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
 	    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 	l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
 	for (i = 0; i < sav->sav_count; i++)
 		l2cache[i] = vdev_config_generate(spa,
 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
 	VERIFY(nvlist_add_nvlist_array(sav->sav_config,
 	    ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0);
 out:
 	for (i = 0; i < sav->sav_count; i++)
 		nvlist_free(l2cache[i]);
 	if (sav->sav_count)
 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
 }
 
 static int
 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 {
 	dmu_buf_t *db;
 	char *packed = NULL;
 	size_t nvsize = 0;
 	int error;
 	*value = NULL;
 
 	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
 	if (error != 0)
 		return (error);
 	nvsize = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
 	packed = kmem_alloc(nvsize, KM_SLEEP);
 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
 	    DMU_READ_PREFETCH);
 	if (error == 0)
 		error = nvlist_unpack(packed, nvsize, value, 0);
 	kmem_free(packed, nvsize);
 
 	return (error);
 }
 
 /*
  * Checks to see if the given vdev could not be opened, in which case we post a
  * sysevent to notify the autoreplace code that the device has been removed.
  */
 static void
 spa_check_removed(vdev_t *vd)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_check_removed(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
 	    !vd->vdev_ishole) {
 		zfs_post_autoreplace(vd->vdev_spa, vd);
 		spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK);
 	}
 }
 
 /*
  * Validate the current config against the MOS config
  */
 static boolean_t
 spa_config_valid(spa_t *spa, nvlist_t *config)
 {
 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
 	nvlist_t *nv;
 
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
 
 	ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children);
 
 	/*
 	 * If we're doing a normal import, then build up any additional
 	 * diagnostic information about missing devices in this config.
 	 * We'll pass this up to the user for further processing.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
 		nvlist_t **child, *nv;
 		uint64_t idx = 0;
 
 		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **),
 		    KM_SLEEP);
 		VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 			vdev_t *mtvd  = mrvd->vdev_child[c];
 
 			if (tvd->vdev_ops == &vdev_missing_ops &&
 			    mtvd->vdev_ops != &vdev_missing_ops &&
 			    mtvd->vdev_islog)
 				child[idx++] = vdev_config_generate(spa, mtvd,
 				    B_FALSE, 0);
 		}
 
 		if (idx) {
 			VERIFY(nvlist_add_nvlist_array(nv,
 			    ZPOOL_CONFIG_CHILDREN, child, idx) == 0);
 			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0);
 
 			for (int i = 0; i < idx; i++)
 				nvlist_free(child[i]);
 		}
 		nvlist_free(nv);
 		kmem_free(child, rvd->vdev_children * sizeof (char **));
 	}
 
 	/*
 	 * Compare the root vdev tree with the information we have
 	 * from the MOS config (mrvd). Check each top-level vdev
 	 * with the corresponding MOS config top-level (mtvd).
 	 */
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		vdev_t *mtvd  = mrvd->vdev_child[c];
 
 		/*
 		 * Resolve any "missing" vdevs in the current configuration.
 		 * If we find that the MOS config has more accurate information
 		 * about the top-level vdev then use that vdev instead.
 		 */
 		if (tvd->vdev_ops == &vdev_missing_ops &&
 		    mtvd->vdev_ops != &vdev_missing_ops) {
 
 			if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG))
 				continue;
 
 			/*
 			 * Device specific actions.
 			 */
 			if (mtvd->vdev_islog) {
 				spa_set_log_state(spa, SPA_LOG_CLEAR);
 			} else {
 				/*
 				 * XXX - once we have 'readonly' pool
 				 * support we should be able to handle
 				 * missing data devices by transitioning
 				 * the pool to readonly.
 				 */
 				continue;
 			}
 
 			/*
 			 * Swap the missing vdev with the data we were
 			 * able to obtain from the MOS config.
 			 */
 			vdev_remove_child(rvd, tvd);
 			vdev_remove_child(mrvd, mtvd);
 
 			vdev_add_child(rvd, mtvd);
 			vdev_add_child(mrvd, tvd);
 
 			spa_config_exit(spa, SCL_ALL, FTAG);
 			vdev_load(mtvd);
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 			vdev_reopen(rvd);
 		} else if (mtvd->vdev_islog) {
 			/*
 			 * Load the slog device's state from the MOS config
 			 * since it's possible that the label does not
 			 * contain the most up-to-date information.
 			 */
 			vdev_load_log_state(tvd, mtvd);
 			vdev_reopen(tvd);
 		}
 	}
 	vdev_free(mrvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * Ensure we were able to validate the config.
 	 */
 	return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum);
 }
 
 /*
  * Check for missing log devices
  */
 static boolean_t
 spa_check_logs(spa_t *spa)
 {
 	boolean_t rv = B_FALSE;
 
 	switch (spa->spa_log_state) {
 	case SPA_LOG_MISSING:
 		/* need to recheck in case slog has been restored */
 	case SPA_LOG_UNKNOWN:
 		rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain,
 		    NULL, DS_FIND_CHILDREN) != 0);
 		if (rv)
 			spa_set_log_state(spa, SPA_LOG_MISSING);
 		break;
 	}
 	return (rv);
 }
 
 static boolean_t
 spa_passivate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	boolean_t slog_found = B_FALSE;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	if (!spa_has_slogs(spa))
 		return (B_FALSE);
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_islog) {
 			metaslab_group_passivate(mg);
 			slog_found = B_TRUE;
 		}
 	}
 
 	return (slog_found);
 }
 
 static void
 spa_activate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 		metaslab_group_t *mg = tvd->vdev_mg;
 
 		if (tvd->vdev_islog)
 			metaslab_group_activate(mg);
 	}
 }
 
 int
 spa_offline_log(spa_t *spa)
 {
 	int error;
 
 	error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
 	    NULL, DS_FIND_CHILDREN);
 	if (error == 0) {
 		/*
 		 * We successfully offlined the log device, sync out the
 		 * current txg so that the "stubby" block can be removed
 		 * by zil_sync().
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 	}
 	return (error);
 }
 
 static void
 spa_aux_check_removed(spa_aux_vdev_t *sav)
 {
 	int i;
 
 	for (i = 0; i < sav->sav_count; i++)
 		spa_check_removed(sav->sav_vdevs[i]);
 }
 
 void
 spa_claim_notify(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 
 	if (zio->io_error)
 		return;
 
 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
 	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
 		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
 	mutex_exit(&spa->spa_props_lock);
 }
 
 typedef struct spa_load_error {
 	uint64_t	sle_meta_count;
 	uint64_t	sle_data_count;
 } spa_load_error_t;
 
 static void
 spa_load_verify_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	spa_load_error_t *sle = zio->io_private;
 	dmu_object_type_t type = BP_GET_TYPE(bp);
 	int error = zio->io_error;
 	spa_t *spa = zio->io_spa;
 
 	if (error) {
 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 		    type != DMU_OT_INTENT_LOG)
-			atomic_add_64(&sle->sle_meta_count, 1);
+			atomic_inc_64(&sle->sle_meta_count);
 		else
-			atomic_add_64(&sle->sle_data_count, 1);
+			atomic_inc_64(&sle->sle_data_count);
 	}
 	zio_data_buf_free(zio->io_data, zio->io_size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_scrub_inflight--;
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 /*
  * Maximum number of concurrent scrub i/os to create while verifying
  * a pool while importing it.
  */
 int spa_load_verify_maxinflight = 10000;
 boolean_t spa_load_verify_metadata = B_TRUE;
 boolean_t spa_load_verify_data = B_TRUE;
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN,
     &spa_load_verify_maxinflight, 0,
     "Maximum number of concurrent scrub I/Os to create while verifying a "
     "pool while importing it");
 
 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN,
     &spa_load_verify_metadata, 0,
     "Check metadata on import?");
  
 SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN,
     &spa_load_verify_data, 0,
     "Check user data on import?");
  
 /*ARGSUSED*/
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return (0);
 	/*
 	 * Note: normally this routine will not be called if
 	 * spa_load_verify_metadata is not set.  However, it may be useful
 	 * to manually set the flag after the traversal has begun.
 	 */
 	if (!spa_load_verify_metadata)
 		return (0);
 	if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data)
 		return (0);
 
 	zio_t *rio = arg;
 	size_t size = BP_GET_PSIZE(bp);
 	void *data = zio_data_buf_alloc(size);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight)
 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 	spa->spa_scrub_inflight++;
 	mutex_exit(&spa->spa_scrub_lock);
 
 	zio_nowait(zio_read(rio, spa, bp, data, size,
 	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
 	return (0);
 }
 
 static int
 spa_load_verify(spa_t *spa)
 {
 	zio_t *rio;
 	spa_load_error_t sle = { 0 };
 	zpool_rewind_policy_t policy;
 	boolean_t verify_ok = B_FALSE;
 	int error = 0;
 
 	zpool_get_rewind_policy(spa->spa_config, &policy);
 
 	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
 		return (0);
 
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
 	if (spa_load_verify_metadata) {
 		error = traverse_pool(spa, spa->spa_verify_min_txg,
 		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
 		    spa_load_verify_cb, rio);
 	}
 
 	(void) zio_wait(rio);
 
 	spa->spa_load_meta_errors = sle.sle_meta_count;
 	spa->spa_load_data_errors = sle.sle_data_count;
 
 	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
 	    sle.sle_data_count <= policy.zrp_maxdata) {
 		int64_t loss = 0;
 
 		verify_ok = B_TRUE;
 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
 		VERIFY(nvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0);
 		VERIFY(nvlist_add_int64(spa->spa_load_info,
 		    ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
 		VERIFY(nvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0);
 	} else {
 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
 	}
 
 	if (error) {
 		if (error != ENXIO && error != EIO)
 			error = SET_ERROR(EIO);
 		return (error);
 	}
 
 	return (verify_ok ? 0 : EIO);
 }
 
 /*
  * Find a value in the pool props object.
  */
 static void
 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
 {
 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
 }
 
 /*
  * Find a value in the pool directory object.
  */
 static int
 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
 {
 	return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    name, sizeof (uint64_t), 1, val));
 }
 
 static int
 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
 {
 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
 	return (err);
 }
 
 /*
  * Fix up config after a partly-completed split.  This is done with the
  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
  * pool have that entry in their config, but only the splitting one contains
  * a list of all the guids of the vdevs that are being split off.
  *
  * This function determines what to do with that list: either rejoin
  * all the disks to the pool, or complete the splitting process.  To attempt
  * the rejoin, each disk that is offlined is marked online again, and
  * we do a reopen() call.  If the vdev label for every disk that was
  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
  * then we call vdev_split() on each disk, and complete the split.
  *
  * Otherwise we leave the config alone, with all the vdevs in place in
  * the original pool.
  */
 static void
 spa_try_repair(spa_t *spa, nvlist_t *config)
 {
 	uint_t extracted;
 	uint64_t *glist;
 	uint_t i, gcount;
 	nvlist_t *nvl;
 	vdev_t **vd;
 	boolean_t attempt_reopen;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
 		return;
 
 	/* check that the config is complete */
 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    &glist, &gcount) != 0)
 		return;
 
 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
 
 	/* attempt to online all the vdevs & validate */
 	attempt_reopen = B_TRUE;
 	for (i = 0; i < gcount; i++) {
 		if (glist[i] == 0)	/* vdev is hole */
 			continue;
 
 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
 		if (vd[i] == NULL) {
 			/*
 			 * Don't bother attempting to reopen the disks;
 			 * just do the split.
 			 */
 			attempt_reopen = B_FALSE;
 		} else {
 			/* attempt to re-online it */
 			vd[i]->vdev_offline = B_FALSE;
 		}
 	}
 
 	if (attempt_reopen) {
 		vdev_reopen(spa->spa_root_vdev);
 
 		/* check each device to see what state it's in */
 		for (extracted = 0, i = 0; i < gcount; i++) {
 			if (vd[i] != NULL &&
 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
 				break;
 			++extracted;
 		}
 	}
 
 	/*
 	 * If every disk has been moved to the new pool, or if we never
 	 * even attempted to look at them, then we split them off for
 	 * good.
 	 */
 	if (!attempt_reopen || gcount == extracted) {
 		for (i = 0; i < gcount; i++)
 			if (vd[i] != NULL)
 				vdev_split(vd[i]);
 		vdev_reopen(spa->spa_root_vdev);
 	}
 
 	kmem_free(vd, gcount * sizeof (vdev_t *));
 }
 
 static int
 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
     boolean_t mosconfig)
 {
 	nvlist_t *config = spa->spa_config;
 	char *ereport = FM_EREPORT_ZFS_POOL;
 	char *comment;
 	int error;
 	uint64_t pool_guid;
 	nvlist_t *nvl;
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
 		return (SET_ERROR(EINVAL));
 
 	ASSERT(spa->spa_comment == NULL);
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
 		spa->spa_comment = spa_strdup(comment);
 
 	/*
 	 * Versioning wasn't explicitly added to the label until later, so if
 	 * it's not present treat it as the initial version.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &spa->spa_ubsync.ub_version) != 0)
 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &spa->spa_config_txg);
 
 	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0)) {
 		error = SET_ERROR(EEXIST);
 	} else {
 		spa->spa_config_guid = pool_guid;
 
 		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
 		    &nvl) == 0) {
 			VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
 			    KM_SLEEP) == 0);
 		}
 
 		nvlist_free(spa->spa_load_info);
 		spa->spa_load_info = fnvlist_alloc();
 
 		gethrestime(&spa->spa_loaded_ts);
 		error = spa_load_impl(spa, pool_guid, config, state, type,
 		    mosconfig, &ereport);
 	}
 
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
 	if (error) {
 		if (error != EEXIST) {
 			spa->spa_loaded_ts.tv_sec = 0;
 			spa->spa_loaded_ts.tv_nsec = 0;
 		}
 		if (error != EBADF) {
 			zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
 		}
 	}
 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
 	spa->spa_ena = 0;
 
 	return (error);
 }
 
 /*
  * Load an existing storage pool, using the pool's builtin spa_config as a
  * source of configuration information.
  */
 static int
 spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
     spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
     char **ereport)
 {
 	int error = 0;
 	nvlist_t *nvroot = NULL;
 	nvlist_t *label;
 	vdev_t *rvd;
 	uberblock_t *ub = &spa->spa_uberblock;
 	uint64_t children, config_cache_txg = spa->spa_config_txg;
 	int orig_mode = spa->spa_mode;
 	int parse;
 	uint64_t obj;
 	boolean_t missing_feat_write = B_FALSE;
 
 	/*
 	 * If this is an untrusted config, access the pool in read-only mode.
 	 * This prevents things like resilvering recently removed devices.
 	 */
 	if (!mosconfig)
 		spa->spa_mode = FREAD;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa->spa_load_state = state;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
 		return (SET_ERROR(EINVAL));
 
 	parse = (type == SPA_IMPORT_EXISTING ?
 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
 
 	/*
 	 * Parse the configuration into a vdev tree.  We explicitly set the
 	 * value that will be returned by spa_version() since parsing the
 	 * configuration requires knowing the version number.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0)
 		return (error);
 
 	ASSERT(spa->spa_root_vdev == rvd);
 
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_guid(spa) == pool_guid);
 	}
 
 	/*
 	 * Try to open all vdevs, loading each label in the process.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_open(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * We need to validate the vdev labels against the configuration that
 	 * we have in hand, which is dependent on the setting of mosconfig. If
 	 * mosconfig is true then we're validating the vdev labels based on
 	 * that config.  Otherwise, we're validating against the cached config
 	 * (zpool.cache) that was read when we loaded the zfs module, and then
 	 * later we will recursively call spa_load() and validate against
 	 * the vdev config.
 	 *
 	 * If we're assembling a new pool that's been split off from an
 	 * existing pool, the labels haven't yet been updated so we skip
 	 * validation for now.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		error = vdev_validate(rvd, mosconfig);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 
 		if (error != 0)
 			return (error);
 
 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
 			return (SET_ERROR(ENXIO));
 	}
 
 	/*
 	 * Find the best uberblock.
 	 */
 	vdev_uberblock_load(rvd, ub, &label);
 
 	/*
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
 	if (ub->ub_txg == 0) {
 		nvlist_free(label);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
 	}
 
 	/*
 	 * If the pool has an unsupported version we can't open it.
 	 */
 	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
 		nvlist_free(label);
 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
 	}
 
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *features;
 
 		/*
 		 * If we weren't able to find what's necessary for reading the
 		 * MOS in the label, return failure.
 		 */
 		if (label == NULL || nvlist_lookup_nvlist(label,
 		    ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) {
 			nvlist_free(label);
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    ENXIO));
 		}
 
 		/*
 		 * Update our in-core representation with the definitive values
 		 * from the label.
 		 */
 		nvlist_free(spa->spa_label_features);
 		VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * Look through entries in the label nvlist's features_for_read. If
 	 * there is a feature listed there which we don't understand then we
 	 * cannot open a pool.
 	 */
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *unsup_feat;
 
 		VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) ==
 		    0);
 
 		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
 		    NULL); nvp != NULL;
 		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
 			if (!zfeature_is_supported(nvpair_name(nvp))) {
 				VERIFY(nvlist_add_string(unsup_feat,
 				    nvpair_name(nvp), "") == 0);
 			}
 		}
 
 		if (!nvlist_empty(unsup_feat)) {
 			VERIFY(nvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0);
 			nvlist_free(unsup_feat);
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		nvlist_free(unsup_feat);
 	}
 
 	/*
 	 * If the vdev guid sum doesn't match the uberblock, we have an
 	 * incomplete configuration.  We first check to see if the pool
 	 * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN).
 	 * If it is, defer the vdev_guid_sum check till later so we
 	 * can handle missing vdevs.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
 	    &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE &&
 	    rvd->vdev_guid_sum != ub->ub_guid_sum)
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
 
 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_try_repair(spa, config);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		nvlist_free(spa->spa_config_splitting);
 		spa->spa_config_splitting = NULL;
 	}
 
 	/*
 	 * Initialize internal SPA structures.
 	 */
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 	spa->spa_claim_max_txg = spa->spa_first_txg;
 	spa->spa_prev_software_version = ub->ub_software_version;
 
 	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
 		boolean_t missing_feat_read = B_FALSE;
 		nvlist_t *unsup_feat, *enabled_feat;
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
 		    &spa->spa_feat_for_read_obj) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
 		    &spa->spa_feat_for_write_obj) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
 		    &spa->spa_feat_desc_obj) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		enabled_feat = fnvlist_alloc();
 		unsup_feat = fnvlist_alloc();
 
 		if (!spa_features_check(spa, B_FALSE,
 		    unsup_feat, enabled_feat))
 			missing_feat_read = B_TRUE;
 
 		if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) {
 			if (!spa_features_check(spa, B_TRUE,
 			    unsup_feat, enabled_feat)) {
 				missing_feat_write = B_TRUE;
 			}
 		}
 
 		fnvlist_add_nvlist(spa->spa_load_info,
 		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
 
 		if (!nvlist_empty(unsup_feat)) {
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 		}
 
 		fnvlist_free(enabled_feat);
 		fnvlist_free(unsup_feat);
 
 		if (!missing_feat_read) {
 			fnvlist_add_boolean(spa->spa_load_info,
 			    ZPOOL_CONFIG_CAN_RDONLY);
 		}
 
 		/*
 		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
 		 * twofold: to determine whether the pool is available for
 		 * import in read-write mode and (if it is not) whether the
 		 * pool is available for import in read-only mode. If the pool
 		 * is available for import in read-write mode, it is displayed
 		 * as available in userland; if it is not available for import
 		 * in read-only mode, it is displayed as unavailable in
 		 * userland. If the pool is available for import in read-only
 		 * mode but not read-write mode, it is displayed as unavailable
 		 * in userland with a special note that the pool is actually
 		 * available for open in read-only mode.
 		 *
 		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
 		 * missing a feature for write, we must first determine whether
 		 * the pool can be opened read-only before returning to
 		 * userland in order to know whether to display the
 		 * abovementioned note.
 		 */
 		if (missing_feat_read || (missing_feat_write &&
 		    spa_writeable(spa))) {
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		/*
 		 * Load refcounts for ZFS features from disk into an in-memory
 		 * cache during SPA initialization.
 		 */
 		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
 			uint64_t refcount;
 
 			error = feature_get_refcount_from_disk(spa,
 			    &spa_feature_table[i], &refcount);
 			if (error == 0) {
 				spa->spa_feat_refcount_cache[i] = refcount;
 			} else if (error == ENOTSUP) {
 				spa->spa_feat_refcount_cache[i] =
 				    SPA_FEATURE_DISABLED;
 			} else {
 				return (spa_vdev_err(rvd,
 				    VDEV_AUX_CORRUPT_DATA, EIO));
 			}
 		}
 	}
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
 		    &spa->spa_feat_enabled_txg_obj) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	spa->spa_is_initializing = B_TRUE;
 	error = dsl_pool_open(spa->spa_dsl_pool);
 	spa->spa_is_initializing = B_FALSE;
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (!mosconfig) {
 		uint64_t hostid;
 		nvlist_t *policy = NULL, *nvconfig;
 
 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
 		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 			char *hostname;
 			unsigned long myhostid = 0;
 
 			VERIFY(nvlist_lookup_string(nvconfig,
 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
 
 #ifdef	_KERNEL
 			myhostid = zone_get_hostid(NULL);
 #else	/* _KERNEL */
 			/*
 			 * We're emulating the system's hostid in userland, so
 			 * we can't use zone_get_hostid().
 			 */
 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
 #endif	/* _KERNEL */
 			if (check_hostid && hostid != 0 && myhostid != 0 &&
 			    hostid != myhostid) {
 				nvlist_free(nvconfig);
 				cmn_err(CE_WARN, "pool '%s' could not be "
 				    "loaded as it was last accessed by "
 				    "another system (host: %s hostid: 0x%lx). "
 				    "See: http://illumos.org/msg/ZFS-8000-EY",
 				    spa_name(spa), hostname,
 				    (unsigned long)hostid);
 				return (SET_ERROR(EBADF));
 			}
 		}
 		if (nvlist_lookup_nvlist(spa->spa_config,
 		    ZPOOL_REWIND_POLICY, &policy) == 0)
 			VERIFY(nvlist_add_nvlist(nvconfig,
 			    ZPOOL_REWIND_POLICY, policy) == 0);
 
 		spa_config_set(spa, nvconfig);
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_activate(spa, orig_mode);
 
 		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
 	}
 
 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the bit that tells us to use the new accounting function
 	 * (raid-z deflation).  If we have an older pool, this will not
 	 * be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
 	    &spa->spa_creation_version);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the persistent error log.  If we have an older pool, this will
 	 * not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
 	    &spa->spa_errlog_scrub);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the history object.  If we have an older pool, this
 	 * will not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * If we're assembling the pool from the split-off vdevs of
 	 * an existing pool, we don't want to attach the spares & cache
 	 * devices.
 	 */
 
 	/*
 	 * Load any hot spares for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
 		if (load_nvlist(spa, spa->spa_spares.sav_object,
 		    &spa->spa_spares.sav_config) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Load any level 2 ARC devices for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
 	    &spa->spa_l2cache.sav_object);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
 		    &spa->spa_l2cache.sav_config) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 
 	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
 	if (error && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (error == 0) {
 		uint64_t autoreplace;
 
 		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
 		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
 		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
 		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
 		    &spa->spa_dedup_ditto);
 
 		spa->spa_autoreplace = (autoreplace != 0);
 	}
 
 	/*
 	 * If the 'autoreplace' property is set, then post a resource notifying
 	 * the ZFS DE that it should not issue any faults for unopenable
 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
 	 * unopenable vdevs so that the normal autoreplace handler can take
 	 * over.
 	 */
 	if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
 		spa_check_removed(spa->spa_root_vdev);
 		/*
 		 * For the import case, this is done in spa_import(), because
 		 * at this point we're using the spare definitions from
 		 * the MOS config, not necessarily from the userland config.
 		 */
 		if (state != SPA_LOAD_IMPORT) {
 			spa_aux_check_removed(&spa->spa_spares);
 			spa_aux_check_removed(&spa->spa_l2cache);
 		}
 	}
 
 	/*
 	 * Load the vdev state for all toplevel vdevs.
 	 */
 	vdev_load(rvd);
 
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the tree.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * Load the DDTs (dedup tables).
 	 */
 	error = ddt_load(spa);
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	spa_update_dspace(spa);
 
 	/*
 	 * Validate the config, using the MOS config to fill in any
 	 * information which might be missing.  If we fail to validate
 	 * the config then declare the pool unfit for use. If we're
 	 * assembling a pool from a split, the log is not transferred
 	 * over.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		nvlist_t *nvconfig;
 
 		if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		if (!spa_config_valid(spa, nvconfig)) {
 			nvlist_free(nvconfig);
 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
 			    ENXIO));
 		}
 		nvlist_free(nvconfig);
 
 		/*
 		 * Now that we've validated the config, check the state of the
 		 * root vdev.  If it can't be opened, it indicates one or
 		 * more toplevel vdevs are faulted.
 		 */
 		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
 			return (SET_ERROR(ENXIO));
 
 		if (spa_check_logs(spa)) {
 			*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
 			return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
 		}
 	}
 
 	if (missing_feat_write) {
 		ASSERT(state == SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * At this point, we know that we can open the pool in
 		 * read-only mode but not read-write mode. We now have enough
 		 * information and can return to userland.
 		 */
 		return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP));
 	}
 
 	/*
 	 * We've successfully opened the pool, verify that we're ready
 	 * to start pushing transactions.
 	 */
 	if (state != SPA_LOAD_TRYIMPORT) {
 		if (error = spa_load_verify(spa))
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    error));
 	}
 
 	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		dmu_tx_t *tx;
 		int need_update = B_FALSE;
 
 		ASSERT(state != SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * Claim log blocks that haven't been committed yet.
 		 * This must all happen in a single txg.
 		 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
 		 * invoked from zil_claim_log_block()'s i/o done callback.
 		 * Price of rollback is that we abandon the log.
 		 */
 		spa->spa_claiming = B_TRUE;
 
 		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
 		(void) dmu_objset_find(spa_name(spa),
 		    zil_claim, tx, DS_FIND_CHILDREN);
 		dmu_tx_commit(tx);
 
 		spa->spa_claiming = B_FALSE;
 
 		spa_set_log_state(spa, SPA_LOG_GOOD);
 		spa->spa_sync_on = B_TRUE;
 		txg_sync_start(spa->spa_dsl_pool);
 
 		/*
 		 * Wait for all claims to sync.  We sync up to the highest
 		 * claimed log block birth time so that claimed log blocks
 		 * don't appear to be from the future.  spa_claim_max_txg
 		 * will have been set for us by either zil_check_log_chain()
 		 * (invoked from spa_check_logs()) or zil_claim() above.
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
 		/*
 		 * If the config cache is stale, or we have uninitialized
 		 * metaslabs (see spa_vdev_add()), then update the config.
 		 *
 		 * If this is a verbatim import, trust the current
 		 * in-core spa_config and update the disk labels.
 		 */
 		if (config_cache_txg != spa->spa_config_txg ||
 		    state == SPA_LOAD_IMPORT ||
 		    state == SPA_LOAD_RECOVER ||
 		    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
 			need_update = B_TRUE;
 
 		for (int c = 0; c < rvd->vdev_children; c++)
 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
 				need_update = B_TRUE;
 
 		/*
 		 * Update the config cache asychronously in case we're the
 		 * root pool, in which case the config cache isn't writable yet.
 		 */
 		if (need_update)
 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 
 		/*
 		 * Check all DTLs to see if anything needs resilvering.
 		 */
 		if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    vdev_resilver_needed(rvd, NULL, NULL))
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 		/*
 		 * Log the fact that we booted up (so that we can detect if
 		 * we rebooted in the middle of an operation).
 		 */
 		spa_history_log_version(spa, "open");
 
 		/*
 		 * Delete any inconsistent datasets.
 		 */
 		(void) dmu_objset_find(spa_name(spa),
 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
 
 		/*
 		 * Clean up any stale temporary dataset userrefs.
 		 */
 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 	}
 
 	return (0);
 }
 
 static int
 spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
 {
 	int mode = spa->spa_mode;
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 
 	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
 
 	spa_activate(spa, mode);
 	spa_async_suspend(spa);
 
 	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
 }
 
 /*
  * If spa_load() fails this function will try loading prior txg's. If
  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
  * function will not rewind the pool and will return the same error as
  * spa_load().
  */
 static int
 spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
     uint64_t max_request, int rewind_flags)
 {
 	nvlist_t *loadinfo = NULL;
 	nvlist_t *config = NULL;
 	int load_error, rewind_error;
 	uint64_t safe_rewind_txg;
 	uint64_t min_txg;
 
 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
 		spa->spa_load_max_txg = spa->spa_load_txg;
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		spa->spa_load_max_txg = max_request;
 		if (max_request != UINT64_MAX)
 			spa->spa_extreme_rewind = B_TRUE;
 	}
 
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
 	    mosconfig);
 	if (load_error == 0)
 		return (0);
 
 	if (spa->spa_root_vdev != NULL)
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
 		nvlist_free(config);
 		return (load_error);
 	}
 
 	if (state == SPA_LOAD_RECOVER) {
 		/* Price of rolling back is discarding txgs, including log */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		/*
 		 * If we aren't rolling back save the load info from our first
 		 * import attempt so that we can restore it after attempting
 		 * to rewind.
 		 */
 		loadinfo = spa->spa_load_info;
 		spa->spa_load_info = fnvlist_alloc();
 	}
 
 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
 	    TXG_INITIAL : safe_rewind_txg;
 
 	/*
 	 * Continue as long as we're finding errors, we're still within
 	 * the acceptable rewind range, and we're still finding uberblocks
 	 */
 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
 		if (spa->spa_load_max_txg < safe_rewind_txg)
 			spa->spa_extreme_rewind = B_TRUE;
 		rewind_error = spa_load_retry(spa, state, mosconfig);
 	}
 
 	spa->spa_extreme_rewind = B_FALSE;
 	spa->spa_load_max_txg = UINT64_MAX;
 
 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
 		spa_config_set(spa, config);
 
 	if (state == SPA_LOAD_RECOVER) {
 		ASSERT3P(loadinfo, ==, NULL);
 		return (rewind_error);
 	} else {
 		/* Store the rewind info as part of the initial load info */
 		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
 		    spa->spa_load_info);
 
 		/* Restore the initial load info */
 		fnvlist_free(spa->spa_load_info);
 		spa->spa_load_info = loadinfo;
 
 		return (load_error);
 	}
 }
 
 /*
  * Pool Open/Import
  *
  * The import case is identical to an open except that the configuration is sent
  * down from userland, instead of grabbed from the configuration cache.  For the
  * case of an open, the pool configuration will exist in the
  * POOL_STATE_UNINITIALIZED state.
  *
  * The stats information (gen/count/ustats) is used to gather vdev statistics at
  * the same time open the pool, without having to keep around the spa_t in some
  * ambiguous state.
  */
 static int
 spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
     nvlist_t **config)
 {
 	spa_t *spa;
 	spa_load_state_t state = SPA_LOAD_OPEN;
 	int error;
 	int locked = B_FALSE;
 	int firstopen = B_FALSE;
 
 	*spapp = NULL;
 
 	/*
 	 * As disgusting as this is, we need to support recursive calls to this
 	 * function because dsl_dir_open() is called during spa_load(), and ends
 	 * up calling spa_open() again.  The real fix is to figure out how to
 	 * avoid dsl_dir_open() calling this in the first place.
 	 */
 	if (mutex_owner(&spa_namespace_lock) != curthread) {
 		mutex_enter(&spa_namespace_lock);
 		locked = B_TRUE;
 	}
 
 	if ((spa = spa_lookup(pool)) == NULL) {
 		if (locked)
 			mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 		zpool_rewind_policy_t policy;
 
 		firstopen = B_TRUE;
 
 		zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config,
 		    &policy);
 		if (policy.zrp_request & ZPOOL_DO_REWIND)
 			state = SPA_LOAD_RECOVER;
 
 		spa_activate(spa, spa_mode_global);
 
 		if (state != SPA_LOAD_RECOVER)
 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 
 		error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
 		    policy.zrp_request);
 
 		if (error == EBADF) {
 			/*
 			 * If vdev_validate() returns failure (indicated by
 			 * EBADF), it indicates that one of the vdevs indicates
 			 * that the pool has been exported or destroyed.  If
 			 * this is the case, the config cache is out of sync and
 			 * we should remove the pool from the namespace.
 			 */
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa_config_sync(spa, B_TRUE, B_TRUE);
 			spa_remove(spa);
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(ENOENT));
 		}
 
 		if (error) {
 			/*
 			 * We can't open the pool, but we still have useful
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
 			if (config != NULL && spa->spa_config) {
 				VERIFY(nvlist_dup(spa->spa_config, config,
 				    KM_SLEEP) == 0);
 				VERIFY(nvlist_add_nvlist(*config,
 				    ZPOOL_CONFIG_LOAD_INFO,
 				    spa->spa_load_info) == 0);
 			}
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa->spa_last_open_failed = error;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
 		}
 	}
 
 	spa_open_ref(spa, tag);
 
 	if (config != NULL)
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	/*
 	 * If we've recovered the pool, pass back any information we
 	 * gathered while doing the load.
 	 */
 	if (state == SPA_LOAD_RECOVER) {
 		VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info) == 0);
 	}
 
 	if (locked) {
 		spa->spa_last_open_failed = 0;
 		spa->spa_last_ubsync_txg = 0;
 		spa->spa_load_txg = 0;
 		mutex_exit(&spa_namespace_lock);
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 		if (firstopen)
 			zvol_create_minors(spa->spa_name);
 #endif
 #endif
 	}
 
 	*spapp = spa;
 
 	return (0);
 }
 
 int
 spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
     nvlist_t **config)
 {
 	return (spa_open_common(name, spapp, tag, policy, config));
 }
 
 int
 spa_open(const char *name, spa_t **spapp, void *tag)
 {
 	return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
 
 /*
  * Lookup the given spa_t, incrementing the inject count in the process,
  * preventing it from being exported or destroyed.
  */
 spa_t *
 spa_inject_addref(char *name)
 {
 	spa_t *spa;
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(name)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (NULL);
 	}
 	spa->spa_inject_ref++;
 	mutex_exit(&spa_namespace_lock);
 
 	return (spa);
 }
 
 void
 spa_inject_delref(spa_t *spa)
 {
 	mutex_enter(&spa_namespace_lock);
 	spa->spa_inject_ref--;
 	mutex_exit(&spa_namespace_lock);
 }
 
 /*
  * Add spares device information to the nvlist.
  */
 static void
 spa_add_spares(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **spares;
 	uint_t i, nspares;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_stat_t *vs;
 	uint_t vsc;
 	uint64_t pool;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_spares.sav_count == 0)
 		return;
 
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 	if (nspares != 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0);
 
 		/*
 		 * Go through and find any spares which have since been
 		 * repurposed as an active spare.  If this is the case, update
 		 * their status appropriately.
 		 */
 		for (i = 0; i < nspares; i++) {
 			VERIFY(nvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID, &guid) == 0);
 			if (spa_spare_exists(guid, &pool, NULL) &&
 			    pool != 0ULL) {
 				VERIFY(nvlist_lookup_uint64_array(
 				    spares[i], ZPOOL_CONFIG_VDEV_STATS,
 				    (uint64_t **)&vs, &vsc) == 0);
 				vs->vs_state = VDEV_STATE_CANT_OPEN;
 				vs->vs_aux = VDEV_AUX_SPARED;
 			}
 		}
 	}
 }
 
 /*
  * Add l2cache device information to the nvlist, including vdev stats.
  */
 static void
 spa_add_l2cache(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **l2cache;
 	uint_t i, j, nl2cache;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_t *vd;
 	vdev_stat_t *vs;
 	uint_t vsc;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_l2cache.sav_count == 0)
 		return;
 
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 	if (nl2cache != 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0);
 
 		/*
 		 * Update level 2 cache device stats.
 		 */
 
 		for (i = 0; i < nl2cache; i++) {
 			VERIFY(nvlist_lookup_uint64(l2cache[i],
 			    ZPOOL_CONFIG_GUID, &guid) == 0);
 
 			vd = NULL;
 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
 				if (guid ==
 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
 					vd = spa->spa_l2cache.sav_vdevs[j];
 					break;
 				}
 			}
 			ASSERT(vd != NULL);
 
 			VERIFY(nvlist_lookup_uint64_array(l2cache[i],
 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc)
 			    == 0);
 			vdev_get_stats(vd, vs);
 		}
 	}
 }
 
 static void
 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t *features;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 	VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 	/* We may be unable to read features if pool is suspended. */
 	if (spa_suspended(spa))
 		goto out;
 
 	if (spa->spa_feat_for_read_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_read_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
 			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	if (spa->spa_feat_for_write_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_write_obj);
 		    zap_cursor_retrieve(&zc, &za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za.za_integer_length == sizeof (uint64_t) &&
 			    za.za_num_integers == 1);
 			VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name,
 			    za.za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 
 out:
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
 	    features) == 0);
 	nvlist_free(features);
 }
 
 int
 spa_get_stats(const char *name, nvlist_t **config,
     char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
 
 	*config = NULL;
 	error = spa_open_common(name, &spa, FTAG, NULL, config);
 
 	if (spa != NULL) {
 		/*
 		 * This still leaves a window of inconsistency where the spares
 		 * or l2cache devices could change and the config would be
 		 * self-inconsistent.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		if (*config != NULL) {
 			uint64_t loadtimes[2];
 
 			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
 			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
 			VERIFY(nvlist_add_uint64_array(*config,
 			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0);
 
 			VERIFY(nvlist_add_uint64(*config,
 			    ZPOOL_CONFIG_ERRCOUNT,
 			    spa_get_errlog_size(spa)) == 0);
 
 			if (spa_suspended(spa))
 				VERIFY(nvlist_add_uint64(*config,
 				    ZPOOL_CONFIG_SUSPENDED,
 				    spa->spa_failmode) == 0);
 
 			spa_add_spares(spa, *config);
 			spa_add_l2cache(spa, *config);
 			spa_add_feature_stats(spa, *config);
 		}
 	}
 
 	/*
 	 * We want to get the alternate root even for faulted pools, so we cheat
 	 * and call spa_lookup() directly.
 	 */
 	if (altroot) {
 		if (spa == NULL) {
 			mutex_enter(&spa_namespace_lock);
 			spa = spa_lookup(name);
 			if (spa)
 				spa_altroot(spa, altroot, buflen);
 			else
 				altroot[0] = '\0';
 			spa = NULL;
 			mutex_exit(&spa_namespace_lock);
 		} else {
 			spa_altroot(spa, altroot, buflen);
 		}
 	}
 
 	if (spa != NULL) {
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_close(spa, FTAG);
 	}
 
 	return (error);
 }
 
 /*
  * Validate that the auxiliary device array is well formed.  We must have an
  * array of nvlists, each which describes a valid leaf vdev.  If this is an
  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
  * specified, as long as they are well-formed.
  */
 static int
 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
     spa_aux_vdev_t *sav, const char *config, uint64_t version,
     vdev_labeltype_t label)
 {
 	nvlist_t **dev;
 	uint_t i, ndev;
 	vdev_t *vd;
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * It's acceptable to have no devs specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
 		return (0);
 
 	if (ndev == 0)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Make sure the pool is formatted with a version that supports this
 	 * device type.
 	 */
 	if (spa_version(spa) < version)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Set the pending device list so we correctly handle device in-use
 	 * checking.
 	 */
 	sav->sav_pending = dev;
 	sav->sav_npending = ndev;
 
 	for (i = 0; i < ndev; i++) {
 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
 		    mode)) != 0)
 			goto out;
 
 		if (!vd->vdev_ops->vdev_op_leaf) {
 			vdev_free(vd);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		/*
 		 * The L2ARC currently only supports disk devices in
 		 * kernel context.  For user-level testing, we allow it.
 		 */
 #ifdef _KERNEL
 		if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) &&
 		    strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) {
 			error = SET_ERROR(ENOTBLK);
 			vdev_free(vd);
 			goto out;
 		}
 #endif
 		vd->vdev_top = vd;
 
 		if ((error = vdev_open(vd)) == 0 &&
 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
 			VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
 			    vd->vdev_guid) == 0);
 		}
 
 		vdev_free(vd);
 
 		if (error &&
 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
 			goto out;
 		else
 			error = 0;
 	}
 
 out:
 	sav->sav_pending = NULL;
 	sav->sav_npending = 0;
 	return (error);
 }
 
 static int
 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
 {
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
 	    VDEV_LABEL_SPARE)) != 0) {
 		return (error);
 	}
 
 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
 	    VDEV_LABEL_L2CACHE));
 }
 
 static void
 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
     const char *config)
 {
 	int i;
 
 	if (sav->sav_config != NULL) {
 		nvlist_t **olddevs;
 		uint_t oldndevs;
 		nvlist_t **newdevs;
 
 		/*
 		 * Generate new dev list by concatentating with the
 		 * current dev list.
 		 */
 		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config,
 		    &olddevs, &oldndevs) == 0);
 
 		newdevs = kmem_alloc(sizeof (void *) *
 		    (ndevs + oldndevs), KM_SLEEP);
 		for (i = 0; i < oldndevs; i++)
 			VERIFY(nvlist_dup(olddevs[i], &newdevs[i],
 			    KM_SLEEP) == 0);
 		for (i = 0; i < ndevs; i++)
 			VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs],
 			    KM_SLEEP) == 0);
 
 		VERIFY(nvlist_remove(sav->sav_config, config,
 		    DATA_TYPE_NVLIST_ARRAY) == 0);
 
 		VERIFY(nvlist_add_nvlist_array(sav->sav_config,
 		    config, newdevs, ndevs + oldndevs) == 0);
 		for (i = 0; i < oldndevs + ndevs; i++)
 			nvlist_free(newdevs[i]);
 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
 	} else {
 		/*
 		 * Generate a new dev list.
 		 */
 		VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(sav->sav_config, config,
 		    devs, ndevs) == 0);
 	}
 }
 
 /*
  * Stop and drop level 2 ARC devices
  */
 void
 spa_l2cache_drop(spa_t *spa)
 {
 	vdev_t *vd;
 	int i;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		uint64_t pool;
 
 		vd = sav->sav_vdevs[i];
 		ASSERT(vd != NULL);
 
 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 		    pool != 0ULL && l2arc_vdev_present(vd))
 			l2arc_remove_vdev(vd);
 	}
 }
 
 /*
  * Pool Creation
  */
 int
 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     nvlist_t *zplprops)
 {
 	spa_t *spa;
 	char *altroot = NULL;
 	vdev_t *rvd;
 	dsl_pool_t *dp;
 	dmu_tx_t *tx;
 	int error = 0;
 	uint64_t txg = TXG_INITIAL;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 	uint64_t version, obj;
 	boolean_t has_features;
 
 	/*
 	 * If this pool already exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Allocate a new spa_t structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(pool, NULL, altroot);
 	spa_activate(spa, spa_mode_global);
 
 	if (props && (error = spa_prop_validate(spa, props))) {
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	has_features = B_FALSE;
 	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
 	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
 		if (zpool_prop_feature(nvpair_name(elem)))
 			has_features = B_TRUE;
 	}
 
 	if (has_features || nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
 		version = SPA_VERSION;
 	}
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 
 	spa->spa_first_txg = txg;
 	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
 
 	/*
 	 * Create the root vdev.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
 
 	ASSERT(error != 0 || rvd != NULL);
 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
 
 	if (error == 0 && !zfs_allocatable_devs(nvroot))
 		error = SET_ERROR(EINVAL);
 
 	if (error == 0 &&
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
 	    (error = spa_validate_aux(spa, nvroot, txg,
 	    VDEV_ALLOC_ADD)) == 0) {
 		for (int c = 0; c < rvd->vdev_children; c++) {
 			vdev_ashift_optimize(rvd->vdev_child[c]);
 			vdev_metaslab_set_size(rvd->vdev_child[c]);
 			vdev_expand(rvd->vdev_child[c], txg);
 		}
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	/*
 	 * Get the list of spares, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME,
 		    KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Get the list of level 2 cache devices, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	spa->spa_is_initializing = B_TRUE;
 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
 	spa->spa_meta_objset = dp->dp_meta_objset;
 	spa->spa_is_initializing = B_FALSE;
 
 	/*
 	 * Create DDTs (dedup tables).
 	 */
 	ddt_create(spa);
 
 	spa_update_dspace(spa);
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * Create the pool config object.
 	 */
 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool config");
 	}
 
 	if (spa_version(spa) >= SPA_VERSION_FEATURES)
 		spa_feature_create_zap_objects(spa, tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
 	    sizeof (uint64_t), 1, &version, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool version");
 	}
 
 	/* Newly created pools with the right version are always deflated. */
 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		spa->spa_deflate = TRUE;
 		if (zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
 			cmn_err(CE_PANIC, "failed to add deflate");
 		}
 	}
 
 	/*
 	 * Create the deferred-free bpobj.  Turn off compression
 	 * because sync-to-convergence takes longer if the blocksize
 	 * keeps changing.
 	 */
 	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
 	dmu_object_set_compress(spa->spa_meta_objset, obj,
 	    ZIO_COMPRESS_OFF, tx);
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
 	    sizeof (uint64_t), 1, &obj, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add bpobj");
 	}
 	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
 	    spa->spa_meta_objset, obj));
 
 	/*
 	 * Create the pool's history object.
 	 */
 	if (version >= SPA_VERSION_ZPOOL_HISTORY)
 		spa_history_create_obj(spa, tx);
 
 	/*
 	 * Set pool properties.
 	 */
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
 
 	if (props != NULL) {
 		spa_configfile_set(spa, props, B_FALSE);
 		spa_sync_props(props, tx);
 	}
 
 	dmu_tx_commit(tx);
 
 	spa->spa_sync_on = B_TRUE;
 	txg_sync_start(spa->spa_dsl_pool);
 
 	/*
 	 * We explicitly wait for the first transaction to complete so that our
 	 * bean counters are appropriately updated.
 	 */
 	txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	spa_config_sync(spa, B_FALSE, B_TRUE);
 
 	spa_history_log_version(spa, "create");
 
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
 
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 #ifdef _KERNEL
 #if defined(sun)
 /*
  * Get the root pool information from the root disk, then import the root pool
  * during the system boot up time.
  */
 extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
 
 static nvlist_t *
 spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
 {
 	nvlist_t *config;
 	nvlist_t *nvtop, *nvroot;
 	uint64_t pgid;
 
 	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
 		return (NULL);
 
 	/*
 	 * Add this top-level vdev to the child array.
 	 */
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvtop) == 0);
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    &pgid) == 0);
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
 
 	/*
 	 * Put this pool's top-level vdevs into a root vdev.
 	 */
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
 	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    &nvtop, 1) == 0);
 
 	/*
 	 * Replace the existing vdev_tree with the new root vdev in
 	 * this pool's configuration (remove the old, add the new).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 	nvlist_free(nvroot);
 	return (config);
 }
 
 /*
  * Walk the vdev tree and see if we can find a device with "better"
  * configuration. A configuration is "better" if the label on that
  * device has a more recent txg.
  */
 static void
 spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
 
 	if (vd->vdev_ops->vdev_op_leaf) {
 		nvlist_t *label;
 		uint64_t label_txg;
 
 		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
 		    &label) != 0)
 			return;
 
 		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
 		    &label_txg) == 0);
 
 		/*
 		 * Do we have a better boot device?
 		 */
 		if (label_txg > *txg) {
 			*txg = label_txg;
 			*avd = vd;
 		}
 		nvlist_free(label);
 	}
 }
 
 /*
  * Import a root pool.
  *
  * For x86. devpath_list will consist of devid and/or physpath name of
  * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
  * The GRUB "findroot" command will return the vdev we should boot.
  *
  * For Sparc, devpath_list consists the physpath name of the booting device
  * no matter the rootpool is a single device pool or a mirrored pool.
  * e.g.
  *	"/pci@1f,0/ide@d/disk@0,0:a"
  */
 int
 spa_import_rootpool(char *devpath, char *devid)
 {
 	spa_t *spa;
 	vdev_t *rvd, *bvd, *avd = NULL;
 	nvlist_t *config, *nvtop;
 	uint64_t guid, txg;
 	char *pname;
 	int error;
 
 	/*
 	 * Read the label from the boot device and generate a configuration.
 	 */
 	config = spa_generate_rootconf(devpath, devid, &guid);
 #if defined(_OBP) && defined(_KERNEL)
 	if (config == NULL) {
 		if (strstr(devpath, "/iscsi/ssd") != NULL) {
 			/* iscsi boot */
 			get_iscsi_bootpath_phy(devpath);
 			config = spa_generate_rootconf(devpath, devid, &guid);
 		}
 	}
 #endif
 	if (config == NULL) {
 		cmn_err(CE_NOTE, "Cannot read the pool label from '%s'",
 		    devpath);
 		return (SET_ERROR(EIO));
 	}
 
 	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 	    &pname) == 0);
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(pname)) != NULL) {
 		/*
 		 * Remove the existing root pool from the namespace so that we
 		 * can replace it with the correct config we just read in.
 		 */
 		spa_remove(spa);
 	}
 
 	spa = spa_add(pname, config, NULL);
 	spa->spa_is_root = B_TRUE;
 	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
 
 	/*
 	 * Build up a vdev tree based on the boot device's label config.
 	 */
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvtop) == 0);
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
 	    VDEV_ALLOC_ROOTPOOL);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error) {
 		mutex_exit(&spa_namespace_lock);
 		nvlist_free(config);
 		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
 		    pname);
 		return (error);
 	}
 
 	/*
 	 * Get the boot vdev.
 	 */
 	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
 		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
 		    (u_longlong_t)guid);
 		error = SET_ERROR(ENOENT);
 		goto out;
 	}
 
 	/*
 	 * Determine if there is a better boot device.
 	 */
 	avd = bvd;
 	spa_alt_rootvdev(rvd, &avd, &txg);
 	if (avd != bvd) {
 		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
 		    "try booting from '%s'", avd->vdev_path);
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	/*
 	 * If the boot device is part of a spare vdev then ensure that
 	 * we're booting off the active spare.
 	 */
 	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
 	    !bvd->vdev_isspare) {
 		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
 		    "try booting from '%s'",
 		    bvd->vdev_parent->
 		    vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path);
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	error = 0;
 out:
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_free(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	nvlist_free(config);
 	return (error);
 }
 
 #else
 
 extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs,
     uint64_t *count);
 
 static nvlist_t *
 spa_generate_rootconf(const char *name)
 {
 	nvlist_t **configs, **tops;
 	nvlist_t *config;
 	nvlist_t *best_cfg, *nvtop, *nvroot;
 	uint64_t *holes;
 	uint64_t best_txg;
 	uint64_t nchildren;
 	uint64_t pgid;
 	uint64_t count;
 	uint64_t i;
 	uint_t   nholes;
 
 	if (vdev_geom_read_pool_label(name, &configs, &count) != 0)
 		return (NULL);
 
 	ASSERT3U(count, !=, 0);
 	best_txg = 0;
 	for (i = 0; i < count; i++) {
 		uint64_t txg;
 
 		VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG,
 		    &txg) == 0);
 		if (txg > best_txg) {
 			best_txg = txg;
 			best_cfg = configs[i];
 		}
 	}
 
 	/*
 	 * Multi-vdev root pool configuration discovery is not supported yet.
 	 */
 	nchildren = 1;
 	nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren);
 	holes = NULL;
 	nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY,
 	    &holes, &nholes);
 
 	tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP);
 	for (i = 0; i < nchildren; i++) {
 		if (i >= count)
 			break;
 		if (configs[i] == NULL)
 			continue;
 		VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE,
 		    &nvtop) == 0);
 		nvlist_dup(nvtop, &tops[i], KM_SLEEP);
 	}
 	for (i = 0; holes != NULL && i < nholes; i++) {
 		if (i >= nchildren)
 			continue;
 		if (tops[holes[i]] != NULL)
 			continue;
 		nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP);
 		VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE,
 		    VDEV_TYPE_HOLE) == 0);
 		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID,
 		    holes[i]) == 0);
 		VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID,
 		    0) == 0);
 	}
 	for (i = 0; i < nchildren; i++) {
 		if (tops[i] != NULL)
 			continue;
 		nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP);
 		VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE,
 		    VDEV_TYPE_MISSING) == 0);
 		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID,
 		    i) == 0);
 		VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID,
 		    0) == 0);
 	}
 
 	/*
 	 * Create pool config based on the best vdev config.
 	 */
 	nvlist_dup(best_cfg, &config, KM_SLEEP);
 
 	/*
 	 * Put this pool's top-level vdevs into a root vdev.
 	 */
 	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    &pgid) == 0);
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
 	    VDEV_TYPE_ROOT) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
 	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
 	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
 	    tops, nchildren) == 0);
 
 	/*
 	 * Replace the existing vdev_tree with the new root vdev in
 	 * this pool's configuration (remove the old, add the new).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 
 	/*
 	 * Drop vdev config elements that should not be present at pool level.
 	 */
 	nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64);
 	nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64);
 
 	for (i = 0; i < count; i++)
 		nvlist_free(configs[i]);
 	kmem_free(configs, count * sizeof(void *));
 	for (i = 0; i < nchildren; i++)
 		nvlist_free(tops[i]);
 	kmem_free(tops, nchildren * sizeof(void *));
 	nvlist_free(nvroot);
 	return (config);
 }
 
 int
 spa_import_rootpool(const char *name)
 {
 	spa_t *spa;
 	vdev_t *rvd, *bvd, *avd = NULL;
 	nvlist_t *config, *nvtop;
 	uint64_t txg;
 	char *pname;
 	int error;
 
 	/*
 	 * Read the label from the boot device and generate a configuration.
 	 */
 	config = spa_generate_rootconf(name);
 
 	mutex_enter(&spa_namespace_lock);
 	if (config != NULL) {
 		VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    &pname) == 0 && strcmp(name, pname) == 0);
 		VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg)
 		    == 0);
 
 		if ((spa = spa_lookup(pname)) != NULL) {
 			/*
 			 * Remove the existing root pool from the namespace so
 			 * that we can replace it with the correct config
 			 * we just read in.
 			 */
 			spa_remove(spa);
 		}
 		spa = spa_add(pname, config, NULL);
 
 		/*
 		 * Set spa_ubsync.ub_version as it can be used in vdev_alloc()
 		 * via spa_version().
 		 */
 		if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 		    &spa->spa_ubsync.ub_version) != 0)
 			spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 	} else if ((spa = spa_lookup(name)) == NULL) {
 		cmn_err(CE_NOTE, "Cannot find the pool label for '%s'",
 		    name);
 		return (EIO);
 	} else {
 		VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0);
 	}
 	spa->spa_is_root = B_TRUE;
 	spa->spa_import_flags = ZFS_IMPORT_VERBATIM;
 
 	/*
 	 * Build up a vdev tree based on the boot device's label config.
 	 */
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvtop) == 0);
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
 	    VDEV_ALLOC_ROOTPOOL);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	if (error) {
 		mutex_exit(&spa_namespace_lock);
 		nvlist_free(config);
 		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
 		    pname);
 		return (error);
 	}
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_free(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	nvlist_free(config);
 	return (0);
 }
 
 #endif	/* sun */
 #endif
 
 /*
  * Import a non-root pool into the system.
  */
 int
 spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 {
 	spa_t *spa;
 	char *altroot = NULL;
 	spa_load_state_t state = SPA_LOAD_IMPORT;
 	zpool_rewind_policy_t policy;
 	uint64_t mode = spa_mode_global;
 	uint64_t readonly = B_FALSE;
 	int error;
 	nvlist_t *nvroot;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	/*
 	 * If a pool with this name exists, return failure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	if (spa_lookup(pool) != NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	(void) nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
 	if (readonly)
 		mode = FREAD;
 	spa = spa_add(pool, config, altroot);
 	spa->spa_import_flags = flags;
 
 	/*
 	 * Verbatim import - Take a pool and insert it into the namespace
 	 * as if it had been loaded at boot.
 	 */
 	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
 		if (props != NULL)
 			spa_configfile_set(spa, props, B_FALSE);
 
 		spa_config_sync(spa, B_FALSE, B_TRUE);
 
 		mutex_exit(&spa_namespace_lock);
 		return (0);
 	}
 
 	spa_activate(spa, mode);
 
 	/*
 	 * Don't start async tasks until we know everything is healthy.
 	 */
 	spa_async_suspend(spa);
 
 	zpool_get_rewind_policy(config, &policy);
 	if (policy.zrp_request & ZPOOL_DO_REWIND)
 		state = SPA_LOAD_RECOVER;
 
 	/*
 	 * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
 	 * because the user-supplied config is actually the one to trust when
 	 * doing an import.
 	 */
 	if (state != SPA_LOAD_RECOVER)
 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 
 	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
 	    policy.zrp_request);
 
 	/*
 	 * Propagate anything learned while loading the pool and pass it
 	 * back to caller (i.e. rewind info, missing devices, etc).
 	 */
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 	    spa->spa_load_info) == 0);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
 	 * Toss any existing sparelist, as it doesn't have any validity
 	 * anymore, and conflicts with spa_has_spare().
 	 */
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 		spa_load_spares(spa);
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 		spa_load_l2cache(spa);
 	}
 
 	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
 	    &nvroot) == 0);
 	if (error == 0)
 		error = spa_validate_aux(spa, nvroot, -1ULL,
 		    VDEV_ALLOC_SPARE);
 	if (error == 0)
 		error = spa_validate_aux(spa, nvroot, -1ULL,
 		    VDEV_ALLOC_L2CACHE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (props != NULL)
 		spa_configfile_set(spa, props, B_FALSE);
 
 	if (error != 0 || (props && spa_writeable(spa) &&
 	    (error = spa_prop_set(spa, props)))) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
 		return (error);
 	}
 
 	spa_async_resume(spa);
 
 	/*
 	 * Override any spares and level 2 cache devices as specified by
 	 * the user, as these may have correct device names/devids, etc.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		if (spa->spa_spares.sav_config)
 			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
 		else
 			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		if (spa->spa_l2cache.sav_config)
 			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
 			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
 		else
 			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
 			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Check for any removed devices.
 	 */
 	if (spa->spa_autoreplace) {
 		spa_aux_check_removed(&spa->spa_spares);
 		spa_aux_check_removed(&spa->spa_l2cache);
 	}
 
 	if (spa_writeable(spa)) {
 		/*
 		 * Update the config cache to include the newly-imported pool.
 		 */
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	}
 
 	/*
 	 * It's possible that the pool was expanded while it was exported.
 	 * We kick off an async task to handle this for us.
 	 */
 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 
 	mutex_exit(&spa_namespace_lock);
 	spa_history_log_version(spa, "import");
 
 #ifdef __FreeBSD__
 #ifdef _KERNEL
 	zvol_create_minors(pool);
 #endif
 #endif
 	return (0);
 }
 
 nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
 	nvlist_t *config = NULL;
 	char *poolname;
 	spa_t *spa;
 	uint64_t state;
 	int error;
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
 		return (NULL);
 
 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
 		return (NULL);
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
 	spa_activate(spa, FREAD);
 
 	/*
 	 * Pass off the heavy lifting to spa_load().
 	 * Pass TRUE for mosconfig because the user-supplied config
 	 * is actually the one to trust when doing an import.
 	 */
 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
 	 */
 	if (spa->spa_root_vdev != NULL) {
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 		VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME,
 		    poolname) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 		    state) == 0);
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    spa->spa_uberblock.ub_timestamp) == 0);
 		VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info) == 0);
 
 		/*
 		 * If the bootfs property exists on this pool then we
 		 * copy it out so that external consumers can tell which
 		 * pools are bootable.
 		 */
 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 			/*
 			 * We have to play games with the name since the
 			 * pool was opened as TRYIMPORT_NAME.
 			 */
 			if (dsl_dsobj_to_dsname(spa_name(spa),
 			    spa->spa_bootfs, tmpname) == 0) {
 				char *cp;
 				char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 				cp = strchr(tmpname, '/');
 				if (cp == NULL) {
 					(void) strlcpy(dsname, tmpname,
 					    MAXPATHLEN);
 				} else {
 					(void) snprintf(dsname, MAXPATHLEN,
 					    "%s/%s", poolname, ++cp);
 				}
 				VERIFY(nvlist_add_string(config,
 				    ZPOOL_CONFIG_BOOTFS, dsname) == 0);
 				kmem_free(dsname, MAXPATHLEN);
 			}
 			kmem_free(tmpname, MAXPATHLEN);
 		}
 
 		/*
 		 * Add the list of hot spares and level 2 cache devices.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_add_spares(spa, config);
 		spa_add_l2cache(spa, config);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_remove(spa);
 	mutex_exit(&spa_namespace_lock);
 
 	return (config);
 }
 
 /*
  * Pool export/destroy
  *
  * The act of destroying or exporting a pool is very simple.  We make sure there
  * is no more pending I/O and any references to the pool are gone.  Then, we
  * update the pool state and sync all the labels to disk, removing the
  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
  * we don't sync the labels or remove the configuration cache.
  */
 static int
 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
     boolean_t force, boolean_t hardforce)
 {
 	spa_t *spa;
 
 	if (oldconfig)
 		*oldconfig = NULL;
 
 	if (!(spa_mode_global & FWRITE))
 		return (SET_ERROR(EROFS));
 
 	mutex_enter(&spa_namespace_lock);
 	if ((spa = spa_lookup(pool)) == NULL) {
 		mutex_exit(&spa_namespace_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	/*
 	 * Put a hold on the pool, drop the namespace lock, stop async tasks,
 	 * reacquire the namespace lock, and see if we can export.
 	 */
 	spa_open_ref(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 	spa_async_suspend(spa);
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 
 	/*
 	 * The pool will be in core if it's openable,
 	 * in which case we can modify its state.
 	 */
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) {
 		/*
 		 * Objsets may be open only because they're dirty, so we
 		 * have to force it to sync before checking spa_refcnt.
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 
 		/*
 		 * A pool cannot be exported or destroyed if there are active
 		 * references.  If we are resetting a pool, allow references by
 		 * fault injection handlers.
 		 */
 		if (!spa_refcount_zero(spa) ||
 		    (spa->spa_inject_ref != 0 &&
 		    new_state != POOL_STATE_UNINITIALIZED)) {
 			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(EBUSY));
 		}
 
 		/*
 		 * A pool cannot be exported if it has an active shared spare.
 		 * This is to prevent other pools stealing the active spare
 		 * from an exported pool. At user's own will, such pool can
 		 * be forcedly exported.
 		 */
 		if (!force && new_state == POOL_STATE_EXPORTED &&
 		    spa_has_active_shared_spare(spa)) {
 			spa_async_resume(spa);
 			mutex_exit(&spa_namespace_lock);
 			return (SET_ERROR(EXDEV));
 		}
 
 		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
 		 */
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
 			spa->spa_final_txg = spa_last_synced_txg(spa) +
 			    TXG_DEFER_SIZE + 1;
 			vdev_config_dirty(spa->spa_root_vdev);
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 	}
 
 	spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY);
 
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 	}
 
 	if (oldconfig && spa->spa_config)
 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
 
 	if (new_state != POOL_STATE_UNINITIALIZED) {
 		if (!hardforce)
 			spa_config_sync(spa, B_TRUE, B_TRUE);
 		spa_remove(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Destroy a storage pool.
  */
 int
 spa_destroy(char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * Export a storage pool.
  */
 int
 spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce)
 {
 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
 	    force, hardforce));
 }
 
 /*
  * Similar to spa_export(), this unloads the spa_t without actually removing it
  * from the namespace in any way.
  */
 int
 spa_reset(char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * Device manipulation
  * ==========================================================================
  */
 
 /*
  * Add a device to a storage pool.
  */
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
 	uint64_t txg, id;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
 	    &nspares) != 0)
 		nspares = 0;
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
 	    &nl2cache) != 0)
 		nl2cache = 0;
 
 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
 	if (vd->vdev_children != 0 &&
 	    (error = vdev_create(vd, txg, B_FALSE)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * We must validate the spares and l2cache devices after checking the
 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
 	 */
 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * Transfer each new top-level vdev from vd to rvd.
 	 */
 	for (int c = 0; c < vd->vdev_children; c++) {
 
 		/*
 		 * Set the vdev id to the first hole, if one exists.
 		 */
 		for (id = 0; id < rvd->vdev_children; id++) {
 			if (rvd->vdev_child[id]->vdev_ishole) {
 				vdev_free(rvd->vdev_child[id]);
 				break;
 			}
 		}
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
 		tvd->vdev_id = id;
 		vdev_add_child(rvd, tvd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (nspares != 0) {
 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
 		    ZPOOL_CONFIG_SPARES);
 		spa_load_spares(spa);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	if (nl2cache != 0) {
 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
 		    ZPOOL_CONFIG_L2CACHE);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * We have to be careful when adding new vdevs to an existing pool.
 	 * If other threads start allocating from these vdevs before we
 	 * sync the config cache, and we lose power, then upon reboot we may
 	 * fail to open the pool because there are DVAs that the config cache
 	 * can't translate.  Therefore, we first add the vdevs without
 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
 	 * and then let spa_config_update() initialize the new metaslabs.
 	 *
 	 * spa_load() checks for added-but-not-initialized vdevs, so that
 	 * if we lose power at any point in this sequence, the remaining
 	 * steps will be completed the next time we load the pool.
 	 */
 	(void) spa_vdev_exit(spa, vd, txg, 0);
 
 	mutex_enter(&spa_namespace_lock);
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	mutex_exit(&spa_namespace_lock);
 
 	return (0);
 }
 
 /*
  * Attach a device to a mirror.  The arguments are the path to any device
  * in the mirror, and the nvroot for the new device.  If the path specifies
  * a device that is not mirrored, we automatically insert the mirror vdev.
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
  * mirror using the 'replacing' vdev, which is functionally identical to
  * the mirror vdev (it actually reuses all the same ops) but has a few
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
  * is automatically detached.
  */
 int
 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 {
 	uint64_t txg, dtl_max_txg;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
 	char *oldvdpath, *newvdpath;
 	int newvd_isspare;
 	int error;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!oldvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = oldvd->vdev_parent;
 
 	if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ATTACH)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	if (newrootvd->vdev_children != 1)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	newvd = newrootvd->vdev_child[0];
 
 	if (!newvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
 		return (spa_vdev_exit(spa, newrootvd, txg, error));
 
 	/*
 	 * Spares can't replace logs
 	 */
 	if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare)
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 	if (!replacing) {
 		/*
 		 * For attach, the only allowable parent is a mirror or the root
 		 * vdev.
 		 */
 		if (pvd->vdev_ops != &vdev_mirror_ops &&
 		    pvd->vdev_ops != &vdev_root_ops)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		pvops = &vdev_mirror_ops;
 	} else {
 		/*
 		 * Active hot spares can only be replaced by inactive hot
 		 * spares.
 		 */
 		if (pvd->vdev_ops == &vdev_spare_ops &&
 		    oldvd->vdev_isspare &&
 		    !spa_has_spare(spa, newvd->vdev_guid))
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		/*
 		 * If the source is a hot spare, and the parent isn't already a
 		 * spare, then we want to create a new hot spare.  Otherwise, we
 		 * want to create a replacing vdev.  The user is not allowed to
 		 * attach to a spared vdev child unless the 'isspare' state is
 		 * the same (spare replaces spare, non-spare replaces
 		 * non-spare).
 		 */
 		if (pvd->vdev_ops == &vdev_replacing_ops &&
 		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		} else if (pvd->vdev_ops == &vdev_spare_ops &&
 		    newvd->vdev_isspare != oldvd->vdev_isspare) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 
 		if (newvd->vdev_isspare)
 			pvops = &vdev_spare_ops;
 		else
 			pvops = &vdev_replacing_ops;
 	}
 
 	/*
 	 * Make sure the new device is big enough.
 	 */
 	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
 	/*
 	 * The new device cannot have a higher alignment requirement
 	 * than the top-level vdev.
 	 */
 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift)
 		return (spa_vdev_exit(spa, newrootvd, txg, EDOM));
 
 	/*
 	 * If this is an in-place replacement, update oldvd's path and devid
 	 * to make it distinguishable from newvd, and unopenable from now on.
 	 */
 	if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) {
 		spa_strfree(oldvd->vdev_path);
 		oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5,
 		    KM_SLEEP);
 		(void) sprintf(oldvd->vdev_path, "%s/%s",
 		    newvd->vdev_path, "old");
 		if (oldvd->vdev_devid != NULL) {
 			spa_strfree(oldvd->vdev_devid);
 			oldvd->vdev_devid = NULL;
 		}
 	}
 
 	/* mark the device being resilvered */
 	newvd->vdev_resilver_txg = txg;
 
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
 	 * mirror/replacing/spare vdev above oldvd.
 	 */
 	if (pvd->vdev_ops != pvops)
 		pvd = vdev_add_parent(oldvd, pvops);
 
 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
 	ASSERT(pvd->vdev_ops == pvops);
 	ASSERT(oldvd->vdev_parent == pvd);
 
 	/*
 	 * Extract the new device from its root and add it to pvd.
 	 */
 	vdev_remove_child(newrootvd, newvd);
 	newvd->vdev_id = pvd->vdev_children;
 	newvd->vdev_crtxg = oldvd->vdev_crtxg;
 	vdev_add_child(pvd, newvd);
 
 	tvd = newvd->vdev_top;
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
 	 * for any dmu_sync-ed blocks.  It will propagate upward when
 	 * spa_vdev_exit() calls vdev_dtl_reassess().
 	 */
 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
 	vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
 	    dtl_max_txg - TXG_INITIAL);
 
 	if (newvd->vdev_isspare) {
 		spa_spare_activate(newvd);
 		spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
 	}
 
 	oldvdpath = spa_strdup(oldvd->vdev_path);
 	newvdpath = spa_strdup(newvd->vdev_path);
 	newvd_isspare = newvd->vdev_isspare;
 
 	/*
 	 * Mark newvd's DTL dirty in this txg.
 	 */
 	vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
 	/*
 	 * Schedule the resilver to restart in the future. We do this to
 	 * ensure that dmu_sync-ed blocks have been stitched into the
 	 * respective datasets.
 	 */
 	dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg);
 
 	/*
 	 * Commit the config
 	 */
 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 
 	spa_history_log_internal(spa, "vdev attach", NULL,
 	    "%s vdev=%s %s vdev=%s",
 	    replacing && newvd_isspare ? "spare in" :
 	    replacing ? "replace" : "attach", newvdpath,
 	    replacing ? "for" : "to", oldvdpath);
 
 	spa_strfree(oldvdpath);
 	spa_strfree(newvdpath);
 
 	if (spa->spa_bootfs)
 		spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH);
 
 	return (0);
 }
 
 /*
  * Detach a device from a mirror or replacing vdev.
  *
  * If 'replace_done' is specified, only detach if the parent
  * is a replacing vdev.
  */
 int
 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 {
 	uint64_t txg;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
 	uint64_t unspare_guid = 0;
 	char *vdpath;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = vd->vdev_parent;
 
 	/*
 	 * If the parent/child relationship is not as expected, don't do it.
 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
 	 * vdev that's replacing B with C.  The user's intent in replacing
 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
 	 * the replace by detaching C, the expected behavior is to end up
 	 * M(A,B).  But suppose that right after deciding to detach C,
 	 * the replacement of B completes.  We would have M(A,C), and then
 	 * ask to detach C, which would leave us with just A -- not what
 	 * the user wanted.  To prevent this, we make sure that the
 	 * parent/child relationship hasn't changed -- in this example,
 	 * that C's parent is still the replacing vdev R.
 	 */
 	if (pvd->vdev_guid != pguid && pguid != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	/*
 	 * Only 'replacing' or 'spare' vdevs can be replaced.
 	 */
 	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
 	    spa_version(spa) >= SPA_VERSION_SPARES);
 
 	/*
 	 * Only mirror, replacing, and spare vdevs support detach.
 	 */
 	if (pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	/*
 	 * If this device has the only valid copy of some data,
 	 * we cannot safely detach it.
 	 */
 	if (vdev_dtl_required(vd))
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	ASSERT(pvd->vdev_children >= 2);
 
 	/*
 	 * If we are detaching the second disk from a replacing vdev, then
 	 * check to see if we changed the original vdev's path to have "/old"
 	 * at the end in spa_vdev_attach().  If so, undo that change now.
 	 */
 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
 	    vd->vdev_path != NULL) {
 		size_t len = strlen(vd->vdev_path);
 
 		for (int c = 0; c < pvd->vdev_children; c++) {
 			cvd = pvd->vdev_child[c];
 
 			if (cvd == vd || cvd->vdev_path == NULL)
 				continue;
 
 			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
 			    strcmp(cvd->vdev_path + len, "/old") == 0) {
 				spa_strfree(cvd->vdev_path);
 				cvd->vdev_path = spa_strdup(vd->vdev_path);
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If we are detaching the original disk from a spare, then it implies
 	 * that the spare should become a real disk, and be removed from the
 	 * active spare list for the pool.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops &&
 	    vd->vdev_id == 0 &&
 	    pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare)
 		unspare = B_TRUE;
 
 	/*
 	 * Erase the disk labels so the disk can be used for other things.
 	 * This must be done after all other error cases are handled,
 	 * but before we disembowel vd (so we can still do I/O to it).
 	 * But if we can't do it, don't treat the error as fatal --
 	 * it may be that the unwritability of the disk is the reason
 	 * it's being detached!
 	 */
 	error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	/*
 	 * Remove vd from its parent and compact the parent's children.
 	 */
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	/*
 	 * Remember one of the remaining children so we can get tvd below.
 	 */
 	cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
 	 * do it now, marking the vdev as no longer a spare in the process.
 	 * We must do this before vdev_remove_parent(), because that can
 	 * change the GUID if it creates a new toplevel GUID.  For a similar
 	 * reason, we must remove the spare now, in the same txg as the detach;
 	 * otherwise someone could attach a new sibling, change the GUID, and
 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
 	 */
 	if (unspare) {
 		ASSERT(cvd->vdev_isspare);
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
 		cvd->vdev_unspare = B_TRUE;
 	}
 
 	/*
 	 * If the parent mirror/replacing vdev only has one child,
 	 * the parent is no longer needed.  Remove it from the tree.
 	 */
 	if (pvd->vdev_children == 1) {
 		if (pvd->vdev_ops == &vdev_spare_ops)
 			cvd->vdev_unspare = B_FALSE;
 		vdev_remove_parent(cvd);
 	}
 
 
 	/*
 	 * We don't set tvd until now because the parent we just removed
 	 * may have been the previous top-level vdev.
 	 */
 	tvd = cvd->vdev_top;
 	ASSERT(tvd->vdev_parent == rvd);
 
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
 	vdev_propagate_state(cvd);
 
 	/*
 	 * If the 'autoexpand' property is set on the pool then automatically
 	 * try to expand the size of the pool. For example if the device we
 	 * just detached was smaller than the others, it may be possible to
 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
 	 * first so that we can obtain the updated sizes of the leaf vdevs.
 	 */
 	if (spa->spa_autoexpand) {
 		vdev_reopen(tvd);
 		vdev_expand(tvd, txg);
 	}
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
 	 * But first make sure we're not on any *other* txg's DTL list, to
 	 * prevent vd from being accessed after it's freed.
 	 */
 	vdpath = spa_strdup(vd->vdev_path);
 	for (int t = 0; t < TXG_SIZE; t++)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
 	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
 
 	/* hang on to the spa before we release the lock */
 	spa_open_ref(spa, FTAG);
 
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	spa_history_log_internal(spa, "detach", NULL,
 	    "vdev=%s", vdpath);
 	spa_strfree(vdpath);
 
 	/*
 	 * If this was the removal of the original device in a hot spare vdev,
 	 * then we want to go through and remove the device from the hot spare
 	 * list of every other pool.
 	 */
 	if (unspare) {
 		spa_t *altspa = NULL;
 
 		mutex_enter(&spa_namespace_lock);
 		while ((altspa = spa_next(altspa)) != NULL) {
 			if (altspa->spa_state != POOL_STATE_ACTIVE ||
 			    altspa == spa)
 				continue;
 
 			spa_open_ref(altspa, FTAG);
 			mutex_exit(&spa_namespace_lock);
 			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
 			mutex_enter(&spa_namespace_lock);
 			spa_close(altspa, FTAG);
 		}
 		mutex_exit(&spa_namespace_lock);
 
 		/* search the rest of the vdevs for spares to remove */
 		spa_vdev_resilver_done(spa);
 	}
 
 	/* all done with the spa; OK to release */
 	mutex_enter(&spa_namespace_lock);
 	spa_close(spa, FTAG);
 	mutex_exit(&spa_namespace_lock);
 
 	return (error);
 }
 
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
 int
 spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp)
 {
 	int error = 0;
 	uint64_t txg, *glist;
 	spa_t *newspa;
 	uint_t c, children, lastlog;
 	nvlist_t **child, *nvl, *tmp;
 	dmu_tx_t *tx;
 	char *altroot = NULL;
 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
 	boolean_t activate_slog;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	/* clear the log and flush everything up to now */
 	activate_slog = spa_passivate_log(spa);
 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 	error = spa_offline_log(spa);
 	txg = spa_vdev_config_enter(spa);
 
 	if (activate_slog)
 		spa_activate_log(spa);
 
 	if (error != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	/* check new spa name before going any further */
 	if (spa_lookup(newname) != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
 
 	/*
 	 * scan through all the children to ensure they're all mirrors
 	 */
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* first, check to ensure we've got the right child count */
 	rvd = spa->spa_root_vdev;
 	lastlog = 0;
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		/* don't count the holes & logs as children */
 		if (vd->vdev_islog || vd->vdev_ishole) {
 			if (lastlog == 0)
 				lastlog = c;
 			continue;
 		}
 
 		lastlog = 0;
 	}
 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* next, ensure no spare or cache devices are part of the split */
 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
 
 	/* then, loop over each vdev and validate it */
 	for (c = 0; c < children; c++) {
 		uint64_t is_hole = 0;
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
 		    &is_hole);
 
 		if (is_hole != 0) {
 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
 				continue;
 			} else {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 		}
 
 		/* which disk is going to be split? */
 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
 		    &glist[c]) != 0) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		/* look it up in the spa */
 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
 		if (vml[c] == NULL) {
 			error = SET_ERROR(ENODEV);
 			break;
 		}
 
 		/* make sure there's nothing stopping the split */
 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
 		    vml[c]->vdev_islog ||
 		    vml[c]->vdev_ishole ||
 		    vml[c]->vdev_isspare ||
 		    vml[c]->vdev_isl2cache ||
 		    !vdev_writeable(vml[c]) ||
 		    vml[c]->vdev_children != 0 ||
 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		if (vdev_dtl_required(vml[c])) {
 			error = SET_ERROR(EBUSY);
 			break;
 		}
 
 		/* we need certain info from the top level */
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vml[c]->vdev_top->vdev_ms_array) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
 		    vml[c]->vdev_top->vdev_ms_shift) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
 		    vml[c]->vdev_top->vdev_asize) == 0);
 		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
 		    vml[c]->vdev_top->vdev_ashift) == 0);
 	}
 
 	if (error != 0) {
 		kmem_free(vml, children * sizeof (vdev_t *));
 		kmem_free(glist, children * sizeof (uint64_t));
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	/* stop writers from using the disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_TRUE;
 	}
 	vdev_reopen(spa->spa_root_vdev);
 
 	/*
 	 * Temporarily record the splitting vdevs in the spa config.  This
 	 * will disappear once the config is regenerated.
 	 */
 	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    glist, children) == 0);
 	kmem_free(glist, children * sizeof (uint64_t));
 
 	mutex_enter(&spa->spa_props_lock);
 	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
 	    nvl) == 0);
 	mutex_exit(&spa->spa_props_lock);
 	spa->spa_config_splitting = nvl;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	/* configure and create the new pool */
 	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 	    spa_version(spa)) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    spa->spa_config_txg) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    spa_generate_guid(NULL)) == 0);
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 
 	/* add the new pool to the namespace */
 	newspa = spa_add(newname, config, altroot);
 	newspa->spa_config_txg = spa->spa_config_txg;
 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
 
 	/* release the spa config lock, retaining the namespace lock */
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 1);
 
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
 #ifndef sun
 	/* mark that we are creating new spa by splitting */
 	newspa->spa_splitting_newspa = B_TRUE;
 #endif
 	/* create the new pool from the disks of the original pool */
 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
 #ifndef sun
 	newspa->spa_splitting_newspa = B_FALSE;
 #endif
 	if (error)
 		goto out;
 
 	/* if that worked, generate a real config for the new pool */
 	if (newspa->spa_root_vdev != NULL) {
 		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
 		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
 		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
 		    B_TRUE));
 	}
 
 	/* set the props */
 	if (props != NULL) {
 		spa_configfile_set(newspa, props, B_FALSE);
 		error = spa_prop_set(newspa, props);
 		if (error)
 			goto out;
 	}
 
 	/* flush everything */
 	txg = spa_vdev_config_enter(newspa);
 	vdev_config_dirty(newspa->spa_root_vdev);
 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 2);
 
 	spa_async_resume(newspa);
 
 	/* finally, update the original pool's config */
 	txg = spa_vdev_config_enter(spa);
 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error != 0)
 		dmu_tx_abort(tx);
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL) {
 			vdev_split(vml[c]);
 			if (error == 0)
 				spa_history_log_internal(spa, "detach", tx,
 				    "vdev=%s", vml[c]->vdev_path);
 			vdev_free(vml[c]);
 		}
 	}
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa->spa_config_splitting = NULL;
 	nvlist_free(nvl);
 	if (error == 0)
 		dmu_tx_commit(tx);
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 3);
 
 	/* split is complete; log a history record */
 	spa_history_log_internal(newspa, "split", NULL,
 	    "from pool %s", spa_name(spa));
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 
 	/* if we're not going to mount the filesystems in userland, export */
 	if (exp)
 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
 		    B_FALSE, B_FALSE);
 
 	return (error);
 
 out:
 	spa_unload(newspa);
 	spa_deactivate(newspa);
 	spa_remove(newspa);
 
 	txg = spa_vdev_config_enter(spa);
 
 	/* re-online all offlined disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_FALSE;
 	}
 	vdev_reopen(spa->spa_root_vdev);
 
 	nvlist_free(spa->spa_config_splitting);
 	spa->spa_config_splitting = NULL;
 	(void) spa_vdev_exit(spa, NULL, txg, error);
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 	return (error);
 }
 
 static nvlist_t *
 spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)
 {
 	for (int i = 0; i < count; i++) {
 		uint64_t guid;
 
 		VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID,
 		    &guid) == 0);
 
 		if (guid == target_guid)
 			return (nvpp[i]);
 	}
 
 	return (NULL);
 }
 
 static void
 spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
 	nvlist_t *dev_to_remove)
 {
 	nvlist_t **newdev = NULL;
 
 	if (count > 1)
 		newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP);
 
 	for (int i = 0, j = 0; i < count; i++) {
 		if (dev[i] == dev_to_remove)
 			continue;
 		VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0);
 	}
 
 	VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0);
 	VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0);
 
 	for (int i = 0; i < count - 1; i++)
 		nvlist_free(newdev[i]);
 
 	if (count > 1)
 		kmem_free(newdev, (count - 1) * sizeof (void *));
 }
 
 /*
  * Evacuate the device.
  */
 static int
 spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
 {
 	uint64_t txg;
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Evacuate the device.  We don't hold the config lock as writer
 	 * since we need to do I/O but we do keep the
 	 * spa_namespace_lock held.  Once this completes the device
 	 * should no longer have any blocks allocated on it.
 	 */
 	if (vd->vdev_islog) {
 		if (vd->vdev_stat.vs_alloc != 0)
 			error = spa_offline_log(spa);
 	} else {
 		error = SET_ERROR(ENOTSUP);
 	}
 
 	if (error)
 		return (error);
 
 	/*
 	 * The evacuation succeeded.  Remove any remaining MOS metadata
 	 * associated with this vdev, and wait for these changes to sync.
 	 */
 	ASSERT0(vd->vdev_stat.vs_alloc);
 	txg = spa_vdev_config_enter(spa);
 	vd->vdev_removing = B_TRUE;
 	vdev_dirty_leaves(vd, VDD_DTL, txg);
 	vdev_config_dirty(vd);
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
 	return (0);
 }
 
 /*
  * Complete the removal by cleaning up the namespace.
  */
 static void
 spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t id = vd->vdev_id;
 	boolean_t last_vdev = (id == (rvd->vdev_children - 1));
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 	ASSERT(vd == vd->vdev_top);
 
 	/*
 	 * Only remove any devices which are empty.
 	 */
 	if (vd->vdev_stat.vs_alloc != 0)
 		return;
 
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	if (list_link_active(&vd->vdev_state_dirty_node))
 		vdev_state_clean(vd);
 	if (list_link_active(&vd->vdev_config_dirty_node))
 		vdev_config_clean(vd);
 
 	vdev_free(vd);
 
 	if (last_vdev) {
 		vdev_compact_children(rvd);
 	} else {
 		vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
 		vdev_add_child(rvd, vd);
 	}
 	vdev_config_dirty(rvd);
 
 	/*
 	 * Reassess the health of our root vdev.
 	 */
 	vdev_reopen(rvd);
 }
 
 /*
  * Remove a device from the pool -
  *
  * Removing a device from the vdev namespace requires several steps
  * and can take a significant amount of time.  As a result we use
  * the spa_vdev_config_[enter/exit] functions which allow us to
  * grab and release the spa_config_lock while still holding the namespace
  * lock.  During each step the configuration is synced out.
  *
  * Currently, this supports removing only hot spares, slogs, and level 2 ARC
  * devices.
  */
 int
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
 	metaslab_group_t *mg;
 	nvlist_t **spares, **l2cache, *nv;
 	uint64_t txg = 0;
 	uint_t nspares, nl2cache;
 	int error = 0;
 	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 
 	ASSERT(spa_writeable(spa));
 
 	if (!locked)
 		txg = spa_vdev_enter(spa);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	if (spa->spa_spares.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) {
 		/*
 		 * Only remove the hot spare if it's not currently in use
 		 * in this pool.
 		 */
 		if (vd == NULL || unspare) {
 			spa_vdev_remove_aux(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES, spares, nspares, nv);
 			spa_load_spares(spa);
 			spa->spa_spares.sav_sync = B_TRUE;
 		} else {
 			error = SET_ERROR(EBUSY);
 		}
 	} else if (spa->spa_l2cache.sav_vdevs != NULL &&
 	    nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 &&
 	    (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) {
 		/*
 		 * Cache devices can always be removed.
 		 */
 		spa_vdev_remove_aux(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	} else if (vd != NULL && vd->vdev_islog) {
 		ASSERT(!locked);
 		ASSERT(vd == vd->vdev_top);
 
 		mg = vd->vdev_mg;
 
 		/*
 		 * Stop allocating from this vdev.
 		 */
 		metaslab_group_passivate(mg);
 
 		/*
 		 * Wait for the youngest allocations and frees to sync,
 		 * and then wait for the deferral of those frees to finish.
 		 */
 		spa_vdev_config_exit(spa, NULL,
 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 		/*
 		 * Attempt to evacuate the vdev.
 		 */
 		error = spa_vdev_remove_evacuate(spa, vd);
 
 		txg = spa_vdev_config_enter(spa);
 
 		/*
 		 * If we couldn't evacuate the vdev, unwind.
 		 */
 		if (error) {
 			metaslab_group_activate(mg);
 			return (spa_vdev_exit(spa, NULL, txg, error));
 		}
 
 		/*
 		 * Clean up the vdev namespace.
 		 */
 		spa_vdev_remove_from_namespace(spa, vd);
 
 	} else if (vd != NULL) {
 		/*
 		 * Normal vdevs cannot be removed (yet).
 		 */
 		error = SET_ERROR(ENOTSUP);
 	} else {
 		/*
 		 * There is no vdev of any kind with the specified guid.
 		 */
 		error = SET_ERROR(ENOENT);
 	}
 
 	if (!locked)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	return (error);
 }
 
 /*
  * Find any device that's done replacing, or a vdev marked 'unspare' that's
  * currently spared, so we can detach it.
  */
 static vdev_t *
 spa_vdev_resilver_done_hunt(vdev_t *vd)
 {
 	vdev_t *newvd, *oldvd;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 		if (oldvd != NULL)
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed replacement.  We always consider the first
 	 * vdev in the list to be the oldest vdev, and the last one to be
 	 * the newest (see spa_vdev_attach() for how that works).  In
 	 * the case where the newest vdev is faulted, we will not automatically
 	 * remove it after a resilver completes.  This is OK as it will require
 	 * user intervention to determine which disk the admin wishes to keep.
 	 */
 	if (vd->vdev_ops == &vdev_replacing_ops) {
 		ASSERT(vd->vdev_children > 1);
 
 		newvd = vd->vdev_child[vd->vdev_children - 1];
 		oldvd = vd->vdev_child[0];
 
 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed resilver with the 'unspare' flag set.
 	 */
 	if (vd->vdev_ops == &vdev_spare_ops) {
 		vdev_t *first = vd->vdev_child[0];
 		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
 
 		if (last->vdev_unspare) {
 			oldvd = first;
 			newvd = last;
 		} else if (first->vdev_unspare) {
 			oldvd = last;
 			newvd = first;
 		} else {
 			oldvd = NULL;
 		}
 
 		if (oldvd != NULL &&
 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 
 		/*
 		 * If there are more than two spares attached to a disk,
 		 * and those spares are not required, then we want to
 		 * attempt to free them up now so that they can be used
 		 * by other pools.  Once we're back down to a single
 		 * disk+spare, we stop removing them.
 		 */
 		if (vd->vdev_children > 2) {
 			newvd = vd->vdev_child[1];
 
 			if (newvd->vdev_isspare && last->vdev_isspare &&
 			    vdev_dtl_empty(last, DTL_MISSING) &&
 			    vdev_dtl_empty(last, DTL_OUTAGE) &&
 			    !vdev_dtl_required(newvd))
 				return (newvd);
 		}
 	}
 
 	return (NULL);
 }
 
 static void
 spa_vdev_resilver_done(spa_t *spa)
 {
 	vdev_t *vd, *pvd, *ppvd;
 	uint64_t guid, sguid, pguid, ppguid;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
 		pvd = vd->vdev_parent;
 		ppvd = pvd->vdev_parent;
 		guid = vd->vdev_guid;
 		pguid = pvd->vdev_guid;
 		ppguid = ppvd->vdev_guid;
 		sguid = 0;
 		/*
 		 * If we have just finished replacing a hot spared device, then
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
 		    ppvd->vdev_children == 2) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
 			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
 		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
 
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
 			return;
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
  * Update the stored path or FRU for this vdev.
  */
 int
 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
     boolean_t ispath)
 {
 	vdev_t *vd;
 	boolean_t sync = B_FALSE;
 
 	ASSERT(spa_writeable(spa));
 
 	spa_vdev_state_enter(spa, SCL_ALL);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENOENT));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	if (ispath) {
 		if (strcmp(value, vd->vdev_path) != 0) {
 			spa_strfree(vd->vdev_path);
 			vd->vdev_path = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	} else {
 		if (vd->vdev_fru == NULL) {
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		} else if (strcmp(value, vd->vdev_fru) != 0) {
 			spa_strfree(vd->vdev_fru);
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	}
 
 	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
 }
 
 int
 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
 {
 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
 }
 
 int
 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
 {
 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * SPA Scanning
  * ==========================================================================
  */
 
 int
 spa_scan_stop(spa_t *spa)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
 	return (dsl_scan_cancel(spa->spa_dsl_pool));
 }
 
 int
 spa_scan(spa_t *spa, pool_scan_func_t func)
 {
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
 
 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * If a resilver was requested, but there is no DTL on a
 	 * writeable leaf device, we have nothing to do.
 	 */
 	if (func == POOL_SCAN_RESILVER &&
 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		return (0);
 	}
 
 	return (dsl_scan(spa->spa_dsl_pool, func));
 }
 
 /*
  * ==========================================================================
  * SPA async task processing
  * ==========================================================================
  */
 
 static void
 spa_async_remove(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_remove_wanted) {
 		vd->vdev_remove_wanted = B_FALSE;
 		vd->vdev_delayed_close = B_FALSE;
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
 
 		/*
 		 * We want to clear the stats, but we don't want to do a full
 		 * vdev_clear() as that will cause us to throw away
 		 * degraded/faulted state as well as attempt to reopen the
 		 * device, all of which is a waste.
 		 */
 		vd->vdev_stat.vs_read_errors = 0;
 		vd->vdev_stat.vs_write_errors = 0;
 		vd->vdev_stat.vs_checksum_errors = 0;
 
 		vdev_state_dirty(vd->vdev_top);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_remove(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_probe(spa_t *spa, vdev_t *vd)
 {
 	if (vd->vdev_probe_wanted) {
 		vd->vdev_probe_wanted = B_FALSE;
 		vdev_reopen(vd);	/* vdev_open() does the actual probe */
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_probe(spa, vd->vdev_child[c]);
 }
 
 static void
 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
 {
 	sysevent_id_t eid;
 	nvlist_t *attr;
 	char *physpath;
 
 	if (!spa->spa_autoexpand)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		spa_async_autoexpand(spa, cvd);
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
 		return;
 
 	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
 
 	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 
 	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
 	    ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP);
 
 	nvlist_free(attr);
 	kmem_free(physpath, MAXPATHLEN);
 }
 
 static void
 spa_async_thread(void *arg)
 {
 	spa_t *spa = arg;
 	int tasks;
 
 	ASSERT(spa->spa_sync_on);
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 	spa->spa_async_tasks &= SPA_ASYNC_REMOVE;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
 	 * See if the config needs to be updated.
 	 */
 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
 		uint64_t old_space, new_space;
 
 		mutex_enter(&spa_namespace_lock);
 		old_space = metaslab_class_get_space(spa_normal_class(spa));
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 		new_space = metaslab_class_get_space(spa_normal_class(spa));
 		mutex_exit(&spa_namespace_lock);
 
 		/*
 		 * If the pool grew as a result of the config update,
 		 * then log an internal history event.
 		 */
 		if (new_space != old_space) {
 			spa_history_log_internal(spa, "vdev online", NULL,
 			    "pool '%s' size: %llu(+%llu)",
 			    spa_name(spa), new_space, new_space - old_space);
 		}
 	}
 
 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_async_autoexpand(spa, spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	/*
 	 * See if any devices need to be probed.
 	 */
 	if (tasks & SPA_ASYNC_PROBE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_probe(spa, spa->spa_root_vdev);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	/*
 	 * If any devices are done replacing, detach them.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER_DONE)
 		spa_vdev_resilver_done(spa);
 
 	/*
 	 * Kick off a resilver.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER)
 		dsl_resilver_restart(spa->spa_dsl_pool, 0);
 
 	/*
 	 * Let the world know that we're done.
 	 */
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_thread = NULL;
 	cv_broadcast(&spa->spa_async_cv);
 	mutex_exit(&spa->spa_async_lock);
 	thread_exit();
 }
 
 static void
 spa_async_thread_vd(void *arg)
 {
 	spa_t *spa = arg;
 	int tasks;
 
 	ASSERT(spa->spa_sync_on);
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 retry:
 	spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
 	 * See if any devices need to be marked REMOVED.
 	 */
 	if (tasks & SPA_ASYNC_REMOVE) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_remove(spa, spa->spa_root_vdev);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	/*
 	 * Let the world know that we're done.
 	 */
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 	if ((tasks & SPA_ASYNC_REMOVE) != 0)
 		goto retry;
 	spa->spa_async_thread_vd = NULL;
 	cv_broadcast(&spa->spa_async_cv);
 	mutex_exit(&spa->spa_async_lock);
 	thread_exit();
 }
 
 void
 spa_async_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_suspended++;
 	while (spa->spa_async_thread != NULL &&
 	    spa->spa_async_thread_vd != NULL)
 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 void
 spa_async_resume(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	ASSERT(spa->spa_async_suspended != 0);
 	spa->spa_async_suspended--;
 	mutex_exit(&spa->spa_async_lock);
 }
 
 static boolean_t
 spa_async_tasks_pending(spa_t *spa)
 {
 	uint_t non_config_tasks;
 	uint_t config_task;
 	boolean_t config_task_suspended;
 
 	non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE |
 	    SPA_ASYNC_REMOVE);
 	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
 	if (spa->spa_ccw_fail_time == 0) {
 		config_task_suspended = B_FALSE;
 	} else {
 		config_task_suspended =
 		    (gethrtime() - spa->spa_ccw_fail_time) <
 		    (zfs_ccw_retry_interval * NANOSEC);
 	}
 
 	return (non_config_tasks || (config_task && !config_task_suspended));
 }
 
 static void
 spa_async_dispatch(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	if (spa_async_tasks_pending(spa) &&
 	    !spa->spa_async_suspended &&
 	    spa->spa_async_thread == NULL &&
 	    rootdir != NULL)
 		spa->spa_async_thread = thread_create(NULL, 0,
 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 static void
 spa_async_dispatch_vd(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 &&
 	    !spa->spa_async_suspended &&
 	    spa->spa_async_thread_vd == NULL &&
 	    rootdir != NULL)
 		spa->spa_async_thread_vd = thread_create(NULL, 0,
 		    spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 void
 spa_async_request(spa_t *spa, int task)
 {
 	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_tasks |= task;
 	mutex_exit(&spa->spa_async_lock);
 	spa_async_dispatch_vd(spa);
 }
 
 /*
  * ==========================================================================
  * SPA syncing routines
  * ==========================================================================
  */
 
 static int
 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	bpobj_t *bpo = arg;
 	bpobj_enqueue(bpo, bp, tx);
 	return (0);
 }
 
 static int
 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	zio_t *zio = arg;
 
 	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
 	    BP_GET_PSIZE(bp), zio->io_flags));
 	return (0);
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing frees.
  */
 static void
 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
 	VERIFY(zio_wait(zio) == 0);
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing deferred frees.
  */
 static void
 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
 	    spa_free_sync_cb, zio, tx), ==, 0);
 	VERIFY0(zio_wait(zio));
 }
 
 
 static void
 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 {
 	char *packed = NULL;
 	size_t bufsize;
 	size_t nvsize = 0;
 	dmu_buf_t *db;
 
 	VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0);
 
 	/*
 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
 	 * information.  This avoids the dmu_buf_will_dirty() path and
 	 * saves us a pre-read to get data we don't actually care about.
 	 */
 	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
 	packed = kmem_alloc(bufsize, KM_SLEEP);
 
 	VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
 	    KM_SLEEP) == 0);
 	bzero(packed + nvsize, bufsize - nvsize);
 
 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx);
 
 	kmem_free(packed, bufsize);
 
 	VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	*(uint64_t *)db->db_data = nvsize;
 	dmu_buf_rele(db, FTAG);
 }
 
 static void
 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
     const char *config, const char *entry)
 {
 	nvlist_t *nvroot;
 	nvlist_t **list;
 	int i;
 
 	if (!sav->sav_sync)
 		return;
 
 	/*
 	 * Update the MOS nvlist describing the list of available devices.
 	 * spa_validate_aux() will have already made sure this nvlist is
 	 * valid and the vdevs are labeled appropriately.
 	 */
 	if (sav->sav_object == 0) {
 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
 		    sizeof (uint64_t), tx);
 		VERIFY(zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
 		    &sav->sav_object, tx) == 0);
 	}
 
 	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	if (sav->sav_count == 0) {
 		VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0);
 	} else {
 		list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP);
 		for (i = 0; i < sav->sav_count; i++)
 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
 			    B_FALSE, VDEV_CONFIG_L2CACHE);
 		VERIFY(nvlist_add_nvlist_array(nvroot, config, list,
 		    sav->sav_count) == 0);
 		for (i = 0; i < sav->sav_count; i++)
 			nvlist_free(list[i]);
 		kmem_free(list, sav->sav_count * sizeof (void *));
 	}
 
 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
 	nvlist_free(nvroot);
 
 	sav->sav_sync = B_FALSE;
 }
 
 static void
 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 {
 	nvlist_t *config;
 
 	if (list_is_empty(&spa->spa_config_dirty_list))
 		return;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 	config = spa_config_generate(spa, spa->spa_root_vdev,
 	    dmu_tx_get_txg(tx), B_FALSE);
 
 	/*
 	 * If we're upgrading the spa version then make sure that
 	 * the config object gets updated with the correct version.
 	 */
 	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 		    spa->spa_uberblock.ub_version);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (spa->spa_config_syncing)
 		nvlist_free(spa->spa_config_syncing);
 	spa->spa_config_syncing = config;
 
 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 }
 
 static void
 spa_sync_version(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *versionp = arg;
 	uint64_t version = *versionp;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	/*
 	 * Setting the version is special cased when first creating the pool.
 	 */
 	ASSERT(tx->tx_txg != TXG_INITIAL);
 
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 	ASSERT(version >= spa_version(spa));
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa_history_log_internal(spa, "set", tx, "version=%lld", version);
 }
 
 /*
  * Set zpool properties.
  */
 static void
 spa_sync_props(void *arg, dmu_tx_t *tx)
 {
 	nvlist_t *nvp = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
 		uint64_t intval;
 		char *strval, *fname;
 		zpool_prop_t prop;
 		const char *propname;
 		zprop_type_t proptype;
 		spa_feature_t fid;
 
 		switch (prop = zpool_name_to_prop(nvpair_name(elem))) {
 		case ZPROP_INVAL:
 			/*
 			 * We checked this earlier in spa_prop_validate().
 			 */
 			ASSERT(zpool_prop_feature(nvpair_name(elem)));
 
 			fname = strchr(nvpair_name(elem), '@') + 1;
 			VERIFY0(zfeature_lookup_name(fname, &fid));
 
 			spa_feature_enable(spa, fid, tx);
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=enabled", nvpair_name(elem));
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			intval = fnvpair_value_uint64(elem);
 			/*
 			 * The version is synced seperatly before other
 			 * properties and should be correct by now.
 			 */
 			ASSERT3U(spa_version(spa), >=, intval);
 			break;
 
 		case ZPOOL_PROP_ALTROOT:
 			/*
 			 * 'altroot' is a non-persistent property. It should
 			 * have been set temporarily at creation or import time.
 			 */
 			ASSERT(spa->spa_root != NULL);
 			break;
 
 		case ZPOOL_PROP_READONLY:
 		case ZPOOL_PROP_CACHEFILE:
 			/*
 			 * 'readonly' and 'cachefile' are also non-persisitent
 			 * properties.
 			 */
 			break;
 		case ZPOOL_PROP_COMMENT:
 			strval = fnvpair_value_string(elem);
 			if (spa->spa_comment != NULL)
 				spa_strfree(spa->spa_comment);
 			spa->spa_comment = spa_strdup(strval);
 			/*
 			 * We need to dirty the configuration on all the vdevs
 			 * so that their labels get updated.  It's unnecessary
 			 * to do this for pool creation since the vdev's
 			 * configuratoin has already been dirtied.
 			 */
 			if (tx->tx_txg != TXG_INITIAL)
 				vdev_config_dirty(spa->spa_root_vdev);
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=%s", nvpair_name(elem), strval);
 			break;
 		default:
 			/*
 			 * Set pool property values in the poolprops mos object.
 			 */
 			if (spa->spa_pool_props_object == 0) {
 				spa->spa_pool_props_object =
 				    zap_create_link(mos, DMU_OT_POOL_PROPS,
 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
 				    tx);
 			}
 
 			/* normalize the property name */
 			propname = zpool_prop_to_name(prop);
 			proptype = zpool_prop_get_type(prop);
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				strval = fnvpair_value_string(elem);
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    1, strlen(strval) + 1, strval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%s", nvpair_name(elem), strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
 					VERIFY0(zpool_prop_index_to_string(
 					    prop, intval, &unused));
 				}
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    8, 1, &intval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%lld", nvpair_name(elem), intval);
 			} else {
 				ASSERT(0); /* not allowed */
 			}
 
 			switch (prop) {
 			case ZPOOL_PROP_DELEGATION:
 				spa->spa_delegation = intval;
 				break;
 			case ZPOOL_PROP_BOOTFS:
 				spa->spa_bootfs = intval;
 				break;
 			case ZPOOL_PROP_FAILUREMODE:
 				spa->spa_failmode = intval;
 				break;
 			case ZPOOL_PROP_AUTOEXPAND:
 				spa->spa_autoexpand = intval;
 				if (tx->tx_txg != TXG_INITIAL)
 					spa_async_request(spa,
 					    SPA_ASYNC_AUTOEXPAND);
 				break;
 			case ZPOOL_PROP_DEDUPDITTO:
 				spa->spa_dedup_ditto = intval;
 				break;
 			default:
 				break;
 			}
 		}
 
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 /*
  * Perform one-time upgrade on-disk changes.  spa_version() does not
  * reflect the new version this txg, so there must be no changes this
  * txg to anything that the upgrade code depends on after it executes.
  * Therefore this must be called after dsl_pool_sync() does the sync
  * tasks.
  */
 static void
 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 
 	ASSERT(spa->spa_sync_pass == 1);
 
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
 		dsl_pool_create_origin(dp, tx);
 
 		/* Keeping the origin open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
 		dsl_pool_upgrade_clones(dp, tx);
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
 		dsl_pool_upgrade_dir_clones(dp, tx);
 
 		/* Keeping the freedir open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		spa_feature_create_zap_objects(spa, tx);
 	}
 
 	/*
 	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
 	 * when possibility to use lz4 compression for metadata was added
 	 * Old pools that have this feature enabled must be upgraded to have
 	 * this feature active
 	 */
 	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		boolean_t lz4_en = spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 		boolean_t lz4_ac = spa_feature_is_active(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 
 		if (lz4_en && !lz4_ac)
 			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
 	}
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 }
 
 /*
  * Sync the specified transaction group.  New blocks may be dirtied as
  * part of the process, so we iterate until it converges.
  */
 void
 spa_sync(spa_t *spa, uint64_t txg)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	dmu_tx_t *tx;
 	int error;
 
 	VERIFY(spa_writeable(spa));
 
 	/*
 	 * Lock out configuration changes.
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
 	/*
 	 * If there are any pending vdev state changes, convert them
 	 * into config changes that go out with this transaction group.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	while (list_head(&spa->spa_state_dirty_list) != NULL) {
 		/*
 		 * We need the write lock here because, for aux vdevs,
 		 * calling vdev_config_dirty() modifies sav_config.
 		 * This is ugly and will become unnecessary when we
 		 * eliminate the aux vdev wart by integrating all vdevs
 		 * into the root vdev tree.
 		 */
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
 			vdev_state_clean(vd);
 			vdev_config_dirty(vd);
 		}
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	spa->spa_sync_starttime = gethrtime();
 #ifdef illumos
 	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid,
 	    spa->spa_sync_starttime + spa->spa_deadman_synctime));
 #else	/* FreeBSD */
 #ifdef _KERNEL
 	callout_reset(&spa->spa_deadman_cycid,
 	    hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa);
 #endif
 #endif
 
 	/*
 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 	 * set spa_deflate if we have no raid-z vdevs.
 	 */
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		int i;
 
 		for (i = 0; i < rvd->vdev_children; i++) {
 			vd = rvd->vdev_child[i];
 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
 				break;
 		}
 		if (i == rvd->vdev_children) {
 			spa->spa_deflate = TRUE;
 			VERIFY(0 == zap_add(spa->spa_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
 		}
 	}
 
 	/*
 	 * If anything has changed in this txg, or if someone is waiting
 	 * for this txg to sync (eg, spa_vdev_remove()), push the
 	 * deferred frees from the previous txg.  If not, leave them
 	 * alone so that we don't generate work on an otherwise idle
 	 * system.
 	 */
 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
 	    !txg_list_empty(&dp->dp_sync_tasks, txg) ||
 	    ((dsl_scan_active(dp->dp_scan) ||
 	    txg_sync_waiting(dp)) && !spa_shutting_down(spa))) {
 		spa_sync_deferred_frees(spa, tx);
 	}
 
 	/*
 	 * Iterate to convergence.
 	 */
 	do {
 		int pass = ++spa->spa_sync_pass;
 
 		spa_sync_config_object(spa, tx);
 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
 		if (pass < zfs_sync_pass_deferred_free) {
 			spa_sync_frees(spa, free_bpl, tx);
 		} else {
 			bplist_iterate(free_bpl, bpobj_enqueue_cb,
 			    &spa->spa_deferred_bpobj, tx);
 		}
 
 		ddt_sync(spa, txg);
 		dsl_scan_sync(dp, tx);
 
 		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 			vdev_sync(vd, txg);
 
 		if (pass == 1)
 			spa_sync_upgrades(spa, tx);
 
 	} while (dmu_objset_is_dirty(mos, txg));
 
 	/*
 	 * Rewrite the vdev configuration (which includes the uberblock)
 	 * to commit the transaction group.
 	 *
 	 * If there are no dirty vdevs, we sync the uberblock to a few
 	 * random top-level vdevs that are known to be visible in the
 	 * config cache (see spa_vdev_add() for a complete description).
 	 * If there *are* dirty vdevs, sync the uberblock to all vdevs.
 	 */
 	for (;;) {
 		/*
 		 * We hold SCL_STATE to prevent vdev open/close/etc.
 		 * while we're attempting to write the vdev labels.
 		 */
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 		if (list_is_empty(&spa->spa_config_dirty_list)) {
 			vdev_t *svd[SPA_DVAS_PER_BP];
 			int svdcount = 0;
 			int children = rvd->vdev_children;
 			int c0 = spa_get_random(children);
 
 			for (int c = 0; c < children; c++) {
 				vd = rvd->vdev_child[(c0 + c) % children];
 				if (vd->vdev_ms_array == 0 || vd->vdev_islog)
 					continue;
 				svd[svdcount++] = vd;
 				if (svdcount == SPA_DVAS_PER_BP)
 					break;
 			}
 			error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
 			if (error != 0)
 				error = vdev_config_sync(svd, svdcount, txg,
 				    B_TRUE);
 		} else {
 			error = vdev_config_sync(rvd->vdev_child,
 			    rvd->vdev_children, txg, B_FALSE);
 			if (error != 0)
 				error = vdev_config_sync(rvd->vdev_child,
 				    rvd->vdev_children, txg, B_TRUE);
 		}
 
 		if (error == 0)
 			spa->spa_last_synced_guid = rvd->vdev_guid;
 
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		if (error == 0)
 			break;
 		zio_suspend(spa, NULL);
 		zio_resume_wait(spa);
 	}
 	dmu_tx_commit(tx);
 
 #ifdef illumos
 	VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY));
 #else	/* FreeBSD */
 #ifdef _KERNEL
 	callout_drain(&spa->spa_deadman_cycid);
 #endif
 #endif
 
 	/*
 	 * Clear the dirty config list.
 	 */
 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
 		vdev_config_clean(vd);
 
 	/*
 	 * Now that the new config has synced transactionally,
 	 * let it become visible to the config cache.
 	 */
 	if (spa->spa_config_syncing != NULL) {
 		spa_config_set(spa, spa->spa_config_syncing);
 		spa->spa_config_txg = txg;
 		spa->spa_config_syncing = NULL;
 	}
 
 	spa->spa_ubsync = spa->spa_uberblock;
 
 	dsl_pool_sync_done(dp, txg);
 
 	/*
 	 * Update usable space statistics.
 	 */
 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 		vdev_sync_done(vd, txg);
 
 	spa_update_dspace(spa);
 
 	/*
 	 * It had better be the case that we didn't dirty anything
 	 * since vdev_config_sync().
 	 */
 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 
 	spa->spa_sync_pass = 0;
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_handle_ignored_writes(spa);
 
 	/*
 	 * If any async tasks have been requested, kick them off.
 	 */
 	spa_async_dispatch(spa);
 	spa_async_dispatch_vd(spa);
 }
 
 /*
  * Sync all pools.  We don't want to hold the namespace lock across these
  * operations, so we take a reference on the spa_t and drop the lock during the
  * sync.
  */
 void
 spa_sync_allpools(void)
 {
 	spa_t *spa = NULL;
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(spa)) != NULL) {
 		if (spa_state(spa) != POOL_STATE_ACTIVE ||
 		    !spa_writeable(spa) || spa_suspended(spa))
 			continue;
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		txg_wait_synced(spa_get_dsl(spa), 0);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 /*
  * ==========================================================================
  * Miscellaneous routines
  * ==========================================================================
  */
 
 /*
  * Remove all pools in the system.
  */
 void
 spa_evict_all(void)
 {
 	spa_t *spa;
 
 	/*
 	 * Remove all cached state.  All pools should be closed now,
 	 * so every spa in the AVL tree should be unreferenced.
 	 */
 	mutex_enter(&spa_namespace_lock);
 	while ((spa = spa_next(NULL)) != NULL) {
 		/*
 		 * Stop async tasks.  The async thread may need to detach
 		 * a device that's been replaced, which requires grabbing
 		 * spa_namespace_lock, so we must drop it here.
 		 */
 		spa_open_ref(spa, FTAG);
 		mutex_exit(&spa_namespace_lock);
 		spa_async_suspend(spa);
 		mutex_enter(&spa_namespace_lock);
 		spa_close(spa, FTAG);
 
 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 			spa_unload(spa);
 			spa_deactivate(spa);
 		}
 		spa_remove(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
 }
 
 vdev_t *
 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
 {
 	vdev_t *vd;
 	int i;
 
 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
 		return (vd);
 
 	if (aux) {
 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vd = spa->spa_l2cache.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 
 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
 			vd = spa->spa_spares.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 	}
 
 	return (NULL);
 }
 
 void
 spa_upgrade(spa_t *spa, uint64_t version)
 {
 	ASSERT(spa_writeable(spa));
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * This should only be called for a non-faulted pool, and since a
 	 * future version would result in an unopenable pool, this shouldn't be
 	 * possible.
 	 */
 	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
 	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 }
 
 boolean_t
 spa_has_spare(spa_t *spa, uint64_t guid)
 {
 	int i;
 	uint64_t spareguid;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++)
 		if (sav->sav_vdevs[i]->vdev_guid == guid)
 			return (B_TRUE);
 
 	for (i = 0; i < sav->sav_npending; i++) {
 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
 		    &spareguid) == 0 && spareguid == guid)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Check if a pool has an active shared spare device.
  * Note: reference count of an active spare is 2, as a spare and as a replace
  */
 static boolean_t
 spa_has_active_shared_spare(spa_t *spa)
 {
 	int i, refcnt;
 	uint64_t pool;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
 		    refcnt > 2)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * Post a sysevent corresponding to the given event.  The 'name' must be one of
  * the event definitions in sys/sysevent/eventdefs.h.  The payload will be
  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
  * in the userland libzpool, as we don't want consumers to misinterpret ztest
  * or zdb as real changes.
  */
 void
 spa_event_notify(spa_t *spa, vdev_t *vd, const char *name)
 {
 #ifdef _KERNEL
 	sysevent_t		*ev;
 	sysevent_attr_list_t	*attr = NULL;
 	sysevent_value_t	value;
 	sysevent_id_t		eid;
 
 	ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs",
 	    SE_SLEEP);
 
 	value.value_type = SE_DATA_TYPE_STRING;
 	value.value.sv_string = spa_name(spa);
 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0)
 		goto done;
 
 	value.value_type = SE_DATA_TYPE_UINT64;
 	value.value.sv_uint64 = spa_guid(spa);
 	if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0)
 		goto done;
 
 	if (vd) {
 		value.value_type = SE_DATA_TYPE_UINT64;
 		value.value.sv_uint64 = vd->vdev_guid;
 		if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value,
 		    SE_SLEEP) != 0)
 			goto done;
 
 		if (vd->vdev_path) {
 			value.value_type = SE_DATA_TYPE_STRING;
 			value.value.sv_string = vd->vdev_path;
 			if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH,
 			    &value, SE_SLEEP) != 0)
 				goto done;
 		}
 	}
 
 	if (sysevent_attach_attributes(ev, attr) != 0)
 		goto done;
 	attr = NULL;
 
 	(void) log_sysevent(ev, SE_SLEEP, &eid);
 
 done:
 	if (attr)
 		sysevent_free_attr(attr);
 	sysevent_free(ev);
 #endif
 }
Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h
===================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h	(revision 271000)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/refcount.h	(revision 271001)
@@ -1,113 +1,113 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012 by Delphix. All rights reserved.
  */
 
 #ifndef	_SYS_REFCOUNT_H
 #define	_SYS_REFCOUNT_H
 
 #include <sys/cdefs.h>
 #include <sys/types.h>
 #include_next <sys/refcount.h>
 #include <sys/list.h>
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * If the reference is held only by the calling function and not any
  * particular object, use FTAG (which is a string) for the holder_tag.
  * Otherwise, use the object that holds the reference.
  */
 #define	FTAG ((char *)__func__)
 
 #ifdef	ZFS_DEBUG
 typedef struct reference {
 	list_node_t ref_link;
 	void *ref_holder;
 	uint64_t ref_number;
 	uint8_t *ref_removed;
 } reference_t;
 
 typedef struct refcount {
 	kmutex_t rc_mtx;
 	boolean_t rc_tracked;
 	list_t rc_list;
 	list_t rc_removed;
 	uint64_t rc_count;
 	uint64_t rc_removed_count;
 } refcount_t;
 
 /* Note: refcount_t must be initialized with refcount_create[_untracked]() */
 
 void refcount_create(refcount_t *rc);
 void refcount_create_untracked(refcount_t *rc);
 void refcount_destroy(refcount_t *rc);
 void refcount_destroy_many(refcount_t *rc, uint64_t number);
 int refcount_is_zero(refcount_t *rc);
 int64_t refcount_count(refcount_t *rc);
 int64_t refcount_add(refcount_t *rc, void *holder_tag);
 int64_t refcount_remove(refcount_t *rc, void *holder_tag);
 int64_t refcount_add_many(refcount_t *rc, uint64_t number, void *holder_tag);
 int64_t refcount_remove_many(refcount_t *rc, uint64_t number, void *holder_tag);
 void refcount_transfer(refcount_t *dst, refcount_t *src);
 
 void refcount_sysinit(void);
 void refcount_fini(void);
 
 #else	/* ZFS_DEBUG */
 
 typedef struct refcount {
 	uint64_t rc_count;
 } refcount_t;
 
 #define	refcount_create(rc) ((rc)->rc_count = 0)
 #define	refcount_create_untracked(rc) ((rc)->rc_count = 0)
 #define	refcount_destroy(rc) ((rc)->rc_count = 0)
 #define	refcount_destroy_many(rc, number) ((rc)->rc_count = 0)
 #define	refcount_is_zero(rc) ((rc)->rc_count == 0)
 #define	refcount_count(rc) ((rc)->rc_count)
-#define	refcount_add(rc, holder) atomic_add_64_nv(&(rc)->rc_count, 1)
-#define	refcount_remove(rc, holder) atomic_add_64_nv(&(rc)->rc_count, -1)
+#define	refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count)
+#define	refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count)
 #define	refcount_add_many(rc, number, holder) \
 	atomic_add_64_nv(&(rc)->rc_count, number)
 #define	refcount_remove_many(rc, number, holder) \
 	atomic_add_64_nv(&(rc)->rc_count, -number)
 #define	refcount_transfer(dst, src) { \
 	uint64_t __tmp = (src)->rc_count; \
 	atomic_add_64(&(src)->rc_count, -__tmp); \
 	atomic_add_64(&(dst)->rc_count, __tmp); \
 }
 
 #define	refcount_sysinit()
 #define	refcount_fini()
 
 #endif	/* ZFS_DEBUG */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_REFCOUNT_H */
Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c
===================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c	(revision 271000)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_cache.c	(revision 271001)
@@ -1,438 +1,438 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
  * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/kstat.h>
 
 /*
  * Virtual device read-ahead caching.
  *
  * This file implements a simple LRU read-ahead cache.  When the DMU reads
  * a given block, it will often want other, nearby blocks soon thereafter.
  * We take advantage of this by reading a larger disk region and caching
  * the result.  In the best case, this can turn 128 back-to-back 512-byte
  * reads into a single 64k read followed by 127 cache hits; this reduces
  * latency dramatically.  In the worst case, it can turn an isolated 512-byte
  * read into a 64k read, which doesn't affect latency all that much but is
  * terribly wasteful of bandwidth.  A more intelligent version of the cache
  * could keep track of access patterns and not do read-ahead unless it sees
  * at least two temporally close I/Os to the same region.  Currently, only
  * metadata I/O is inflated.  A futher enhancement could take advantage of
  * more semantic information about the I/O.  And it could use something
  * faster than an AVL tree; that was chosen solely for convenience.
  *
  * There are five cache operations: allocate, fill, read, write, evict.
  *
  * (1) Allocate.  This reserves a cache entry for the specified region.
  *     We separate the allocate and fill operations so that multiple threads
  *     don't generate I/O for the same cache miss.
  *
  * (2) Fill.  When the I/O for a cache miss completes, the fill routine
  *     places the data in the previously allocated cache entry.
  *
  * (3) Read.  Read data from the cache.
  *
  * (4) Write.  Update cache contents after write completion.
  *
  * (5) Evict.  When allocating a new entry, we evict the oldest (LRU) entry
  *     if the total cache size exceeds zfs_vdev_cache_size.
  */
 
 /*
  * These tunables are for performance analysis.
  */
 /*
  * All i/os smaller than zfs_vdev_cache_max will be turned into
  * 1<<zfs_vdev_cache_bshift byte reads by the vdev_cache (aka software
  * track buffer).  At most zfs_vdev_cache_size bytes will be kept in each
  * vdev's vdev_cache.
  *
  * TODO: Note that with the current ZFS code, it turns out that the
  * vdev cache is not helpful, and in some cases actually harmful.  It
  * is better if we disable this.  Once some time has passed, we should
  * actually remove this to simplify the code.  For now we just disable
  * it by setting the zfs_vdev_cache_size to zero.  Note that Solaris 11
  * has made these same changes.
  */
 int zfs_vdev_cache_max = 1<<14;			/* 16KB */
 int zfs_vdev_cache_size = 0;
 int zfs_vdev_cache_bshift = 16;
 
 #define	VCBS (1 << zfs_vdev_cache_bshift)	/* 64KB */
 
 SYSCTL_DECL(_vfs_zfs_vdev);
 SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache");
 TUNABLE_INT("vfs.zfs.vdev.cache.max", &zfs_vdev_cache_max);
 SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, max, CTLFLAG_RDTUN,
     &zfs_vdev_cache_max, 0, "Maximum I/O request size that increase read size");
 TUNABLE_INT("vfs.zfs.vdev.cache.size", &zfs_vdev_cache_size);
 SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, size, CTLFLAG_RDTUN,
     &zfs_vdev_cache_size, 0, "Size of VDEV cache");
 TUNABLE_INT("vfs.zfs.vdev.cache.bshift", &zfs_vdev_cache_bshift);
 SYSCTL_INT(_vfs_zfs_vdev_cache, OID_AUTO, bshift, CTLFLAG_RDTUN,
     &zfs_vdev_cache_bshift, 0, "Turn too small requests into 1 << this value");
 
 kstat_t	*vdc_ksp = NULL;
 
 typedef struct vdc_stats {
 	kstat_named_t vdc_stat_delegations;
 	kstat_named_t vdc_stat_hits;
 	kstat_named_t vdc_stat_misses;
 } vdc_stats_t;
 
 static vdc_stats_t vdc_stats = {
 	{ "delegations",	KSTAT_DATA_UINT64 },
 	{ "hits",		KSTAT_DATA_UINT64 },
 	{ "misses",		KSTAT_DATA_UINT64 }
 };
 
-#define	VDCSTAT_BUMP(stat)	atomic_add_64(&vdc_stats.stat.value.ui64, 1);
+#define	VDCSTAT_BUMP(stat)	atomic_inc_64(&vdc_stats.stat.value.ui64);
 
 static int
 vdev_cache_offset_compare(const void *a1, const void *a2)
 {
 	const vdev_cache_entry_t *ve1 = a1;
 	const vdev_cache_entry_t *ve2 = a2;
 
 	if (ve1->ve_offset < ve2->ve_offset)
 		return (-1);
 	if (ve1->ve_offset > ve2->ve_offset)
 		return (1);
 	return (0);
 }
 
 static int
 vdev_cache_lastused_compare(const void *a1, const void *a2)
 {
 	const vdev_cache_entry_t *ve1 = a1;
 	const vdev_cache_entry_t *ve2 = a2;
 
 	if (ve1->ve_lastused < ve2->ve_lastused)
 		return (-1);
 	if (ve1->ve_lastused > ve2->ve_lastused)
 		return (1);
 
 	/*
 	 * Among equally old entries, sort by offset to ensure uniqueness.
 	 */
 	return (vdev_cache_offset_compare(a1, a2));
 }
 
 /*
  * Evict the specified entry from the cache.
  */
 static void
 vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve)
 {
 	ASSERT(MUTEX_HELD(&vc->vc_lock));
 	ASSERT(ve->ve_fill_io == NULL);
 	ASSERT(ve->ve_data != NULL);
 
 	avl_remove(&vc->vc_lastused_tree, ve);
 	avl_remove(&vc->vc_offset_tree, ve);
 	zio_buf_free(ve->ve_data, VCBS);
 	kmem_free(ve, sizeof (vdev_cache_entry_t));
 }
 
 /*
  * Allocate an entry in the cache.  At the point we don't have the data,
  * we're just creating a placeholder so that multiple threads don't all
  * go off and read the same blocks.
  */
 static vdev_cache_entry_t *
 vdev_cache_allocate(zio_t *zio)
 {
 	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
 	uint64_t offset = P2ALIGN(zio->io_offset, VCBS);
 	vdev_cache_entry_t *ve;
 
 	ASSERT(MUTEX_HELD(&vc->vc_lock));
 
 	if (zfs_vdev_cache_size == 0)
 		return (NULL);
 
 	/*
 	 * If adding a new entry would exceed the cache size,
 	 * evict the oldest entry (LRU).
 	 */
 	if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) >
 	    zfs_vdev_cache_size) {
 		ve = avl_first(&vc->vc_lastused_tree);
 		if (ve->ve_fill_io != NULL)
 			return (NULL);
 		ASSERT(ve->ve_hits != 0);
 		vdev_cache_evict(vc, ve);
 	}
 
 	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
 	ve->ve_offset = offset;
 	ve->ve_lastused = ddi_get_lbolt();
 	ve->ve_data = zio_buf_alloc(VCBS);
 
 	avl_add(&vc->vc_offset_tree, ve);
 	avl_add(&vc->vc_lastused_tree, ve);
 
 	return (ve);
 }
 
 static void
 vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
 {
 	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
 
 	ASSERT(MUTEX_HELD(&vc->vc_lock));
 	ASSERT(ve->ve_fill_io == NULL);
 
 	if (ve->ve_lastused != ddi_get_lbolt()) {
 		avl_remove(&vc->vc_lastused_tree, ve);
 		ve->ve_lastused = ddi_get_lbolt();
 		avl_add(&vc->vc_lastused_tree, ve);
 	}
 
 	ve->ve_hits++;
 	bcopy(ve->ve_data + cache_phase, zio->io_data, zio->io_size);
 }
 
 /*
  * Fill a previously allocated cache entry with data.
  */
 static void
 vdev_cache_fill(zio_t *fio)
 {
 	vdev_t *vd = fio->io_vd;
 	vdev_cache_t *vc = &vd->vdev_cache;
 	vdev_cache_entry_t *ve = fio->io_private;
 	zio_t *pio;
 
 	ASSERT(fio->io_size == VCBS);
 
 	/*
 	 * Add data to the cache.
 	 */
 	mutex_enter(&vc->vc_lock);
 
 	ASSERT(ve->ve_fill_io == fio);
 	ASSERT(ve->ve_offset == fio->io_offset);
 	ASSERT(ve->ve_data == fio->io_data);
 
 	ve->ve_fill_io = NULL;
 
 	/*
 	 * Even if this cache line was invalidated by a missed write update,
 	 * any reads that were queued up before the missed update are still
 	 * valid, so we can satisfy them from this line before we evict it.
 	 */
 	while ((pio = zio_walk_parents(fio)) != NULL)
 		vdev_cache_hit(vc, ve, pio);
 
 	if (fio->io_error || ve->ve_missed_update)
 		vdev_cache_evict(vc, ve);
 
 	mutex_exit(&vc->vc_lock);
 }
 
 /*
  * Read data from the cache.  Returns B_TRUE cache hit, B_FALSE on miss.
  */
 boolean_t
 vdev_cache_read(zio_t *zio)
 {
 	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
 	vdev_cache_entry_t *ve, ve_search;
 	uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS);
 	uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS);
 	zio_t *fio;
 
 	ASSERT(zio->io_type == ZIO_TYPE_READ);
 
 	if (zio->io_flags & ZIO_FLAG_DONT_CACHE)
 		return (B_FALSE);
 
 	if (zio->io_size > zfs_vdev_cache_max)
 		return (B_FALSE);
 
 	/*
 	 * If the I/O straddles two or more cache blocks, don't cache it.
 	 */
 	if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS))
 		return (B_FALSE);
 
 	ASSERT(cache_phase + zio->io_size <= VCBS);
 
 	mutex_enter(&vc->vc_lock);
 
 	ve_search.ve_offset = cache_offset;
 	ve = avl_find(&vc->vc_offset_tree, &ve_search, NULL);
 
 	if (ve != NULL) {
 		if (ve->ve_missed_update) {
 			mutex_exit(&vc->vc_lock);
 			return (B_FALSE);
 		}
 
 		if ((fio = ve->ve_fill_io) != NULL) {
 			zio_vdev_io_bypass(zio);
 			zio_add_child(zio, fio);
 			mutex_exit(&vc->vc_lock);
 			VDCSTAT_BUMP(vdc_stat_delegations);
 			return (B_TRUE);
 		}
 
 		vdev_cache_hit(vc, ve, zio);
 		zio_vdev_io_bypass(zio);
 
 		mutex_exit(&vc->vc_lock);
 		VDCSTAT_BUMP(vdc_stat_hits);
 		return (B_TRUE);
 	}
 
 	ve = vdev_cache_allocate(zio);
 
 	if (ve == NULL) {
 		mutex_exit(&vc->vc_lock);
 		return (B_FALSE);
 	}
 
 	fio = zio_vdev_delegated_io(zio->io_vd, cache_offset,
 	    ve->ve_data, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW,
 	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
 
 	ve->ve_fill_io = fio;
 	zio_vdev_io_bypass(zio);
 	zio_add_child(zio, fio);
 
 	mutex_exit(&vc->vc_lock);
 	zio_nowait(fio);
 	VDCSTAT_BUMP(vdc_stat_misses);
 
 	return (B_TRUE);
 }
 
 /*
  * Update cache contents upon write completion.
  */
 void
 vdev_cache_write(zio_t *zio)
 {
 	vdev_cache_t *vc = &zio->io_vd->vdev_cache;
 	vdev_cache_entry_t *ve, ve_search;
 	uint64_t io_start = zio->io_offset;
 	uint64_t io_end = io_start + zio->io_size;
 	uint64_t min_offset = P2ALIGN(io_start, VCBS);
 	uint64_t max_offset = P2ROUNDUP(io_end, VCBS);
 	avl_index_t where;
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 
 	mutex_enter(&vc->vc_lock);
 
 	ve_search.ve_offset = min_offset;
 	ve = avl_find(&vc->vc_offset_tree, &ve_search, &where);
 
 	if (ve == NULL)
 		ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER);
 
 	while (ve != NULL && ve->ve_offset < max_offset) {
 		uint64_t start = MAX(ve->ve_offset, io_start);
 		uint64_t end = MIN(ve->ve_offset + VCBS, io_end);
 
 		if (ve->ve_fill_io != NULL) {
 			ve->ve_missed_update = 1;
 		} else {
 			bcopy((char *)zio->io_data + start - io_start,
 			    ve->ve_data + start - ve->ve_offset, end - start);
 		}
 		ve = AVL_NEXT(&vc->vc_offset_tree, ve);
 	}
 	mutex_exit(&vc->vc_lock);
 }
 
 void
 vdev_cache_purge(vdev_t *vd)
 {
 	vdev_cache_t *vc = &vd->vdev_cache;
 	vdev_cache_entry_t *ve;
 
 	mutex_enter(&vc->vc_lock);
 	while ((ve = avl_first(&vc->vc_offset_tree)) != NULL)
 		vdev_cache_evict(vc, ve);
 	mutex_exit(&vc->vc_lock);
 }
 
 void
 vdev_cache_init(vdev_t *vd)
 {
 	vdev_cache_t *vc = &vd->vdev_cache;
 
 	mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare,
 	    sizeof (vdev_cache_entry_t),
 	    offsetof(struct vdev_cache_entry, ve_offset_node));
 
 	avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare,
 	    sizeof (vdev_cache_entry_t),
 	    offsetof(struct vdev_cache_entry, ve_lastused_node));
 }
 
 void
 vdev_cache_fini(vdev_t *vd)
 {
 	vdev_cache_t *vc = &vd->vdev_cache;
 
 	vdev_cache_purge(vd);
 
 	avl_destroy(&vc->vc_offset_tree);
 	avl_destroy(&vc->vc_lastused_tree);
 
 	mutex_destroy(&vc->vc_lock);
 }
 
 void
 vdev_cache_stat_init(void)
 {
 	vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (vdc_ksp != NULL) {
 		vdc_ksp->ks_data = &vdc_stats;
 		kstat_install(vdc_ksp);
 	}
 }
 
 void
 vdev_cache_stat_fini(void)
 {
 	if (vdc_ksp != NULL) {
 		kstat_delete(vdc_ksp);
 		vdc_ksp = NULL;
 	}
 }
Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
===================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c	(revision 271000)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c	(revision 271001)
@@ -1,1292 +1,1292 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 /*
  * Virtual Device Labels
  * ---------------------
  *
  * The vdev label serves several distinct purposes:
  *
  *	1. Uniquely identify this device as part of a ZFS pool and confirm its
  *	   identity within the pool.
  *
  * 	2. Verify that all the devices given in a configuration are present
  *         within the pool.
  *
  * 	3. Determine the uberblock for the pool.
  *
  * 	4. In case of an import operation, determine the configuration of the
  *         toplevel vdev of which it is a part.
  *
  * 	5. If an import operation cannot find all the devices in the pool,
  *         provide enough information to the administrator to determine which
  *         devices are missing.
  *
  * It is important to note that while the kernel is responsible for writing the
  * label, it only consumes the information in the first three cases.  The
  * latter information is only consumed in userland when determining the
  * configuration to import a pool.
  *
  *
  * Label Organization
  * ------------------
  *
  * Before describing the contents of the label, it's important to understand how
  * the labels are written and updated with respect to the uberblock.
  *
  * When the pool configuration is altered, either because it was newly created
  * or a device was added, we want to update all the labels such that we can deal
  * with fatal failure at any point.  To this end, each disk has two labels which
  * are updated before and after the uberblock is synced.  Assuming we have
  * labels and an uberblock with the following transaction groups:
  *
  *              L1          UB          L2
  *           +------+    +------+    +------+
  *           |      |    |      |    |      |
  *           | t10  |    | t10  |    | t10  |
  *           |      |    |      |    |      |
  *           +------+    +------+    +------+
  *
  * In this stable state, the labels and the uberblock were all updated within
  * the same transaction group (10).  Each label is mirrored and checksummed, so
  * that we can detect when we fail partway through writing the label.
  *
  * In order to identify which labels are valid, the labels are written in the
  * following manner:
  *
  * 	1. For each vdev, update 'L1' to the new label
  * 	2. Update the uberblock
  * 	3. For each vdev, update 'L2' to the new label
  *
  * Given arbitrary failure, we can determine the correct label to use based on
  * the transaction group.  If we fail after updating L1 but before updating the
  * UB, we will notice that L1's transaction group is greater than the uberblock,
  * so L2 must be valid.  If we fail after writing the uberblock but before
  * writing L2, we will notice that L2's transaction group is less than L1, and
  * therefore L1 is valid.
  *
  * Another added complexity is that not every label is updated when the config
  * is synced.  If we add a single device, we do not want to have to re-write
  * every label for every device in the pool.  This means that both L1 and L2 may
  * be older than the pool uberblock, because the necessary information is stored
  * on another vdev.
  *
  *
  * On-disk Format
  * --------------
  *
  * The vdev label consists of two distinct parts, and is wrapped within the
  * vdev_label_t structure.  The label includes 8k of padding to permit legacy
  * VTOC disk labels, but is otherwise ignored.
  *
  * The first half of the label is a packed nvlist which contains pool wide
  * properties, per-vdev properties, and configuration information.  It is
  * described in more detail below.
  *
  * The latter half of the label consists of a redundant array of uberblocks.
  * These uberblocks are updated whenever a transaction group is committed,
  * or when the configuration is updated.  When a pool is loaded, we scan each
  * vdev for the 'best' uberblock.
  *
  *
  * Configuration Information
  * -------------------------
  *
  * The nvlist describing the pool and vdev contains the following elements:
  *
  * 	version		ZFS on-disk version
  * 	name		Pool name
  * 	state		Pool state
  * 	txg		Transaction group in which this label was written
  * 	pool_guid	Unique identifier for this pool
  * 	vdev_tree	An nvlist describing vdev tree.
  *	features_for_read
  *			An nvlist of the features necessary for reading the MOS.
  *
  * Each leaf device label also contains the following:
  *
  * 	top_guid	Unique ID for top-level vdev in which this is contained
  * 	guid		Unique ID for the leaf vdev
  *
  * The 'vs' configuration follows the format described in 'spa_config.c'.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/zap.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/metaslab.h>
 #include <sys/zio.h>
 #include <sys/dsl_scan.h>
 #include <sys/trim_map.h>
 #include <sys/fs/zfs.h>
 
 static boolean_t vdev_trim_on_init = B_TRUE;
 SYSCTL_DECL(_vfs_zfs_vdev);
 SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW,
     &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation");
 
 /*
  * Basic routines to read and write from a vdev label.
  * Used throughout the rest of this file.
  */
 uint64_t
 vdev_label_offset(uint64_t psize, int l, uint64_t offset)
 {
 	ASSERT(offset < sizeof (vdev_label_t));
 	ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0);
 
 	return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ?
 	    0 : psize - VDEV_LABELS * sizeof (vdev_label_t)));
 }
 
 /*
  * Returns back the vdev label associated with the passed in offset.
  */
 int
 vdev_label_number(uint64_t psize, uint64_t offset)
 {
 	int l;
 
 	if (offset >= psize - VDEV_LABEL_END_SIZE) {
 		offset -= psize - VDEV_LABEL_END_SIZE;
 		offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t);
 	}
 	l = offset / sizeof (vdev_label_t);
 	return (l < VDEV_LABELS ? l : -1);
 }
 
 static void
 vdev_label_read(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
 	uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
 	ASSERT(spa_config_held(zio->io_spa, SCL_STATE_ALL, RW_WRITER) ==
 	    SCL_STATE_ALL);
 	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
 
 	zio_nowait(zio_read_phys(zio, vd,
 	    vdev_label_offset(vd->vdev_psize, l, offset),
 	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
 	    ZIO_PRIORITY_SYNC_READ, flags, B_TRUE));
 }
 
 static void
 vdev_label_write(zio_t *zio, vdev_t *vd, int l, void *buf, uint64_t offset,
 	uint64_t size, zio_done_func_t *done, void *private, int flags)
 {
 	ASSERT(spa_config_held(zio->io_spa, SCL_ALL, RW_WRITER) == SCL_ALL ||
 	    (spa_config_held(zio->io_spa, SCL_CONFIG | SCL_STATE, RW_READER) ==
 	    (SCL_CONFIG | SCL_STATE) &&
 	    dsl_pool_sync_context(spa_get_dsl(zio->io_spa))));
 	ASSERT(flags & ZIO_FLAG_CONFIG_WRITER);
 
 	zio_nowait(zio_write_phys(zio, vd,
 	    vdev_label_offset(vd->vdev_psize, l, offset),
 	    size, buf, ZIO_CHECKSUM_LABEL, done, private,
 	    ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE));
 }
 
 /*
  * Generate the nvlist representing this vdev's config.
  */
 nvlist_t *
 vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
     vdev_config_flag_t flags)
 {
 	nvlist_t *nv = NULL;
 
 	nv = fnvlist_alloc();
 
 	fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type);
 	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)))
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id);
 	fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid);
 
 	if (vd->vdev_path != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path);
 
 	if (vd->vdev_devid != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid);
 
 	if (vd->vdev_physpath != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 		    vd->vdev_physpath);
 
 	if (vd->vdev_fru != NULL)
 		fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru);
 
 	if (vd->vdev_nparity != 0) {
 		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
 		    VDEV_TYPE_RAIDZ) == 0);
 
 		/*
 		 * Make sure someone hasn't managed to sneak a fancy new vdev
 		 * into a crufty old storage pool.
 		 */
 		ASSERT(vd->vdev_nparity == 1 ||
 		    (vd->vdev_nparity <= 2 &&
 		    spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
 		    (vd->vdev_nparity <= 3 &&
 		    spa_version(spa) >= SPA_VERSION_RAIDZ3));
 
 		/*
 		 * Note that we'll add the nparity tag even on storage pools
 		 * that only support a single parity device -- older software
 		 * will just ignore it.
 		 */
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity);
 	}
 
 	if (vd->vdev_wholedisk != -1ULL)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
 		    vd->vdev_wholedisk);
 
 	if (vd->vdev_not_present)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1);
 
 	if (vd->vdev_isspare)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1);
 
 	if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) &&
 	    vd == vd->vdev_top) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vd->vdev_ms_array);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
 		    vd->vdev_ms_shift);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE,
 		    vd->vdev_asize);
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog);
 		if (vd->vdev_removing)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING,
 			    vd->vdev_removing);
 	}
 
 	if (vd->vdev_dtl_sm != NULL) {
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
 		    space_map_object(vd->vdev_dtl_sm));
 	}
 
 	if (vd->vdev_crtxg)
 		fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg);
 
 	if (getstats) {
 		vdev_stat_t vs;
 		pool_scan_stat_t ps;
 
 		vdev_get_stats(vd, &vs);
 		fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS,
 		    (uint64_t *)&vs, sizeof (vs) / sizeof (uint64_t));
 
 		/* provide either current or previous scan information */
 		if (spa_scan_get_stats(spa, &ps) == 0) {
 			fnvlist_add_uint64_array(nv,
 			    ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps,
 			    sizeof (pool_scan_stat_t) / sizeof (uint64_t));
 		}
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		nvlist_t **child;
 		int c, idx;
 
 		ASSERT(!vd->vdev_ishole);
 
 		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
 		    KM_SLEEP);
 
 		for (c = 0, idx = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 
 			/*
 			 * If we're generating an nvlist of removing
 			 * vdevs then skip over any device which is
 			 * not being removed.
 			 */
 			if ((flags & VDEV_CONFIG_REMOVING) &&
 			    !cvd->vdev_removing)
 				continue;
 
 			child[idx++] = vdev_config_generate(spa, cvd,
 			    getstats, flags);
 		}
 
 		if (idx) {
 			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 			    child, idx);
 		}
 
 		for (c = 0; c < idx; c++)
 			nvlist_free(child[c]);
 
 		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
 
 	} else {
 		const char *aux = NULL;
 
 		if (vd->vdev_offline && !vd->vdev_tmpoffline)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE);
 		if (vd->vdev_resilver_txg != 0)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG,
 			    vd->vdev_resilver_txg);
 		if (vd->vdev_faulted)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE);
 		if (vd->vdev_degraded)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE);
 		if (vd->vdev_removed)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE);
 		if (vd->vdev_unspare)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE);
 		if (vd->vdev_ishole)
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE);
 
 		switch (vd->vdev_stat.vs_aux) {
 		case VDEV_AUX_ERR_EXCEEDED:
 			aux = "err_exceeded";
 			break;
 
 		case VDEV_AUX_EXTERNAL:
 			aux = "external";
 			break;
 		}
 
 		if (aux != NULL)
 			fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux);
 
 		if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
 			fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
 			    vd->vdev_orig_guid);
 		}
 	}
 
 	return (nv);
 }
 
 /*
  * Generate a view of the top-level vdevs.  If we currently have holes
  * in the namespace, then generate an array which contains a list of holey
  * vdevs.  Additionally, add the number of top-level children that currently
  * exist.
  */
 void
 vdev_top_config_generate(spa_t *spa, nvlist_t *config)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t *array;
 	uint_t c, idx;
 
 	array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
 
 	for (c = 0, idx = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		if (tvd->vdev_ishole)
 			array[idx++] = c;
 	}
 
 	if (idx) {
 		VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
 		    array, idx) == 0);
 	}
 
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
 	    rvd->vdev_children) == 0);
 
 	kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
 }
 
 /*
  * Returns the configuration from the label of the given vdev. For vdevs
  * which don't have a txg value stored on their label (i.e. spares/cache)
  * or have not been completely initialized (txg = 0) just return
  * the configuration from the first valid label we find. Otherwise,
  * find the most up-to-date label that does not exceed the specified
  * 'txg' value.
  */
 nvlist_t *
 vdev_label_read_config(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *config = NULL;
 	vdev_phys_t *vp;
 	zio_t *zio;
 	uint64_t best_txg = 0;
 	int error = 0;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
 	if (!vdev_readable(vd))
 		return (NULL);
 
 	vp = zio_buf_alloc(sizeof (vdev_phys_t));
 
 retry:
 	for (int l = 0; l < VDEV_LABELS; l++) {
 		nvlist_t *label = NULL;
 
 		zio = zio_root(spa, NULL, NULL, flags);
 
 		vdev_label_read(zio, vd, l, vp,
 		    offsetof(vdev_label_t, vl_vdev_phys),
 		    sizeof (vdev_phys_t), NULL, NULL, flags);
 
 		if (zio_wait(zio) == 0 &&
 		    nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist),
 		    &label, 0) == 0) {
 			uint64_t label_txg = 0;
 
 			/*
 			 * Auxiliary vdevs won't have txg values in their
 			 * labels and newly added vdevs may not have been
 			 * completely initialized so just return the
 			 * configuration from the first valid label we
 			 * encounter.
 			 */
 			error = nvlist_lookup_uint64(label,
 			    ZPOOL_CONFIG_POOL_TXG, &label_txg);
 			if ((error || label_txg == 0) && !config) {
 				config = label;
 				break;
 			} else if (label_txg <= txg && label_txg > best_txg) {
 				best_txg = label_txg;
 				nvlist_free(config);
 				config = fnvlist_dup(label);
 			}
 		}
 
 		if (label != NULL) {
 			nvlist_free(label);
 			label = NULL;
 		}
 	}
 
 	if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
 		flags |= ZIO_FLAG_TRYHARD;
 		goto retry;
 	}
 
 	zio_buf_free(vp, sizeof (vdev_phys_t));
 
 	return (config);
 }
 
 /*
  * Determine if a device is in use.  The 'spare_guid' parameter will be filled
  * in with the device guid if this spare is active elsewhere on the system.
  */
 static boolean_t
 vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason,
     uint64_t *spare_guid, uint64_t *l2cache_guid)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t state, pool_guid, device_guid, txg, spare_pool;
 	uint64_t vdtxg = 0;
 	nvlist_t *label;
 
 	if (spare_guid)
 		*spare_guid = 0ULL;
 	if (l2cache_guid)
 		*l2cache_guid = 0ULL;
 
 	/*
 	 * Read the label, if any, and perform some basic sanity checks.
 	 */
 	if ((label = vdev_label_read_config(vd, -1ULL)) == NULL)
 		return (B_FALSE);
 
 	(void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
 	    &vdtxg);
 
 	if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 	    &state) != 0 ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
 	    &device_guid) != 0) {
 		nvlist_free(label);
 		return (B_FALSE);
 	}
 
 	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
 	    &pool_guid) != 0 ||
 	    nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
 	    &txg) != 0)) {
 		nvlist_free(label);
 		return (B_FALSE);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * Check to see if this device indeed belongs to the pool it claims to
 	 * be a part of.  The only way this is allowed is if the device is a hot
 	 * spare (which we check for later on).
 	 */
 	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    !spa_guid_exists(pool_guid, device_guid) &&
 	    !spa_spare_exists(device_guid, NULL, NULL) &&
 	    !spa_l2cache_exists(device_guid, NULL))
 		return (B_FALSE);
 
 	/*
 	 * If the transaction group is zero, then this an initialized (but
 	 * unused) label.  This is only an error if the create transaction
 	 * on-disk is the same as the one we're using now, in which case the
 	 * user has attempted to add the same vdev multiple times in the same
 	 * transaction.
 	 */
 	if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE &&
 	    txg == 0 && vdtxg == crtxg)
 		return (B_TRUE);
 
 	/*
 	 * Check to see if this is a spare device.  We do an explicit check for
 	 * spa_has_spare() here because it may be on our pending list of spares
 	 * to add.  We also check if it is an l2cache device.
 	 */
 	if (spa_spare_exists(device_guid, &spare_pool, NULL) ||
 	    spa_has_spare(spa, device_guid)) {
 		if (spare_guid)
 			*spare_guid = device_guid;
 
 		switch (reason) {
 		case VDEV_LABEL_CREATE:
 		case VDEV_LABEL_L2CACHE:
 			return (B_TRUE);
 
 		case VDEV_LABEL_REPLACE:
 			return (!spa_has_spare(spa, device_guid) ||
 			    spare_pool != 0ULL);
 
 		case VDEV_LABEL_SPARE:
 			return (spa_has_spare(spa, device_guid));
 		}
 	}
 
 	/*
 	 * Check to see if this is an l2cache device.
 	 */
 	if (spa_l2cache_exists(device_guid, NULL))
 		return (B_TRUE);
 
 	/*
 	 * We can't rely on a pool's state if it's been imported
 	 * read-only.  Instead we look to see if the pools is marked
 	 * read-only in the namespace and set the state to active.
 	 */
 	if ((spa = spa_by_guid(pool_guid, device_guid)) != NULL &&
 	    spa_mode(spa) == FREAD)
 		state = POOL_STATE_ACTIVE;
 
 	/*
 	 * If the device is marked ACTIVE, then this device is in use by another
 	 * pool on the system.
 	 */
 	return (state == POOL_STATE_ACTIVE);
 }
 
 /*
  * Initialize a vdev label.  We check to make sure each leaf device is not in
  * use, and writable.  We put down an initial label which we will later
  * overwrite with a complete label.  Note that it's important to do this
  * sequentially, not in parallel, so that we catch cases of multiple use of the
  * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with
  * itself.
  */
 int
 vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 {
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *label;
 	vdev_phys_t *vp;
 	char *pad2;
 	uberblock_t *ub;
 	zio_t *zio;
 	char *buf;
 	size_t buflen;
 	int error;
 	uint64_t spare_guid, l2cache_guid;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		if ((error = vdev_label_init(vd->vdev_child[c],
 		    crtxg, reason)) != 0)
 			return (error);
 
 	/* Track the creation time for this vdev */
 	vd->vdev_crtxg = crtxg;
 
 	if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa))
 		return (0);
 
 	/*
 	 * Dead vdevs cannot be initialized.
 	 */
 	if (vdev_is_dead(vd))
 		return (SET_ERROR(EIO));
 
 	/*
 	 * Determine if the vdev is in use.
 	 */
 	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
 	    vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
 		return (SET_ERROR(EBUSY));
 
 	/*
 	 * If this is a request to add or replace a spare or l2cache device
 	 * that is in use elsewhere on the system, then we must update the
 	 * guid (which was initialized to a random value) to reflect the
 	 * actual GUID (which is shared between multiple pools).
 	 */
 	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE &&
 	    spare_guid != 0ULL) {
 		uint64_t guid_delta = spare_guid - vd->vdev_guid;
 
 		vd->vdev_guid += guid_delta;
 
 		for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
 			pvd->vdev_guid_sum += guid_delta;
 
 		/*
 		 * If this is a replacement, then we want to fallthrough to the
 		 * rest of the code.  If we're adding a spare, then it's already
 		 * labeled appropriately and we can just return.
 		 */
 		if (reason == VDEV_LABEL_SPARE)
 			return (0);
 		ASSERT(reason == VDEV_LABEL_REPLACE ||
 		    reason == VDEV_LABEL_SPLIT);
 	}
 
 	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
 	    l2cache_guid != 0ULL) {
 		uint64_t guid_delta = l2cache_guid - vd->vdev_guid;
 
 		vd->vdev_guid += guid_delta;
 
 		for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
 			pvd->vdev_guid_sum += guid_delta;
 
 		/*
 		 * If this is a replacement, then we want to fallthrough to the
 		 * rest of the code.  If we're adding an l2cache, then it's
 		 * already labeled appropriately and we can just return.
 		 */
 		if (reason == VDEV_LABEL_L2CACHE)
 			return (0);
 		ASSERT(reason == VDEV_LABEL_REPLACE);
 	}
 
 	/*
 	 * TRIM the whole thing so that we start with a clean slate.
 	 * It's just an optimization, so we don't care if it fails.
 	 * Don't TRIM if removing so that we don't interfere with zpool
 	 * disaster recovery.
 	 */
 	if (zfs_trim_enabled && vdev_trim_on_init && (reason == VDEV_LABEL_CREATE ||
 	    reason == VDEV_LABEL_SPARE || reason == VDEV_LABEL_L2CACHE))
 		zio_wait(zio_trim(NULL, spa, vd, 0, vd->vdev_psize));
 
 	/*
 	 * Initialize its label.
 	 */
 	vp = zio_buf_alloc(sizeof (vdev_phys_t));
 	bzero(vp, sizeof (vdev_phys_t));
 
 	/*
 	 * Generate a label describing the pool and our top-level vdev.
 	 * We mark it as being from txg 0 to indicate that it's not
 	 * really part of an active pool just yet.  The labels will
 	 * be written again with a meaningful txg by spa_sync().
 	 */
 	if (reason == VDEV_LABEL_SPARE ||
 	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) {
 		/*
 		 * For inactive hot spares, we generate a special label that
 		 * identifies as a mutually shared hot spare.  We write the
 		 * label if we are adding a hot spare, or if we are removing an
 		 * active hot spare (in which case we want to revert the
 		 * labels).
 		 */
 		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
 		    spa_version(spa)) == 0);
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 		    POOL_STATE_SPARE) == 0);
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
 		    vd->vdev_guid) == 0);
 	} else if (reason == VDEV_LABEL_L2CACHE ||
 	    (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) {
 		/*
 		 * For level 2 ARC devices, add a special label.
 		 */
 		VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION,
 		    spa_version(spa)) == 0);
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE,
 		    POOL_STATE_L2CACHE) == 0);
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
 		    vd->vdev_guid) == 0);
 	} else {
 		uint64_t txg = 0ULL;
 
 		if (reason == VDEV_LABEL_SPLIT)
 			txg = spa->spa_uberblock.ub_txg;
 		label = spa_config_generate(spa, vd, txg, B_FALSE);
 
 		/*
 		 * Add our creation time.  This allows us to detect multiple
 		 * vdev uses as described above, and automatically expires if we
 		 * fail.
 		 */
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG,
 		    crtxg) == 0);
 	}
 
 	buf = vp->vp_nvlist;
 	buflen = sizeof (vp->vp_nvlist);
 
 	error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP);
 	if (error != 0) {
 		nvlist_free(label);
 		zio_buf_free(vp, sizeof (vdev_phys_t));
 		/* EFAULT means nvlist_pack ran out of room */
 		return (error == EFAULT ? ENAMETOOLONG : EINVAL);
 	}
 
 	/*
 	 * Initialize uberblock template.
 	 */
 	ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
 	bzero(ub, VDEV_UBERBLOCK_RING);
 	*ub = spa->spa_uberblock;
 	ub->ub_txg = 0;
 
 	/* Initialize the 2nd padding area. */
 	pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
 	bzero(pad2, VDEV_PAD_SIZE);
 
 	/*
 	 * Write everything in parallel.
 	 */
 retry:
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
 
 		vdev_label_write(zio, vd, l, vp,
 		    offsetof(vdev_label_t, vl_vdev_phys),
 		    sizeof (vdev_phys_t), NULL, NULL, flags);
 
 		/*
 		 * Skip the 1st padding area.
 		 * Zero out the 2nd padding area where it might have
 		 * left over data from previous filesystem format.
 		 */
 		vdev_label_write(zio, vd, l, pad2,
 		    offsetof(vdev_label_t, vl_pad2),
 		    VDEV_PAD_SIZE, NULL, NULL, flags);
 
 		vdev_label_write(zio, vd, l, ub,
 		    offsetof(vdev_label_t, vl_uberblock),
 		    VDEV_UBERBLOCK_RING, NULL, NULL, flags);
 	}
 
 	error = zio_wait(zio);
 
 	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
 		flags |= ZIO_FLAG_TRYHARD;
 		goto retry;
 	}
 
 	nvlist_free(label);
 	zio_buf_free(pad2, VDEV_PAD_SIZE);
 	zio_buf_free(ub, VDEV_UBERBLOCK_RING);
 	zio_buf_free(vp, sizeof (vdev_phys_t));
 
 	/*
 	 * If this vdev hasn't been previously identified as a spare, then we
 	 * mark it as such only if a) we are labeling it as a spare, or b) it
 	 * exists as a spare elsewhere in the system.  Do the same for
 	 * level 2 ARC devices.
 	 */
 	if (error == 0 && !vd->vdev_isspare &&
 	    (reason == VDEV_LABEL_SPARE ||
 	    spa_spare_exists(vd->vdev_guid, NULL, NULL)))
 		spa_spare_add(vd);
 
 	if (error == 0 && !vd->vdev_isl2cache &&
 	    (reason == VDEV_LABEL_L2CACHE ||
 	    spa_l2cache_exists(vd->vdev_guid, NULL)))
 		spa_l2cache_add(vd);
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * uberblock load/sync
  * ==========================================================================
  */
 
 /*
  * Consider the following situation: txg is safely synced to disk.  We've
  * written the first uberblock for txg + 1, and then we lose power.  When we
  * come back up, we fail to see the uberblock for txg + 1 because, say,
  * it was on a mirrored device and the replica to which we wrote txg + 1
  * is now offline.  If we then make some changes and sync txg + 1, and then
  * the missing replica comes back, then for a few seconds we'll have two
  * conflicting uberblocks on disk with the same txg.  The solution is simple:
  * among uberblocks with equal txg, choose the one with the latest timestamp.
  */
 static int
 vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
 {
 	if (ub1->ub_txg < ub2->ub_txg)
 		return (-1);
 	if (ub1->ub_txg > ub2->ub_txg)
 		return (1);
 
 	if (ub1->ub_timestamp < ub2->ub_timestamp)
 		return (-1);
 	if (ub1->ub_timestamp > ub2->ub_timestamp)
 		return (1);
 
 	return (0);
 }
 
 struct ubl_cbdata {
 	uberblock_t	*ubl_ubbest;	/* Best uberblock */
 	vdev_t		*ubl_vd;	/* vdev associated with the above */
 };
 
 static void
 vdev_uberblock_load_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	spa_t *spa = zio->io_spa;
 	zio_t *rio = zio->io_private;
 	uberblock_t *ub = zio->io_data;
 	struct ubl_cbdata *cbp = rio->io_private;
 
 	ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd));
 
 	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 		mutex_enter(&rio->io_lock);
 		if (ub->ub_txg <= spa->spa_load_max_txg &&
 		    vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) {
 			/*
 			 * Keep track of the vdev in which this uberblock
 			 * was found. We will use this information later
 			 * to obtain the config nvlist associated with
 			 * this uberblock.
 			 */
 			*cbp->ubl_ubbest = *ub;
 			cbp->ubl_vd = vd;
 		}
 		mutex_exit(&rio->io_lock);
 	}
 
 	zio_buf_free(zio->io_data, zio->io_size);
 }
 
 static void
 vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags,
     struct ubl_cbdata *cbp)
 {
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
 		for (int l = 0; l < VDEV_LABELS; l++) {
 			for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
 				vdev_label_read(zio, vd, l,
 				    zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)),
 				    VDEV_UBERBLOCK_OFFSET(vd, n),
 				    VDEV_UBERBLOCK_SIZE(vd),
 				    vdev_uberblock_load_done, zio, flags);
 			}
 		}
 	}
 }
 
 /*
  * Reads the 'best' uberblock from disk along with its associated
  * configuration. First, we read the uberblock array of each label of each
  * vdev, keeping track of the uberblock with the highest txg in each array.
  * Then, we read the configuration from the same vdev as the best uberblock.
  */
 void
 vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config)
 {
 	zio_t *zio;
 	spa_t *spa = rvd->vdev_spa;
 	struct ubl_cbdata cb;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
 
 	ASSERT(ub);
 	ASSERT(config);
 
 	bzero(ub, sizeof (uberblock_t));
 	*config = NULL;
 
 	cb.ubl_ubbest = ub;
 	cb.ubl_vd = NULL;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	zio = zio_root(spa, NULL, &cb, flags);
 	vdev_uberblock_load_impl(zio, rvd, flags, &cb);
 	(void) zio_wait(zio);
 
 	/*
 	 * It's possible that the best uberblock was discovered on a label
 	 * that has a configuration which was written in a future txg.
 	 * Search all labels on this vdev to find the configuration that
 	 * matches the txg for our uberblock.
 	 */
 	if (cb.ubl_vd != NULL)
 		*config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
  * On success, increment root zio's count of good writes.
  * We only get credit for writes to known-visible vdevs; see spa_vdev_add().
  */
 static void
 vdev_uberblock_sync_done(zio_t *zio)
 {
 	uint64_t *good_writes = zio->io_private;
 
 	if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0)
-		atomic_add_64(good_writes, 1);
+		atomic_inc_64(good_writes);
 }
 
 /*
  * Write the uberblock to all labels of all leaves of the specified vdev.
  */
 static void
 vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
 {
 	uberblock_t *ubbuf;
 	int n;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_uberblock_sync(zio, ub, vd->vdev_child[c], flags);
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
 
 	if (!vdev_writeable(vd))
 		return;
 
 	n = ub->ub_txg & (VDEV_UBERBLOCK_COUNT(vd) - 1);
 
 	ubbuf = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
 	bzero(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
 	*ubbuf = *ub;
 
 	for (int l = 0; l < VDEV_LABELS; l++)
 		vdev_label_write(zio, vd, l, ubbuf,
 		    VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd),
 		    vdev_uberblock_sync_done, zio->io_private,
 		    flags | ZIO_FLAG_DONT_PROPAGATE);
 
 	zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
 }
 
 /* Sync the uberblocks to all vdevs in svd[] */
 int
 vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
 {
 	spa_t *spa = svd[0]->vdev_spa;
 	zio_t *zio;
 	uint64_t good_writes = 0;
 
 	zio = zio_root(spa, NULL, &good_writes, flags);
 
 	for (int v = 0; v < svdcount; v++)
 		vdev_uberblock_sync(zio, ub, svd[v], flags);
 
 	(void) zio_wait(zio);
 
 	/*
 	 * Flush the uberblocks to disk.  This ensures that the odd labels
 	 * are no longer needed (because the new uberblocks and the even
 	 * labels are safely on disk), so it is safe to overwrite them.
 	 */
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (int v = 0; v < svdcount; v++)
 		zio_flush(zio, svd[v]);
 
 	(void) zio_wait(zio);
 
 	return (good_writes >= 1 ? 0 : EIO);
 }
 
 /*
  * On success, increment the count of good writes for our top-level vdev.
  */
 static void
 vdev_label_sync_done(zio_t *zio)
 {
 	uint64_t *good_writes = zio->io_private;
 
 	if (zio->io_error == 0)
-		atomic_add_64(good_writes, 1);
+		atomic_inc_64(good_writes);
 }
 
 /*
  * If there weren't enough good writes, indicate failure to the parent.
  */
 static void
 vdev_label_sync_top_done(zio_t *zio)
 {
 	uint64_t *good_writes = zio->io_private;
 
 	if (*good_writes == 0)
 		zio->io_error = SET_ERROR(EIO);
 
 	kmem_free(good_writes, sizeof (uint64_t));
 }
 
 /*
  * We ignore errors for log and cache devices, simply free the private data.
  */
 static void
 vdev_label_sync_ignore_done(zio_t *zio)
 {
 	kmem_free(zio->io_private, sizeof (uint64_t));
 }
 
 /*
  * Write all even or odd labels to all leaves of the specified vdev.
  */
 static void
 vdev_label_sync(zio_t *zio, vdev_t *vd, int l, uint64_t txg, int flags)
 {
 	nvlist_t *label;
 	vdev_phys_t *vp;
 	char *buf;
 	size_t buflen;
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_label_sync(zio, vd->vdev_child[c], l, txg, flags);
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return;
 
 	if (!vdev_writeable(vd))
 		return;
 
 	/*
 	 * Generate a label describing the top-level config to which we belong.
 	 */
 	label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE);
 
 	vp = zio_buf_alloc(sizeof (vdev_phys_t));
 	bzero(vp, sizeof (vdev_phys_t));
 
 	buf = vp->vp_nvlist;
 	buflen = sizeof (vp->vp_nvlist);
 
 	if (nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP) == 0) {
 		for (; l < VDEV_LABELS; l += 2) {
 			vdev_label_write(zio, vd, l, vp,
 			    offsetof(vdev_label_t, vl_vdev_phys),
 			    sizeof (vdev_phys_t),
 			    vdev_label_sync_done, zio->io_private,
 			    flags | ZIO_FLAG_DONT_PROPAGATE);
 		}
 	}
 
 	zio_buf_free(vp, sizeof (vdev_phys_t));
 	nvlist_free(label);
 }
 
 int
 vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
 {
 	list_t *dl = &spa->spa_config_dirty_list;
 	vdev_t *vd;
 	zio_t *zio;
 	int error;
 
 	/*
 	 * Write the new labels to disk.
 	 */
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
 		uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
 		    KM_SLEEP);
 
 		ASSERT(!vd->vdev_ishole);
 
 		zio_t *vio = zio_null(zio, spa, NULL,
 		    (vd->vdev_islog || vd->vdev_aux != NULL) ?
 		    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
 		    good_writes, flags);
 		vdev_label_sync(vio, vd, l, txg, flags);
 		zio_nowait(vio);
 	}
 
 	error = zio_wait(zio);
 
 	/*
 	 * Flush the new labels to disk.
 	 */
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd))
 		zio_flush(zio, vd);
 
 	(void) zio_wait(zio);
 
 	return (error);
 }
 
 /*
  * Sync the uberblock and any changes to the vdev configuration.
  *
  * The order of operations is carefully crafted to ensure that
  * if the system panics or loses power at any time, the state on disk
  * is still transactionally consistent.  The in-line comments below
  * describe the failure semantics at each stage.
  *
  * Moreover, vdev_config_sync() is designed to be idempotent: if it fails
  * at any time, you can just call it again, and it will resume its work.
  */
 int
 vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
 {
 	spa_t *spa = svd[0]->vdev_spa;
 	uberblock_t *ub = &spa->spa_uberblock;
 	vdev_t *vd;
 	zio_t *zio;
 	int error;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
 	/*
 	 * Normally, we don't want to try too hard to write every label and
 	 * uberblock.  If there is a flaky disk, we don't want the rest of the
 	 * sync process to block while we retry.  But if we can't write a
 	 * single label out, we should retry with ZIO_FLAG_TRYHARD before
 	 * bailing out and declaring the pool faulted.
 	 */
 	if (tryhard)
 		flags |= ZIO_FLAG_TRYHARD;
 
 	ASSERT(ub->ub_txg <= txg);
 
 	/*
 	 * If this isn't a resync due to I/O errors,
 	 * and nothing changed in this transaction group,
 	 * and the vdev configuration hasn't changed,
 	 * then there's nothing to do.
 	 */
 	if (ub->ub_txg < txg &&
 	    uberblock_update(ub, spa->spa_root_vdev, txg) == B_FALSE &&
 	    list_is_empty(&spa->spa_config_dirty_list))
 		return (0);
 
 	if (txg > spa_freeze_txg(spa))
 		return (0);
 
 	ASSERT(txg <= spa->spa_final_txg);
 
 	/*
 	 * Flush the write cache of every disk that's been written to
 	 * in this transaction group.  This ensures that all blocks
 	 * written in this txg will be committed to stable storage
 	 * before any uberblock that references them.
 	 */
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (vd = txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd;
 	    vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)))
 		zio_flush(zio, vd);
 
 	(void) zio_wait(zio);
 
 	/*
 	 * Sync out the even labels (L0, L2) for every dirty vdev.  If the
 	 * system dies in the middle of this process, that's OK: all of the
 	 * even labels that made it to disk will be newer than any uberblock,
 	 * and will therefore be considered invalid.  The odd labels (L1, L3),
 	 * which have not yet been touched, will still be valid.  We flush
 	 * the new labels to disk to ensure that all even-label updates
 	 * are committed to stable storage before the uberblock update.
 	 */
 	if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0)
 		return (error);
 
 	/*
 	 * Sync the uberblocks to all vdevs in svd[].
 	 * If the system dies in the middle of this step, there are two cases
 	 * to consider, and the on-disk state is consistent either way:
 	 *
 	 * (1)	If none of the new uberblocks made it to disk, then the
 	 *	previous uberblock will be the newest, and the odd labels
 	 *	(which had not yet been touched) will be valid with respect
 	 *	to that uberblock.
 	 *
 	 * (2)	If one or more new uberblocks made it to disk, then they
 	 *	will be the newest, and the even labels (which had all
 	 *	been successfully committed) will be valid with respect
 	 *	to the new uberblocks.
 	 */
 	if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0)
 		return (error);
 
 	/*
 	 * Sync out odd labels for every dirty vdev.  If the system dies
 	 * in the middle of this process, the even labels and the new
 	 * uberblocks will suffice to open the pool.  The next time
 	 * the pool is opened, the first thing we'll do -- before any
 	 * user data is modified -- is mark every vdev dirty so that
 	 * all labels will be brought up to date.  We flush the new labels
 	 * to disk to ensure that all odd-label updates are committed to
 	 * stable storage before the next transaction group begins.
 	 */
 	if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0)
 		return (error);
 
 	trim_thread_wakeup(spa);
 
 	return (0);
 }
Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	(revision 271000)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vfsops.c	(revision 271001)
@@ -1,2543 +1,2543 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011 Pawel Jakub Dawidek <pawel@dawidek.net>.
  * All rights reserved.
  * Copyright (c) 2013 by Delphix. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
 #include <sys/acl.h>
 #include <sys/vnode.h>
 #include <sys/vfs.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
 #include <sys/cmn_err.h>
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zil.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/varargs.h>
 #include <sys/policy.h>
 #include <sys/atomic.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/sunddi.h>
 #include <sys/dnlc.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa_boot.h>
 #include <sys/jail.h>
 #include "zfs_comutil.h"
 
 struct mtx zfs_debug_mtx;
 MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF);
 
 SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system");
 
 int zfs_super_owner;
 SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0,
     "File system owner can perform privileged operation on his file systems");
 
 int zfs_debug_level;
 TUNABLE_INT("vfs.zfs.debug", &zfs_debug_level);
 SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RW, &zfs_debug_level, 0,
     "Debug level");
 
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions");
 static int zfs_version_acl = ZFS_ACL_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0,
     "ZFS_ACL_VERSION");
 static int zfs_version_spa = SPA_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0,
     "SPA_VERSION");
 static int zfs_version_zpl = ZPL_VERSION;
 SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0,
     "ZPL_VERSION");
 
 static int zfs_mount(vfs_t *vfsp);
 static int zfs_umount(vfs_t *vfsp, int fflag);
 static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp);
 static int zfs_statfs(vfs_t *vfsp, struct statfs *statp);
 static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp);
 static int zfs_sync(vfs_t *vfsp, int waitfor);
 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
     struct ucred **credanonp, int *numsecflavors, int **secflavors);
 static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp);
 static void zfs_objset_close(zfsvfs_t *zfsvfs);
 static void zfs_freevfs(vfs_t *vfsp);
 
 static struct vfsops zfs_vfsops = {
 	.vfs_mount =		zfs_mount,
 	.vfs_unmount =		zfs_umount,
 	.vfs_root =		zfs_root,
 	.vfs_statfs =		zfs_statfs,
 	.vfs_vget =		zfs_vget,
 	.vfs_sync =		zfs_sync,
 	.vfs_checkexp =		zfs_checkexp,
 	.vfs_fhtovp =		zfs_fhtovp,
 };
 
 VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN);
 
 /*
  * We need to keep a count of active fs's.
  * This is necessary to prevent our module
  * from being unloaded after a umount -f
  */
 static uint32_t	zfs_active_fs_count = 0;
 
 /*ARGSUSED*/
 static int
 zfs_sync(vfs_t *vfsp, int waitfor)
 {
 
 	/*
 	 * Data integrity is job one.  We don't want a compromised kernel
 	 * writing to the storage pool, so we never sync during panic.
 	 */
 	if (panicstr)
 		return (0);
 
 	if (vfsp != NULL) {
 		/*
 		 * Sync a specific filesystem.
 		 */
 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
 		dsl_pool_t *dp;
 		int error;
 
 		error = vfs_stdsync(vfsp, waitfor);
 		if (error != 0)
 			return (error);
 
 		ZFS_ENTER(zfsvfs);
 		dp = dmu_objset_pool(zfsvfs->z_os);
 
 		/*
 		 * If the system is shutting down, then skip any
 		 * filesystems which may exist on a suspended pool.
 		 */
 		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}
 
 		if (zfsvfs->z_log != NULL)
 			zil_commit(zfsvfs->z_log, 0);
 
 		ZFS_EXIT(zfsvfs);
 	} else {
 		/*
 		 * Sync all ZFS filesystems.  This is what happens when you
 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
 		 * request by waiting for all pools to commit all dirty data.
 		 */
 		spa_sync_allpools();
 	}
 
 	return (0);
 }
 
 #ifndef __FreeBSD_kernel__
 static int
 zfs_create_unique_device(dev_t *dev)
 {
 	major_t new_major;
 
 	do {
 		ASSERT3U(zfs_minor, <=, MAXMIN32);
 		minor_t start = zfs_minor;
 		do {
 			mutex_enter(&zfs_dev_mtx);
 			if (zfs_minor >= MAXMIN32) {
 				/*
 				 * If we're still using the real major
 				 * keep out of /dev/zfs and /dev/zvol minor
 				 * number space.  If we're using a getudev()'ed
 				 * major number, we can use all of its minors.
 				 */
 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
 					zfs_minor = ZFS_MIN_MINOR;
 				else
 					zfs_minor = 0;
 			} else {
 				zfs_minor++;
 			}
 			*dev = makedevice(zfs_major, zfs_minor);
 			mutex_exit(&zfs_dev_mtx);
 		} while (vfs_devismounted(*dev) && zfs_minor != start);
 		if (zfs_minor == start) {
 			/*
 			 * We are using all ~262,000 minor numbers for the
 			 * current major number.  Create a new major number.
 			 */
 			if ((new_major = getudev()) == (major_t)-1) {
 				cmn_err(CE_WARN,
 				    "zfs_mount: Can't get unique major "
 				    "device number.");
 				return (-1);
 			}
 			mutex_enter(&zfs_dev_mtx);
 			zfs_major = new_major;
 			zfs_minor = 0;
 
 			mutex_exit(&zfs_dev_mtx);
 		} else {
 			break;
 		}
 		/* CONSTANTCONDITION */
 	} while (1);
 
 	return (0);
 }
 #endif	/* !__FreeBSD_kernel__ */
 
 static void
 atime_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		zfsvfs->z_atime = TRUE;
 		zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 	} else {
 		zfsvfs->z_atime = FALSE;
 		zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 	}
 }
 
 static void
 xattr_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 #endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 #ifdef TODO
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 #endif
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 	}
 }
 
 static void
 blksz_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval < SPA_MINBLOCKSIZE ||
 	    newval > SPA_MAXBLOCKSIZE || !ISP2(newval))
 		newval = SPA_MAXBLOCKSIZE;
 
 	zfsvfs->z_max_blksz = newval;
 	zfsvfs->z_vfs->mnt_stat.f_iosize = newval;
 }
 
 static void
 readonly_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval) {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 	}
 }
 
 static void
 setuid_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 	}
 }
 
 static void
 exec_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 	}
 }
 
 /*
  * The nbmand mount option can be changed at mount time.
  * We can't allow it to be toggled on live file systems or incorrect
  * behavior may be seen from cifs clients
  *
  * This property isn't registered via dsl_prop_register(), but this callback
  * will be called when a file system is first mounted
  */
 static void
 nbmand_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 	if (newval == FALSE) {
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 	} else {
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 	}
 }
 
 static void
 snapdir_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_show_ctldir = newval;
 }
 
 static void
 vscan_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_vscan = newval;
 }
 
 static void
 acl_mode_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_mode = newval;
 }
 
 static void
 acl_inherit_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_inherit = newval;
 }
 
 static int
 zfs_register_callbacks(vfs_t *vfsp)
 {
 	struct dsl_dataset *ds = NULL;
 	objset_t *os = NULL;
 	zfsvfs_t *zfsvfs = NULL;
 	uint64_t nbmand;
 	boolean_t readonly = B_FALSE;
 	boolean_t do_readonly = B_FALSE;
 	boolean_t setuid = B_FALSE;
 	boolean_t do_setuid = B_FALSE;
 	boolean_t exec = B_FALSE;
 	boolean_t do_exec = B_FALSE;
 #ifdef illumos
 	boolean_t devices = B_FALSE;
 	boolean_t do_devices = B_FALSE;
 #endif
 	boolean_t xattr = B_FALSE;
 	boolean_t do_xattr = B_FALSE;
 	boolean_t atime = B_FALSE;
 	boolean_t do_atime = B_FALSE;
 	int error = 0;
 
 	ASSERT(vfsp);
 	zfsvfs = vfsp->vfs_data;
 	ASSERT(zfsvfs);
 	os = zfsvfs->z_os;
 
 	/*
 	 * This function can be called for a snapshot when we update snapshot's
 	 * mount point, which isn't really supported.
 	 */
 	if (dmu_objset_is_snapshot(os))
 		return (EOPNOTSUPP);
 
 	/*
 	 * The act of registering our callbacks will destroy any mount
 	 * options we may have.  In order to enable temporary overrides
 	 * of mount options, we stash away the current values and
 	 * restore them after we register the callbacks.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 	    !spa_writeable(dmu_objset_spa(os))) {
 		readonly = B_TRUE;
 		do_readonly = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 		readonly = B_FALSE;
 		do_readonly = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 		setuid = B_FALSE;
 		do_setuid = B_TRUE;
 	} else {
 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 			setuid = B_FALSE;
 			do_setuid = B_TRUE;
 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 			setuid = B_TRUE;
 			do_setuid = B_TRUE;
 		}
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 		exec = B_FALSE;
 		do_exec = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 		exec = B_TRUE;
 		do_exec = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 		xattr = B_FALSE;
 		do_xattr = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 		xattr = B_TRUE;
 		do_xattr = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 		atime = B_FALSE;
 		do_atime = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 		atime = B_TRUE;
 		do_atime = B_TRUE;
 	}
 
 	/*
 	 * nbmand is a special property.  It can only be changed at
 	 * mount time.
 	 *
 	 * This is weird, but it is documented to only be changeable
 	 * at mount time.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 		nbmand = B_FALSE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 		nbmand = B_TRUE;
 	} else {
 		char osname[MAXNAMELEN];
 
 		dmu_objset_name(os, osname);
 		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
 		    NULL)) {
 			return (error);
 		}
 	}
 
 	/*
 	 * Register property callbacks.
 	 *
 	 * It would probably be fine to just check for i/o error from
 	 * the first prop_register(), but I guess I like to go
 	 * overboard...
 	 */
 	ds = dmu_objset_ds(os);
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	error = dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
 #ifdef illumos
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
 #endif
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
 	    zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 	if (error)
 		goto unregister;
 
 	/*
 	 * Invoke our callbacks to restore temporary mount options.
 	 */
 	if (do_readonly)
 		readonly_changed_cb(zfsvfs, readonly);
 	if (do_setuid)
 		setuid_changed_cb(zfsvfs, setuid);
 	if (do_exec)
 		exec_changed_cb(zfsvfs, exec);
 	if (do_xattr)
 		xattr_changed_cb(zfsvfs, xattr);
 	if (do_atime)
 		atime_changed_cb(zfsvfs, atime);
 
 	nbmand_changed_cb(zfsvfs, nbmand);
 
 	return (0);
 
 unregister:
 	/*
 	 * We may attempt to unregister some callbacks that are not
 	 * registered, but this is OK; it will simply return ENOMSG,
 	 * which we will ignore.
 	 */
 	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ATIME),
 	    atime_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR),
 	    xattr_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 	    blksz_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_READONLY),
 	    readonly_changed_cb, zfsvfs);
 #ifdef illumos
 	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEVICES),
 	    devices_changed_cb, zfsvfs);
 #endif
 	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SETUID),
 	    setuid_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_EXEC),
 	    exec_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR),
 	    snapdir_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE),
 	    acl_mode_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT),
 	    acl_inherit_changed_cb, zfsvfs);
 	(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_VSCAN),
 	    vscan_changed_cb, zfsvfs);
 	return (error);
 }
 
 static int
 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
     uint64_t *userp, uint64_t *groupp)
 {
 	/*
 	 * Is it a valid type of object to track?
 	 */
 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 		return (SET_ERROR(ENOENT));
 
 	/*
 	 * If we have a NULL data pointer
 	 * then assume the id's aren't changing and
 	 * return EEXIST to the dmu to let it know to
 	 * use the same ids
 	 */
 	if (data == NULL)
 		return (SET_ERROR(EEXIST));
 
 	if (bonustype == DMU_OT_ZNODE) {
 		znode_phys_t *znp = data;
 		*userp = znp->zp_uid;
 		*groupp = znp->zp_gid;
 	} else {
 		int hdrsize;
 		sa_hdr_phys_t *sap = data;
 		sa_hdr_phys_t sa = *sap;
 		boolean_t swap = B_FALSE;
 
 		ASSERT(bonustype == DMU_OT_SA);
 
 		if (sa.sa_magic == 0) {
 			/*
 			 * This should only happen for newly created
 			 * files that haven't had the znode data filled
 			 * in yet.
 			 */
 			*userp = 0;
 			*groupp = 0;
 			return (0);
 		}
 		if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
 			sa.sa_magic = SA_MAGIC;
 			sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
 			swap = B_TRUE;
 		} else {
 			VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
 		}
 
 		hdrsize = sa_hdrsize(&sa);
 		VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
 		*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
 		    SA_UID_OFFSET));
 		*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
 		    SA_GID_OFFSET));
 		if (swap) {
 			*userp = BSWAP_64(*userp);
 			*groupp = BSWAP_64(*groupp);
 		}
 	}
 	return (0);
 }
 
 static void
 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
     char *domainbuf, int buflen, uid_t *ridp)
 {
 	uint64_t fuid;
 	const char *domain;
 
 	fuid = strtonum(fuidstr, NULL);
 
 	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
 	if (domain)
 		(void) strlcpy(domainbuf, domain, buflen);
 	else
 		domainbuf[0] = '\0';
 	*ridp = FUID_RID(fuid);
 }
 
 static uint64_t
 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
 {
 	switch (type) {
 	case ZFS_PROP_USERUSED:
 		return (DMU_USERUSED_OBJECT);
 	case ZFS_PROP_GROUPUSED:
 		return (DMU_GROUPUSED_OBJECT);
 	case ZFS_PROP_USERQUOTA:
 		return (zfsvfs->z_userquota_obj);
 	case ZFS_PROP_GROUPQUOTA:
 		return (zfsvfs->z_groupquota_obj);
 	}
 	return (0);
 }
 
 int
 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
 {
 	int error;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	zfs_useracct_t *buf = vbuf;
 	uint64_t obj;
 
 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
 		return (SET_ERROR(ENOTSUP));
 
 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 	if (obj == 0) {
 		*bufsizep = 0;
 		return (0);
 	}
 
 	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
 		    *bufsizep)
 			break;
 
 		fuidstr_to_sid(zfsvfs, za.za_name,
 		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
 
 		buf->zu_space = za.za_first_integer;
 		buf++;
 	}
 	if (error == ENOENT)
 		error = 0;
 
 	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
 	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
 	*cookiep = zap_cursor_serialize(&zc);
 	zap_cursor_fini(&zc);
 	return (error);
 }
 
 /*
  * buf must be big enough (eg, 32 bytes)
  */
 static int
 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
     char *buf, boolean_t addok)
 {
 	uint64_t fuid;
 	int domainid = 0;
 
 	if (domain && domain[0]) {
 		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
 		if (domainid == -1)
 			return (SET_ERROR(ENOENT));
 	}
 	fuid = FUID_ENCODE(domainid, rid);
 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 	return (0);
 }
 
 int
 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t *valp)
 {
 	char buf[32];
 	int err;
 	uint64_t obj;
 
 	*valp = 0;
 
 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
 		return (SET_ERROR(ENOTSUP));
 
 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 	if (obj == 0)
 		return (0);
 
 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
 	if (err)
 		return (err);
 
 	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
 	if (err == ENOENT)
 		err = 0;
 	return (err);
 }
 
 int
 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t quota)
 {
 	char buf[32];
 	int err;
 	dmu_tx_t *tx;
 	uint64_t *objp;
 	boolean_t fuid_dirtied;
 
 	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
 		return (SET_ERROR(EINVAL));
 
 	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
 		return (SET_ERROR(ENOTSUP));
 
 	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
 	    &zfsvfs->z_groupquota_obj;
 
 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
 	if (err)
 		return (err);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
 	if (*objp == 0) {
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 		    zfs_userquota_prop_prefixes[type]);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	mutex_enter(&zfsvfs->z_lock);
 	if (*objp == 0) {
 		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
 		    DMU_OT_NONE, 0, tx);
 		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
 	}
 	mutex_exit(&zfsvfs->z_lock);
 
 	if (quota == 0) {
 		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
 		if (err == ENOENT)
 			err = 0;
 	} else {
 		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
 	}
 	ASSERT(err == 0);
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 	dmu_tx_commit(tx);
 	return (err);
 }
 
 boolean_t
 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 {
 	char buf[32];
 	uint64_t used, quota, usedobj, quotaobj;
 	int err;
 
 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 
 	if (quotaobj == 0 || zfsvfs->z_replay)
 		return (B_FALSE);
 
 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
 	if (err != 0)
 		return (B_FALSE);
 
 	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
 	if (err != 0)
 		return (B_FALSE);
 	return (used >= quota);
 }
 
 boolean_t
 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
 {
 	uint64_t fuid;
 	uint64_t quotaobj;
 
 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 
 	fuid = isgroup ? zp->z_gid : zp->z_uid;
 
 	if (quotaobj == 0 || zfsvfs->z_replay)
 		return (B_FALSE);
 
 	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
 }
 
 int
 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 {
 	objset_t *os;
 	zfsvfs_t *zfsvfs;
 	uint64_t zval;
 	int i, error;
 	uint64_t sa_obj;
 
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 
 	/*
 	 * We claim to always be readonly so we can open snapshots;
 	 * other ZPL code will prevent us from writing to snapshots.
 	 */
 	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
 	if (error) {
 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
 		return (error);
 	}
 
 	/*
 	 * Initialize the zfs-specific filesystem structure.
 	 * Should probably make this a kmem cache, shuffle fields,
 	 * and just bzero up to z_hold_mtx[].
 	 */
 	zfsvfs->z_vfs = NULL;
 	zfsvfs->z_parent = zfsvfs;
 	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 	zfsvfs->z_os = os;
 
 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 	if (error) {
 		goto out;
 	} else if (zfsvfs->z_version >
 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 		(void) printf("Can't mount a version %lld file system "
 		    "on a version %lld pool\n. Pool must be upgraded to mount "
 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
 		error = SET_ERROR(ENOTSUP);
 		goto out;
 	}
 	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
 		goto out;
 	zfsvfs->z_norm = (int)zval;
 
 	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
 		goto out;
 	zfsvfs->z_utf8 = (zval != 0);
 
 	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
 		goto out;
 	zfsvfs->z_case = (uint_t)zval;
 
 	/*
 	 * Fold case on file systems that are always or sometimes case
 	 * insensitive.
 	 */
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 	    zfsvfs->z_case == ZFS_CASE_MIXED)
 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 
 	if (zfsvfs->z_use_sa) {
 		/* should either have both of these objects or none */
 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 		    &sa_obj);
 		if (error)
 			return (error);
 	} else {
 		/*
 		 * Pre SA versions file systems should never touch
 		 * either the attribute registration or layout objects.
 		 */
 		sa_obj = 0;
 	}
 
 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 	    &zfsvfs->z_attr_table);
 	if (error)
 		goto out;
 
 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
 		sa_register_update_callback(os, zfs_sa_upgrade);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 	    &zfsvfs->z_root);
 	if (error)
 		goto out;
 	ASSERT(zfsvfs->z_root != 0);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 	    &zfsvfs->z_unlinkedobj);
 	if (error)
 		goto out;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 	    8, 1, &zfsvfs->z_userquota_obj);
 	if (error && error != ENOENT)
 		goto out;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 	    8, 1, &zfsvfs->z_groupquota_obj);
 	if (error && error != ENOENT)
 		goto out;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 	    &zfsvfs->z_fuid_obj);
 	if (error && error != ENOENT)
 		goto out;
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 	    &zfsvfs->z_shares_dir);
 	if (error && error != ENOENT)
 		goto out;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
 	*zfvp = zfsvfs;
 	return (0);
 
 out:
 	dmu_objset_disown(os, zfsvfs);
 	*zfvp = NULL;
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 	return (error);
 }
 
 static int
 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 {
 	int error;
 
 	error = zfs_register_callbacks(zfsvfs->z_vfs);
 	if (error)
 		return (error);
 
 	/*
 	 * Set the objset user_ptr to track its zfsvfs.
 	 */
 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 
 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
 
 	/*
 	 * If we are not mounting (ie: online recv), then we don't
 	 * have to worry about replaying the log as we blocked all
 	 * operations out since we closed the ZIL.
 	 */
 	if (mounting) {
 		boolean_t readonly;
 
 		/*
 		 * During replay we remove the read only flag to
 		 * allow replays to succeed.
 		 */
 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
 		if (readonly != 0)
 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 		else
 			zfs_unlinked_drain(zfsvfs);
 
 		/*
 		 * Parse and replay the intent log.
 		 *
 		 * Because of ziltest, this must be done after
 		 * zfs_unlinked_drain().  (Further note: ziltest
 		 * doesn't use readonly mounts, where
 		 * zfs_unlinked_drain() isn't called.)  This is because
 		 * ziltest causes spa_sync() to think it's committed,
 		 * but actually it is not, so the intent log contains
 		 * many txg's worth of changes.
 		 *
 		 * In particular, if object N is in the unlinked set in
 		 * the last txg to actually sync, then it could be
 		 * actually freed in a later txg and then reallocated
 		 * in a yet later txg.  This would write a "create
 		 * object N" record to the intent log.  Normally, this
 		 * would be fine because the spa_sync() would have
 		 * written out the fact that object N is free, before
 		 * we could write the "create object N" intent log
 		 * record.
 		 *
 		 * But when we are in ziltest mode, we advance the "open
 		 * txg" without actually spa_sync()-ing the changes to
 		 * disk.  So we would see that object N is still
 		 * allocated and in the unlinked set, and there is an
 		 * intent log record saying to allocate it.
 		 */
 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
 			if (zil_replay_disable) {
 				zil_destroy(zfsvfs->z_log, B_FALSE);
 			} else {
 				zfsvfs->z_replay = B_TRUE;
 				zil_replay(zfsvfs->z_os, zfsvfs,
 				    zfs_replay_vector);
 				zfsvfs->z_replay = B_FALSE;
 			}
 		}
 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
 	}
 
 	return (0);
 }
 
 extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
 
 void
 zfsvfs_free(zfsvfs_t *zfsvfs)
 {
 	int i;
 
 	/*
 	 * This is a barrier to prevent the filesystem from going away in
 	 * zfs_znode_move() until we can safely ensure that the filesystem is
 	 * not unmounted. We consider the filesystem valid before the barrier
 	 * and invalid after the barrier.
 	 */
 	rw_enter(&zfsvfs_lock, RW_READER);
 	rw_exit(&zfsvfs_lock);
 
 	zfs_fuid_destroy(zfsvfs);
 
 	mutex_destroy(&zfsvfs->z_znodes_lock);
 	mutex_destroy(&zfsvfs->z_lock);
 	list_destroy(&zfsvfs->z_all_znodes);
 	rrm_destroy(&zfsvfs->z_teardown_lock);
 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
 	rw_destroy(&zfsvfs->z_fuid_lock);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
 static void
 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
 {
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 	if (zfsvfs->z_vfs) {
 		if (zfsvfs->z_use_fuids) {
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
 		} else {
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
 		}
 	}
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 }
 
 static int
 zfs_domount(vfs_t *vfsp, char *osname)
 {
 	uint64_t recordsize, fsid_guid;
 	int error = 0;
 	zfsvfs_t *zfsvfs;
 	vnode_t *vp;
 
 	ASSERT(vfsp);
 	ASSERT(osname);
 
 	error = zfsvfs_create(osname, &zfsvfs);
 	if (error)
 		return (error);
 	zfsvfs->z_vfs = vfsp;
 
 #ifdef illumos
 	/* Initialize the generic filesystem structure. */
 	vfsp->vfs_bcount = 0;
 	vfsp->vfs_data = NULL;
 
 	if (zfs_create_unique_device(&mount_dev) == -1) {
 		error = SET_ERROR(ENODEV);
 		goto out;
 	}
 	ASSERT(vfs_devismounted(mount_dev) == 0);
 #endif
 
 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 	    NULL))
 		goto out;
 	zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE;
 	zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize;
 
 	vfsp->vfs_data = zfsvfs;
 	vfsp->mnt_flag |= MNT_LOCAL;
 	vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED;
 	vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES;
 	vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED;
 
 	/*
 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
 	 * separates our fsid from any other filesystem types, and a
 	 * 56-bit objset unique ID.  The objset unique ID is unique to
 	 * all objsets open on this system, provided by unique_create().
 	 * The 8-bit fs type must be put in the low bits of fsid[1]
 	 * because that's where other Solaris filesystems put it.
 	 */
 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
 	vfsp->vfs_fsid.val[0] = fsid_guid;
 	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
 	    vfsp->mnt_vfc->vfc_typenum & 0xFF;
 
 	/*
 	 * Set features for file system.
 	 */
 	zfs_set_fuid_feature(zfsvfs);
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 	}
 	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
 
 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 		uint64_t pval;
 
 		atime_changed_cb(zfsvfs, B_FALSE);
 		readonly_changed_cb(zfsvfs, B_TRUE);
 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
 			goto out;
 		xattr_changed_cb(zfsvfs, pval);
 		zfsvfs->z_issnap = B_TRUE;
 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
 
 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 	} else {
 		error = zfsvfs_setup(zfsvfs, B_TRUE);
 	}
 
 	vfs_mountedfrom(vfsp, osname);
 	/* Grab extra reference. */
 	VERIFY(VFS_ROOT(vfsp, LK_EXCLUSIVE, &vp) == 0);
 	VOP_UNLOCK(vp, 0);
 
 	if (!zfsvfs->z_issnap)
 		zfsctl_create(zfsvfs);
 out:
 	if (error) {
 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 		zfsvfs_free(zfsvfs);
 	} else {
-		atomic_add_32(&zfs_active_fs_count, 1);
+		atomic_inc_32(&zfs_active_fs_count);
 	}
 
 	return (error);
 }
 
 void
 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 {
 	objset_t *os = zfsvfs->z_os;
 	struct dsl_dataset *ds;
 
 	/*
 	 * Unregister properties.
 	 */
 	if (!dmu_objset_is_snapshot(os)) {
 		ds = dmu_objset_ds(os);
 		VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
 		    zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "aclinherit",
 		    acl_inherit_changed_cb, zfsvfs) == 0);
 
 		VERIFY(dsl_prop_unregister(ds, "vscan",
 		    vscan_changed_cb, zfsvfs) == 0);
 	}
 }
 
 #ifdef SECLABEL
 /*
  * Convert a decimal digit string to a uint64_t integer.
  */
 static int
 str_to_uint64(char *str, uint64_t *objnum)
 {
 	uint64_t num = 0;
 
 	while (*str) {
 		if (*str < '0' || *str > '9')
 			return (SET_ERROR(EINVAL));
 
 		num = num*10 + *str++ - '0';
 	}
 
 	*objnum = num;
 	return (0);
 }
 
 /*
  * The boot path passed from the boot loader is in the form of
  * "rootpool-name/root-filesystem-object-number'. Convert this
  * string to a dataset name: "rootpool-name/root-filesystem-name".
  */
 static int
 zfs_parse_bootfs(char *bpath, char *outpath)
 {
 	char *slashp;
 	uint64_t objnum;
 	int error;
 
 	if (*bpath == 0 || *bpath == '/')
 		return (SET_ERROR(EINVAL));
 
 	(void) strcpy(outpath, bpath);
 
 	slashp = strchr(bpath, '/');
 
 	/* if no '/', just return the pool name */
 	if (slashp == NULL) {
 		return (0);
 	}
 
 	/* if not a number, just return the root dataset name */
 	if (str_to_uint64(slashp+1, &objnum)) {
 		return (0);
 	}
 
 	*slashp = '\0';
 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
 	*slashp = '/';
 
 	return (error);
 }
 
 /*
  * Check that the hex label string is appropriate for the dataset being
  * mounted into the global_zone proper.
  *
  * Return an error if the hex label string is not default or
  * admin_low/admin_high.  For admin_low labels, the corresponding
  * dataset must be readonly.
  */
 int
 zfs_check_global_label(const char *dsname, const char *hexsl)
 {
 	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
 		return (0);
 	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
 		return (0);
 	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
 		/* must be readonly */
 		uint64_t rdonly;
 
 		if (dsl_prop_get_integer(dsname,
 		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
 			return (SET_ERROR(EACCES));
 		return (rdonly ? 0 : EACCES);
 	}
 	return (SET_ERROR(EACCES));
 }
 
 /*
  * Determine whether the mount is allowed according to MAC check.
  * by comparing (where appropriate) label of the dataset against
  * the label of the zone being mounted into.  If the dataset has
  * no label, create one.
  *
  * Returns 0 if access allowed, error otherwise (e.g. EACCES)
  */
 static int
 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
 {
 	int		error, retv;
 	zone_t		*mntzone = NULL;
 	ts_label_t	*mnt_tsl;
 	bslabel_t	*mnt_sl;
 	bslabel_t	ds_sl;
 	char		ds_hexsl[MAXNAMELEN];
 
 	retv = EACCES;				/* assume the worst */
 
 	/*
 	 * Start by getting the dataset label if it exists.
 	 */
 	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
 	if (error)
 		return (SET_ERROR(EACCES));
 
 	/*
 	 * If labeling is NOT enabled, then disallow the mount of datasets
 	 * which have a non-default label already.  No other label checks
 	 * are needed.
 	 */
 	if (!is_system_labeled()) {
 		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
 			return (0);
 		return (SET_ERROR(EACCES));
 	}
 
 	/*
 	 * Get the label of the mountpoint.  If mounting into the global
 	 * zone (i.e. mountpoint is not within an active zone and the
 	 * zoned property is off), the label must be default or
 	 * admin_low/admin_high only; no other checks are needed.
 	 */
 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
 	if (mntzone->zone_id == GLOBAL_ZONEID) {
 		uint64_t zoned;
 
 		zone_rele(mntzone);
 
 		if (dsl_prop_get_integer(osname,
 		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
 			return (SET_ERROR(EACCES));
 		if (!zoned)
 			return (zfs_check_global_label(osname, ds_hexsl));
 		else
 			/*
 			 * This is the case of a zone dataset being mounted
 			 * initially, before the zone has been fully created;
 			 * allow this mount into global zone.
 			 */
 			return (0);
 	}
 
 	mnt_tsl = mntzone->zone_slabel;
 	ASSERT(mnt_tsl != NULL);
 	label_hold(mnt_tsl);
 	mnt_sl = label2bslabel(mnt_tsl);
 
 	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
 		/*
 		 * The dataset doesn't have a real label, so fabricate one.
 		 */
 		char *str = NULL;
 
 		if (l_to_str_internal(mnt_sl, &str) == 0 &&
 		    dsl_prop_set_string(osname,
 		    zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 		    ZPROP_SRC_LOCAL, str) == 0)
 			retv = 0;
 		if (str != NULL)
 			kmem_free(str, strlen(str) + 1);
 	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
 		/*
 		 * Now compare labels to complete the MAC check.  If the
 		 * labels are equal then allow access.  If the mountpoint
 		 * label dominates the dataset label, allow readonly access.
 		 * Otherwise, access is denied.
 		 */
 		if (blequal(mnt_sl, &ds_sl))
 			retv = 0;
 		else if (bldominates(mnt_sl, &ds_sl)) {
 			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
 			retv = 0;
 		}
 	}
 
 	label_rele(mnt_tsl);
 	zone_rele(mntzone);
 	return (retv);
 }
 #endif	/* SECLABEL */
 
 #ifdef OPENSOLARIS_MOUNTROOT
 static int
 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
 {
 	int error = 0;
 	static int zfsrootdone = 0;
 	zfsvfs_t *zfsvfs = NULL;
 	znode_t *zp = NULL;
 	vnode_t *vp = NULL;
 	char *zfs_bootfs;
 	char *zfs_devid;
 
 	ASSERT(vfsp);
 
 	/*
 	 * The filesystem that we mount as root is defined in the
 	 * boot property "zfs-bootfs" with a format of
 	 * "poolname/root-dataset-objnum".
 	 */
 	if (why == ROOT_INIT) {
 		if (zfsrootdone++)
 			return (SET_ERROR(EBUSY));
 		/*
 		 * the process of doing a spa_load will require the
 		 * clock to be set before we could (for example) do
 		 * something better by looking at the timestamp on
 		 * an uberblock, so just set it to -1.
 		 */
 		clkset(-1);
 
 		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
 			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
 			    "bootfs name");
 			return (SET_ERROR(EINVAL));
 		}
 		zfs_devid = spa_get_bootprop("diskdevid");
 		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
 		if (zfs_devid)
 			spa_free_bootprop(zfs_devid);
 		if (error) {
 			spa_free_bootprop(zfs_bootfs);
 			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
 			    error);
 			return (error);
 		}
 		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
 			spa_free_bootprop(zfs_bootfs);
 			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
 			    error);
 			return (error);
 		}
 
 		spa_free_bootprop(zfs_bootfs);
 
 		if (error = vfs_lock(vfsp))
 			return (error);
 
 		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
 			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
 			goto out;
 		}
 
 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
 		ASSERT(zfsvfs);
 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
 			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
 			goto out;
 		}
 
 		vp = ZTOV(zp);
 		mutex_enter(&vp->v_lock);
 		vp->v_flag |= VROOT;
 		mutex_exit(&vp->v_lock);
 		rootvp = vp;
 
 		/*
 		 * Leave rootvp held.  The root file system is never unmounted.
 		 */
 
 		vfs_add((struct vnode *)0, vfsp,
 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
 out:
 		vfs_unlock(vfsp);
 		return (error);
 	} else if (why == ROOT_REMOUNT) {
 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
 		vfsp->vfs_flag |= VFS_REMOUNT;
 
 		/* refresh mount options */
 		zfs_unregister_callbacks(vfsp->vfs_data);
 		return (zfs_register_callbacks(vfsp));
 
 	} else if (why == ROOT_UNMOUNT) {
 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
 		(void) zfs_sync(vfsp, 0, 0);
 		return (0);
 	}
 
 	/*
 	 * if "why" is equal to anything else other than ROOT_INIT,
 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
 	 */
 	return (SET_ERROR(ENOTSUP));
 }
 #endif	/* OPENSOLARIS_MOUNTROOT */
 
 static int
 getpoolname(const char *osname, char *poolname)
 {
 	char *p;
 
 	p = strchr(osname, '/');
 	if (p == NULL) {
 		if (strlen(osname) >= MAXNAMELEN)
 			return (ENAMETOOLONG);
 		(void) strcpy(poolname, osname);
 	} else {
 		if (p - osname >= MAXNAMELEN)
 			return (ENAMETOOLONG);
 		(void) strncpy(poolname, osname, p - osname);
 		poolname[p - osname] = '\0';
 	}
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_mount(vfs_t *vfsp)
 {
 	kthread_t	*td = curthread;
 	vnode_t		*mvp = vfsp->mnt_vnodecovered;
 	cred_t		*cr = td->td_ucred;
 	char		*osname;
 	int		error = 0;
 	int		canwrite;
 
 #ifdef illumos
 	if (mvp->v_type != VDIR)
 		return (SET_ERROR(ENOTDIR));
 
 	mutex_enter(&mvp->v_lock);
 	if ((uap->flags & MS_REMOUNT) == 0 &&
 	    (uap->flags & MS_OVERLAY) == 0 &&
 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 		mutex_exit(&mvp->v_lock);
 		return (SET_ERROR(EBUSY));
 	}
 	mutex_exit(&mvp->v_lock);
 
 	/*
 	 * ZFS does not support passing unparsed data in via MS_DATA.
 	 * Users should use the MS_OPTIONSTR interface; this means
 	 * that all option parsing is already done and the options struct
 	 * can be interrogated.
 	 */
 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
 #else
 	if (!prison_allow(td->td_ucred, PR_ALLOW_MOUNT_ZFS))
 		return (SET_ERROR(EPERM));
 
 	if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL))
 		return (SET_ERROR(EINVAL));
 #endif	/* ! illumos */
 
 	/*
 	 * If full-owner-access is enabled and delegated administration is
 	 * turned on, we must set nosuid.
 	 */
 	if (zfs_super_owner &&
 	    dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) {
 		secpolicy_fs_mount_clearopts(cr, vfsp);
 	}
 
 	/*
 	 * Check for mount privilege?
 	 *
 	 * If we don't have privilege then see if
 	 * we have local permission to allow it
 	 */
 	error = secpolicy_fs_mount(cr, mvp, vfsp);
 	if (error) {
 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0)
 			goto out;
 
 		if (!(vfsp->vfs_flag & MS_REMOUNT)) {
 			vattr_t		vattr;
 
 			/*
 			 * Make sure user is the owner of the mount point
 			 * or has sufficient privileges.
 			 */
 
 			vattr.va_mask = AT_UID;
 
 			vn_lock(mvp, LK_SHARED | LK_RETRY);
 			if (VOP_GETATTR(mvp, &vattr, cr)) {
 				VOP_UNLOCK(mvp, 0);
 				goto out;
 			}
 
 			if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 &&
 			    VOP_ACCESS(mvp, VWRITE, cr, td) != 0) {
 				VOP_UNLOCK(mvp, 0);
 				goto out;
 			}
 			VOP_UNLOCK(mvp, 0);
 		}
 
 		secpolicy_fs_mount_clearopts(cr, vfsp);
 	}
 
 	/*
 	 * Refuse to mount a filesystem if we are in a local zone and the
 	 * dataset is not visible.
 	 */
 	if (!INGLOBALZONE(curthread) &&
 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 #ifdef SECLABEL
 	error = zfs_mount_label_policy(vfsp, osname);
 	if (error)
 		goto out;
 #endif
 
 	vfsp->vfs_flag |= MNT_NFS4ACLS;
 
 	/*
 	 * When doing a remount, we simply refresh our temporary properties
 	 * according to those options set in the current VFS options.
 	 */
 	if (vfsp->vfs_flag & MS_REMOUNT) {
 		/* refresh mount options */
 		zfs_unregister_callbacks(vfsp->vfs_data);
 		error = zfs_register_callbacks(vfsp);
 		goto out;
 	}
 
 	/* Initial root mount: try hard to import the requested root pool. */
 	if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 &&
 	    (vfsp->vfs_flag & MNT_UPDATE) == 0) {
 		char pname[MAXNAMELEN];
 
 		error = getpoolname(osname, pname);
 		if (error == 0)
 			error = spa_import_rootpool(pname);
 		if (error)
 			goto out;
 	}
 	DROP_GIANT();
 	error = zfs_domount(vfsp, osname);
 	PICKUP_GIANT();
 
 #ifdef sun
 	/*
 	 * Add an extra VFS_HOLD on our parent vfs so that it can't
 	 * disappear due to a forced unmount.
 	 */
 	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
 		VFS_HOLD(mvp->v_vfsp);
 #endif	/* sun */
 
 out:
 	return (error);
 }
 
 static int
 zfs_statfs(vfs_t *vfsp, struct statfs *statp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
 
 	statp->f_version = STATFS_VERSION;
 
 	ZFS_ENTER(zfsvfs);
 
 	dmu_objset_space(zfsvfs->z_os,
 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
 
 	/*
 	 * The underlying storage pool actually uses multiple block sizes.
 	 * We report the fragsize as the smallest block size we support,
 	 * and we report our blocksize as the filesystem's maximum blocksize.
 	 */
 	statp->f_bsize = SPA_MINBLOCKSIZE;
 	statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize;
 
 	/*
 	 * The following report "total" blocks of various kinds in the
 	 * file system, but reported in terms of f_frsize - the
 	 * "fragment" size.
 	 */
 
 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
 	statp->f_bfree = availbytes / statp->f_bsize;
 	statp->f_bavail = statp->f_bfree; /* no root reservation */
 
 	/*
 	 * statvfs() should really be called statufs(), because it assumes
 	 * static metadata.  ZFS doesn't preallocate files, so the best
 	 * we can do is report the max that could possibly fit in f_files,
 	 * and that minus the number actually used in f_ffree.
 	 * For f_ffree, report the smaller of the number of object available
 	 * and the number of blocks (each object will take at least a block).
 	 */
 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
 	statp->f_files = statp->f_ffree + usedobjs;
 
 	/*
 	 * We're a zfs filesystem.
 	 */
 	(void) strlcpy(statp->f_fstypename, "zfs", sizeof(statp->f_fstypename));
 
 	strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname,
 	    sizeof(statp->f_mntfromname));
 	strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname,
 	    sizeof(statp->f_mntonname));
 
 	statp->f_namemax = ZFS_MAXNAMELEN;
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	znode_t *rootzp;
 	int error;
 
 	ZFS_ENTER_NOERROR(zfsvfs);
 
 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
 	if (error == 0)
 		*vpp = ZTOV(rootzp);
 
 	ZFS_EXIT(zfsvfs);
 
 	if (error == 0) {
 		error = vn_lock(*vpp, flags);
 		if (error == 0)
 			(*vpp)->v_vflag |= VV_ROOT;
 	}
 	if (error != 0)
 		*vpp = NULL;
 
 	return (error);
 }
 
 /*
  * Teardown the zfsvfs::z_os.
  *
  * Note, if 'unmounting' if FALSE, we return with the 'z_teardown_lock'
  * and 'z_teardown_inactive_lock' held.
  */
 static int
 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 {
 	znode_t	*zp;
 
 	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 
 	if (!unmounting) {
 		/*
 		 * We purge the parent filesystem's vfsp as the parent
 		 * filesystem and all of its snapshots have their vnode's
 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
 		 * 'z_parent' is self referential for non-snapshots.
 		 */
 		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 #ifdef FREEBSD_NAMECACHE
 		cache_purgevfs(zfsvfs->z_parent->z_vfs);
 #endif
 	}
 
 	/*
 	 * Close the zil. NB: Can't close the zil while zfs_inactive
 	 * threads are blocked as zil_close can call zfs_inactive.
 	 */
 	if (zfsvfs->z_log) {
 		zil_close(zfsvfs->z_log);
 		zfsvfs->z_log = NULL;
 	}
 
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
 
 	/*
 	 * If we are not unmounting (ie: online recv) and someone already
 	 * unmounted this file system while we were doing the switcheroo,
 	 * or a reopen of z_os failed then just bail out now.
 	 */
 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * At this point there are no vops active, and any new vops will
 	 * fail with EIO since we have z_teardown_lock for writer (only
 	 * relavent for forced unmount).
 	 *
 	 * Release all holds on dbufs.
 	 */
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
 	    zp = list_next(&zfsvfs->z_all_znodes, zp))
 		if (zp->z_sa_hdl) {
 			ASSERT(ZTOV(zp)->v_count >= 0);
 			zfs_znode_dmu_fini(zp);
 		}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	/*
 	 * If we are unmounting, set the unmounted flag and let new vops
 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
 	 * other vops will fail with EIO.
 	 */
 	if (unmounting) {
 		zfsvfs->z_unmounted = B_TRUE;
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	}
 
 	/*
 	 * z_os will be NULL if there was an error in attempting to reopen
 	 * zfsvfs, so just return as the properties had already been
 	 * unregistered and cached data had been evicted before.
 	 */
 	if (zfsvfs->z_os == NULL)
 		return (0);
 
 	/*
 	 * Unregister properties.
 	 */
 	zfs_unregister_callbacks(zfsvfs);
 
 	/*
 	 * Evict cached data
 	 */
 	if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
 	    !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 	dmu_objset_evict_dbufs(zfsvfs->z_os);
 
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_umount(vfs_t *vfsp, int fflag)
 {
 	kthread_t *td = curthread;
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	objset_t *os;
 	cred_t *cr = td->td_ucred;
 	int ret;
 
 	ret = secpolicy_fs_unmount(cr, vfsp);
 	if (ret) {
 		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
 		    ZFS_DELEG_PERM_MOUNT, cr))
 			return (ret);
 	}
 
 	/*
 	 * We purge the parent filesystem's vfsp as the parent filesystem
 	 * and all of its snapshots have their vnode's v_vfsp set to the
 	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
 	 * referential for non-snapshots.
 	 */
 	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 
 	/*
 	 * Unmount any snapshots mounted under .zfs before unmounting the
 	 * dataset itself.
 	 */
 	if (zfsvfs->z_ctldir != NULL) {
 		if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0)
 			return (ret);
 		ret = vflush(vfsp, 0, 0, td);
 		ASSERT(ret == EBUSY);
 		if (!(fflag & MS_FORCE)) {
 			if (zfsvfs->z_ctldir->v_count > 1)
 				return (EBUSY);
 			ASSERT(zfsvfs->z_ctldir->v_count == 1);
 		}
 		zfsctl_destroy(zfsvfs);
 		ASSERT(zfsvfs->z_ctldir == NULL);
 	}
 
 	if (fflag & MS_FORCE) {
 		/*
 		 * Mark file system as unmounted before calling
 		 * vflush(FORCECLOSE). This way we ensure no future vnops
 		 * will be called and risk operating on DOOMED vnodes.
 		 */
 		rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 		zfsvfs->z_unmounted = B_TRUE;
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 	}
 
 	/*
 	 * Flush all the files.
 	 */
 	ret = vflush(vfsp, 1, (fflag & MS_FORCE) ? FORCECLOSE : 0, td);
 	if (ret != 0) {
 		if (!zfsvfs->z_issnap) {
 			zfsctl_create(zfsvfs);
 			ASSERT(zfsvfs->z_ctldir != NULL);
 		}
 		return (ret);
 	}
 
 #ifdef sun
 	if (!(fflag & MS_FORCE)) {
 		/*
 		 * Check the number of active vnodes in the file system.
 		 * Our count is maintained in the vfs structure, but the
 		 * number is off by 1 to indicate a hold on the vfs
 		 * structure itself.
 		 *
 		 * The '.zfs' directory maintains a reference of its
 		 * own, and any active references underneath are
 		 * reflected in the vnode count.
 		 */
 		if (zfsvfs->z_ctldir == NULL) {
 			if (vfsp->vfs_count > 1)
 				return (SET_ERROR(EBUSY));
 		} else {
 			if (vfsp->vfs_count > 2 ||
 			    zfsvfs->z_ctldir->v_count > 1)
 				return (SET_ERROR(EBUSY));
 		}
 	}
 #endif
 
 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
 	os = zfsvfs->z_os;
 
 	/*
 	 * z_os will be NULL if there was an error in
 	 * attempting to reopen zfsvfs.
 	 */
 	if (os != NULL) {
 		/*
 		 * Unset the objset user_ptr.
 		 */
 		mutex_enter(&os->os_user_ptr_lock);
 		dmu_objset_set_user(os, NULL);
 		mutex_exit(&os->os_user_ptr_lock);
 
 		/*
 		 * Finally release the objset
 		 */
 		dmu_objset_disown(os, zfsvfs);
 	}
 
 	/*
 	 * We can now safely destroy the '.zfs' directory node.
 	 */
 	if (zfsvfs->z_ctldir != NULL)
 		zfsctl_destroy(zfsvfs);
 	if (zfsvfs->z_issnap) {
 		vnode_t *svp = vfsp->mnt_vnodecovered;
 
 		if (svp->v_count >= 2)
 			VN_RELE(svp);
 	}
 	zfs_freevfs(vfsp);
 
 	return (0);
 }
 
 static int
 zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp)
 {
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	int 		err;
 
 	/*
 	 * zfs_zget() can't operate on virtual entries like .zfs/ or
 	 * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP.
 	 * This will make NFS to switch to LOOKUP instead of using VGET.
 	 */
 	if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR ||
 	    (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir))
 		return (EOPNOTSUPP);
 
 	ZFS_ENTER(zfsvfs);
 	err = zfs_zget(zfsvfs, ino, &zp);
 	if (err == 0 && zp->z_unlinked) {
 		VN_RELE(ZTOV(zp));
 		err = EINVAL;
 	}
 	if (err == 0)
 		*vpp = ZTOV(zp);
 	ZFS_EXIT(zfsvfs);
 	if (err == 0)
 		err = vn_lock(*vpp, flags);
 	if (err != 0)
 		*vpp = NULL;
 	return (err);
 }
 
 static int
 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp,
     struct ucred **credanonp, int *numsecflavors, int **secflavors)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 	/*
 	 * If this is regular file system vfsp is the same as
 	 * zfsvfs->z_parent->z_vfs, but if it is snapshot,
 	 * zfsvfs->z_parent->z_vfs represents parent file system
 	 * which we have to use here, because only this file system
 	 * has mnt_export configured.
 	 */
 	return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp,
 	    credanonp, numsecflavors, secflavors));
 }
 
 CTASSERT(SHORT_FID_LEN <= sizeof(struct fid));
 CTASSERT(LONG_FID_LEN <= sizeof(struct fid));
 
 static int
 zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp)
 {
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	uint64_t	object = 0;
 	uint64_t	fid_gen = 0;
 	uint64_t	gen_mask;
 	uint64_t	zp_gen;
 	int 		i, err;
 
 	*vpp = NULL;
 
 	ZFS_ENTER(zfsvfs);
 
 	/*
 	 * On FreeBSD we can get snapshot's mount point or its parent file
 	 * system mount point depending if snapshot is already mounted or not.
 	 */
 	if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) {
 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
 		uint64_t	objsetid = 0;
 		uint64_t	setgen = 0;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
 
 		ZFS_EXIT(zfsvfs);
 
 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
 		if (err)
 			return (SET_ERROR(EINVAL));
 		ZFS_ENTER(zfsvfs);
 	}
 
 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
 
 		for (i = 0; i < sizeof (zfid->zf_object); i++)
 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
 	} else {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * A zero fid_gen means we are in .zfs or the .zfs/snapshot
 	 * directory tree. If the object == zfsvfs->z_shares_dir, then
 	 * we are in the .zfs/shares directory tree.
 	 */
 	if ((fid_gen == 0 &&
 	     (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) ||
 	    (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) {
 		*vpp = zfsvfs->z_ctldir;
 		ASSERT(*vpp != NULL);
 		if (object == ZFSCTL_INO_SNAPDIR) {
 			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
 		} else if (object == zfsvfs->z_shares_dir) {
 			VERIFY(zfsctl_root_lookup(*vpp, "shares", vpp, NULL,
 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
 		} else {
 			VN_HOLD(*vpp);
 		}
 		ZFS_EXIT(zfsvfs);
 		err = vn_lock(*vpp, flags);
 		if (err != 0)
 			*vpp = NULL;
 		return (err);
 	}
 
 	gen_mask = -1ULL >> (64 - 8 * i);
 
 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
 	if (err = zfs_zget(zfsvfs, object, &zp)) {
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
 	    sizeof (uint64_t));
 	zp_gen = zp_gen & gen_mask;
 	if (zp_gen == 0)
 		zp_gen = 1;
 	if (zp->z_unlinked || zp_gen != fid_gen) {
 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
 		VN_RELE(ZTOV(zp));
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	*vpp = ZTOV(zp);
 	ZFS_EXIT(zfsvfs);
 	err = vn_lock(*vpp, flags | LK_RETRY);
 	if (err == 0)
 		vnode_create_vobject(*vpp, zp->z_size, curthread);
 	else
 		*vpp = NULL;
 	return (err);
 }
 
 /*
  * Block out VOPs and close zfsvfs_t::z_os
  *
  * Note, if successful, then we return with the 'z_teardown_lock' and
  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
  * dataset and objset intact so that they can be atomically handed off during
  * a subsequent rollback or recv operation and the resume thereafter.
  */
 int
 zfs_suspend_fs(zfsvfs_t *zfsvfs)
 {
 	int error;
 
 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
 		return (error);
 
 	return (0);
 }
 
 /*
  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
  * is an invariant across any of the operations that can be performed while the
  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
  * are the same: the relevant objset and associated dataset are owned by
  * zfsvfs, held, and long held on entry.
  */
 int
 zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
 {
 	int err;
 	znode_t *zp;
 	uint64_t sa_obj = 0;
 
 	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
 
 	/*
 	 * We already own this, so just hold and rele it to update the
 	 * objset_t, as the one we had before may have been evicted.
 	 */
 	VERIFY0(dmu_objset_hold(osname, zfsvfs, &zfsvfs->z_os));
 	VERIFY3P(zfsvfs->z_os->os_dsl_dataset->ds_owner, ==, zfsvfs);
 	VERIFY(dsl_dataset_long_held(zfsvfs->z_os->os_dsl_dataset));
 	dmu_objset_rele(zfsvfs->z_os, zfsvfs);
 
 	/*
 	 * Make sure version hasn't changed
 	 */
 
 	err = zfs_get_zplprop(zfsvfs->z_os, ZFS_PROP_VERSION,
 	    &zfsvfs->z_version);
 
 	if (err)
 		goto bail;
 
 	err = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
 	    ZFS_SA_ATTRS, 8, 1, &sa_obj);
 
 	if (err && zfsvfs->z_version >= ZPL_VERSION_SA)
 		goto bail;
 
 	if ((err = sa_setup(zfsvfs->z_os, sa_obj,
 	    zfs_attr_table,  ZPL_END, &zfsvfs->z_attr_table)) != 0)
 		goto bail;
 
 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
 		sa_register_update_callback(zfsvfs->z_os,
 		    zfs_sa_upgrade);
 
 	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
 
 	zfs_set_fuid_feature(zfsvfs);
 
 	/*
 	 * Attempt to re-establish all the active znodes with
 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
 	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
 	 * when they try to use their znode.
 	 */
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
 		(void) zfs_rezget(zp);
 	}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 bail:
 	/* release the VOPs */
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 
 	if (err) {
 		/*
 		 * Since we couldn't setup the sa framework, try to force
 		 * unmount this file system.
 		 */
 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread);
 	}
 	return (err);
 }
 
 static void
 zfs_freevfs(vfs_t *vfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 #ifdef sun
 	/*
 	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
 	 * from zfs_mount().  Release it here.  If we came through
 	 * zfs_mountroot() instead, we didn't grab an extra hold, so
 	 * skip the VFS_RELE for rootvfs.
 	 */
 	if (zfsvfs->z_issnap && (vfsp != rootvfs))
 		VFS_RELE(zfsvfs->z_parent->z_vfs);
 #endif	/* sun */
 
 	zfsvfs_free(zfsvfs);
 
-	atomic_add_32(&zfs_active_fs_count, -1);
+	atomic_dec_32(&zfs_active_fs_count);
 }
 
 #ifdef __i386__
 static int desiredvnodes_backup;
 #endif
 
 static void
 zfs_vnodes_adjust(void)
 {
 #ifdef __i386__
 	int newdesiredvnodes;
 
 	desiredvnodes_backup = desiredvnodes;
 
 	/*
 	 * We calculate newdesiredvnodes the same way it is done in
 	 * vntblinit(). If it is equal to desiredvnodes, it means that
 	 * it wasn't tuned by the administrator and we can tune it down.
 	 */
 	newdesiredvnodes = min(maxproc + cnt.v_page_count / 4, 2 *
 	    vm_kmem_size / (5 * (sizeof(struct vm_object) +
 	    sizeof(struct vnode))));
 	if (newdesiredvnodes == desiredvnodes)
 		desiredvnodes = (3 * newdesiredvnodes) / 4;
 #endif
 }
 
 static void
 zfs_vnodes_adjust_back(void)
 {
 
 #ifdef __i386__
 	desiredvnodes = desiredvnodes_backup;
 #endif
 }
 
 void
 zfs_init(void)
 {
 
 	printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n");
 
 	/*
 	 * Initialize .zfs directory structures
 	 */
 	zfsctl_init();
 
 	/*
 	 * Initialize znode cache, vnode ops, etc...
 	 */
 	zfs_znode_init();
 
 	/*
 	 * Reduce number of vnodes. Originally number of vnodes is calculated
 	 * with UFS inode in mind. We reduce it here, because it's too big for
 	 * ZFS/i386.
 	 */
 	zfs_vnodes_adjust();
 
 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
 }
 
 void
 zfs_fini(void)
 {
 	zfsctl_fini();
 	zfs_znode_fini();
 	zfs_vnodes_adjust_back();
 }
 
 int
 zfs_busy(void)
 {
 	return (zfs_active_fs_count != 0);
 }
 
 int
 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 {
 	int error;
 	objset_t *os = zfsvfs->z_os;
 	dmu_tx_t *tx;
 
 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
 		return (SET_ERROR(EINVAL));
 
 	if (newvers < zfsvfs->z_version)
 		return (SET_ERROR(EINVAL));
 
 	if (zfs_spa_version_map(newvers) >
 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
 		return (SET_ERROR(ENOTSUP));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 		    ZFS_SA_ATTRS);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	}
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 	    8, 1, &newvers, tx);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		return (error);
 	}
 
 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
 		uint64_t sa_obj;
 
 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
 		    SPA_VERSION_SA);
 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
 		    DMU_OT_NONE, 0, tx);
 
 		error = zap_add(os, MASTER_NODE_OBJ,
 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
 		ASSERT0(error);
 
 		VERIFY(0 == sa_set_sa_object(os, sa_obj));
 		sa_register_update_callback(os, zfs_sa_upgrade);
 	}
 
 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
 	    "from %llu to %llu", zfsvfs->z_version, newvers);
 
 	dmu_tx_commit(tx);
 
 	zfsvfs->z_version = newvers;
 
 	zfs_set_fuid_feature(zfsvfs);
 
 	return (0);
 }
 
 /*
  * Read a property stored within the master node.
  */
 int
 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
 {
 	const char *pname;
 	int error = ENOENT;
 
 	/*
 	 * Look up the file system's value for the property.  For the
 	 * version property, we look up a slightly different string.
 	 */
 	if (prop == ZFS_PROP_VERSION)
 		pname = ZPL_VERSION_STR;
 	else
 		pname = zfs_prop_to_name(prop);
 
 	if (os != NULL)
 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
 
 	if (error == ENOENT) {
 		/* No value set, use the default value */
 		switch (prop) {
 		case ZFS_PROP_VERSION:
 			*value = ZPL_VERSION;
 			break;
 		case ZFS_PROP_NORMALIZE:
 		case ZFS_PROP_UTF8ONLY:
 			*value = 0;
 			break;
 		case ZFS_PROP_CASE:
 			*value = ZFS_CASE_SENSITIVE;
 			break;
 		default:
 			return (error);
 		}
 		error = 0;
 	}
 	return (error);
 }
 
 #ifdef _KERNEL
 void
 zfsvfs_update_fromname(const char *oldname, const char *newname)
 {
 	char tmpbuf[MAXPATHLEN];
 	struct mount *mp;
 	char *fromname;
 	size_t oldlen;
 
 	oldlen = strlen(oldname);
 
 	mtx_lock(&mountlist_mtx);
 	TAILQ_FOREACH(mp, &mountlist, mnt_list) {
 		fromname = mp->mnt_stat.f_mntfromname;
 		if (strcmp(fromname, oldname) == 0) {
 			(void)strlcpy(fromname, newname,
 			    sizeof(mp->mnt_stat.f_mntfromname));
 			continue;
 		}
 		if (strncmp(fromname, oldname, oldlen) == 0 &&
 		    (fromname[oldlen] == '/' || fromname[oldlen] == '@')) {
 			(void)snprintf(tmpbuf, sizeof(tmpbuf), "%s%s",
 			    newname, fromname + oldlen);
 			(void)strlcpy(fromname, tmpbuf,
 			    sizeof(mp->mnt_stat.f_mntfromname));
 			continue;
 		}
 	}
 	mtx_unlock(&mountlist_mtx);
 }
 #endif
Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c
===================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c	(revision 271000)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_inject.c	(revision 271001)
@@ -1,524 +1,524 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2014 by Delphix. All rights reserved.
  */
 
 /*
  * ZFS fault injection
  *
  * To handle fault injection, we keep track of a series of zinject_record_t
  * structures which describe which logical block(s) should be injected with a
  * fault.  These are kept in a global list.  Each record corresponds to a given
  * spa_t and maintains a special hold on the spa_t so that it cannot be deleted
  * or exported while the injection record exists.
  *
  * Device level injection is done using the 'zi_guid' field.  If this is set, it
  * means that the error is destined for a particular device, not a piece of
  * data.
  *
  * This is a rather poor data structure and algorithm, but we don't expect more
  * than a few faults at any one time, so it should be sufficient for our needs.
  */
 
 #include <sys/arc.h>
 #include <sys/zio_impl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/vdev_impl.h>
 #include <sys/dmu_objset.h>
 #include <sys/fs/zfs.h>
 
 uint32_t zio_injection_enabled;
 
 typedef struct inject_handler {
 	int			zi_id;
 	spa_t			*zi_spa;
 	zinject_record_t	zi_record;
 	list_node_t		zi_link;
 } inject_handler_t;
 
 static list_t inject_handlers;
 static krwlock_t inject_lock;
 static int inject_next_id = 1;
 
 /*
  * Returns true if the given record matches the I/O in progress.
  */
 static boolean_t
 zio_match_handler(zbookmark_phys_t *zb, uint64_t type,
     zinject_record_t *record, int error)
 {
 	/*
 	 * Check for a match against the MOS, which is based on type
 	 */
 	if (zb->zb_objset == DMU_META_OBJSET &&
 	    record->zi_objset == DMU_META_OBJSET &&
 	    record->zi_object == DMU_META_DNODE_OBJECT) {
 		if (record->zi_type == DMU_OT_NONE ||
 		    type == record->zi_type)
 			return (record->zi_freq == 0 ||
 			    spa_get_random(100) < record->zi_freq);
 		else
 			return (B_FALSE);
 	}
 
 	/*
 	 * Check for an exact match.
 	 */
 	if (zb->zb_objset == record->zi_objset &&
 	    zb->zb_object == record->zi_object &&
 	    zb->zb_level == record->zi_level &&
 	    zb->zb_blkid >= record->zi_start &&
 	    zb->zb_blkid <= record->zi_end &&
 	    error == record->zi_error)
 		return (record->zi_freq == 0 ||
 		    spa_get_random(100) < record->zi_freq);
 
 	return (B_FALSE);
 }
 
 /*
  * Panic the system when a config change happens in the function
  * specified by tag.
  */
 void
 zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
 {
 	inject_handler_t *handler;
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
 		if (spa != handler->zi_spa)
 			continue;
 
 		if (handler->zi_record.zi_type == type &&
 		    strcmp(tag, handler->zi_record.zi_func) == 0)
 			panic("Panic requested in function %s\n", tag);
 	}
 
 	rw_exit(&inject_lock);
 }
 
 /*
  * Determine if the I/O in question should return failure.  Returns the errno
  * to be returned to the caller.
  */
 int
 zio_handle_fault_injection(zio_t *zio, int error)
 {
 	int ret = 0;
 	inject_handler_t *handler;
 
 	/*
 	 * Ignore I/O not associated with any logical data.
 	 */
 	if (zio->io_logical == NULL)
 		return (0);
 
 	/*
 	 * Currently, we only support fault injection on reads.
 	 */
 	if (zio->io_type != ZIO_TYPE_READ)
 		return (0);
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
 		if (zio->io_spa != handler->zi_spa ||
 		    handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT)
 			continue;
 
 		/* If this handler matches, return EIO */
 		if (zio_match_handler(&zio->io_logical->io_bookmark,
 		    zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE,
 		    &handler->zi_record, error)) {
 			ret = error;
 			break;
 		}
 	}
 
 	rw_exit(&inject_lock);
 
 	return (ret);
 }
 
 /*
  * Determine if the zio is part of a label update and has an injection
  * handler associated with that portion of the label. Currently, we
  * allow error injection in either the nvlist or the uberblock region of
  * of the vdev label.
  */
 int
 zio_handle_label_injection(zio_t *zio, int error)
 {
 	inject_handler_t *handler;
 	vdev_t *vd = zio->io_vd;
 	uint64_t offset = zio->io_offset;
 	int label;
 	int ret = 0;
 
 	if (offset >= VDEV_LABEL_START_SIZE &&
 	    offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
 		return (0);
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 		uint64_t start = handler->zi_record.zi_start;
 		uint64_t end = handler->zi_record.zi_end;
 
 		if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT)
 			continue;
 
 		/*
 		 * The injection region is the relative offsets within a
 		 * vdev label. We must determine the label which is being
 		 * updated and adjust our region accordingly.
 		 */
 		label = vdev_label_number(vd->vdev_psize, offset);
 		start = vdev_label_offset(vd->vdev_psize, label, start);
 		end = vdev_label_offset(vd->vdev_psize, label, end);
 
 		if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid &&
 		    (offset >= start && offset <= end)) {
 			ret = error;
 			break;
 		}
 	}
 	rw_exit(&inject_lock);
 	return (ret);
 }
 
 
 int
 zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
 {
 	inject_handler_t *handler;
 	int ret = 0;
 
 	/*
 	 * We skip over faults in the labels unless it's during
 	 * device open (i.e. zio == NULL).
 	 */
 	if (zio != NULL) {
 		uint64_t offset = zio->io_offset;
 
 		if (offset < VDEV_LABEL_START_SIZE ||
 		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
 			return (0);
 	}
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
 		if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT)
 			continue;
 
 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
 			if (handler->zi_record.zi_failfast &&
 			    (zio == NULL || (zio->io_flags &
 			    (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
 				continue;
 			}
 
 			/* Handle type specific I/O failures */
 			if (zio != NULL &&
 			    handler->zi_record.zi_iotype != ZIO_TYPES &&
 			    handler->zi_record.zi_iotype != zio->io_type)
 				continue;
 
 			if (handler->zi_record.zi_error == error) {
 				/*
 				 * For a failed open, pretend like the device
 				 * has gone away.
 				 */
 				if (error == ENXIO)
 					vd->vdev_stat.vs_aux =
 					    VDEV_AUX_OPEN_FAILED;
 
 				/*
 				 * Treat these errors as if they had been
 				 * retried so that all the appropriate stats
 				 * and FMA events are generated.
 				 */
 				if (!handler->zi_record.zi_failfast &&
 				    zio != NULL)
 					zio->io_flags |= ZIO_FLAG_IO_RETRY;
 
 				ret = error;
 				break;
 			}
 			if (handler->zi_record.zi_error == ENXIO) {
 				ret = SET_ERROR(EIO);
 				break;
 			}
 		}
 	}
 
 	rw_exit(&inject_lock);
 
 	return (ret);
 }
 
 /*
  * Simulate hardware that ignores cache flushes.  For requested number
  * of seconds nix the actual writing to disk.
  */
 void
 zio_handle_ignored_writes(zio_t *zio)
 {
 	inject_handler_t *handler;
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
 		/* Ignore errors not destined for this pool */
 		if (zio->io_spa != handler->zi_spa ||
 		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
 			continue;
 
 		/*
 		 * Positive duration implies # of seconds, negative
 		 * a number of txgs
 		 */
 		if (handler->zi_record.zi_timer == 0) {
 			if (handler->zi_record.zi_duration > 0)
 				handler->zi_record.zi_timer = ddi_get_lbolt64();
 			else
 				handler->zi_record.zi_timer = zio->io_txg;
 		}
 
 		/* Have a "problem" writing 60% of the time */
 		if (spa_get_random(100) < 60)
 			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 		break;
 	}
 
 	rw_exit(&inject_lock);
 }
 
 void
 spa_handle_ignored_writes(spa_t *spa)
 {
 	inject_handler_t *handler;
 
 	if (zio_injection_enabled == 0)
 		return;
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
 		if (spa != handler->zi_spa ||
 		    handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES)
 			continue;
 
 		if (handler->zi_record.zi_duration > 0) {
 			VERIFY(handler->zi_record.zi_timer == 0 ||
 			    handler->zi_record.zi_timer +
 			    handler->zi_record.zi_duration * hz >
 			    ddi_get_lbolt64());
 		} else {
 			/* duration is negative so the subtraction here adds */
 			VERIFY(handler->zi_record.zi_timer == 0 ||
 			    handler->zi_record.zi_timer -
 			    handler->zi_record.zi_duration >=
 			    spa_syncing_txg(spa));
 		}
 	}
 
 	rw_exit(&inject_lock);
 }
 
 uint64_t
 zio_handle_io_delay(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	inject_handler_t *handler;
 	uint64_t seconds = 0;
 
 	if (zio_injection_enabled == 0)
 		return (0);
 
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
 		if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO)
 			continue;
 
 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
 			seconds = handler->zi_record.zi_timer;
 			break;
 		}
 
 	}
 	rw_exit(&inject_lock);
 	return (seconds);
 }
 
 /*
  * Create a new handler for the given record.  We add it to the list, adding
  * a reference to the spa_t in the process.  We increment zio_injection_enabled,
  * which is the switch to trigger all fault injection.
  */
 int
 zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record)
 {
 	inject_handler_t *handler;
 	int error;
 	spa_t *spa;
 
 	/*
 	 * If this is pool-wide metadata, make sure we unload the corresponding
 	 * spa_t, so that the next attempt to load it will trigger the fault.
 	 * We call spa_reset() to unload the pool appropriately.
 	 */
 	if (flags & ZINJECT_UNLOAD_SPA)
 		if ((error = spa_reset(name)) != 0)
 			return (error);
 
 	if (!(flags & ZINJECT_NULL)) {
 		/*
 		 * spa_inject_ref() will add an injection reference, which will
 		 * prevent the pool from being removed from the namespace while
 		 * still allowing it to be unloaded.
 		 */
 		if ((spa = spa_inject_addref(name)) == NULL)
 			return (SET_ERROR(ENOENT));
 
 		handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP);
 
 		rw_enter(&inject_lock, RW_WRITER);
 
 		*id = handler->zi_id = inject_next_id++;
 		handler->zi_spa = spa;
 		handler->zi_record = *record;
 		list_insert_tail(&inject_handlers, handler);
-		atomic_add_32(&zio_injection_enabled, 1);
+		atomic_inc_32(&zio_injection_enabled);
 
 		rw_exit(&inject_lock);
 	}
 
 	/*
 	 * Flush the ARC, so that any attempts to read this data will end up
 	 * going to the ZIO layer.  Note that this is a little overkill, but
 	 * we don't have the necessary ARC interfaces to do anything else, and
 	 * fault injection isn't a performance critical path.
 	 */
 	if (flags & ZINJECT_FLUSH_ARC)
 		arc_flush(NULL);
 
 	return (0);
 }
 
 /*
  * Returns the next record with an ID greater than that supplied to the
  * function.  Used to iterate over all handlers in the system.
  */
 int
 zio_inject_list_next(int *id, char *name, size_t buflen,
     zinject_record_t *record)
 {
 	inject_handler_t *handler;
 	int ret;
 
 	mutex_enter(&spa_namespace_lock);
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler))
 		if (handler->zi_id > *id)
 			break;
 
 	if (handler) {
 		*record = handler->zi_record;
 		*id = handler->zi_id;
 		(void) strncpy(name, spa_name(handler->zi_spa), buflen);
 		ret = 0;
 	} else {
 		ret = SET_ERROR(ENOENT);
 	}
 
 	rw_exit(&inject_lock);
 	mutex_exit(&spa_namespace_lock);
 
 	return (ret);
 }
 
 /*
  * Clear the fault handler with the given identifier, or return ENOENT if none
  * exists.
  */
 int
 zio_clear_fault(int id)
 {
 	inject_handler_t *handler;
 
 	rw_enter(&inject_lock, RW_WRITER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler))
 		if (handler->zi_id == id)
 			break;
 
 	if (handler == NULL) {
 		rw_exit(&inject_lock);
 		return (SET_ERROR(ENOENT));
 	}
 
 	list_remove(&inject_handlers, handler);
 	rw_exit(&inject_lock);
 
 	spa_inject_delref(handler->zi_spa);
 	kmem_free(handler, sizeof (inject_handler_t));
-	atomic_add_32(&zio_injection_enabled, -1);
+	atomic_dec_32(&zio_injection_enabled);
 
 	return (0);
 }
 
 void
 zio_inject_init(void)
 {
 	rw_init(&inject_lock, NULL, RW_DEFAULT, NULL);
 	list_create(&inject_handlers, sizeof (inject_handler_t),
 	    offsetof(inject_handler_t, zi_link));
 }
 
 void
 zio_inject_fini(void)
 {
 	list_destroy(&inject_handlers);
 	rw_destroy(&inject_lock);
 }
Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/os/fm.c
===================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/common/os/fm.c	(revision 271000)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/common/os/fm.c	(revision 271001)
@@ -1,1402 +1,1399 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved.
  */
 
 /*
  * Fault Management Architecture (FMA) Resource and Protocol Support
  *
  * The routines contained herein provide services to support kernel subsystems
  * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089).
  *
  * Name-Value Pair Lists
  *
  * The embodiment of an FMA protocol element (event, fmri or authority) is a
  * name-value pair list (nvlist_t).  FMA-specific nvlist construtor and
  * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used
  * to create an nvpair list using custom allocators.  Callers may choose to
  * allocate either from the kernel memory allocator, or from a preallocated
  * buffer, useful in constrained contexts like high-level interrupt routines.
  *
  * Protocol Event and FMRI Construction
  *
  * Convenience routines are provided to construct nvlist events according to
  * the FMA Event Protocol and Naming Schema specification for ereports and
  * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes.
  *
  * ENA Manipulation
  *
  * Routines to generate ENA formats 0, 1 and 2 are available as well as
  * routines to increment formats 1 and 2.  Individual fields within the
  * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(),
  * fm_ena_format_get() and fm_ena_gen_get().
  */
 
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/sysevent.h>
 #include <sys/nvpair.h>
 #include <sys/cmn_err.h>
 #include <sys/cpuvar.h>
 #include <sys/sysmacros.h>
 #include <sys/systm.h>
 #include <sys/compress.h>
 #include <sys/cpuvar.h>
 #include <sys/kobj.h>
 #include <sys/kstat.h>
 #include <sys/processor.h>
 #include <sys/pcpu.h>
 #include <sys/sunddi.h>
 #include <sys/systeminfo.h>
 #include <sys/sysevent/eventdefs.h>
 #include <sys/fm/util.h>
 #include <sys/fm/protocol.h>
 
 /*
  * URL and SUNW-MSG-ID value to display for fm_panic(), defined below.  These
  * values must be kept in sync with the FMA source code in usr/src/cmd/fm.
  */
 static const char *fm_url = "http://www.sun.com/msg";
 static const char *fm_msgid = "SUNOS-8000-0G";
 static char *volatile fm_panicstr = NULL;
 
 #ifdef sun
 errorq_t *ereport_errorq;
 #endif
 void *ereport_dumpbuf;
 size_t ereport_dumplen;
 
 static uint_t ereport_chanlen = ERPT_EVCH_MAX;
 static evchan_t *ereport_chan = NULL;
 static ulong_t ereport_qlen = 0;
 static size_t ereport_size = 0;
 static int ereport_cols = 80;
 
 extern void fastreboot_disable_highpil(void);
 
 /*
  * Common fault management kstats to record ereport generation
  * failures
  */
 
 struct erpt_kstat {
 	kstat_named_t	erpt_dropped;		/* num erpts dropped on post */
 	kstat_named_t	erpt_set_failed;	/* num erpt set failures */
 	kstat_named_t	fmri_set_failed;	/* num fmri set failures */
 	kstat_named_t	payload_set_failed;	/* num payload set failures */
 };
 
 static struct erpt_kstat erpt_kstat_data = {
 	{ "erpt-dropped", KSTAT_DATA_UINT64 },
 	{ "erpt-set-failed", KSTAT_DATA_UINT64 },
 	{ "fmri-set-failed", KSTAT_DATA_UINT64 },
 	{ "payload-set-failed", KSTAT_DATA_UINT64 }
 };
 
 #ifdef sun
 /*ARGSUSED*/
 static void
 fm_drain(void *private, void *data, errorq_elem_t *eep)
 {
 	nvlist_t *nvl = errorq_elem_nvl(ereport_errorq, eep);
 
 	if (!panicstr)
 		(void) fm_ereport_post(nvl, EVCH_TRYHARD);
 	else
 		fm_nvprint(nvl);
 }
 #endif
 
 void
 fm_init(void)
 {
 	kstat_t *ksp;
 
 #ifdef sun
 	(void) sysevent_evc_bind(FM_ERROR_CHAN,
 	    &ereport_chan, EVCH_CREAT | EVCH_HOLD_PEND);
 
 	(void) sysevent_evc_control(ereport_chan,
 	    EVCH_SET_CHAN_LEN, &ereport_chanlen);
 #endif
 
 	if (ereport_qlen == 0)
 		ereport_qlen = ERPT_MAX_ERRS * MAX(max_ncpus, 4);
 
 	if (ereport_size == 0)
 		ereport_size = ERPT_DATA_SZ;
 
 #ifdef sun
 	ereport_errorq = errorq_nvcreate("fm_ereport_queue",
 	    (errorq_func_t)fm_drain, NULL, ereport_qlen, ereport_size,
 	    FM_ERR_PIL, ERRORQ_VITAL);
 	if (ereport_errorq == NULL)
 		panic("failed to create required ereport error queue");
 #endif
 
 	ereport_dumpbuf = kmem_alloc(ereport_size, KM_SLEEP);
 	ereport_dumplen = ereport_size;
 
 	/* Initialize ereport allocation and generation kstats */
 	ksp = kstat_create("unix", 0, "fm", "misc", KSTAT_TYPE_NAMED,
 	    sizeof (struct erpt_kstat) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (ksp != NULL) {
 		ksp->ks_data = &erpt_kstat_data;
 		kstat_install(ksp);
 	} else {
 		cmn_err(CE_NOTE, "failed to create fm/misc kstat\n");
 
 	}
 }
 
 #ifdef sun
 /*
  * Formatting utility function for fm_nvprintr.  We attempt to wrap chunks of
  * output so they aren't split across console lines, and return the end column.
  */
 /*PRINTFLIKE4*/
 static int
 fm_printf(int depth, int c, int cols, const char *format, ...)
 {
 	va_list ap;
 	int width;
 	char c1;
 
 	va_start(ap, format);
 	width = vsnprintf(&c1, sizeof (c1), format, ap);
 	va_end(ap);
 
 	if (c + width >= cols) {
 		console_printf("\n\r");
 		c = 0;
 		if (format[0] != ' ' && depth > 0) {
 			console_printf(" ");
 			c++;
 		}
 	}
 
 	va_start(ap, format);
 	console_vprintf(format, ap);
 	va_end(ap);
 
 	return ((c + width) % cols);
 }
 
 /*
  * Recursively print a nvlist in the specified column width and return the
  * column we end up in.  This function is called recursively by fm_nvprint(),
  * below.  We generically format the entire nvpair using hexadecimal
  * integers and strings, and elide any integer arrays.  Arrays are basically
  * used for cache dumps right now, so we suppress them so as not to overwhelm
  * the amount of console output we produce at panic time.  This can be further
  * enhanced as FMA technology grows based upon the needs of consumers.  All
  * FMA telemetry is logged using the dump device transport, so the console
  * output serves only as a fallback in case this procedure is unsuccessful.
  */
 static int
 fm_nvprintr(nvlist_t *nvl, int d, int c, int cols)
 {
 	nvpair_t *nvp;
 
 	for (nvp = nvlist_next_nvpair(nvl, NULL);
 	    nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) {
 
 		data_type_t type = nvpair_type(nvp);
 		const char *name = nvpair_name(nvp);
 
 		boolean_t b;
 		uint8_t i8;
 		uint16_t i16;
 		uint32_t i32;
 		uint64_t i64;
 		char *str;
 		nvlist_t *cnv;
 
 		if (strcmp(name, FM_CLASS) == 0)
 			continue; /* already printed by caller */
 
 		c = fm_printf(d, c, cols, " %s=", name);
 
 		switch (type) {
 		case DATA_TYPE_BOOLEAN:
 			c = fm_printf(d + 1, c, cols, " 1");
 			break;
 
 		case DATA_TYPE_BOOLEAN_VALUE:
 			(void) nvpair_value_boolean_value(nvp, &b);
 			c = fm_printf(d + 1, c, cols, b ? "1" : "0");
 			break;
 
 		case DATA_TYPE_BYTE:
 			(void) nvpair_value_byte(nvp, &i8);
 			c = fm_printf(d + 1, c, cols, "%x", i8);
 			break;
 
 		case DATA_TYPE_INT8:
 			(void) nvpair_value_int8(nvp, (void *)&i8);
 			c = fm_printf(d + 1, c, cols, "%x", i8);
 			break;
 
 		case DATA_TYPE_UINT8:
 			(void) nvpair_value_uint8(nvp, &i8);
 			c = fm_printf(d + 1, c, cols, "%x", i8);
 			break;
 
 		case DATA_TYPE_INT16:
 			(void) nvpair_value_int16(nvp, (void *)&i16);
 			c = fm_printf(d + 1, c, cols, "%x", i16);
 			break;
 
 		case DATA_TYPE_UINT16:
 			(void) nvpair_value_uint16(nvp, &i16);
 			c = fm_printf(d + 1, c, cols, "%x", i16);
 			break;
 
 		case DATA_TYPE_INT32:
 			(void) nvpair_value_int32(nvp, (void *)&i32);
 			c = fm_printf(d + 1, c, cols, "%x", i32);
 			break;
 
 		case DATA_TYPE_UINT32:
 			(void) nvpair_value_uint32(nvp, &i32);
 			c = fm_printf(d + 1, c, cols, "%x", i32);
 			break;
 
 		case DATA_TYPE_INT64:
 			(void) nvpair_value_int64(nvp, (void *)&i64);
 			c = fm_printf(d + 1, c, cols, "%llx",
 			    (u_longlong_t)i64);
 			break;
 
 		case DATA_TYPE_UINT64:
 			(void) nvpair_value_uint64(nvp, &i64);
 			c = fm_printf(d + 1, c, cols, "%llx",
 			    (u_longlong_t)i64);
 			break;
 
 		case DATA_TYPE_HRTIME:
 			(void) nvpair_value_hrtime(nvp, (void *)&i64);
 			c = fm_printf(d + 1, c, cols, "%llx",
 			    (u_longlong_t)i64);
 			break;
 
 		case DATA_TYPE_STRING:
 			(void) nvpair_value_string(nvp, &str);
 			c = fm_printf(d + 1, c, cols, "\"%s\"",
 			    str ? str : "<NULL>");
 			break;
 
 		case DATA_TYPE_NVLIST:
 			c = fm_printf(d + 1, c, cols, "[");
 			(void) nvpair_value_nvlist(nvp, &cnv);
 			c = fm_nvprintr(cnv, d + 1, c, cols);
 			c = fm_printf(d + 1, c, cols, " ]");
 			break;
 
 		case DATA_TYPE_NVLIST_ARRAY: {
 			nvlist_t **val;
 			uint_t i, nelem;
 
 			c = fm_printf(d + 1, c, cols, "[");
 			(void) nvpair_value_nvlist_array(nvp, &val, &nelem);
 			for (i = 0; i < nelem; i++) {
 				c = fm_nvprintr(val[i], d + 1, c, cols);
 			}
 			c = fm_printf(d + 1, c, cols, " ]");
 			}
 			break;
 
 		case DATA_TYPE_BOOLEAN_ARRAY:
 		case DATA_TYPE_BYTE_ARRAY:
 		case DATA_TYPE_INT8_ARRAY:
 		case DATA_TYPE_UINT8_ARRAY:
 		case DATA_TYPE_INT16_ARRAY:
 		case DATA_TYPE_UINT16_ARRAY:
 		case DATA_TYPE_INT32_ARRAY:
 		case DATA_TYPE_UINT32_ARRAY:
 		case DATA_TYPE_INT64_ARRAY:
 		case DATA_TYPE_UINT64_ARRAY:
 		case DATA_TYPE_STRING_ARRAY:
 			c = fm_printf(d + 1, c, cols, "[...]");
 			break;
 		case DATA_TYPE_UNKNOWN:
 			c = fm_printf(d + 1, c, cols, "<unknown>");
 			break;
 		}
 	}
 
 	return (c);
 }
 
 void
 fm_nvprint(nvlist_t *nvl)
 {
 	char *class;
 	int c = 0;
 
 	console_printf("\r");
 
 	if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0)
 		c = fm_printf(0, c, ereport_cols, "%s", class);
 
 	if (fm_nvprintr(nvl, 0, c, ereport_cols) != 0)
 		console_printf("\n");
 
 	console_printf("\n");
 }
 
 /*
  * Wrapper for panic() that first produces an FMA-style message for admins.
  * Normally such messages are generated by fmd(1M)'s syslog-msgs agent: this
  * is the one exception to that rule and the only error that gets messaged.
  * This function is intended for use by subsystems that have detected a fatal
  * error and enqueued appropriate ereports and wish to then force a panic.
  */
 /*PRINTFLIKE1*/
 void
 fm_panic(const char *format, ...)
 {
 	va_list ap;
 
 	(void) atomic_cas_ptr((void *)&fm_panicstr, NULL, (void *)format);
 #if defined(__i386) || defined(__amd64)
 	fastreboot_disable_highpil();
 #endif /* __i386 || __amd64 */
 	va_start(ap, format);
 	vpanic(format, ap);
 	va_end(ap);
 }
 
 /*
  * Simply tell the caller if fm_panicstr is set, ie. an fma event has
  * caused the panic. If so, something other than the default panic
  * diagnosis method will diagnose the cause of the panic.
  */
 int
 is_fm_panic()
 {
 	if (fm_panicstr)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Print any appropriate FMA banner message before the panic message.  This
  * function is called by panicsys() and prints the message for fm_panic().
  * We print the message here so that it comes after the system is quiesced.
  * A one-line summary is recorded in the log only (cmn_err(9F) with "!" prefix).
  * The rest of the message is for the console only and not needed in the log,
  * so it is printed using console_printf().  We break it up into multiple
  * chunks so as to avoid overflowing any small legacy prom_printf() buffers.
  */
 void
 fm_banner(void)
 {
 	timespec_t tod;
 	hrtime_t now;
 
 	if (!fm_panicstr)
 		return; /* panic was not initiated by fm_panic(); do nothing */
 
 	if (panicstr) {
 		tod = panic_hrestime;
 		now = panic_hrtime;
 	} else {
 		gethrestime(&tod);
 		now = gethrtime_waitfree();
 	}
 
 	cmn_err(CE_NOTE, "!SUNW-MSG-ID: %s, "
 	    "TYPE: Error, VER: 1, SEVERITY: Major\n", fm_msgid);
 
 	console_printf(
 "\n\rSUNW-MSG-ID: %s, TYPE: Error, VER: 1, SEVERITY: Major\n"
 "EVENT-TIME: 0x%lx.0x%lx (0x%llx)\n",
 	    fm_msgid, tod.tv_sec, tod.tv_nsec, (u_longlong_t)now);
 
 	console_printf(
 "PLATFORM: %s, CSN: -, HOSTNAME: %s\n"
 "SOURCE: %s, REV: %s %s\n",
 	    platform, utsname.nodename, utsname.sysname,
 	    utsname.release, utsname.version);
 
 	console_printf(
 "DESC: Errors have been detected that require a reboot to ensure system\n"
 "integrity.  See %s/%s for more information.\n",
 	    fm_url, fm_msgid);
 
 	console_printf(
 "AUTO-RESPONSE: Solaris will attempt to save and diagnose the error telemetry\n"
 "IMPACT: The system will sync files, save a crash dump if needed, and reboot\n"
 "REC-ACTION: Save the error summary below in case telemetry cannot be saved\n");
 
 	console_printf("\n");
 }
 
 /*
  * Utility function to write all of the pending ereports to the dump device.
  * This function is called at either normal reboot or panic time, and simply
  * iterates over the in-transit messages in the ereport sysevent channel.
  */
 void
 fm_ereport_dump(void)
 {
 	evchanq_t *chq;
 	sysevent_t *sep;
 	erpt_dump_t ed;
 
 	timespec_t tod;
 	hrtime_t now;
 	char *buf;
 	size_t len;
 
 	if (panicstr) {
 		tod = panic_hrestime;
 		now = panic_hrtime;
 	} else {
 		if (ereport_errorq != NULL)
 			errorq_drain(ereport_errorq);
 		gethrestime(&tod);
 		now = gethrtime_waitfree();
 	}
 
 	/*
 	 * In the panic case, sysevent_evc_walk_init() will return NULL.
 	 */
 	if ((chq = sysevent_evc_walk_init(ereport_chan, NULL)) == NULL &&
 	    !panicstr)
 		return; /* event channel isn't initialized yet */
 
 	while ((sep = sysevent_evc_walk_step(chq)) != NULL) {
 		if ((buf = sysevent_evc_event_attr(sep, &len)) == NULL)
 			break;
 
 		ed.ed_magic = ERPT_MAGIC;
 		ed.ed_chksum = checksum32(buf, len);
 		ed.ed_size = (uint32_t)len;
 		ed.ed_pad = 0;
 		ed.ed_hrt_nsec = SE_TIME(sep);
 		ed.ed_hrt_base = now;
 		ed.ed_tod_base.sec = tod.tv_sec;
 		ed.ed_tod_base.nsec = tod.tv_nsec;
 
 		dumpvp_write(&ed, sizeof (ed));
 		dumpvp_write(buf, len);
 	}
 
 	sysevent_evc_walk_fini(chq);
 }
 #endif
 
 /*
  * Post an error report (ereport) to the sysevent error channel.  The error
  * channel must be established with a prior call to sysevent_evc_create()
  * before publication may occur.
  */
 void
 fm_ereport_post(nvlist_t *ereport, int evc_flag)
 {
 	size_t nvl_size = 0;
 	evchan_t *error_chan;
 	sysevent_id_t eid;
 
 	(void) nvlist_size(ereport, &nvl_size, NV_ENCODE_NATIVE);
 	if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) {
-		atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
 		return;
 	}
 
 #ifdef sun
 	if (sysevent_evc_bind(FM_ERROR_CHAN, &error_chan,
 	    EVCH_CREAT|EVCH_HOLD_PEND) != 0) {
-		atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
 		return;
 	}
 
 	if (sysevent_evc_publish(error_chan, EC_FM, ESC_FM_ERROR,
 	    SUNW_VENDOR, FM_PUB, ereport, evc_flag) != 0) {
-		atomic_add_64(&erpt_kstat_data.erpt_dropped.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64);
 		(void) sysevent_evc_unbind(error_chan);
 		return;
 	}
 	(void) sysevent_evc_unbind(error_chan);
 #else
 	(void) ddi_log_sysevent(NULL, SUNW_VENDOR, EC_DEV_STATUS,
 	    ESC_DEV_DLE, ereport, &eid, DDI_SLEEP);
 #endif
 }
 
 /*
  * Wrapppers for FM nvlist allocators
  */
 /* ARGSUSED */
 static void *
 i_fm_alloc(nv_alloc_t *nva, size_t size)
 {
 	return (kmem_zalloc(size, KM_SLEEP));
 }
 
 /* ARGSUSED */
 static void
 i_fm_free(nv_alloc_t *nva, void *buf, size_t size)
 {
 	kmem_free(buf, size);
 }
 
 const nv_alloc_ops_t fm_mem_alloc_ops = {
 	NULL,
 	NULL,
 	i_fm_alloc,
 	i_fm_free,
 	NULL
 };
 
 /*
  * Create and initialize a new nv_alloc_t for a fixed buffer, buf.  A pointer
  * to the newly allocated nv_alloc_t structure is returned upon success or NULL
  * is returned to indicate that the nv_alloc structure could not be created.
  */
 nv_alloc_t *
 fm_nva_xcreate(char *buf, size_t bufsz)
 {
 	nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
 
 	if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) {
 		kmem_free(nvhdl, sizeof (nv_alloc_t));
 		return (NULL);
 	}
 
 	return (nvhdl);
 }
 
 /*
  * Destroy a previously allocated nv_alloc structure.  The fixed buffer
  * associated with nva must be freed by the caller.
  */
 void
 fm_nva_xdestroy(nv_alloc_t *nva)
 {
 	nv_alloc_fini(nva);
 	kmem_free(nva, sizeof (nv_alloc_t));
 }
 
 /*
  * Create a new nv list.  A pointer to a new nv list structure is returned
  * upon success or NULL is returned to indicate that the structure could
  * not be created.  The newly created nv list is created and managed by the
  * operations installed in nva.   If nva is NULL, the default FMA nva
  * operations are installed and used.
  *
  * When called from the kernel and nva == NULL, this function must be called
  * from passive kernel context with no locks held that can prevent a
  * sleeping memory allocation from occurring.  Otherwise, this function may
  * be called from other kernel contexts as long a valid nva created via
  * fm_nva_create() is supplied.
  */
 nvlist_t *
 fm_nvlist_create(nv_alloc_t *nva)
 {
 	int hdl_alloced = 0;
 	nvlist_t *nvl;
 	nv_alloc_t *nvhdl;
 
 	if (nva == NULL) {
 		nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP);
 
 		if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) {
 			kmem_free(nvhdl, sizeof (nv_alloc_t));
 			return (NULL);
 		}
 		hdl_alloced = 1;
 	} else {
 		nvhdl = nva;
 	}
 
 	if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) {
 		if (hdl_alloced) {
 			nv_alloc_fini(nvhdl);
 			kmem_free(nvhdl, sizeof (nv_alloc_t));
 		}
 		return (NULL);
 	}
 
 	return (nvl);
 }
 
 /*
  * Destroy a previously allocated nvlist structure.  flag indicates whether
  * or not the associated nva structure should be freed (FM_NVA_FREE) or
  * retained (FM_NVA_RETAIN).  Retaining the nv alloc structure allows
  * it to be re-used for future nvlist creation operations.
  */
 void
 fm_nvlist_destroy(nvlist_t *nvl, int flag)
 {
 	nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl);
 
 	nvlist_free(nvl);
 
 	if (nva != NULL) {
 		if (flag == FM_NVA_FREE)
 			fm_nva_xdestroy(nva);
 	}
 }
 
 int
 i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap)
 {
 	int nelem, ret = 0;
 	data_type_t type;
 
 	while (ret == 0 && name != NULL) {
 		type = va_arg(ap, data_type_t);
 		switch (type) {
 		case DATA_TYPE_BYTE:
 			ret = nvlist_add_byte(payload, name,
 			    va_arg(ap, uint_t));
 			break;
 		case DATA_TYPE_BYTE_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_byte_array(payload, name,
 			    va_arg(ap, uchar_t *), nelem);
 			break;
 		case DATA_TYPE_BOOLEAN_VALUE:
 			ret = nvlist_add_boolean_value(payload, name,
 			    va_arg(ap, boolean_t));
 			break;
 		case DATA_TYPE_BOOLEAN_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_boolean_array(payload, name,
 			    va_arg(ap, boolean_t *), nelem);
 			break;
 		case DATA_TYPE_INT8:
 			ret = nvlist_add_int8(payload, name,
 			    va_arg(ap, int));
 			break;
 		case DATA_TYPE_INT8_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_int8_array(payload, name,
 			    va_arg(ap, int8_t *), nelem);
 			break;
 		case DATA_TYPE_UINT8:
 			ret = nvlist_add_uint8(payload, name,
 			    va_arg(ap, uint_t));
 			break;
 		case DATA_TYPE_UINT8_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_uint8_array(payload, name,
 			    va_arg(ap, uint8_t *), nelem);
 			break;
 		case DATA_TYPE_INT16:
 			ret = nvlist_add_int16(payload, name,
 			    va_arg(ap, int));
 			break;
 		case DATA_TYPE_INT16_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_int16_array(payload, name,
 			    va_arg(ap, int16_t *), nelem);
 			break;
 		case DATA_TYPE_UINT16:
 			ret = nvlist_add_uint16(payload, name,
 			    va_arg(ap, uint_t));
 			break;
 		case DATA_TYPE_UINT16_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_uint16_array(payload, name,
 			    va_arg(ap, uint16_t *), nelem);
 			break;
 		case DATA_TYPE_INT32:
 			ret = nvlist_add_int32(payload, name,
 			    va_arg(ap, int32_t));
 			break;
 		case DATA_TYPE_INT32_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_int32_array(payload, name,
 			    va_arg(ap, int32_t *), nelem);
 			break;
 		case DATA_TYPE_UINT32:
 			ret = nvlist_add_uint32(payload, name,
 			    va_arg(ap, uint32_t));
 			break;
 		case DATA_TYPE_UINT32_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_uint32_array(payload, name,
 			    va_arg(ap, uint32_t *), nelem);
 			break;
 		case DATA_TYPE_INT64:
 			ret = nvlist_add_int64(payload, name,
 			    va_arg(ap, int64_t));
 			break;
 		case DATA_TYPE_INT64_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_int64_array(payload, name,
 			    va_arg(ap, int64_t *), nelem);
 			break;
 		case DATA_TYPE_UINT64:
 			ret = nvlist_add_uint64(payload, name,
 			    va_arg(ap, uint64_t));
 			break;
 		case DATA_TYPE_UINT64_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_uint64_array(payload, name,
 			    va_arg(ap, uint64_t *), nelem);
 			break;
 		case DATA_TYPE_STRING:
 			ret = nvlist_add_string(payload, name,
 			    va_arg(ap, char *));
 			break;
 		case DATA_TYPE_STRING_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_string_array(payload, name,
 			    va_arg(ap, char **), nelem);
 			break;
 		case DATA_TYPE_NVLIST:
 			ret = nvlist_add_nvlist(payload, name,
 			    va_arg(ap, nvlist_t *));
 			break;
 		case DATA_TYPE_NVLIST_ARRAY:
 			nelem = va_arg(ap, int);
 			ret = nvlist_add_nvlist_array(payload, name,
 			    va_arg(ap, nvlist_t **), nelem);
 			break;
 		default:
 			ret = EINVAL;
 		}
 
 		name = va_arg(ap, char *);
 	}
 	return (ret);
 }
 
 void
 fm_payload_set(nvlist_t *payload, ...)
 {
 	int ret;
 	const char *name;
 	va_list ap;
 
 	va_start(ap, payload);
 	name = va_arg(ap, char *);
 	ret = i_fm_payload_set(payload, name, ap);
 	va_end(ap);
 
 	if (ret)
-		atomic_add_64(
-		    &erpt_kstat_data.payload_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64);
 }
 
 /*
  * Set-up and validate the members of an ereport event according to:
  *
  *	Member name		Type		Value
  *	====================================================
  *	class			string		ereport
  *	version			uint8_t		0
  *	ena			uint64_t	<ena>
  *	detector		nvlist_t	<detector>
  *	ereport-payload		nvlist_t	<var args>
  *
  * We don't actually add a 'version' member to the payload.  Really,
  * the version quoted to us by our caller is that of the category 1
  * "ereport" event class (and we require FM_EREPORT_VERS0) but
  * the payload version of the actual leaf class event under construction
  * may be something else.  Callers should supply a version in the varargs,
  * or (better) we could take two version arguments - one for the
  * ereport category 1 classification (expect FM_EREPORT_VERS0) and one
  * for the leaf class.
  */
 void
 fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class,
     uint64_t ena, const nvlist_t *detector, ...)
 {
 	char ereport_class[FM_MAX_CLASS];
 	const char *name;
 	va_list ap;
 	int ret;
 
 	if (version != FM_EREPORT_VERS0) {
-		atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
 		return;
 	}
 
 	(void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s",
 	    FM_EREPORT_CLASS, erpt_class);
 	if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) {
-		atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
 		return;
 	}
 
 	if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) {
-		atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
 	}
 
 	if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR,
 	    (nvlist_t *)detector) != 0) {
-		atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
 	}
 
 	va_start(ap, detector);
 	name = va_arg(ap, const char *);
 	ret = i_fm_payload_set(ereport, name, ap);
 	va_end(ap);
 
 	if (ret)
-		atomic_add_64(&erpt_kstat_data.erpt_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64);
 }
 
 /*
  * Set-up and validate the members of an hc fmri according to;
  *
  *	Member name		Type		Value
  *	===================================================
  *	version			uint8_t		0
  *	auth			nvlist_t	<auth>
  *	hc-name			string		<name>
  *	hc-id			string		<id>
  *
  * Note that auth and hc-id are optional members.
  */
 
 #define	HC_MAXPAIRS	20
 #define	HC_MAXNAMELEN	50
 
 static int
 fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth)
 {
 	if (version != FM_HC_SCHEME_VERSION) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return (0);
 	}
 
 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 ||
 	    nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return (0);
 	}
 
 	if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
 	    (nvlist_t *)auth) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return (0);
 	}
 
 	return (1);
 }
 
 void
 fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth,
     nvlist_t *snvl, int npairs, ...)
 {
 	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
 	nvlist_t *pairs[HC_MAXPAIRS];
 	va_list ap;
 	int i;
 
 	if (!fm_fmri_hc_set_common(fmri, version, auth))
 		return;
 
 	npairs = MIN(npairs, HC_MAXPAIRS);
 
 	va_start(ap, npairs);
 	for (i = 0; i < npairs; i++) {
 		const char *name = va_arg(ap, const char *);
 		uint32_t id = va_arg(ap, uint32_t);
 		char idstr[11];
 
 		(void) snprintf(idstr, sizeof (idstr), "%u", id);
 
 		pairs[i] = fm_nvlist_create(nva);
 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
 	}
 	va_end(ap);
 
 	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0)
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 
 	for (i = 0; i < npairs; i++)
 		fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
 
 	if (snvl != NULL) {
 		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
 	}
 }
 
 /*
  * Set-up and validate the members of an dev fmri according to:
  *
  *	Member name		Type		Value
  *	====================================================
  *	version			uint8_t		0
  *	auth			nvlist_t	<auth>
  *	devpath			string		<devpath>
  *	[devid]			string		<devid>
  *	[target-port-l0id]	string		<target-port-lun0-id>
  *
  * Note that auth and devid are optional members.
  */
 void
 fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth,
     const char *devpath, const char *devid, const char *tpl0)
 {
 	int err = 0;
 
 	if (version != DEV_SCHEME_VERSION0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version);
 	err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV);
 
 	if (auth != NULL) {
 		err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY,
 		    (nvlist_t *)auth);
 	}
 
 	err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath);
 
 	if (devid != NULL)
 		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid);
 
 	if (tpl0 != NULL)
 		err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0);
 
 	if (err)
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 
 }
 
 /*
  * Set-up and validate the members of an cpu fmri according to:
  *
  *	Member name		Type		Value
  *	====================================================
  *	version			uint8_t		0
  *	auth			nvlist_t	<auth>
  *	cpuid			uint32_t	<cpu_id>
  *	cpumask			uint8_t		<cpu_mask>
  *	serial			uint64_t	<serial_id>
  *
  * Note that auth, cpumask, serial are optional members.
  *
  */
 void
 fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth,
     uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp)
 {
 	uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64;
 
 	if (version < CPU_SCHEME_VERSION1) {
-		atomic_add_64(failedp, 1);
+		atomic_inc_64(failedp);
 		return;
 	}
 
 	if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) {
-		atomic_add_64(failedp, 1);
+		atomic_inc_64(failedp);
 		return;
 	}
 
 	if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME,
 	    FM_FMRI_SCHEME_CPU) != 0) {
-		atomic_add_64(failedp, 1);
+		atomic_inc_64(failedp);
 		return;
 	}
 
 	if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY,
 	    (nvlist_t *)auth) != 0)
-		atomic_add_64(failedp, 1);
+		atomic_inc_64(failedp);
 
 	if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0)
-		atomic_add_64(failedp, 1);
+		atomic_inc_64(failedp);
 
 	if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK,
 	    *cpu_maskp) != 0)
-		atomic_add_64(failedp, 1);
+		atomic_inc_64(failedp);
 
 	if (serial_idp == NULL || nvlist_add_string(fmri_cpu,
 	    FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0)
-			atomic_add_64(failedp, 1);
+			atomic_inc_64(failedp);
 }
 
 /*
  * Set-up and validate the members of a mem according to:
  *
  *	Member name		Type		Value
  *	====================================================
  *	version			uint8_t		0
  *	auth			nvlist_t	<auth>		[optional]
  *	unum			string		<unum>
  *	serial			string		<serial>	[optional*]
  *	offset			uint64_t	<offset>	[optional]
  *
  *	* serial is required if offset is present
  */
 void
 fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth,
     const char *unum, const char *serial, uint64_t offset)
 {
 	if (version != MEM_SCHEME_VERSION0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (!serial && (offset != (uint64_t)-1)) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (auth != NULL) {
 		if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY,
 		    (nvlist_t *)auth) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
 	}
 
 	if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 	}
 
 	if (serial != NULL) {
 		if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID,
 		    (char **)&serial, 1) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
-		if (offset != (uint64_t)-1) {
-			if (nvlist_add_uint64(fmri, FM_FMRI_MEM_OFFSET,
-			    offset) != 0) {
-				atomic_add_64(&erpt_kstat_data.
-				    fmri_set_failed.value.ui64, 1);
-			}
+		if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri,
+		    FM_FMRI_MEM_OFFSET, offset) != 0) {
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
 	}
 }
 
 void
 fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid,
     uint64_t vdev_guid)
 {
 	if (version != ZFS_SCHEME_VERSION0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 	}
 
 	if (vdev_guid != 0) {
 		if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 		}
 	}
 }
 
 uint64_t
 fm_ena_increment(uint64_t ena)
 {
 	uint64_t new_ena;
 
 	switch (ENA_FORMAT(ena)) {
 	case FM_ENA_FMT1:
 		new_ena = ena + (1 << ENA_FMT1_GEN_SHFT);
 		break;
 	case FM_ENA_FMT2:
 		new_ena = ena + (1 << ENA_FMT2_GEN_SHFT);
 		break;
 	default:
 		new_ena = 0;
 	}
 
 	return (new_ena);
 }
 
 uint64_t
 fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format)
 {
 	uint64_t ena = 0;
 
 	switch (format) {
 	case FM_ENA_FMT1:
 		if (timestamp) {
 			ena = (uint64_t)((format & ENA_FORMAT_MASK) |
 			    ((cpuid << ENA_FMT1_CPUID_SHFT) &
 			    ENA_FMT1_CPUID_MASK) |
 			    ((timestamp << ENA_FMT1_TIME_SHFT) &
 			    ENA_FMT1_TIME_MASK));
 		} else {
 			ena = (uint64_t)((format & ENA_FORMAT_MASK) |
 			    ((cpuid << ENA_FMT1_CPUID_SHFT) &
 			    ENA_FMT1_CPUID_MASK) |
 			    ((gethrtime_waitfree() << ENA_FMT1_TIME_SHFT) &
 			    ENA_FMT1_TIME_MASK));
 		}
 		break;
 	case FM_ENA_FMT2:
 		ena = (uint64_t)((format & ENA_FORMAT_MASK) |
 		    ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK));
 		break;
 	default:
 		break;
 	}
 
 	return (ena);
 }
 
 uint64_t
 fm_ena_generate(uint64_t timestamp, uchar_t format)
 {
 	return (fm_ena_generate_cpu(timestamp, PCPU_GET(cpuid), format));
 }
 
 uint64_t
 fm_ena_generation_get(uint64_t ena)
 {
 	uint64_t gen;
 
 	switch (ENA_FORMAT(ena)) {
 	case FM_ENA_FMT1:
 		gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT;
 		break;
 	case FM_ENA_FMT2:
 		gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT;
 		break;
 	default:
 		gen = 0;
 		break;
 	}
 
 	return (gen);
 }
 
 uchar_t
 fm_ena_format_get(uint64_t ena)
 {
 
 	return (ENA_FORMAT(ena));
 }
 
 uint64_t
 fm_ena_id_get(uint64_t ena)
 {
 	uint64_t id;
 
 	switch (ENA_FORMAT(ena)) {
 	case FM_ENA_FMT1:
 		id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT;
 		break;
 	case FM_ENA_FMT2:
 		id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT;
 		break;
 	default:
 		id = 0;
 	}
 
 	return (id);
 }
 
 uint64_t
 fm_ena_time_get(uint64_t ena)
 {
 	uint64_t time;
 
 	switch (ENA_FORMAT(ena)) {
 	case FM_ENA_FMT1:
 		time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT;
 		break;
 	case FM_ENA_FMT2:
 		time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT;
 		break;
 	default:
 		time = 0;
 	}
 
 	return (time);
 }
 
 #ifdef sun
 /*
  * Convert a getpcstack() trace to symbolic name+offset, and add the resulting
  * string array to a Fault Management ereport as FM_EREPORT_PAYLOAD_NAME_STACK.
  */
 void
 fm_payload_stack_add(nvlist_t *payload, const pc_t *stack, int depth)
 {
 	int i;
 	char *sym;
 	ulong_t off;
 	char *stkpp[FM_STK_DEPTH];
 	char buf[FM_STK_DEPTH * FM_SYM_SZ];
 	char *stkp = buf;
 
 	for (i = 0; i < depth && i != FM_STK_DEPTH; i++, stkp += FM_SYM_SZ) {
 		if ((sym = kobj_getsymname(stack[i], &off)) != NULL)
 			(void) snprintf(stkp, FM_SYM_SZ, "%s+%lx", sym, off);
 		else
 			(void) snprintf(stkp, FM_SYM_SZ, "%lx", (long)stack[i]);
 		stkpp[i] = stkp;
 	}
 
 	fm_payload_set(payload, FM_EREPORT_PAYLOAD_NAME_STACK,
 	    DATA_TYPE_STRING_ARRAY, depth, stkpp, NULL);
 }
 #endif
 
 #ifdef sun
 void
 print_msg_hwerr(ctid_t ct_id, proc_t *p)
 {
 	uprintf("Killed process %d (%s) in contract id %d "
 	    "due to hardware error\n", p->p_pid, p->p_user.u_comm, ct_id);
 }
 #endif
 
 void
 fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth,
     nvlist_t *snvl, nvlist_t *bboard, int npairs, ...)
 {
 	nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri);
 	nvlist_t *pairs[HC_MAXPAIRS];
 	nvlist_t **hcl;
 	uint_t n;
 	int i, j;
 	va_list ap;
 	char *hcname, *hcid;
 
 	if (!fm_fmri_hc_set_common(fmri, version, auth))
 		return;
 
 	/*
 	 * copy the bboard nvpairs to the pairs array
 	 */
 	if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n)
 	    != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	for (i = 0; i < n; i++) {
 		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME,
 		    &hcname) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 			return;
 		}
 		if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 			return;
 		}
 
 		pairs[i] = fm_nvlist_create(nva);
 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 ||
 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) {
 			for (j = 0; j <= i; j++) {
 				if (pairs[j] != NULL)
 					fm_nvlist_destroy(pairs[j],
 					    FM_NVA_RETAIN);
 			}
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 			return;
 		}
 	}
 
 	/*
 	 * create the pairs from passed in pairs
 	 */
 	npairs = MIN(npairs, HC_MAXPAIRS);
 
 	va_start(ap, npairs);
 	for (i = n; i < npairs + n; i++) {
 		const char *name = va_arg(ap, const char *);
 		uint32_t id = va_arg(ap, uint32_t);
 		char idstr[11];
 		(void) snprintf(idstr, sizeof (idstr), "%u", id);
 		pairs[i] = fm_nvlist_create(nva);
 		if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 ||
 		    nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) {
 			for (j = 0; j <= i; j++) {
 				if (pairs[j] != NULL)
 					fm_nvlist_destroy(pairs[j],
 					    FM_NVA_RETAIN);
 			}
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 			return;
 		}
 	}
 	va_end(ap);
 
 	/*
 	 * Create the fmri hc list
 	 */
 	if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs,
 	    npairs + n) != 0) {
-		atomic_add_64(&erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+		atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64);
 		return;
 	}
 
 	for (i = 0; i < npairs + n; i++) {
 			fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN);
 	}
 
 	if (snvl != NULL) {
 		if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) {
-			atomic_add_64(
-			    &erpt_kstat_data.fmri_set_failed.value.ui64, 1);
+			atomic_inc_64(
+			    &erpt_kstat_data.fmri_set_failed.value.ui64);
 			return;
 		}
 	}
 }
Index: stable/10/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c
===================================================================
--- stable/10/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c	(revision 271000)
+++ stable/10/sys/cddl/contrib/opensolaris/uts/sparc/dtrace/fasttrap_isa.c	(revision 271001)
@@ -1,1597 +1,1595 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/fasttrap_isa.h>
 #include <sys/fasttrap_impl.h>
 #include <sys/dtrace.h>
 #include <sys/dtrace_impl.h>
 #include <sys/cmn_err.h>
 #include <sys/frame.h>
 #include <sys/stack.h>
 #include <sys/sysmacros.h>
 #include <sys/trap.h>
 
 #include <v9/sys/machpcb.h>
 #include <v9/sys/privregs.h>
 
 /*
  * Lossless User-Land Tracing on SPARC
  * -----------------------------------
  *
  * The Basic Idea
  *
  * The most important design constraint is, of course, correct execution of
  * the user thread above all else. The next most important goal is rapid
  * execution. We combine execution of instructions in user-land with
  * emulation of certain instructions in the kernel to aim for complete
  * correctness and maximal performance.
  *
  * We take advantage of the split PC/NPC architecture to speed up logical
  * single-stepping; when we copy an instruction out to the scratch space in
  * the ulwp_t structure (held in the %g7 register on SPARC), we can
  * effectively single step by setting the PC to our scratch space and leaving
  * the NPC alone. This executes the replaced instruction and then continues
  * on without having to reenter the kernel as with single- stepping. The
  * obvious caveat is for instructions whose execution is PC dependant --
  * branches, call and link instructions (call and jmpl), and the rdpc
  * instruction. These instructions cannot be executed in the manner described
  * so they must be emulated in the kernel.
  *
  * Emulation for this small set of instructions if fairly simple; the most
  * difficult part being emulating branch conditions.
  *
  *
  * A Cache Heavy Portfolio
  *
  * It's important to note at this time that copying an instruction out to the
  * ulwp_t scratch space in user-land is rather complicated. SPARC has
  * separate data and instruction caches so any writes to the D$ (using a
  * store instruction for example) aren't necessarily reflected in the I$.
  * The flush instruction can be used to synchronize the two and must be used
  * for any self-modifying code, but the flush instruction only applies to the
  * primary address space (the absence of a flusha analogue to the flush
  * instruction that accepts an ASI argument is an obvious omission from SPARC
  * v9 where the notion of the alternate address space was introduced on
  * SPARC). To correctly copy out the instruction we must use a block store
  * that doesn't allocate in the D$ and ensures synchronization with the I$;
  * see dtrace_blksuword32() for the implementation  (this function uses
  * ASI_BLK_COMMIT_S to write a block through the secondary ASI in the manner
  * described). Refer to the UltraSPARC I/II manual for details on the
  * ASI_BLK_COMMIT_S ASI.
  *
  *
  * Return Subtleties
  *
  * When we're firing a return probe we need to expose the value returned by
  * the function being traced. Since the function can set the return value
  * in its last instruction, we need to fire the return probe only _after_
  * the effects of the instruction are apparent. For instructions that we
  * emulate, we can call dtrace_probe() after we've performed the emulation;
  * for instructions that we execute after we return to user-land, we set
  * %pc to the instruction we copied out (as described above) and set %npc
  * to a trap instruction stashed in the ulwp_t structure. After the traced
  * instruction is executed, the trap instruction returns control to the
  * kernel where we can fire the return probe.
  *
  * This need for a second trap in cases where we execute the traced
  * instruction makes it all the more important to emulate the most common
  * instructions to avoid the second trip in and out of the kernel.
  *
  *
  * Making it Fast
  *
  * Since copying out an instruction is neither simple nor inexpensive for the
  * CPU, we should attempt to avoid doing it in as many cases as possible.
  * Since function entry and return are usually the most interesting probe
  * sites, we attempt to tune the performance of the fasttrap provider around
  * instructions typically in those places.
  *
  * Looking at a bunch of functions in libraries and executables reveals that
  * most functions begin with either a save or a sethi (to setup a larger
  * argument to the save) and end with a restore or an or (in the case of leaf
  * functions). To try to improve performance, we emulate all of these
  * instructions in the kernel.
  *
  * The save and restore instructions are a little tricky since they perform
  * register window maniplulation. Rather than trying to tinker with the
  * register windows from the kernel, we emulate the implicit add that takes
  * place as part of those instructions and set the %pc to point to a simple
  * save or restore we've hidden in the ulwp_t structure. If we're in a return
  * probe so want to make it seem as though the tracepoint has been completely
  * executed we need to remember that we've pulled this trick with restore and
  * pull registers from the previous window (the one that we'll switch to once
  * the simple store instruction is executed) rather than the current one. This
  * is why in the case of emulating a restore we set the DTrace CPU flag
  * CPU_DTRACE_FAKERESTORE before calling dtrace_probe() for the return probes
  * (see fasttrap_return_common()).
  */
 
 #define	OP(x)		((x) >> 30)
 #define	OP2(x)		(((x) >> 22) & 0x07)
 #define	OP3(x)		(((x) >> 19) & 0x3f)
 #define	RCOND(x)	(((x) >> 25) & 0x07)
 #define	COND(x)		(((x) >> 25) & 0x0f)
 #define	A(x)		(((x) >> 29) & 0x01)
 #define	I(x)		(((x) >> 13) & 0x01)
 #define	RD(x)		(((x) >> 25) & 0x1f)
 #define	RS1(x)		(((x) >> 14) & 0x1f)
 #define	RS2(x)		(((x) >> 0) & 0x1f)
 #define	CC(x)		(((x) >> 20) & 0x03)
 #define	DISP16(x)	((((x) >> 6) & 0xc000) | ((x) & 0x3fff))
 #define	DISP22(x)	((x) & 0x3fffff)
 #define	DISP19(x)	((x) & 0x7ffff)
 #define	DISP30(x)	((x) & 0x3fffffff)
 #define	SW_TRAP(x)	((x) & 0x7f)
 
 #define	OP3_OR		0x02
 #define	OP3_RD		0x28
 #define	OP3_JMPL	0x38
 #define	OP3_RETURN	0x39
 #define	OP3_TCC		0x3a
 #define	OP3_SAVE	0x3c
 #define	OP3_RESTORE	0x3d
 
 #define	OP3_PREFETCH	0x2d
 #define	OP3_CASA	0x3c
 #define	OP3_PREFETCHA	0x3d
 #define	OP3_CASXA	0x3e
 
 #define	OP2_ILLTRAP	0x0
 #define	OP2_BPcc	0x1
 #define	OP2_Bicc	0x2
 #define	OP2_BPr		0x3
 #define	OP2_SETHI	0x4
 #define	OP2_FBPfcc	0x5
 #define	OP2_FBfcc	0x6
 
 #define	R_G0		0
 #define	R_O0		8
 #define	R_SP		14
 #define	R_I0		24
 #define	R_I1		25
 #define	R_I2		26
 #define	R_I3		27
 #define	R_I4		28
 
 /*
  * Check the comment in fasttrap.h when changing these offsets or adding
  * new instructions.
  */
 #define	FASTTRAP_OFF_SAVE	64
 #define	FASTTRAP_OFF_RESTORE	68
 #define	FASTTRAP_OFF_FTRET	72
 #define	FASTTRAP_OFF_RETURN	76
 
 #define	BREAKPOINT_INSTR	0x91d02001	/* ta 1 */
 
 /*
  * Tunable to let users turn off the fancy save instruction optimization.
  * If a program is non-ABI compliant, there's a possibility that the save
  * instruction optimization could cause an error.
  */
 int fasttrap_optimize_save = 1;
 
 static uint64_t
 fasttrap_anarg(struct regs *rp, int argno)
 {
 	uint64_t value;
 
 	if (argno < 6)
 		return ((&rp->r_o0)[argno]);
 
 	if (curproc->p_model == DATAMODEL_NATIVE) {
 		struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
 
 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 		value = dtrace_fulword(&fr->fr_argd[argno]);
 		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR |
 		    CPU_DTRACE_BADALIGN);
 	} else {
 		struct frame32 *fr = (struct frame32 *)rp->r_sp;
 
 		DTRACE_CPUFLAG_SET(CPU_DTRACE_NOFAULT);
 		value = dtrace_fuword32(&fr->fr_argd[argno]);
 		DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_NOFAULT | CPU_DTRACE_BADADDR |
 		    CPU_DTRACE_BADALIGN);
 	}
 
 	return (value);
 }
 
 static ulong_t fasttrap_getreg(struct regs *, uint_t);
 static void fasttrap_putreg(struct regs *, uint_t, ulong_t);
 
 static void
 fasttrap_usdt_args(fasttrap_probe_t *probe, struct regs *rp,
     uint_t fake_restore, int argc, uintptr_t *argv)
 {
 	int i, x, cap = MIN(argc, probe->ftp_nargs);
 	int inc = (fake_restore ? 16 : 0);
 
 	/*
 	 * The only way we'll hit the fake_restore case is if a USDT probe is
 	 * invoked as a tail-call. While it wouldn't be incorrect, we can
 	 * avoid a call to fasttrap_getreg(), and safely use rp->r_sp
 	 * directly since a tail-call can't be made if the invoked function
 	 * would use the argument dump space (i.e. if there were more than
 	 * 6 arguments). We take this shortcut because unconditionally rooting
 	 * around for R_FP (R_SP + 16) would be unnecessarily painful.
 	 */
 
 	if (curproc->p_model == DATAMODEL_NATIVE) {
 		struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
 		uintptr_t v;
 
 		for (i = 0; i < cap; i++) {
 			x = probe->ftp_argmap[i];
 
 			if (x < 6)
 				argv[i] = fasttrap_getreg(rp, R_O0 + x + inc);
 			else if (fasttrap_fulword(&fr->fr_argd[x], &v) != 0)
 				argv[i] = 0;
 		}
 
 	} else {
 		struct frame32 *fr = (struct frame32 *)rp->r_sp;
 		uint32_t v;
 
 		for (i = 0; i < cap; i++) {
 			x = probe->ftp_argmap[i];
 
 			if (x < 6)
 				argv[i] = fasttrap_getreg(rp, R_O0 + x + inc);
 			else if (fasttrap_fuword32(&fr->fr_argd[x], &v) != 0)
 				argv[i] = 0;
 		}
 	}
 
 	for (; i < argc; i++) {
 		argv[i] = 0;
 	}
 }
 
 static void
 fasttrap_return_common(struct regs *rp, uintptr_t pc, pid_t pid,
     uint_t fake_restore)
 {
 	fasttrap_tracepoint_t *tp;
 	fasttrap_bucket_t *bucket;
 	fasttrap_id_t *id;
 	kmutex_t *pid_mtx;
 	dtrace_icookie_t cookie;
 
 	pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
 	mutex_enter(pid_mtx);
 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
 
 	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
 		if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
 		    tp->ftt_proc->ftpc_acount != 0)
 			break;
 	}
 
 	/*
 	 * Don't sweat it if we can't find the tracepoint again; unlike
 	 * when we're in fasttrap_pid_probe(), finding the tracepoint here
 	 * is not essential to the correct execution of the process.
 	 */
 	if (tp == NULL || tp->ftt_retids == NULL) {
 		mutex_exit(pid_mtx);
 		return;
 	}
 
 	for (id = tp->ftt_retids; id != NULL; id = id->fti_next) {
 		fasttrap_probe_t *probe = id->fti_probe;
 
 		if (id->fti_ptype == DTFTP_POST_OFFSETS) {
 			if (probe->ftp_argmap != NULL && fake_restore) {
 				uintptr_t t[5];
 
 				fasttrap_usdt_args(probe, rp, fake_restore,
 				    sizeof (t) / sizeof (t[0]), t);
 
 				cookie = dtrace_interrupt_disable();
 				DTRACE_CPUFLAG_SET(CPU_DTRACE_FAKERESTORE);
 				dtrace_probe(probe->ftp_id, t[0], t[1],
 				    t[2], t[3], t[4]);
 				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_FAKERESTORE);
 				dtrace_interrupt_enable(cookie);
 
 			} else if (probe->ftp_argmap != NULL) {
 				uintptr_t t[5];
 
 				fasttrap_usdt_args(probe, rp, fake_restore,
 				    sizeof (t) / sizeof (t[0]), t);
 
 				dtrace_probe(probe->ftp_id, t[0], t[1],
 				    t[2], t[3], t[4]);
 
 			} else if (fake_restore) {
 				uintptr_t arg0 = fasttrap_getreg(rp, R_I0);
 				uintptr_t arg1 = fasttrap_getreg(rp, R_I1);
 				uintptr_t arg2 = fasttrap_getreg(rp, R_I2);
 				uintptr_t arg3 = fasttrap_getreg(rp, R_I3);
 				uintptr_t arg4 = fasttrap_getreg(rp, R_I4);
 
 				cookie = dtrace_interrupt_disable();
 				DTRACE_CPUFLAG_SET(CPU_DTRACE_FAKERESTORE);
 				dtrace_probe(probe->ftp_id, arg0, arg1,
 				    arg2, arg3, arg4);
 				DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_FAKERESTORE);
 				dtrace_interrupt_enable(cookie);
 
 			} else {
 				dtrace_probe(probe->ftp_id, rp->r_o0, rp->r_o1,
 				    rp->r_o2, rp->r_o3, rp->r_o4);
 			}
 
 			continue;
 		}
 
 		/*
 		 * If this is only a possible return point, we must
 		 * be looking at a potential tail call in leaf context.
 		 * If the %npc is still within this function, then we
 		 * must have misidentified a jmpl as a tail-call when it
 		 * is, in fact, part of a jump table. It would be nice to
 		 * remove this tracepoint, but this is neither the time
 		 * nor the place.
 		 */
 		if ((tp->ftt_flags & FASTTRAP_F_RETMAYBE) &&
 		    rp->r_npc - probe->ftp_faddr < probe->ftp_fsize)
 			continue;
 
 		/*
 		 * It's possible for a function to branch to the delay slot
 		 * of an instruction that we've identified as a return site.
 		 * We can dectect this spurious return probe activation by
 		 * observing that in this case %npc will be %pc + 4 and %npc
 		 * will be inside the current function (unless the user is
 		 * doing _crazy_ instruction picking in which case there's
 		 * very little we can do). The second check is important
 		 * in case the last instructions of a function make a tail-
 		 * call to the function located immediately subsequent.
 		 */
 		if (rp->r_npc == rp->r_pc + 4 &&
 		    rp->r_npc - probe->ftp_faddr < probe->ftp_fsize)
 			continue;
 
 		/*
 		 * The first argument is the offset of return tracepoint
 		 * in the function; the remaining arguments are the return
 		 * values.
 		 *
 		 * If fake_restore is set, we need to pull the return values
 		 * out of the %i's rather than the %o's -- a little trickier.
 		 */
 		if (!fake_restore) {
 			dtrace_probe(probe->ftp_id, pc - probe->ftp_faddr,
 			    rp->r_o0, rp->r_o1, rp->r_o2, rp->r_o3);
 		} else {
 			uintptr_t arg0 = fasttrap_getreg(rp, R_I0);
 			uintptr_t arg1 = fasttrap_getreg(rp, R_I1);
 			uintptr_t arg2 = fasttrap_getreg(rp, R_I2);
 			uintptr_t arg3 = fasttrap_getreg(rp, R_I3);
 
 			cookie = dtrace_interrupt_disable();
 			DTRACE_CPUFLAG_SET(CPU_DTRACE_FAKERESTORE);
 			dtrace_probe(probe->ftp_id, pc - probe->ftp_faddr,
 			    arg0, arg1, arg2, arg3);
 			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_FAKERESTORE);
 			dtrace_interrupt_enable(cookie);
 		}
 	}
 
 	mutex_exit(pid_mtx);
 }
 
 int
 fasttrap_pid_probe(struct regs *rp)
 {
 	proc_t *p = curproc;
 	fasttrap_tracepoint_t *tp, tp_local;
 	fasttrap_id_t *id;
 	pid_t pid;
 	uintptr_t pc = rp->r_pc;
 	uintptr_t npc = rp->r_npc;
 	uintptr_t orig_pc = pc;
 	fasttrap_bucket_t *bucket;
 	kmutex_t *pid_mtx;
 	uint_t fake_restore = 0, is_enabled = 0;
 	dtrace_icookie_t cookie;
 
 	/*
 	 * It's possible that a user (in a veritable orgy of bad planning)
 	 * could redirect this thread's flow of control before it reached the
 	 * return probe fasttrap. In this case we need to kill the process
 	 * since it's in a unrecoverable state.
 	 */
 	if (curthread->t_dtrace_step) {
 		ASSERT(curthread->t_dtrace_on);
 		fasttrap_sigtrap(p, curthread, pc);
 		return (0);
 	}
 
 	/*
 	 * Clear all user tracing flags.
 	 */
 	curthread->t_dtrace_ft = 0;
 	curthread->t_dtrace_pc = 0;
 	curthread->t_dtrace_npc = 0;
 	curthread->t_dtrace_scrpc = 0;
 	curthread->t_dtrace_astpc = 0;
 
 	/*
 	 * Treat a child created by a call to vfork(2) as if it were its
 	 * parent. We know that there's only one thread of control in such a
 	 * process: this one.
 	 */
 	while (p->p_flag & SVFORK) {
 		p = p->p_parent;
 	}
 
 	pid = p->p_pid;
 	pid_mtx = &cpu_core[CPU->cpu_id].cpuc_pid_lock;
 	mutex_enter(pid_mtx);
 	bucket = &fasttrap_tpoints.fth_table[FASTTRAP_TPOINTS_INDEX(pid, pc)];
 
 	/*
 	 * Lookup the tracepoint that the process just hit.
 	 */
 	for (tp = bucket->ftb_data; tp != NULL; tp = tp->ftt_next) {
 		if (pid == tp->ftt_pid && pc == tp->ftt_pc &&
 		    tp->ftt_proc->ftpc_acount != 0)
 			break;
 	}
 
 	/*
 	 * If we couldn't find a matching tracepoint, either a tracepoint has
 	 * been inserted without using the pid<pid> ioctl interface (see
 	 * fasttrap_ioctl), or somehow we have mislaid this tracepoint.
 	 */
 	if (tp == NULL) {
 		mutex_exit(pid_mtx);
 		return (-1);
 	}
 
 	for (id = tp->ftt_ids; id != NULL; id = id->fti_next) {
 		fasttrap_probe_t *probe = id->fti_probe;
 		int isentry = (id->fti_ptype == DTFTP_ENTRY);
 
 		if (id->fti_ptype == DTFTP_IS_ENABLED) {
 			is_enabled = 1;
 			continue;
 		}
 
 		/*
 		 * We note that this was an entry probe to help ustack() find
 		 * the first caller.
 		 */
 		if (isentry) {
 			cookie = dtrace_interrupt_disable();
 			DTRACE_CPUFLAG_SET(CPU_DTRACE_ENTRY);
 		}
 		dtrace_probe(probe->ftp_id, rp->r_o0, rp->r_o1, rp->r_o2,
 		    rp->r_o3, rp->r_o4);
 		if (isentry) {
 			DTRACE_CPUFLAG_CLEAR(CPU_DTRACE_ENTRY);
 			dtrace_interrupt_enable(cookie);
 		}
 	}
 
 	/*
 	 * We're about to do a bunch of work so we cache a local copy of
 	 * the tracepoint to emulate the instruction, and then find the
 	 * tracepoint again later if we need to light up any return probes.
 	 */
 	tp_local = *tp;
 	mutex_exit(pid_mtx);
 	tp = &tp_local;
 
 	/*
 	 * If there's an is-enabled probe conntected to this tracepoint it
 	 * means that there was a 'mov %g0, %o0' instruction that was placed
 	 * there by DTrace when the binary was linked. As this probe is, in
 	 * fact, enabled, we need to stuff 1 into %o0. Accordingly, we can
 	 * bypass all the instruction emulation logic since we know the
 	 * inevitable result. It's possible that a user could construct a
 	 * scenario where the 'is-enabled' probe was on some other
 	 * instruction, but that would be a rather exotic way to shoot oneself
 	 * in the foot.
 	 */
 	if (is_enabled) {
 		rp->r_o0 = 1;
 		pc = rp->r_npc;
 		npc = pc + 4;
 		goto done;
 	}
 
 	/*
 	 * We emulate certain types of instructions to ensure correctness
 	 * (in the case of position dependent instructions) or optimize
 	 * common cases. The rest we have the thread execute back in user-
 	 * land.
 	 */
 	switch (tp->ftt_type) {
 	case FASTTRAP_T_SAVE:
 	{
 		int32_t imm;
 
 		/*
 		 * This an optimization to let us handle function entry
 		 * probes more efficiently. Many functions begin with a save
 		 * instruction that follows the pattern:
 		 *	save	%sp, <imm>, %sp
 		 *
 		 * Meanwhile, we've stashed the instruction:
 		 *	save	%g1, %g0, %sp
 		 *
 		 * off of %g7, so all we have to do is stick the right value
 		 * into %g1 and reset %pc to point to the instruction we've
 		 * cleverly hidden (%npc should not be touched).
 		 */
 
 		imm = tp->ftt_instr << 19;
 		imm >>= 19;
 		rp->r_g1 = rp->r_sp + imm;
 		pc = rp->r_g7 + FASTTRAP_OFF_SAVE;
 		break;
 	}
 
 	case FASTTRAP_T_RESTORE:
 	{
 		ulong_t value;
 		uint_t rd;
 
 		/*
 		 * This is an optimization to let us handle function
 		 * return probes more efficiently. Most non-leaf functions
 		 * end with the sequence:
 		 *	ret
 		 *	restore	<reg>, <reg_or_imm>, %oX
 		 *
 		 * We've stashed the instruction:
 		 *	restore	%g0, %g0, %g0
 		 *
 		 * off of %g7 so we just need to place the correct value
 		 * in the right %i register (since after our fake-o
 		 * restore, the %i's will become the %o's) and set the %pc
 		 * to point to our hidden restore. We also set fake_restore to
 		 * let fasttrap_return_common() know that it will find the
 		 * return values in the %i's rather than the %o's.
 		 */
 
 		if (I(tp->ftt_instr)) {
 			int32_t imm;
 
 			imm = tp->ftt_instr << 19;
 			imm >>= 19;
 			value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) + imm;
 		} else {
 			value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) +
 			    fasttrap_getreg(rp, RS2(tp->ftt_instr));
 		}
 
 		/*
 		 * Convert %o's to %i's; leave %g's as they are.
 		 */
 		rd = RD(tp->ftt_instr);
 		fasttrap_putreg(rp, ((rd & 0x18) == 0x8) ? rd + 16 : rd, value);
 
 		pc = rp->r_g7 + FASTTRAP_OFF_RESTORE;
 		fake_restore = 1;
 		break;
 	}
 
 	case FASTTRAP_T_RETURN:
 	{
 		uintptr_t target;
 
 		/*
 		 * A return instruction is like a jmpl (without the link
 		 * part) that executes an implicit restore. We've stashed
 		 * the instruction:
 		 *	return %o0
 		 *
 		 * off of %g7 so we just need to place the target in %o0
 		 * and set the %pc to point to the stashed return instruction.
 		 * We use %o0 since that register disappears after the return
 		 * executes, erasing any evidence of this tampering.
 		 */
 		if (I(tp->ftt_instr)) {
 			int32_t imm;
 
 			imm = tp->ftt_instr << 19;
 			imm >>= 19;
 			target = fasttrap_getreg(rp, RS1(tp->ftt_instr)) + imm;
 		} else {
 			target = fasttrap_getreg(rp, RS1(tp->ftt_instr)) +
 			    fasttrap_getreg(rp, RS2(tp->ftt_instr));
 		}
 
 		fasttrap_putreg(rp, R_O0, target);
 
 		pc = rp->r_g7 + FASTTRAP_OFF_RETURN;
 		fake_restore = 1;
 		break;
 	}
 
 	case FASTTRAP_T_OR:
 	{
 		ulong_t value;
 
 		if (I(tp->ftt_instr)) {
 			int32_t imm;
 
 			imm = tp->ftt_instr << 19;
 			imm >>= 19;
 			value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) | imm;
 		} else {
 			value = fasttrap_getreg(rp, RS1(tp->ftt_instr)) |
 			    fasttrap_getreg(rp, RS2(tp->ftt_instr));
 		}
 
 		fasttrap_putreg(rp, RD(tp->ftt_instr), value);
 		pc = rp->r_npc;
 		npc = pc + 4;
 		break;
 	}
 
 	case FASTTRAP_T_SETHI:
 		if (RD(tp->ftt_instr) != R_G0) {
 			uint32_t imm32 = tp->ftt_instr << 10;
 			fasttrap_putreg(rp, RD(tp->ftt_instr), (ulong_t)imm32);
 		}
 		pc = rp->r_npc;
 		npc = pc + 4;
 		break;
 
 	case FASTTRAP_T_CCR:
 	{
 		uint_t c, v, z, n, taken;
 		uint_t ccr = rp->r_tstate >> TSTATE_CCR_SHIFT;
 
 		if (tp->ftt_cc != 0)
 			ccr >>= 4;
 
 		c = (ccr >> 0) & 1;
 		v = (ccr >> 1) & 1;
 		z = (ccr >> 2) & 1;
 		n = (ccr >> 3) & 1;
 
 		switch (tp->ftt_code) {
 		case 0x0:	/* BN */
 			taken = 0;		break;
 		case 0x1:	/* BE */
 			taken = z;		break;
 		case 0x2:	/* BLE */
 			taken = z | (n ^ v);	break;
 		case 0x3:	/* BL */
 			taken = n ^ v;		break;
 		case 0x4:	/* BLEU */
 			taken = c | z;		break;
 		case 0x5:	/* BCS (BLU) */
 			taken = c;		break;
 		case 0x6:	/* BNEG */
 			taken = n;		break;
 		case 0x7:	/* BVS */
 			taken = v;		break;
 		case 0x8:	/* BA */
 			/*
 			 * We handle the BA case differently since the annul
 			 * bit means something slightly different.
 			 */
 			panic("fasttrap: mishandled a branch");
 			taken = 1;		break;
 		case 0x9:	/* BNE */
 			taken = ~z;		break;
 		case 0xa:	/* BG */
 			taken = ~(z | (n ^ v));	break;
 		case 0xb:	/* BGE */
 			taken = ~(n ^ v);	break;
 		case 0xc:	/* BGU */
 			taken = ~(c | z);	break;
 		case 0xd:	/* BCC (BGEU) */
 			taken = ~c;		break;
 		case 0xe:	/* BPOS */
 			taken = ~n;		break;
 		case 0xf:	/* BVC */
 			taken = ~v;		break;
 		}
 
 		if (taken & 1) {
 			pc = rp->r_npc;
 			npc = tp->ftt_dest;
 		} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
 			/*
 			 * Untaken annulled branches don't execute the
 			 * instruction in the delay slot.
 			 */
 			pc = rp->r_npc + 4;
 			npc = pc + 4;
 		} else {
 			pc = rp->r_npc;
 			npc = pc + 4;
 		}
 		break;
 	}
 
 	case FASTTRAP_T_FCC:
 	{
 		uint_t fcc;
 		uint_t taken;
 		uint64_t fsr;
 
 		dtrace_getfsr(&fsr);
 
 		if (tp->ftt_cc == 0) {
 			fcc = (fsr >> 10) & 0x3;
 		} else {
 			uint_t shift;
 			ASSERT(tp->ftt_cc <= 3);
 			shift = 30 + tp->ftt_cc * 2;
 			fcc = (fsr >> shift) & 0x3;
 		}
 
 		switch (tp->ftt_code) {
 		case 0x0:	/* FBN */
 			taken = (1 << fcc) & (0|0|0|0);	break;
 		case 0x1:	/* FBNE */
 			taken = (1 << fcc) & (8|4|2|0);	break;
 		case 0x2:	/* FBLG */
 			taken = (1 << fcc) & (0|4|2|0);	break;
 		case 0x3:	/* FBUL */
 			taken = (1 << fcc) & (8|0|2|0);	break;
 		case 0x4:	/* FBL */
 			taken = (1 << fcc) & (0|0|2|0);	break;
 		case 0x5:	/* FBUG */
 			taken = (1 << fcc) & (8|4|0|0);	break;
 		case 0x6:	/* FBG */
 			taken = (1 << fcc) & (0|4|0|0);	break;
 		case 0x7:	/* FBU */
 			taken = (1 << fcc) & (8|0|0|0);	break;
 		case 0x8:	/* FBA */
 			/*
 			 * We handle the FBA case differently since the annul
 			 * bit means something slightly different.
 			 */
 			panic("fasttrap: mishandled a branch");
 			taken = (1 << fcc) & (8|4|2|1);	break;
 		case 0x9:	/* FBE */
 			taken = (1 << fcc) & (0|0|0|1);	break;
 		case 0xa:	/* FBUE */
 			taken = (1 << fcc) & (8|0|0|1);	break;
 		case 0xb:	/* FBGE */
 			taken = (1 << fcc) & (0|4|0|1);	break;
 		case 0xc:	/* FBUGE */
 			taken = (1 << fcc) & (8|4|0|1);	break;
 		case 0xd:	/* FBLE */
 			taken = (1 << fcc) & (0|0|2|1);	break;
 		case 0xe:	/* FBULE */
 			taken = (1 << fcc) & (8|0|2|1);	break;
 		case 0xf:	/* FBO */
 			taken = (1 << fcc) & (0|4|2|1);	break;
 		}
 
 		if (taken) {
 			pc = rp->r_npc;
 			npc = tp->ftt_dest;
 		} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
 			/*
 			 * Untaken annulled branches don't execute the
 			 * instruction in the delay slot.
 			 */
 			pc = rp->r_npc + 4;
 			npc = pc + 4;
 		} else {
 			pc = rp->r_npc;
 			npc = pc + 4;
 		}
 		break;
 	}
 
 	case FASTTRAP_T_REG:
 	{
 		int64_t value;
 		uint_t taken;
 		uint_t reg = RS1(tp->ftt_instr);
 
 		/*
 		 * An ILP32 process shouldn't be using a branch predicated on
 		 * an %i or an %l since it would violate the ABI. It's a
 		 * violation of the ABI because we can't ensure deterministic
 		 * behavior. We should have identified this case when we
 		 * enabled the probe.
 		 */
 		ASSERT(p->p_model == DATAMODEL_LP64 || reg < 16);
 
 		value = (int64_t)fasttrap_getreg(rp, reg);
 
 		switch (tp->ftt_code) {
 		case 0x1:	/* BRZ */
 			taken = (value == 0);	break;
 		case 0x2:	/* BRLEZ */
 			taken = (value <= 0);	break;
 		case 0x3:	/* BRLZ */
 			taken = (value < 0);	break;
 		case 0x5:	/* BRNZ */
 			taken = (value != 0);	break;
 		case 0x6:	/* BRGZ */
 			taken = (value > 0);	break;
 		case 0x7:	/* BRGEZ */
 			taken = (value >= 0);	break;
 		default:
 		case 0x0:
 		case 0x4:
 			panic("fasttrap: mishandled a branch");
 		}
 
 		if (taken) {
 			pc = rp->r_npc;
 			npc = tp->ftt_dest;
 		} else if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
 			/*
 			 * Untaken annulled branches don't execute the
 			 * instruction in the delay slot.
 			 */
 			pc = rp->r_npc + 4;
 			npc = pc + 4;
 		} else {
 			pc = rp->r_npc;
 			npc = pc + 4;
 		}
 		break;
 	}
 
 	case FASTTRAP_T_ALWAYS:
 		/*
 		 * BAs, BA,As...
 		 */
 
 		if (tp->ftt_flags & FASTTRAP_F_ANNUL) {
 			/*
 			 * Annulled branch always instructions never execute
 			 * the instruction in the delay slot.
 			 */
 			pc = tp->ftt_dest;
 			npc = tp->ftt_dest + 4;
 		} else {
 			pc = rp->r_npc;
 			npc = tp->ftt_dest;
 		}
 		break;
 
 	case FASTTRAP_T_RDPC:
 		fasttrap_putreg(rp, RD(tp->ftt_instr), rp->r_pc);
 		pc = rp->r_npc;
 		npc = pc + 4;
 		break;
 
 	case FASTTRAP_T_CALL:
 		/*
 		 * It's a call _and_ link remember...
 		 */
 		rp->r_o7 = rp->r_pc;
 		pc = rp->r_npc;
 		npc = tp->ftt_dest;
 		break;
 
 	case FASTTRAP_T_JMPL:
 		pc = rp->r_npc;
 
 		if (I(tp->ftt_instr)) {
 			uint_t rs1 = RS1(tp->ftt_instr);
 			int32_t imm;
 
 			imm = tp->ftt_instr << 19;
 			imm >>= 19;
 			npc = fasttrap_getreg(rp, rs1) + imm;
 		} else {
 			uint_t rs1 = RS1(tp->ftt_instr);
 			uint_t rs2 = RS2(tp->ftt_instr);
 
 			npc = fasttrap_getreg(rp, rs1) +
 			    fasttrap_getreg(rp, rs2);
 		}
 
 		/*
 		 * Do the link part of the jump-and-link instruction.
 		 */
 		fasttrap_putreg(rp, RD(tp->ftt_instr), rp->r_pc);
 
 		break;
 
 	case FASTTRAP_T_COMMON:
 	{
 		curthread->t_dtrace_scrpc = rp->r_g7;
 		curthread->t_dtrace_astpc = rp->r_g7 + FASTTRAP_OFF_FTRET;
 
 		/*
 		 * Copy the instruction to a reserved location in the
 		 * user-land thread structure, then set the PC to that
 		 * location and leave the NPC alone. We take pains to ensure
 		 * consistency in the instruction stream (See SPARC
 		 * Architecture Manual Version 9, sections 8.4.7, A.20, and
 		 * H.1.6; UltraSPARC I/II User's Manual, sections 3.1.1.1,
 		 * and 13.6.4) by using the ASI ASI_BLK_COMMIT_S to copy the
 		 * instruction into the user's address space without
 		 * bypassing the I$. There's no AS_USER version of this ASI
 		 * (as exist for other ASIs) so we use the lofault
 		 * mechanism to catch faults.
 		 */
 		if (dtrace_blksuword32(rp->r_g7, &tp->ftt_instr, 1) == -1) {
 			/*
 			 * If the copyout fails, then the process's state
 			 * is not consistent (the effects of the traced
 			 * instruction will never be seen). This process
 			 * cannot be allowed to continue execution.
 			 */
 			fasttrap_sigtrap(curproc, curthread, pc);
 			return (0);
 		}
 
 		curthread->t_dtrace_pc = pc;
 		curthread->t_dtrace_npc = npc;
 		curthread->t_dtrace_on = 1;
 
 		pc = curthread->t_dtrace_scrpc;
 
 		if (tp->ftt_retids != NULL) {
 			curthread->t_dtrace_step = 1;
 			curthread->t_dtrace_ret = 1;
 			npc = curthread->t_dtrace_astpc;
 		}
 		break;
 	}
 
 	default:
 		panic("fasttrap: mishandled an instruction");
 	}
 
 	/*
 	 * This bit me in the ass a couple of times, so lets toss this
 	 * in as a cursory sanity check.
 	 */
 	ASSERT(pc != rp->r_g7 + 4);
 	ASSERT(pc != rp->r_g7 + 8);
 
 done:
 	/*
 	 * If there were no return probes when we first found the tracepoint,
 	 * we should feel no obligation to honor any return probes that were
 	 * subsequently enabled -- they'll just have to wait until the next
 	 * time around.
 	 */
 	if (tp->ftt_retids != NULL) {
 		/*
 		 * We need to wait until the results of the instruction are
 		 * apparent before invoking any return probes. If this
 		 * instruction was emulated we can just call
 		 * fasttrap_return_common(); if it needs to be executed, we
 		 * need to wait until we return to the kernel.
 		 */
 		if (tp->ftt_type != FASTTRAP_T_COMMON) {
 			fasttrap_return_common(rp, orig_pc, pid, fake_restore);
 		} else {
 			ASSERT(curthread->t_dtrace_ret != 0);
 			ASSERT(curthread->t_dtrace_pc == orig_pc);
 			ASSERT(curthread->t_dtrace_scrpc == rp->r_g7);
 			ASSERT(npc == curthread->t_dtrace_astpc);
 		}
 	}
 
 	ASSERT(pc != 0);
 	rp->r_pc = pc;
 	rp->r_npc = npc;
 
 	return (0);
 }
 
 int
 fasttrap_return_probe(struct regs *rp)
 {
 	proc_t *p = ttoproc(curthread);
 	pid_t pid;
 	uintptr_t pc = curthread->t_dtrace_pc;
 	uintptr_t npc = curthread->t_dtrace_npc;
 
 	curthread->t_dtrace_pc = 0;
 	curthread->t_dtrace_npc = 0;
 	curthread->t_dtrace_scrpc = 0;
 	curthread->t_dtrace_astpc = 0;
 
 	/*
 	 * Treat a child created by a call to vfork(2) as if it were its
 	 * parent. We know there's only one thread of control in such a
 	 * process: this one.
 	 */
 	while (p->p_flag & SVFORK) {
 		p = p->p_parent;
 	}
 
 	/*
 	 * We set the %pc and %npc to their values when the traced
 	 * instruction was initially executed so that it appears to
 	 * dtrace_probe() that we're on the original instruction, and so that
 	 * the user can't easily detect our complex web of lies.
 	 * dtrace_return_probe() (our caller) will correctly set %pc and %npc
 	 * after we return.
 	 */
 	rp->r_pc = pc;
 	rp->r_npc = npc;
 
 	pid = p->p_pid;
 	fasttrap_return_common(rp, pc, pid, 0);
 
 	return (0);
 }
 
 int
 fasttrap_tracepoint_install(proc_t *p, fasttrap_tracepoint_t *tp)
 {
 	fasttrap_instr_t instr = FASTTRAP_INSTR;
 
 	if (uwrite(p, &instr, 4, tp->ftt_pc) != 0)
 		return (-1);
 
 	return (0);
 }
 
 int
 fasttrap_tracepoint_remove(proc_t *p, fasttrap_tracepoint_t *tp)
 {
 	fasttrap_instr_t instr;
 
 	/*
 	 * Distinguish between read or write failures and a changed
 	 * instruction.
 	 */
 	if (uread(p, &instr, 4, tp->ftt_pc) != 0)
 		return (0);
 	if (instr != FASTTRAP_INSTR && instr != BREAKPOINT_INSTR)
 		return (0);
 	if (uwrite(p, &tp->ftt_instr, 4, tp->ftt_pc) != 0)
 		return (-1);
 
 	return (0);
 }
 
 int
 fasttrap_tracepoint_init(proc_t *p, fasttrap_tracepoint_t *tp, uintptr_t pc,
     fasttrap_probe_type_t type)
 {
 	uint32_t instr;
 	int32_t disp;
 
 	/*
 	 * Read the instruction at the given address out of the process's
 	 * address space. We don't have to worry about a debugger
 	 * changing this instruction before we overwrite it with our trap
 	 * instruction since P_PR_LOCK is set.
 	 */
 	if (uread(p, &instr, 4, pc) != 0)
 		return (-1);
 
 	/*
 	 * Decode the instruction to fill in the probe flags. We can have
 	 * the process execute most instructions on its own using a pc/npc
 	 * trick, but pc-relative control transfer present a problem since
 	 * we're relocating the instruction. We emulate these instructions
 	 * in the kernel. We assume a default type and over-write that as
 	 * needed.
 	 *
 	 * pc-relative instructions must be emulated for correctness;
 	 * other instructions (which represent a large set of commonly traced
 	 * instructions) are emulated or otherwise optimized for performance.
 	 */
 	tp->ftt_type = FASTTRAP_T_COMMON;
 	if (OP(instr) == 1) {
 		/*
 		 * Call instructions.
 		 */
 		tp->ftt_type = FASTTRAP_T_CALL;
 		disp = DISP30(instr) << 2;
 		tp->ftt_dest = pc + (intptr_t)disp;
 
 	} else if (OP(instr) == 0) {
 		/*
 		 * Branch instructions.
 		 *
 		 * Unconditional branches need careful attention when they're
 		 * annulled: annulled unconditional branches never execute
 		 * the instruction in the delay slot.
 		 */
 		switch (OP2(instr)) {
 		case OP2_ILLTRAP:
 		case 0x7:
 			/*
 			 * The compiler may place an illtrap after a call to
 			 * a function that returns a structure. In the case of
 			 * a returned structure, the compiler places an illtrap
 			 * whose const22 field is the size of the returned
 			 * structure immediately following the delay slot of
 			 * the call. To stay out of the way, we refuse to
 			 * place tracepoints on top of illtrap instructions.
 			 *
 			 * This is one of the dumbest architectural decisions
 			 * I've ever had to work around.
 			 *
 			 * We also identify the only illegal op2 value (See
 			 * SPARC Architecture Manual Version 9, E.2 table 31).
 			 */
 			return (-1);
 
 		case OP2_BPcc:
 			if (COND(instr) == 8) {
 				tp->ftt_type = FASTTRAP_T_ALWAYS;
 			} else {
 				/*
 				 * Check for an illegal instruction.
 				 */
 				if (CC(instr) & 1)
 					return (-1);
 				tp->ftt_type = FASTTRAP_T_CCR;
 				tp->ftt_cc = CC(instr);
 				tp->ftt_code = COND(instr);
 			}
 
 			if (A(instr) != 0)
 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
 
 			disp = DISP19(instr);
 			disp <<= 13;
 			disp >>= 11;
 			tp->ftt_dest = pc + (intptr_t)disp;
 			break;
 
 		case OP2_Bicc:
 			if (COND(instr) == 8) {
 				tp->ftt_type = FASTTRAP_T_ALWAYS;
 			} else {
 				tp->ftt_type = FASTTRAP_T_CCR;
 				tp->ftt_cc = 0;
 				tp->ftt_code = COND(instr);
 			}
 
 			if (A(instr) != 0)
 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
 
 			disp = DISP22(instr);
 			disp <<= 10;
 			disp >>= 8;
 			tp->ftt_dest = pc + (intptr_t)disp;
 			break;
 
 		case OP2_BPr:
 			/*
 			 * Check for an illegal instruction.
 			 */
 			if ((RCOND(instr) & 3) == 0)
 				return (-1);
 
 			/*
 			 * It's a violation of the v8plus ABI to use a
 			 * register-predicated branch in a 32-bit app if
 			 * the register used is an %l or an %i (%gs and %os
 			 * are legit because they're not saved to the stack
 			 * in 32-bit words when we take a trap).
 			 */
 			if (p->p_model == DATAMODEL_ILP32 && RS1(instr) >= 16)
 				return (-1);
 
 			tp->ftt_type = FASTTRAP_T_REG;
 			if (A(instr) != 0)
 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
 			disp = DISP16(instr);
 			disp <<= 16;
 			disp >>= 14;
 			tp->ftt_dest = pc + (intptr_t)disp;
 			tp->ftt_code = RCOND(instr);
 			break;
 
 		case OP2_SETHI:
 			tp->ftt_type = FASTTRAP_T_SETHI;
 			break;
 
 		case OP2_FBPfcc:
 			if (COND(instr) == 8) {
 				tp->ftt_type = FASTTRAP_T_ALWAYS;
 			} else {
 				tp->ftt_type = FASTTRAP_T_FCC;
 				tp->ftt_cc = CC(instr);
 				tp->ftt_code = COND(instr);
 			}
 
 			if (A(instr) != 0)
 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
 
 			disp = DISP19(instr);
 			disp <<= 13;
 			disp >>= 11;
 			tp->ftt_dest = pc + (intptr_t)disp;
 			break;
 
 		case OP2_FBfcc:
 			if (COND(instr) == 8) {
 				tp->ftt_type = FASTTRAP_T_ALWAYS;
 			} else {
 				tp->ftt_type = FASTTRAP_T_FCC;
 				tp->ftt_cc = 0;
 				tp->ftt_code = COND(instr);
 			}
 
 			if (A(instr) != 0)
 				tp->ftt_flags |= FASTTRAP_F_ANNUL;
 
 			disp = DISP22(instr);
 			disp <<= 10;
 			disp >>= 8;
 			tp->ftt_dest = pc + (intptr_t)disp;
 			break;
 		}
 
 	} else if (OP(instr) == 2) {
 		switch (OP3(instr)) {
 		case OP3_RETURN:
 			tp->ftt_type = FASTTRAP_T_RETURN;
 			break;
 
 		case OP3_JMPL:
 			tp->ftt_type = FASTTRAP_T_JMPL;
 			break;
 
 		case OP3_RD:
 			if (RS1(instr) == 5)
 				tp->ftt_type = FASTTRAP_T_RDPC;
 			break;
 
 		case OP3_SAVE:
 			/*
 			 * We optimize for save instructions at function
 			 * entry; see the comment in fasttrap_pid_probe()
 			 * (near FASTTRAP_T_SAVE) for details.
 			 */
 			if (fasttrap_optimize_save != 0 &&
 			    type == DTFTP_ENTRY &&
 			    I(instr) == 1 && RD(instr) == R_SP)
 				tp->ftt_type = FASTTRAP_T_SAVE;
 			break;
 
 		case OP3_RESTORE:
 			/*
 			 * We optimize restore instructions at function
 			 * return; see the comment in fasttrap_pid_probe()
 			 * (near FASTTRAP_T_RESTORE) for details.
 			 *
 			 * rd must be an %o or %g register.
 			 */
 			if ((RD(instr) & 0x10) == 0)
 				tp->ftt_type = FASTTRAP_T_RESTORE;
 			break;
 
 		case OP3_OR:
 			/*
 			 * A large proportion of instructions in the delay
 			 * slot of retl instructions are or's so we emulate
 			 * these downstairs as an optimization.
 			 */
 			tp->ftt_type = FASTTRAP_T_OR;
 			break;
 
 		case OP3_TCC:
 			/*
 			 * Breakpoint instructions are effectively position-
 			 * dependent since the debugger uses the %pc value
 			 * to lookup which breakpoint was executed. As a
 			 * result, we can't actually instrument breakpoints.
 			 */
 			if (SW_TRAP(instr) == ST_BREAKPOINT)
 				return (-1);
 			break;
 
 		case 0x19:
 		case 0x1d:
 		case 0x29:
 		case 0x33:
 		case 0x3f:
 			/*
 			 * Identify illegal instructions (See SPARC
 			 * Architecture Manual Version 9, E.2 table 32).
 			 */
 			return (-1);
 		}
 	} else if (OP(instr) == 3) {
 		uint32_t op3 = OP3(instr);
 
 		/*
 		 * Identify illegal instructions (See SPARC Architecture
 		 * Manual Version 9, E.2 table 33).
 		 */
 		if ((op3 & 0x28) == 0x28) {
 			if (op3 != OP3_PREFETCH && op3 != OP3_CASA &&
 			    op3 != OP3_PREFETCHA && op3 != OP3_CASXA)
 				return (-1);
 		} else {
 			if ((op3 & 0x0f) == 0x0c || (op3 & 0x3b) == 0x31)
 				return (-1);
 		}
 	}
 
 	tp->ftt_instr = instr;
 
 	/*
 	 * We don't know how this tracepoint is going to be used, but in case
 	 * it's used as part of a function return probe, we need to indicate
 	 * whether it's always a return site or only potentially a return
 	 * site. If it's part of a return probe, it's always going to be a
 	 * return from that function if it's a restore instruction or if
 	 * the previous instruction was a return. If we could reliably
 	 * distinguish jump tables from return sites, this wouldn't be
 	 * necessary.
 	 */
 	if (tp->ftt_type != FASTTRAP_T_RESTORE &&
 	    (uread(p, &instr, 4, pc - sizeof (instr)) != 0 ||
 	    !(OP(instr) == 2 && OP3(instr) == OP3_RETURN)))
 		tp->ftt_flags |= FASTTRAP_F_RETMAYBE;
 
 	return (0);
 }
 
 /*ARGSUSED*/
 uint64_t
 fasttrap_pid_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
     int aframes)
 {
 	return (fasttrap_anarg(ttolwp(curthread)->lwp_regs, argno));
 }
 
 /*ARGSUSED*/
 uint64_t
 fasttrap_usdt_getarg(void *arg, dtrace_id_t id, void *parg, int argno,
     int aframes)
 {
 	return (fasttrap_anarg(ttolwp(curthread)->lwp_regs, argno));
 }
 
 static uint64_t fasttrap_getreg_fast_cnt;
 static uint64_t fasttrap_getreg_mpcb_cnt;
 static uint64_t fasttrap_getreg_slow_cnt;
 
 static ulong_t
 fasttrap_getreg(struct regs *rp, uint_t reg)
 {
 	ulong_t value;
 	dtrace_icookie_t cookie;
 	struct machpcb *mpcb;
 	extern ulong_t dtrace_getreg_win(uint_t, uint_t);
 
 	/*
 	 * We have the %os and %gs in our struct regs, but if we need to
 	 * snag a %l or %i we need to go scrounging around in the process's
 	 * address space.
 	 */
 	if (reg == 0)
 		return (0);
 
 	if (reg < 16)
 		return ((&rp->r_g1)[reg - 1]);
 
 	/*
 	 * Before we look at the user's stack, we'll check the register
 	 * windows to see if the information we want is in there.
 	 */
 	cookie = dtrace_interrupt_disable();
 	if (dtrace_getotherwin() > 0) {
 		value = dtrace_getreg_win(reg, 1);
 		dtrace_interrupt_enable(cookie);
 
-		atomic_add_64(&fasttrap_getreg_fast_cnt, 1);
+		atomic_inc_64(&fasttrap_getreg_fast_cnt);
 
 		return (value);
 	}
 	dtrace_interrupt_enable(cookie);
 
 	/*
 	 * First check the machpcb structure to see if we've already read
 	 * in the register window we're looking for; if we haven't, (and
 	 * we probably haven't) try to copy in the value of the register.
 	 */
 	/* LINTED - alignment */
 	mpcb = (struct machpcb *)((caddr_t)rp - REGOFF);
 
 	if (get_udatamodel() == DATAMODEL_NATIVE) {
 		struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
 
 		if (mpcb->mpcb_wbcnt > 0) {
 			struct rwindow *rwin = (void *)mpcb->mpcb_wbuf;
 			int i = mpcb->mpcb_wbcnt;
 			do {
 				i--;
 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
 					continue;
 
-				atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
+				atomic_inc_64(&fasttrap_getreg_mpcb_cnt);
 				return (rwin[i].rw_local[reg - 16]);
 			} while (i > 0);
 		}
 
 		if (fasttrap_fulword(&fr->fr_local[reg - 16], &value) != 0)
 			goto err;
 	} else {
 		struct frame32 *fr =
 		    (struct frame32 *)(uintptr_t)(caddr32_t)rp->r_sp;
 		uint32_t *v32 = (uint32_t *)&value;
 
 		if (mpcb->mpcb_wbcnt > 0) {
 			struct rwindow32 *rwin = (void *)mpcb->mpcb_wbuf;
 			int i = mpcb->mpcb_wbcnt;
 			do {
 				i--;
 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
 					continue;
 
-				atomic_add_64(&fasttrap_getreg_mpcb_cnt, 1);
+				atomic_inc_64(&fasttrap_getreg_mpcb_cnt);
 				return (rwin[i].rw_local[reg - 16]);
 			} while (i > 0);
 		}
 
 		if (fasttrap_fuword32(&fr->fr_local[reg - 16], &v32[1]) != 0)
 			goto err;
 
 		v32[0] = 0;
 	}
 
-	atomic_add_64(&fasttrap_getreg_slow_cnt, 1);
+	atomic_inc_64(&fasttrap_getreg_slow_cnt);
 	return (value);
 
 err:
 	/*
 	 * If the copy in failed, the process will be in a irrecoverable
 	 * state, and we have no choice but to kill it.
 	 */
 	kern_psignal(ttoproc(curthread), SIGILL);
 	return (0);
 }
 
 static uint64_t fasttrap_putreg_fast_cnt;
 static uint64_t fasttrap_putreg_mpcb_cnt;
 static uint64_t fasttrap_putreg_slow_cnt;
 
 static void
 fasttrap_putreg(struct regs *rp, uint_t reg, ulong_t value)
 {
 	dtrace_icookie_t cookie;
 	struct machpcb *mpcb;
 	extern void dtrace_putreg_win(uint_t, ulong_t);
 
 	if (reg == 0)
 		return;
 
 	if (reg < 16) {
 		(&rp->r_g1)[reg - 1] = value;
 		return;
 	}
 
 	/*
 	 * If the user process is still using some register windows, we
 	 * can just place the value in the correct window.
 	 */
 	cookie = dtrace_interrupt_disable();
 	if (dtrace_getotherwin() > 0) {
 		dtrace_putreg_win(reg, value);
 		dtrace_interrupt_enable(cookie);
-		atomic_add_64(&fasttrap_putreg_fast_cnt, 1);
+		atomic_inc_64(&fasttrap_putreg_fast_cnt);
 		return;
 	}
 	dtrace_interrupt_enable(cookie);
 
 	/*
 	 * First see if there's a copy of the register window in the
 	 * machpcb structure that we can modify; if there isn't try to
 	 * copy out the value. If that fails, we try to create a new
 	 * register window in the machpcb structure. While this isn't
 	 * _precisely_ the intended use of the machpcb structure, it
 	 * can't cause any problems since we know at this point in the
 	 * code that all of the user's data have been flushed out of the
 	 * register file (since %otherwin is 0).
 	 */
 	/* LINTED - alignment */
 	mpcb = (struct machpcb *)((caddr_t)rp - REGOFF);
 
 	if (get_udatamodel() == DATAMODEL_NATIVE) {
 		struct frame *fr = (struct frame *)(rp->r_sp + STACK_BIAS);
 		/* LINTED - alignment */
 		struct rwindow *rwin = (struct rwindow *)mpcb->mpcb_wbuf;
 
 		if (mpcb->mpcb_wbcnt > 0) {
 			int i = mpcb->mpcb_wbcnt;
 			do {
 				i--;
 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
 					continue;
 
 				rwin[i].rw_local[reg - 16] = value;
-				atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
+				atomic_inc_64(&fasttrap_putreg_mpcb_cnt);
 				return;
 			} while (i > 0);
 		}
 
 		if (fasttrap_sulword(&fr->fr_local[reg - 16], value) != 0) {
 			if (mpcb->mpcb_wbcnt >= MAXWIN || copyin(fr,
 			    &rwin[mpcb->mpcb_wbcnt], sizeof (*rwin)) != 0)
 				goto err;
 
 			rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = value;
 			mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
 			mpcb->mpcb_wbcnt++;
-			atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
+			atomic_inc_64(&fasttrap_putreg_mpcb_cnt);
 			return;
 		}
 	} else {
 		struct frame32 *fr =
 		    (struct frame32 *)(uintptr_t)(caddr32_t)rp->r_sp;
 		/* LINTED - alignment */
 		struct rwindow32 *rwin = (struct rwindow32 *)mpcb->mpcb_wbuf;
 		uint32_t v32 = (uint32_t)value;
 
 		if (mpcb->mpcb_wbcnt > 0) {
 			int i = mpcb->mpcb_wbcnt;
 			do {
 				i--;
 				if ((long)mpcb->mpcb_spbuf[i] != rp->r_sp)
 					continue;
 
 				rwin[i].rw_local[reg - 16] = v32;
-				atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
+				atomic_inc_64(&fasttrap_putreg_mpcb_cnt);
 				return;
 			} while (i > 0);
 		}
 
 		if (fasttrap_suword32(&fr->fr_local[reg - 16], v32) != 0) {
 			if (mpcb->mpcb_wbcnt >= MAXWIN || copyin(fr,
 			    &rwin[mpcb->mpcb_wbcnt], sizeof (*rwin)) != 0)
 				goto err;
 
 			rwin[mpcb->mpcb_wbcnt].rw_local[reg - 16] = v32;
 			mpcb->mpcb_spbuf[mpcb->mpcb_wbcnt] = (caddr_t)rp->r_sp;
 			mpcb->mpcb_wbcnt++;
-			atomic_add_64(&fasttrap_putreg_mpcb_cnt, 1);
+			atomic_inc_64(&fasttrap_putreg_mpcb_cnt);
 			return;
 		}
 	}
 
-	atomic_add_64(&fasttrap_putreg_slow_cnt, 1);
+	atomic_inc_64(&fasttrap_putreg_slow_cnt);
 	return;
 
 err:
 	/*
 	 * If we couldn't record this register's value, the process is in an
 	 * irrecoverable state and we have no choice but to euthanize it.
 	 */
 	kern_psignal(ttoproc(curthread), SIGILL);
 }
Index: stable/10
===================================================================
--- stable/10	(revision 271000)
+++ stable/10	(revision 271001)

Property changes on: stable/10
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r270247