diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h
index 40f1a8ec2c57..949ea4dfaba1 100644
--- a/include/os/freebsd/spl/sys/taskq.h
+++ b/include/os/freebsd/spl/sys/taskq.h
@@ -1,129 +1,129 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_TASKQ_H
 #define	_SYS_TASKQ_H
 
 #ifdef _KERNEL
 
 #include <sys/types.h>
 #include <sys/proc.h>
 #include <sys/queue.h>
 #include <sys/taskqueue.h>
 #include <sys/thread.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #define	TASKQ_NAMELEN	31
 
 typedef struct taskq {
 	struct taskqueue	*tq_queue;
 	int			tq_nthreads;
 } taskq_t;
 
 typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 
 typedef struct taskq_ent {
 	union {
 		struct task	 tqent_task;
 		struct timeout_task tqent_timeout_task;
 	};
 	task_func_t	*tqent_func;
 	void		*tqent_arg;
 	taskqid_t	 tqent_id;
 	LIST_ENTRY(taskq_ent) tqent_hash;
 	uint_t		 tqent_type;
 	volatile uint_t	 tqent_rc;
 } taskq_ent_t;
 
 /*
  * Public flags for taskq_create(): bit range 0-15
  */
 #define	TASKQ_PREPOPULATE	0x0001	/* Prepopulate with threads and data */
 #define	TASKQ_CPR_SAFE		0x0002	/* Use CPR safe protocol */
 #define	TASKQ_DYNAMIC		0x0004	/* Use dynamic thread scheduling */
 #define	TASKQ_THREADS_CPU_PCT	0x0008	/* number of threads as % of ncpu */
 #define	TASKQ_DC_BATCH		0x0010	/* Taskq uses SDC in batch mode */
 
 /*
  * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
  * KM_SLEEP/KM_NOSLEEP.
  */
 #define	TQ_SLEEP	0x00	/* Can block for memory */
 #define	TQ_NOSLEEP	0x01	/* cannot block for memory; may fail */
 #define	TQ_NOQUEUE	0x02	/* Do not enqueue if can't dispatch */
 #define	TQ_NOALLOC	0x04	/* cannot allocate memory; may fail */
 #define	TQ_FRONT	0x08	/* Put task at the front of the queue */
 
 #define	TASKQID_INVALID		((taskqid_t)0)
 
 extern taskq_t *system_taskq;
 /* Global dynamic task queue for long delay */
 extern taskq_t *system_delay_taskq;
 
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *,
     uint_t, clock_t);
 extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
     taskq_ent_t *);
 extern int taskq_empty_ent(taskq_ent_t *);
 extern void taskq_init_ent(taskq_ent_t *);
 taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
 taskq_t	*taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
     kthread_t ***);
 taskq_t	*taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t);
 taskq_t	*taskq_create_proc(const char *, int, pri_t, int, int,
     struct proc *, uint_t);
 taskq_t	*taskq_create_sysdc(const char *, int, int, int,
     struct proc *, uint_t, uint_t);
 void	nulltask(void *);
 extern void taskq_destroy(taskq_t *);
 extern void taskq_wait_id(taskq_t *, taskqid_t);
 extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
 extern void taskq_wait(taskq_t *);
-extern int taskq_cancel_id(taskq_t *, taskqid_t);
+extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t);
 extern int taskq_member(taskq_t *, kthread_t *);
 extern taskq_t *taskq_of_curthread(void);
 void	taskq_suspend(taskq_t *);
 int	taskq_suspended(taskq_t *);
 void	taskq_resume(taskq_t *);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _KERNEL */
 
 #ifdef _STANDALONE
 typedef void taskq_t;
 typedef int taskq_ent_t;
 #define	taskq_init_ent(x)
 #endif /* _STANDALONE */
 
 #endif	/* _SYS_TASKQ_H */
diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h
index c9b2bc994c8c..108b4fbeec8d 100644
--- a/include/os/linux/spl/sys/taskq.h
+++ b/include/os/linux/spl/sys/taskq.h
@@ -1,214 +1,214 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  */
 /*
  * Copyright (c) 2024, Klara Inc.
  * Copyright (c) 2024, Syneto
  */
 
 #ifndef _SPL_TASKQ_H
 #define	_SPL_TASKQ_H
 
 #include <linux/module.h>
 #include <linux/gfp.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/kthread.h>
 #include <sys/types.h>
 #include <sys/thread.h>
 #include <sys/rwlock.h>
 #include <sys/wait.h>
 #include <sys/wmsum.h>
 #include <sys/kstat.h>
 
 #define	TASKQ_NAMELEN		31
 
 #define	TASKQ_PREPOPULATE	0x00000001
 #define	TASKQ_CPR_SAFE		0x00000002
 #define	TASKQ_DYNAMIC		0x00000004
 #define	TASKQ_THREADS_CPU_PCT	0x00000008
 #define	TASKQ_DC_BATCH		0x00000010
 #define	TASKQ_ACTIVE		0x80000000
 
 /*
  * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
  * KM_SLEEP/KM_NOSLEEP.  TQ_NOQUEUE/TQ_NOALLOC are set particularly
  * large so as not to conflict with already used GFP_* defines.
  */
 #define	TQ_SLEEP		0x00000000
 #define	TQ_NOSLEEP		0x00000001
 #define	TQ_PUSHPAGE		0x00000002
 #define	TQ_NOQUEUE		0x01000000
 #define	TQ_NOALLOC		0x02000000
 #define	TQ_NEW			0x04000000
 #define	TQ_FRONT		0x08000000
 
 /*
  * Reserved taskqid values.
  */
 #define	TASKQID_INVALID		((taskqid_t)0)
 #define	TASKQID_INITIAL		((taskqid_t)1)
 
 /*
  * spin_lock(lock) and spin_lock_nested(lock,0) are equivalent,
  * so TQ_LOCK_DYNAMIC must not evaluate to 0
  */
 typedef enum tq_lock_role {
 	TQ_LOCK_GENERAL =	0,
 	TQ_LOCK_DYNAMIC =	1,
 } tq_lock_role_t;
 
 typedef unsigned long taskqid_t;
 typedef void (task_func_t)(void *);
 
 typedef struct taskq_sums {
 	/* gauges (inc/dec counters, current value) */
 	wmsum_t tqs_threads_active;		/* threads running a task */
 	wmsum_t tqs_threads_idle;		/* threads waiting for work */
 	wmsum_t tqs_threads_total;		/* total threads */
 	wmsum_t tqs_tasks_pending;		/* tasks waiting to execute */
 	wmsum_t tqs_tasks_priority;		/* hi-pri tasks waiting */
 	wmsum_t tqs_tasks_total;		/* total waiting tasks */
 	wmsum_t tqs_tasks_delayed;		/* tasks deferred to future */
 	wmsum_t tqs_entries_free;		/* task entries on free list */
 
 	/* counters (inc only, since taskq creation) */
 	wmsum_t tqs_threads_created;		/* threads created */
 	wmsum_t tqs_threads_destroyed;		/* threads destroyed */
 	wmsum_t tqs_tasks_dispatched;		/* tasks dispatched */
 	wmsum_t tqs_tasks_dispatched_delayed;	/* tasks delayed to future */
 	wmsum_t tqs_tasks_executed_normal;	/* normal pri tasks executed */
 	wmsum_t tqs_tasks_executed_priority;	/* high pri tasks executed */
 	wmsum_t tqs_tasks_executed;		/* total tasks executed */
 	wmsum_t tqs_tasks_delayed_requeued;	/* delayed tasks requeued */
 	wmsum_t tqs_tasks_cancelled;		/* tasks cancelled before run */
 	wmsum_t tqs_thread_wakeups;		/* total thread wakeups */
 	wmsum_t tqs_thread_wakeups_nowork;	/* thread woken but no tasks */
 	wmsum_t tqs_thread_sleeps;		/* total thread sleeps */
 } taskq_sums_t;
 
 typedef struct taskq {
 	spinlock_t		tq_lock;	/* protects taskq_t */
 	char			*tq_name;	/* taskq name */
 	int			tq_instance;	/* instance of tq_name */
 	struct list_head	tq_thread_list;	/* list of all threads */
 	struct list_head	tq_active_list;	/* list of active threads */
 	int			tq_nactive;	/* # of active threads */
 	int			tq_nthreads;	/* # of existing threads */
 	int			tq_nspawn;	/* # of threads being spawned */
 	int			tq_maxthreads;	/* # of threads maximum */
 	/* If PERCPU flag is set, percent of NCPUs to have as threads */
 	int			tq_cpu_pct;
 	int			tq_pri;		/* priority */
 	int			tq_minalloc;	/* min taskq_ent_t pool size */
 	int			tq_maxalloc;	/* max taskq_ent_t pool size */
 	int			tq_nalloc;	/* cur taskq_ent_t pool size */
 	uint_t			tq_flags;	/* flags */
 	taskqid_t		tq_next_id;	/* next pend/work id */
 	taskqid_t		tq_lowest_id;	/* lowest pend/work id */
 	struct list_head	tq_free_list;	/* free taskq_ent_t's */
 	struct list_head	tq_pend_list;	/* pending taskq_ent_t's */
 	struct list_head	tq_prio_list;	/* priority taskq_ent_t's */
 	struct list_head	tq_delay_list;	/* delayed taskq_ent_t's */
 	struct list_head	tq_taskqs;	/* all taskq_t's */
 	wait_queue_head_t	tq_work_waitq;	/* new work waitq */
 	wait_queue_head_t	tq_wait_waitq;	/* wait waitq */
 	tq_lock_role_t		tq_lock_class;	/* class when taking tq_lock */
 	/* list node for the cpu hotplug callback */
 	struct hlist_node	tq_hp_cb_node;
 	boolean_t		tq_hp_support;
 	unsigned long		lastspawnstop;	/* when to purge dynamic */
 	taskq_sums_t		tq_sums;
 	kstat_t			*tq_ksp;
 } taskq_t;
 
 typedef struct taskq_ent {
 	spinlock_t		tqent_lock;
 	wait_queue_head_t	tqent_waitq;
 	struct timer_list	tqent_timer;
 	struct list_head	tqent_list;
 	taskqid_t		tqent_id;
 	task_func_t		*tqent_func;
 	void			*tqent_arg;
 	taskq_t			*tqent_taskq;
 	uintptr_t		tqent_flags;
 	unsigned long		tqent_birth;
 } taskq_ent_t;
 
 #define	TQENT_FLAG_PREALLOC	0x1
 #define	TQENT_FLAG_CANCEL	0x2
 
 /* bits 2-3 are which list tqent is on */
 #define	TQENT_LIST_NONE		0x0
 #define	TQENT_LIST_PENDING	0x4
 #define	TQENT_LIST_PRIORITY	0x8
 #define	TQENT_LIST_DELAY	0xc
 #define	TQENT_LIST_MASK		0xc
 
 typedef struct taskq_thread {
 	struct list_head	tqt_thread_list;
 	struct list_head	tqt_active_list;
 	struct task_struct	*tqt_thread;
 	taskq_t			*tqt_tq;
 	taskqid_t		tqt_id;
 	taskq_ent_t		*tqt_task;
 	uintptr_t		tqt_flags;
 } taskq_thread_t;
 
 /* Global system-wide dynamic task queue available for all consumers */
 extern taskq_t *system_taskq;
 /* Global dynamic task queue for long delay */
 extern taskq_t *system_delay_taskq;
 
 /* List of all taskqs */
 extern struct list_head tq_list;
 extern struct rw_semaphore tq_list_sem;
 
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *,
     uint_t, clock_t);
 extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
     taskq_ent_t *);
 extern int taskq_empty_ent(taskq_ent_t *);
 extern void taskq_init_ent(taskq_ent_t *);
 extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t);
 extern taskq_t *taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
     kthread_t ***);
 extern void taskq_destroy(taskq_t *);
 extern void taskq_wait_id(taskq_t *, taskqid_t);
 extern void taskq_wait_outstanding(taskq_t *, taskqid_t);
 extern void taskq_wait(taskq_t *);
-extern int taskq_cancel_id(taskq_t *, taskqid_t);
+extern int taskq_cancel_id(taskq_t *, taskqid_t, boolean_t);
 extern int taskq_member(taskq_t *, kthread_t *);
 extern taskq_t *taskq_of_curthread(void);
 
 #define	taskq_create_proc(name, nthreads, pri, min, max, proc, flags) \
     taskq_create(name, nthreads, pri, min, max, flags)
 #define	taskq_create_sysdc(name, nthreads, min, max, proc, dc, flags) \
 	((void) sizeof (dc), \
 	    taskq_create(name, nthreads, maxclsyspri, min, max, flags))
 
 int spl_taskq_init(void);
 void spl_taskq_fini(void);
 
 #endif  /* _SPL_TASKQ_H */
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 7112d3ef5c99..26e7b5cf52a4 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -1,789 +1,789 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_CONTEXT_H
 #define	_SYS_ZFS_CONTEXT_H
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /*
  * This code compiles in three different contexts. When __KERNEL__ is defined,
  * the code uses "unix-like" kernel interfaces. When _STANDALONE is defined, the
  * code is running in a reduced capacity environment of the boot loader which is
  * generally a subset of both POSIX and kernel interfaces (with a few unique
  * interfaces too). When neither are defined, it's in a userland POSIX or
  * similar environment.
  */
 #if defined(__KERNEL__) || defined(_STANDALONE)
 #include <sys/types.h>
 #include <sys/atomic.h>
 #include <sys/sysmacros.h>
 #include <sys/vmsystm.h>
 #include <sys/condvar.h>
 #include <sys/cmn_err.h>
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/vmem.h>
 #include <sys/misc.h>
 #include <sys/taskq.h>
 #include <sys/param.h>
 #include <sys/disp.h>
 #include <sys/debug.h>
 #include <sys/random.h>
 #include <sys/string.h>
 #include <sys/byteorder.h>
 #include <sys/list.h>
 #include <sys/time.h>
 #include <sys/zone.h>
 #include <sys/kstat.h>
 #include <sys/zfs_debug.h>
 #include <sys/sysevent.h>
 #include <sys/sysevent/eventdefs.h>
 #include <sys/zfs_delay.h>
 #include <sys/sunddi.h>
 #include <sys/ctype.h>
 #include <sys/disp.h>
 #include <sys/trace.h>
 #include <sys/procfs_list.h>
 #include <sys/mod.h>
 #include <sys/uio_impl.h>
 #include <sys/zfs_context_os.h>
 #else /* _KERNEL || _STANDALONE */
 
 #define	_SYS_MUTEX_H
 #define	_SYS_RWLOCK_H
 #define	_SYS_CONDVAR_H
 #define	_SYS_VNODE_H
 #define	_SYS_VFS_H
 #define	_SYS_SUNDDI_H
 #define	_SYS_CALLB_H
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
 #include <stdarg.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <errno.h>
 #include <string.h>
 #include <pthread.h>
 #include <setjmp.h>
 #include <assert.h>
 #include <umem.h>
 #include <limits.h>
 #include <atomic.h>
 #include <dirent.h>
 #include <time.h>
 #include <ctype.h>
 #include <signal.h>
 #include <sys/mman.h>
 #include <sys/types.h>
 #include <sys/cred.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/byteorder.h>
 #include <sys/list.h>
 #include <sys/mod.h>
 #include <sys/uio.h>
 #include <sys/zfs_debug.h>
 #include <sys/kstat.h>
 #include <sys/u8_textprep.h>
 #include <sys/sysevent.h>
 #include <sys/sysevent/eventdefs.h>
 #include <sys/sunddi.h>
 #include <sys/debug.h>
 #include <sys/utsname.h>
 #include <sys/trace_zfs.h>
 
 #include <sys/zfs_context_os.h>
 
 /*
  * Stack
  */
 
 #define	noinline	__attribute__((noinline))
 #define	likely(x)	__builtin_expect((x), 1)
 #define	unlikely(x)	__builtin_expect((x), 0)
 
 /*
  * Debugging
  */
 
 /*
  * Note that we are not using the debugging levels.
  */
 
 #define	CE_CONT		0	/* continuation		*/
 #define	CE_NOTE		1	/* notice		*/
 #define	CE_WARN		2	/* warning		*/
 #define	CE_PANIC	3	/* panic		*/
 #define	CE_IGNORE	4	/* print nothing	*/
 
 /*
  * ZFS debugging
  */
 
 extern void dprintf_setup(int *argc, char **argv);
 
 extern void cmn_err(int, const char *, ...)
     __attribute__((format(printf, 2, 3)));
 extern void vcmn_err(int, const char *, va_list)
     __attribute__((format(printf, 2, 0)));
 extern void panic(const char *, ...)
     __attribute__((format(printf, 1, 2), noreturn));
 extern void vpanic(const char *, va_list)
     __attribute__((format(printf, 1, 0), noreturn));
 
 #define	fm_panic	panic
 
 /*
  * DTrace SDT probes have different signatures in userland than they do in
  * the kernel.  If they're being used in kernel code, re-define them out of
  * existence for their counterparts in libzpool.
  *
  * Here's an example of how to use the set-error probes in userland:
  * zfs$target:::set-error /arg0 == EBUSY/ {stack();}
  *
  * Here's an example of how to use DTRACE_PROBE probes in userland:
  * If there is a probe declared as follows:
  * DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn);
  * Then you can use it as follows:
  * zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/
  *     {printf("%u %p\n", arg1, arg2);}
  */
 
 #ifdef DTRACE_PROBE
 #undef	DTRACE_PROBE
 #endif	/* DTRACE_PROBE */
 #define	DTRACE_PROBE(a)
 
 #ifdef DTRACE_PROBE1
 #undef	DTRACE_PROBE1
 #endif	/* DTRACE_PROBE1 */
 #define	DTRACE_PROBE1(a, b, c)
 
 #ifdef DTRACE_PROBE2
 #undef	DTRACE_PROBE2
 #endif	/* DTRACE_PROBE2 */
 #define	DTRACE_PROBE2(a, b, c, d, e)
 
 #ifdef DTRACE_PROBE3
 #undef	DTRACE_PROBE3
 #endif	/* DTRACE_PROBE3 */
 #define	DTRACE_PROBE3(a, b, c, d, e, f, g)
 
 #ifdef DTRACE_PROBE4
 #undef	DTRACE_PROBE4
 #endif	/* DTRACE_PROBE4 */
 #define	DTRACE_PROBE4(a, b, c, d, e, f, g, h, i)
 
 /*
  * Threads.
  */
 typedef pthread_t	kthread_t;
 
 #define	TS_RUN		0x00000002
 #define	TS_JOINABLE	0x00000004
 
 #define	curthread	((void *)(uintptr_t)pthread_self())
 #define	getcomm()	"unknown"
 
 #define	thread_create_named(name, stk, stksize, func, arg, len, \
     pp, state, pri)	\
 	zk_thread_create(name, func, arg, stksize, state)
 #define	thread_create(stk, stksize, func, arg, len, pp, state, pri)	\
 	zk_thread_create(#func, func, arg, stksize, state)
 #define	thread_exit()	pthread_exit(NULL)
 #define	thread_join(t)	pthread_join((pthread_t)(t), NULL)
 
 #define	newproc(f, a, cid, pri, ctp, pid)	(ENOSYS)
 /*
  * Check if the current thread is a memory reclaim thread.
  * Always returns false in userspace (no memory reclaim thread).
  */
 #define	current_is_reclaim_thread()	(0)
 
 /* in libzpool, p0 exists only to have its address taken */
 typedef struct proc {
 	uintptr_t	this_is_never_used_dont_dereference_it;
 } proc_t;
 
 extern struct proc p0;
 #define	curproc		(&p0)
 
 #define	PS_NONE		-1
 
 extern kthread_t *zk_thread_create(const char *name, void (*func)(void *),
     void *arg, size_t stksize, int state);
 
 #define	issig()		(FALSE)
 
 #define	KPREEMPT_SYNC		(-1)
 
 #define	kpreempt(x)		sched_yield()
 #define	kpreempt_disable()	((void)0)
 #define	kpreempt_enable()	((void)0)
 
 /*
  * Mutexes
  */
 typedef struct kmutex {
 	pthread_mutex_t		m_lock;
 	pthread_t		m_owner;
 } kmutex_t;
 
 #define	MUTEX_DEFAULT		0
 #define	MUTEX_NOLOCKDEP		MUTEX_DEFAULT
 #define	MUTEX_HELD(mp)		pthread_equal((mp)->m_owner, pthread_self())
 #define	MUTEX_NOT_HELD(mp)	!MUTEX_HELD(mp)
 
 extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie);
 extern void mutex_destroy(kmutex_t *mp);
 extern void mutex_enter(kmutex_t *mp);
 extern int mutex_enter_check_return(kmutex_t *mp);
 extern void mutex_exit(kmutex_t *mp);
 extern int mutex_tryenter(kmutex_t *mp);
 
 #define	NESTED_SINGLE 1
 #define	mutex_enter_nested(mp, class) mutex_enter(mp)
 #define	mutex_enter_interruptible(mp) mutex_enter_check_return(mp)
 /*
  * RW locks
  */
 typedef struct krwlock {
 	pthread_rwlock_t	rw_lock;
 	pthread_t		rw_owner;
 	uint_t			rw_readers;
 } krwlock_t;
 
 typedef int krw_t;
 
 #define	RW_READER		0
 #define	RW_WRITER		1
 #define	RW_DEFAULT		RW_READER
 #define	RW_NOLOCKDEP		RW_READER
 
 #define	RW_READ_HELD(rw)	((rw)->rw_readers > 0)
 #define	RW_WRITE_HELD(rw)	pthread_equal((rw)->rw_owner, pthread_self())
 #define	RW_LOCK_HELD(rw)	(RW_READ_HELD(rw) || RW_WRITE_HELD(rw))
 
 extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg);
 extern void rw_destroy(krwlock_t *rwlp);
 extern void rw_enter(krwlock_t *rwlp, krw_t rw);
 extern int rw_tryenter(krwlock_t *rwlp, krw_t rw);
 extern int rw_tryupgrade(krwlock_t *rwlp);
 extern void rw_exit(krwlock_t *rwlp);
 #define	rw_downgrade(rwlp) do { } while (0)
 
 /*
  * Credentials
  */
 extern uid_t crgetuid(cred_t *cr);
 extern uid_t crgetruid(cred_t *cr);
 extern gid_t crgetgid(cred_t *cr);
 extern int crgetngroups(cred_t *cr);
 extern gid_t *crgetgroups(cred_t *cr);
 
 /*
  * Condition variables
  */
 typedef pthread_cond_t		kcondvar_t;
 
 #define	CV_DEFAULT		0
 #define	CALLOUT_FLAG_ABSOLUTE	0x2
 
 extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg);
 extern void cv_destroy(kcondvar_t *cv);
 extern void cv_wait(kcondvar_t *cv, kmutex_t *mp);
 extern int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp);
 extern int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime);
 extern int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
     hrtime_t res, int flag);
 extern void cv_signal(kcondvar_t *cv);
 extern void cv_broadcast(kcondvar_t *cv);
 
 #define	cv_timedwait_io(cv, mp, at)		cv_timedwait(cv, mp, at)
 #define	cv_timedwait_idle(cv, mp, at)		cv_timedwait(cv, mp, at)
 #define	cv_timedwait_sig(cv, mp, at)		cv_timedwait(cv, mp, at)
 #define	cv_wait_io(cv, mp)			cv_wait(cv, mp)
 #define	cv_wait_idle(cv, mp)			cv_wait(cv, mp)
 #define	cv_wait_io_sig(cv, mp)			cv_wait_sig(cv, mp)
 #define	cv_timedwait_sig_hires(cv, mp, t, r, f) \
 	cv_timedwait_hires(cv, mp, t, r, f)
 #define	cv_timedwait_idle_hires(cv, mp, t, r, f) \
 	cv_timedwait_hires(cv, mp, t, r, f)
 
 /*
  * Thread-specific data
  */
 #define	tsd_get(k) pthread_getspecific(k)
 #define	tsd_set(k, v) pthread_setspecific(k, v)
 #define	tsd_create(kp, d) pthread_key_create((pthread_key_t *)kp, d)
 #define	tsd_destroy(kp) /* nothing */
 #ifdef __FreeBSD__
 typedef off_t loff_t;
 #endif
 
 /*
  * kstat creation, installation and deletion
  */
 extern kstat_t *kstat_create(const char *, int,
     const char *, const char *, uchar_t, ulong_t, uchar_t);
 extern void kstat_install(kstat_t *);
 extern void kstat_delete(kstat_t *);
 extern void kstat_set_raw_ops(kstat_t *ksp,
     int (*headers)(char *buf, size_t size),
     int (*data)(char *buf, size_t size, void *data),
     void *(*addr)(kstat_t *ksp, loff_t index));
 
 /*
  * procfs list manipulation
  */
 
 typedef struct procfs_list {
 	void		*pl_private;
 	kmutex_t	pl_lock;
 	list_t		pl_list;
 	uint64_t	pl_next_id;
 	size_t		pl_node_offset;
 } procfs_list_t;
 
 #ifndef __cplusplus
 struct seq_file { };
 void seq_printf(struct seq_file *m, const char *fmt, ...);
 
 typedef struct procfs_list_node {
 	list_node_t	pln_link;
 	uint64_t	pln_id;
 } procfs_list_node_t;
 
 void procfs_list_install(const char *module,
     const char *submodule,
     const char *name,
     mode_t mode,
     procfs_list_t *procfs_list,
     int (*show)(struct seq_file *f, void *p),
     int (*show_header)(struct seq_file *f),
     int (*clear)(procfs_list_t *procfs_list),
     size_t procfs_list_node_off);
 void procfs_list_uninstall(procfs_list_t *procfs_list);
 void procfs_list_destroy(procfs_list_t *procfs_list);
 void procfs_list_add(procfs_list_t *procfs_list, void *p);
 #endif
 
 /*
  * Kernel memory
  */
 #define	KM_SLEEP		UMEM_NOFAIL
 #define	KM_PUSHPAGE		KM_SLEEP
 #define	KM_NOSLEEP		UMEM_DEFAULT
 #define	KM_NORMALPRI		0	/* not needed with UMEM_DEFAULT */
 #define	KMC_NODEBUG		UMC_NODEBUG
 #define	KMC_KVMEM		0x0
 #define	KMC_RECLAIMABLE		0x0
 #define	kmem_alloc(_s, _f)	umem_alloc(_s, _f)
 #define	kmem_zalloc(_s, _f)	umem_zalloc(_s, _f)
 #define	kmem_free(_b, _s)	umem_free(_b, _s)
 #define	vmem_alloc(_s, _f)	kmem_alloc(_s, _f)
 #define	vmem_zalloc(_s, _f)	kmem_zalloc(_s, _f)
 #define	vmem_free(_b, _s)	kmem_free(_b, _s)
 #define	kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \
 	umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i)
 #define	kmem_cache_destroy(_c)	umem_cache_destroy(_c)
 #define	kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f)
 #define	kmem_cache_free(_c, _b)	umem_cache_free(_c, _b)
 #define	kmem_debugging()	0
 #define	kmem_cache_reap_now(_c)	umem_cache_reap_now(_c);
 #define	kmem_cache_set_move(_c, _cb)	/* nothing */
 #define	POINTER_INVALIDATE(_pp)		/* nothing */
 #define	POINTER_IS_VALID(_p)	0
 
 typedef umem_cache_t kmem_cache_t;
 
 typedef enum kmem_cbrc {
 	KMEM_CBRC_YES,
 	KMEM_CBRC_NO,
 	KMEM_CBRC_LATER,
 	KMEM_CBRC_DONT_NEED,
 	KMEM_CBRC_DONT_KNOW
 } kmem_cbrc_t;
 
 /*
  * Task queues
  */
 
 #define	TASKQ_NAMELEN	31
 
 typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 
 typedef struct taskq_ent {
 	struct taskq_ent	*tqent_next;
 	struct taskq_ent	*tqent_prev;
 	task_func_t		*tqent_func;
 	void			*tqent_arg;
 	uintptr_t		tqent_flags;
 } taskq_ent_t;
 
 typedef struct taskq {
 	char		tq_name[TASKQ_NAMELEN + 1];
 	kmutex_t	tq_lock;
 	krwlock_t	tq_threadlock;
 	kcondvar_t	tq_dispatch_cv;
 	kcondvar_t	tq_wait_cv;
 	kthread_t	**tq_threadlist;
 	int		tq_flags;
 	int		tq_active;
 	int		tq_nthreads;
 	int		tq_nalloc;
 	int		tq_minalloc;
 	int		tq_maxalloc;
 	kcondvar_t	tq_maxalloc_cv;
 	int		tq_maxalloc_wait;
 	taskq_ent_t	*tq_freelist;
 	taskq_ent_t	tq_task;
 } taskq_t;
 
 #define	TQENT_FLAG_PREALLOC	0x1	/* taskq_dispatch_ent used */
 
 #define	TASKQ_PREPOPULATE	0x0001
 #define	TASKQ_CPR_SAFE		0x0002	/* Use CPR safe protocol */
 #define	TASKQ_DYNAMIC		0x0004	/* Use dynamic thread scheduling */
 #define	TASKQ_THREADS_CPU_PCT	0x0008	/* Scale # threads by # cpus */
 #define	TASKQ_DC_BATCH		0x0010	/* Mark threads as batch */
 
 #define	TQ_SLEEP	KM_SLEEP	/* Can block for memory */
 #define	TQ_NOSLEEP	KM_NOSLEEP	/* cannot block for memory; may fail */
 #define	TQ_NOQUEUE	0x02		/* Do not enqueue if can't dispatch */
 #define	TQ_FRONT	0x08		/* Queue in front */
 
 #define	TASKQID_INVALID		((taskqid_t)0)
 
 extern taskq_t *system_taskq;
 extern taskq_t *system_delay_taskq;
 
 extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
 extern taskq_t	*taskq_create_synced(const char *, int, pri_t, int, int, uint_t,
     kthread_t ***);
 #define	taskq_create_proc(a, b, c, d, e, p, f) \
 	    (taskq_create(a, b, c, d, e, f))
 #define	taskq_create_sysdc(a, b, d, e, p, dc, f) \
 	    ((void) sizeof (dc), taskq_create(a, b, maxclsyspri, d, e, f))
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t,
     clock_t);
 extern void	taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
     taskq_ent_t *);
 extern int	taskq_empty_ent(taskq_ent_t *);
 extern void	taskq_init_ent(taskq_ent_t *);
 extern void	taskq_destroy(taskq_t *);
 extern void	taskq_wait(taskq_t *);
 extern void	taskq_wait_id(taskq_t *, taskqid_t);
 extern void	taskq_wait_outstanding(taskq_t *, taskqid_t);
 extern int	taskq_member(taskq_t *, kthread_t *);
 extern taskq_t	*taskq_of_curthread(void);
-extern int	taskq_cancel_id(taskq_t *, taskqid_t);
+extern int	taskq_cancel_id(taskq_t *, taskqid_t, boolean_t);
 extern void	system_taskq_init(void);
 extern void	system_taskq_fini(void);
 
 #define	XVA_MAPSIZE	3
 #define	XVA_MAGIC	0x78766174
 
 extern char *vn_dumpdir;
 #define	AV_SCANSTAMP_SZ	32		/* length of anti-virus scanstamp */
 
 typedef struct xoptattr {
 	inode_timespec_t xoa_createtime;	/* Create time of file */
 	uint8_t		xoa_archive;
 	uint8_t		xoa_system;
 	uint8_t		xoa_readonly;
 	uint8_t		xoa_hidden;
 	uint8_t		xoa_nounlink;
 	uint8_t		xoa_immutable;
 	uint8_t		xoa_appendonly;
 	uint8_t		xoa_nodump;
 	uint8_t		xoa_settable;
 	uint8_t		xoa_opaque;
 	uint8_t		xoa_av_quarantined;
 	uint8_t		xoa_av_modified;
 	uint8_t		xoa_av_scanstamp[AV_SCANSTAMP_SZ];
 	uint8_t		xoa_reparse;
 	uint8_t		xoa_offline;
 	uint8_t		xoa_sparse;
 } xoptattr_t;
 
 typedef struct vattr {
 	uint_t		va_mask;	/* bit-mask of attributes */
 	u_offset_t	va_size;	/* file size in bytes */
 } vattr_t;
 
 
 typedef struct xvattr {
 	vattr_t		xva_vattr;	/* Embedded vattr structure */
 	uint32_t	xva_magic;	/* Magic Number */
 	uint32_t	xva_mapsize;	/* Size of attr bitmap (32-bit words) */
 	uint32_t	*xva_rtnattrmapp;	/* Ptr to xva_rtnattrmap[] */
 	uint32_t	xva_reqattrmap[XVA_MAPSIZE];	/* Requested attrs */
 	uint32_t	xva_rtnattrmap[XVA_MAPSIZE];	/* Returned attrs */
 	xoptattr_t	xva_xoptattrs;	/* Optional attributes */
 } xvattr_t;
 
 typedef struct vsecattr {
 	uint_t		vsa_mask;	/* See below */
 	int		vsa_aclcnt;	/* ACL entry count */
 	void		*vsa_aclentp;	/* pointer to ACL entries */
 	int		vsa_dfaclcnt;	/* default ACL entry count */
 	void		*vsa_dfaclentp;	/* pointer to default ACL entries */
 	size_t		vsa_aclentsz;	/* ACE size in bytes of vsa_aclentp */
 } vsecattr_t;
 
 #define	AT_MODE		0x00002
 #define	AT_UID		0x00004
 #define	AT_GID		0x00008
 #define	AT_FSID		0x00010
 #define	AT_NODEID	0x00020
 #define	AT_NLINK	0x00040
 #define	AT_SIZE		0x00080
 #define	AT_ATIME	0x00100
 #define	AT_MTIME	0x00200
 #define	AT_CTIME	0x00400
 #define	AT_RDEV		0x00800
 #define	AT_BLKSIZE	0x01000
 #define	AT_NBLOCKS	0x02000
 #define	AT_SEQ		0x08000
 #define	AT_XVATTR	0x10000
 
 #define	CRCREAT		0
 
 #define	F_FREESP	11
 #define	FIGNORECASE	0x80000 /* request case-insensitive lookups */
 
 /*
  * Random stuff
  */
 #define	ddi_get_lbolt()		(gethrtime() >> 23)
 #define	ddi_get_lbolt64()	(gethrtime() >> 23)
 #define	hz	119	/* frequency when using gethrtime() >> 23 for lbolt */
 
 #define	ddi_time_before(a, b)		(a < b)
 #define	ddi_time_after(a, b)		ddi_time_before(b, a)
 #define	ddi_time_before_eq(a, b)	(!ddi_time_after(a, b))
 #define	ddi_time_after_eq(a, b)		ddi_time_before_eq(b, a)
 
 #define	ddi_time_before64(a, b)		(a < b)
 #define	ddi_time_after64(a, b)		ddi_time_before64(b, a)
 #define	ddi_time_before_eq64(a, b)	(!ddi_time_after64(a, b))
 #define	ddi_time_after_eq64(a, b)	ddi_time_before_eq64(b, a)
 
 extern void delay(clock_t ticks);
 
 #define	SEC_TO_TICK(sec)	((sec) * hz)
 #define	MSEC_TO_TICK(msec)	(howmany((hrtime_t)(msec) * hz, MILLISEC))
 #define	USEC_TO_TICK(usec)	(howmany((hrtime_t)(usec) * hz, MICROSEC))
 #define	NSEC_TO_TICK(nsec)	(howmany((hrtime_t)(nsec) * hz, NANOSEC))
 
 #define	max_ncpus	64
 #define	boot_ncpus	(sysconf(_SC_NPROCESSORS_ONLN))
 
 /*
  * Process priorities as defined by setpriority(2) and getpriority(2).
  */
 #define	minclsyspri	19
 #define	defclsyspri	0
 /* Write issue taskq priority. */
 #define	wtqclsyspri	-19
 #define	maxclsyspri	-20
 
 #define	CPU_SEQID	((uintptr_t)pthread_self() & (max_ncpus - 1))
 #define	CPU_SEQID_UNSTABLE	CPU_SEQID
 
 #define	kcred		NULL
 #define	CRED()		NULL
 
 #define	crhold(cr)	((void)cr)
 #define	crfree(cr)	((void)cr)
 
 #define	ptob(x)		((x) * PAGESIZE)
 
 #define	NN_DIVISOR_1000	(1U << 0)
 #define	NN_NUMBUF_SZ	(6)
 
 extern uint64_t physmem;
 extern const char *random_path;
 extern const char *urandom_path;
 
 extern int highbit64(uint64_t i);
 extern int lowbit64(uint64_t i);
 extern int random_get_bytes(uint8_t *ptr, size_t len);
 extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len);
 
 static __inline__ uint32_t
 random_in_range(uint32_t range)
 {
 	uint32_t r;
 
 	ASSERT(range != 0);
 
 	if (range == 1)
 		return (0);
 
 	(void) random_get_pseudo_bytes((uint8_t *)&r, sizeof (r));
 
 	return (r % range);
 }
 
 extern void kernel_init(int mode);
 extern void kernel_fini(void);
 extern void random_init(void);
 extern void random_fini(void);
 
 struct spa;
 extern void show_pool_stats(struct spa *);
 extern int handle_tunable_option(const char *, boolean_t);
 
 typedef struct callb_cpr {
 	kmutex_t	*cc_lockp;
 } callb_cpr_t;
 
 #define	CALLB_CPR_INIT(cp, lockp, func, name)	{		\
 	(cp)->cc_lockp = lockp;					\
 }
 
 #define	CALLB_CPR_SAFE_BEGIN(cp) {				\
 	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
 }
 
 #define	CALLB_CPR_SAFE_END(cp, lockp) {				\
 	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
 }
 
 #define	CALLB_CPR_EXIT(cp) {					\
 	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
 	mutex_exit((cp)->cc_lockp);				\
 }
 
 #define	zone_dataset_visible(x, y)	(1)
 #define	INGLOBALZONE(z)			(1)
 extern uint32_t zone_get_hostid(void *zonep);
 
 extern char *kmem_vasprintf(const char *fmt, va_list adx);
 extern char *kmem_asprintf(const char *fmt, ...);
 #define	kmem_strfree(str) kmem_free((str), strlen(str) + 1)
 #define	kmem_strdup(s)  strdup(s)
 
 #ifndef __cplusplus
 extern int kmem_scnprintf(char *restrict str, size_t size,
     const char *restrict fmt, ...);
 #endif
 
 /*
  * Hostname information
  */
 extern int ddi_strtoull(const char *str, char **nptr, int base,
     u_longlong_t *result);
 
 typedef struct utsname	utsname_t;
 extern utsname_t *utsname(void);
 
 /* ZFS Boot Related stuff. */
 
 struct _buf {
 	intptr_t	_fd;
 };
 
 struct bootstat {
 	uint64_t st_size;
 };
 
 typedef struct ace_object {
 	uid_t		a_who;
 	uint32_t	a_access_mask;
 	uint16_t	a_flags;
 	uint16_t	a_type;
 	uint8_t		a_obj_type[16];
 	uint8_t		a_inherit_obj_type[16];
 } ace_object_t;
 
 
 #define	ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE	0x05
 #define	ACE_ACCESS_DENIED_OBJECT_ACE_TYPE	0x06
 #define	ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE	0x07
 #define	ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE	0x08
 
 extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr);
 extern int zfs_secpolicy_rename_perms(const char *from, const char *to,
     cred_t *cr);
 extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
 extern int secpolicy_zfs(const cred_t *cr);
 extern zoneid_t getzoneid(void);
 
 /* SID stuff */
 typedef struct ksiddomain {
 	uint_t	kd_ref;
 	uint_t	kd_len;
 	char	*kd_name;
 } ksiddomain_t;
 
 ksiddomain_t *ksid_lookupdomain(const char *);
 void ksiddomain_rele(ksiddomain_t *);
 
 #define	DDI_SLEEP	KM_SLEEP
 #define	ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \
 	sysevent_post_event(_c, _d, _b, "libzpool", _e, _f)
 
 #define	zfs_sleep_until(wakeup)						\
 	do {								\
 		hrtime_t delta = wakeup - gethrtime();			\
 		struct timespec ts;					\
 		ts.tv_sec = delta / NANOSEC;				\
 		ts.tv_nsec = delta % NANOSEC;				\
 		(void) nanosleep(&ts, NULL);				\
 	} while (0)
 
 typedef int fstrans_cookie_t;
 
 extern fstrans_cookie_t spl_fstrans_mark(void);
 extern void spl_fstrans_unmark(fstrans_cookie_t);
 extern int kmem_cache_reap_active(void);
 
 
 /*
  * Kernel modules
  */
 #define	__init
 #define	__exit
 
 #endif  /* _KERNEL || _STANDALONE */
 
 #ifdef __cplusplus
 };
 #endif
 
 #endif	/* _SYS_ZFS_CONTEXT_H */
diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c
index 0457de21fa18..ff9bf61538ae 100644
--- a/lib/libzpool/taskq.c
+++ b/lib/libzpool/taskq.c
@@ -1,417 +1,417 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /*
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright 2012 Garrett D'Amore <garrett@damore.org>.  All rights reserved.
  * Copyright (c) 2014 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 
 int taskq_now;
 taskq_t *system_taskq;
 taskq_t *system_delay_taskq;
 
 static pthread_key_t taskq_tsd;
 
 #define	TASKQ_ACTIVE	0x00010000
 
 static taskq_ent_t *
 task_alloc(taskq_t *tq, int tqflags)
 {
 	taskq_ent_t *t;
 	int rv;
 
 again:	if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) {
 		ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 		tq->tq_freelist = t->tqent_next;
 	} else {
 		if (tq->tq_nalloc >= tq->tq_maxalloc) {
 			if (!(tqflags & KM_SLEEP))
 				return (NULL);
 
 			/*
 			 * We don't want to exceed tq_maxalloc, but we can't
 			 * wait for other tasks to complete (and thus free up
 			 * task structures) without risking deadlock with
 			 * the caller.  So, we just delay for one second
 			 * to throttle the allocation rate. If we have tasks
 			 * complete before one second timeout expires then
 			 * taskq_ent_free will signal us and we will
 			 * immediately retry the allocation.
 			 */
 			tq->tq_maxalloc_wait++;
 			rv = cv_timedwait(&tq->tq_maxalloc_cv,
 			    &tq->tq_lock, ddi_get_lbolt() + hz);
 			tq->tq_maxalloc_wait--;
 			if (rv > 0)
 				goto again;		/* signaled */
 		}
 		mutex_exit(&tq->tq_lock);
 
 		t = kmem_alloc(sizeof (taskq_ent_t), tqflags);
 
 		mutex_enter(&tq->tq_lock);
 		if (t != NULL) {
 			/* Make sure we start without any flags */
 			t->tqent_flags = 0;
 			tq->tq_nalloc++;
 		}
 	}
 	return (t);
 }
 
 static void
 task_free(taskq_t *tq, taskq_ent_t *t)
 {
 	if (tq->tq_nalloc <= tq->tq_minalloc) {
 		t->tqent_next = tq->tq_freelist;
 		tq->tq_freelist = t;
 	} else {
 		tq->tq_nalloc--;
 		mutex_exit(&tq->tq_lock);
 		kmem_free(t, sizeof (taskq_ent_t));
 		mutex_enter(&tq->tq_lock);
 	}
 
 	if (tq->tq_maxalloc_wait)
 		cv_signal(&tq->tq_maxalloc_cv);
 }
 
 taskqid_t
 taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
 {
 	taskq_ent_t *t;
 
 	if (taskq_now) {
 		func(arg);
 		return (1);
 	}
 
 	mutex_enter(&tq->tq_lock);
 	ASSERT(tq->tq_flags & TASKQ_ACTIVE);
 	if ((t = task_alloc(tq, tqflags)) == NULL) {
 		mutex_exit(&tq->tq_lock);
 		return (0);
 	}
 	if (tqflags & TQ_FRONT) {
 		t->tqent_next = tq->tq_task.tqent_next;
 		t->tqent_prev = &tq->tq_task;
 	} else {
 		t->tqent_next = &tq->tq_task;
 		t->tqent_prev = tq->tq_task.tqent_prev;
 	}
 	t->tqent_next->tqent_prev = t;
 	t->tqent_prev->tqent_next = t;
 	t->tqent_func = func;
 	t->tqent_arg = arg;
 	t->tqent_flags = 0;
 	cv_signal(&tq->tq_dispatch_cv);
 	mutex_exit(&tq->tq_lock);
 	return (1);
 }
 
 taskqid_t
 taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags,
     clock_t expire_time)
 {
 	(void) tq, (void) func, (void) arg, (void) tqflags, (void) expire_time;
 	return (0);
 }
 
 int
 taskq_empty_ent(taskq_ent_t *t)
 {
 	return (t->tqent_next == NULL);
 }
 
 void
 taskq_init_ent(taskq_ent_t *t)
 {
 	t->tqent_next = NULL;
 	t->tqent_prev = NULL;
 	t->tqent_func = NULL;
 	t->tqent_arg = NULL;
 	t->tqent_flags = 0;
 }
 
 void
 taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
     taskq_ent_t *t)
 {
 	ASSERT(func != NULL);
 
 	/*
 	 * Mark it as a prealloc'd task.  This is important
 	 * to ensure that we don't free it later.
 	 */
 	t->tqent_flags |= TQENT_FLAG_PREALLOC;
 	/*
 	 * Enqueue the task to the underlying queue.
 	 */
 	mutex_enter(&tq->tq_lock);
 
 	if (flags & TQ_FRONT) {
 		t->tqent_next = tq->tq_task.tqent_next;
 		t->tqent_prev = &tq->tq_task;
 	} else {
 		t->tqent_next = &tq->tq_task;
 		t->tqent_prev = tq->tq_task.tqent_prev;
 	}
 	t->tqent_next->tqent_prev = t;
 	t->tqent_prev->tqent_next = t;
 	t->tqent_func = func;
 	t->tqent_arg = arg;
 	cv_signal(&tq->tq_dispatch_cv);
 	mutex_exit(&tq->tq_lock);
 }
 
 void
 taskq_wait(taskq_t *tq)
 {
 	mutex_enter(&tq->tq_lock);
 	while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0)
 		cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
 	mutex_exit(&tq->tq_lock);
 }
 
 void
 taskq_wait_id(taskq_t *tq, taskqid_t id)
 {
 	(void) id;
 	taskq_wait(tq);
 }
 
 void
 taskq_wait_outstanding(taskq_t *tq, taskqid_t id)
 {
 	(void) id;
 	taskq_wait(tq);
 }
 
 static __attribute__((noreturn)) void
 taskq_thread(void *arg)
 {
 	taskq_t *tq = arg;
 	taskq_ent_t *t;
 	boolean_t prealloc;
 
 	VERIFY0(pthread_setspecific(taskq_tsd, tq));
 
 	mutex_enter(&tq->tq_lock);
 	while (tq->tq_flags & TASKQ_ACTIVE) {
 		if ((t = tq->tq_task.tqent_next) == &tq->tq_task) {
 			if (--tq->tq_active == 0)
 				cv_broadcast(&tq->tq_wait_cv);
 			cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock);
 			tq->tq_active++;
 			continue;
 		}
 		t->tqent_prev->tqent_next = t->tqent_next;
 		t->tqent_next->tqent_prev = t->tqent_prev;
 		t->tqent_next = NULL;
 		t->tqent_prev = NULL;
 		prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC;
 		mutex_exit(&tq->tq_lock);
 
 		rw_enter(&tq->tq_threadlock, RW_READER);
 		t->tqent_func(t->tqent_arg);
 		rw_exit(&tq->tq_threadlock);
 
 		mutex_enter(&tq->tq_lock);
 		if (!prealloc)
 			task_free(tq, t);
 	}
 	tq->tq_nthreads--;
 	cv_broadcast(&tq->tq_wait_cv);
 	mutex_exit(&tq->tq_lock);
 	thread_exit();
 }
 
 taskq_t *
 taskq_create(const char *name, int nthreads, pri_t pri,
     int minalloc, int maxalloc, uint_t flags)
 {
 	(void) pri;
 	taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP);
 	int t;
 
 	if (flags & TASKQ_THREADS_CPU_PCT) {
 		int pct;
 		ASSERT3S(nthreads, >=, 0);
 		ASSERT3S(nthreads, <=, 100);
 		pct = MIN(nthreads, 100);
 		pct = MAX(pct, 0);
 
 		nthreads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100;
 		nthreads = MAX(nthreads, 1);	/* need at least 1 thread */
 	} else {
 		ASSERT3S(nthreads, >=, 1);
 	}
 
 	rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL);
 	(void) strlcpy(tq->tq_name, name, sizeof (tq->tq_name));
 	tq->tq_flags = flags | TASKQ_ACTIVE;
 	tq->tq_active = nthreads;
 	tq->tq_nthreads = nthreads;
 	tq->tq_minalloc = minalloc;
 	tq->tq_maxalloc = maxalloc;
 	tq->tq_task.tqent_next = &tq->tq_task;
 	tq->tq_task.tqent_prev = &tq->tq_task;
 	tq->tq_threadlist = kmem_alloc(nthreads * sizeof (kthread_t *),
 	    KM_SLEEP);
 
 	if (flags & TASKQ_PREPOPULATE) {
 		mutex_enter(&tq->tq_lock);
 		while (minalloc-- > 0)
 			task_free(tq, task_alloc(tq, KM_SLEEP));
 		mutex_exit(&tq->tq_lock);
 	}
 
 	for (t = 0; t < nthreads; t++)
 		VERIFY((tq->tq_threadlist[t] = thread_create_named(tq->tq_name,
 		    NULL, 0, taskq_thread, tq, 0, &p0, TS_RUN, pri)) != NULL);
 
 	return (tq);
 }
 
 void
 taskq_destroy(taskq_t *tq)
 {
 	int nthreads = tq->tq_nthreads;
 
 	taskq_wait(tq);
 
 	mutex_enter(&tq->tq_lock);
 
 	tq->tq_flags &= ~TASKQ_ACTIVE;
 	cv_broadcast(&tq->tq_dispatch_cv);
 
 	while (tq->tq_nthreads != 0)
 		cv_wait(&tq->tq_wait_cv, &tq->tq_lock);
 
 	tq->tq_minalloc = 0;
 	while (tq->tq_nalloc != 0) {
 		ASSERT(tq->tq_freelist != NULL);
 		taskq_ent_t *tqent_nexttq = tq->tq_freelist->tqent_next;
 		task_free(tq, tq->tq_freelist);
 		tq->tq_freelist = tqent_nexttq;
 	}
 
 	mutex_exit(&tq->tq_lock);
 
 	kmem_free(tq->tq_threadlist, nthreads * sizeof (kthread_t *));
 
 	rw_destroy(&tq->tq_threadlock);
 	mutex_destroy(&tq->tq_lock);
 	cv_destroy(&tq->tq_dispatch_cv);
 	cv_destroy(&tq->tq_wait_cv);
 	cv_destroy(&tq->tq_maxalloc_cv);
 
 	kmem_free(tq, sizeof (taskq_t));
 }
 
 /*
  * Create a taskq with a specified number of pool threads. Allocate
  * and return an array of nthreads kthread_t pointers, one for each
  * thread in the pool. The array is not ordered and must be freed
  * by the caller.
  */
 taskq_t *
 taskq_create_synced(const char *name, int nthreads, pri_t pri,
     int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
 {
 	taskq_t *tq;
 	kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
 	    KM_SLEEP);
 
 	(void) pri; (void) minalloc; (void) maxalloc;
 
 	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
 
 	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
 	    flags | TASKQ_PREPOPULATE);
 	VERIFY(tq != NULL);
 	VERIFY(tq->tq_nthreads == nthreads);
 
 	for (int i = 0; i < nthreads; i++) {
 		kthreads[i] = tq->tq_threadlist[i];
 	}
 	*ktpp = kthreads;
 	return (tq);
 }
 
 int
 taskq_member(taskq_t *tq, kthread_t *t)
 {
 	int i;
 
 	if (taskq_now)
 		return (1);
 
 	for (i = 0; i < tq->tq_nthreads; i++)
 		if (tq->tq_threadlist[i] == t)
 			return (1);
 
 	return (0);
 }
 
 taskq_t *
 taskq_of_curthread(void)
 {
 	return (pthread_getspecific(taskq_tsd));
 }
 
 int
-taskq_cancel_id(taskq_t *tq, taskqid_t id)
+taskq_cancel_id(taskq_t *tq, taskqid_t id, boolean_t wait)
 {
-	(void) tq, (void) id;
+	(void) tq, (void) id, (void) wait;
 	return (ENOENT);
 }
 
 void
 system_taskq_init(void)
 {
 	VERIFY0(pthread_key_create(&taskq_tsd, NULL));
 	system_taskq = taskq_create("system_taskq", 64, maxclsyspri, 4, 512,
 	    TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
 	system_delay_taskq = taskq_create("delay_taskq", 4, maxclsyspri, 4,
 	    512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
 }
 
 void
 system_taskq_fini(void)
 {
 	taskq_destroy(system_taskq);
 	system_taskq = NULL; /* defensive */
 	taskq_destroy(system_delay_taskq);
 	system_delay_taskq = NULL;
 	VERIFY0(pthread_key_delete(taskq_tsd));
 }
diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c
index 78d9c8257208..3caa4c506183 100644
--- a/module/os/freebsd/spl/spl_taskq.c
+++ b/module/os/freebsd/spl/spl_taskq.c
@@ -1,528 +1,535 @@
 // SPDX-License-Identifier: BSD-2-Clause
 /*
  * Copyright (c) 2009 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Copyright (c) 2012 Spectra Logic Corporation.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  */
 
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/kmem.h>
 #include <sys/lock.h>
 #include <sys/mutex.h>
 #include <sys/queue.h>
 #include <sys/taskq.h>
 #include <sys/taskqueue.h>
 #include <sys/zfs_context.h>
 
 #if defined(__i386__) || defined(__amd64__) || defined(__aarch64__)
 #include <machine/pcb.h>
 #endif
 
 #include <vm/uma.h>
 
 static uint_t taskq_tsd;
 static uma_zone_t taskq_zone;
 
 /*
  * Global system-wide dynamic task queue available for all consumers. This
  * taskq is not intended for long-running tasks; instead, a dedicated taskq
  * should be created.
  */
 taskq_t *system_taskq = NULL;
 taskq_t *system_delay_taskq = NULL;
 taskq_t *dynamic_taskq = NULL;
 
 proc_t *system_proc;
 
 static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures");
 
 static LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl;
 static unsigned long tqenthash;
 static unsigned long tqenthashlock;
 static struct sx *tqenthashtbl_lock;
 
 static taskqid_t tqidnext;
 
 #define	TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash])
 #define	TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)])
 
 #define	NORMAL_TASK 0
 #define	TIMEOUT_TASK 1
 
 static void
 system_taskq_init(void *arg)
 {
 	int i;
 
 	tsd_create(&taskq_tsd, NULL);
 	tqenthashtbl = hashinit(mp_ncpus * 8, M_TASKQ, &tqenthash);
 	tqenthashlock = (tqenthash + 1) / 8;
 	if (tqenthashlock > 0)
 		tqenthashlock--;
 	tqenthashtbl_lock =
 	    malloc(sizeof (*tqenthashtbl_lock) * (tqenthashlock + 1),
 	    M_TASKQ, M_WAITOK | M_ZERO);
 	for (i = 0; i < tqenthashlock + 1; i++)
 		sx_init_flags(&tqenthashtbl_lock[i], "tqenthash", SX_DUPOK);
 	taskq_zone = uma_zcreate("taskq_zone", sizeof (taskq_ent_t),
 	    NULL, NULL, NULL, NULL,
 	    UMA_ALIGN_CACHE, 0);
 	system_taskq = taskq_create("system_taskq", mp_ncpus, minclsyspri,
 	    0, 0, 0);
 	system_delay_taskq = taskq_create("system_delay_taskq", mp_ncpus,
 	    minclsyspri, 0, 0, 0);
 }
 SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init,
     NULL);
 
 static void
 system_taskq_fini(void *arg)
 {
 	int i;
 
 	taskq_destroy(system_delay_taskq);
 	taskq_destroy(system_taskq);
 	uma_zdestroy(taskq_zone);
 	tsd_destroy(&taskq_tsd);
 	for (i = 0; i < tqenthashlock + 1; i++)
 		sx_destroy(&tqenthashtbl_lock[i]);
 	for (i = 0; i < tqenthash + 1; i++)
 		VERIFY(LIST_EMPTY(&tqenthashtbl[i]));
 	free(tqenthashtbl_lock, M_TASKQ);
 	free(tqenthashtbl, M_TASKQ);
 }
 SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini,
     NULL);
 
 #ifdef __LP64__
 static taskqid_t
 __taskq_genid(void)
 {
 	taskqid_t tqid;
 
 	/*
 	 * Assume a 64-bit counter will not wrap in practice.
 	 */
 	tqid = atomic_add_64_nv(&tqidnext, 1);
 	VERIFY(tqid);
 	return (tqid);
 }
 #else
 static taskqid_t
 __taskq_genid(void)
 {
 	taskqid_t tqid;
 
 	for (;;) {
 		tqid = atomic_add_32_nv(&tqidnext, 1);
 		if (__predict_true(tqid != 0))
 			break;
 	}
 	VERIFY(tqid);
 	return (tqid);
 }
 #endif
 
 static taskq_ent_t *
 taskq_lookup(taskqid_t tqid)
 {
 	taskq_ent_t *ent = NULL;
 
 	if (tqid == 0)
 		return (NULL);
 	sx_slock(TQIDHASHLOCK(tqid));
 	LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) {
 		if (ent->tqent_id == tqid)
 			break;
 	}
 	if (ent != NULL)
 		refcount_acquire(&ent->tqent_rc);
 	sx_sunlock(TQIDHASHLOCK(tqid));
 	return (ent);
 }
 
 static taskqid_t
 taskq_insert(taskq_ent_t *ent)
 {
 	taskqid_t tqid = __taskq_genid();
 
 	ent->tqent_id = tqid;
 	sx_xlock(TQIDHASHLOCK(tqid));
 	LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash);
 	sx_xunlock(TQIDHASHLOCK(tqid));
 	return (tqid);
 }
 
 static void
 taskq_remove(taskq_ent_t *ent)
 {
 	taskqid_t tqid = ent->tqent_id;
 
 	if (tqid == 0)
 		return;
 	sx_xlock(TQIDHASHLOCK(tqid));
 	if (ent->tqent_id != 0) {
 		LIST_REMOVE(ent, tqent_hash);
 		ent->tqent_id = 0;
 	}
 	sx_xunlock(TQIDHASHLOCK(tqid));
 }
 
 static void
 taskq_tsd_set(void *context)
 {
 	taskq_t *tq = context;
 
 #if defined(__amd64__) || defined(__i386__) || defined(__aarch64__)
 	if (context != NULL && tsd_get(taskq_tsd) == NULL)
 		fpu_kern_thread(FPU_KERN_NORMAL);
 #endif
 	tsd_set(taskq_tsd, tq);
 }
 
 static taskq_t *
 taskq_create_impl(const char *name, int nthreads, pri_t pri,
     proc_t *proc __maybe_unused, uint_t flags)
 {
 	taskq_t *tq;
 
 	if ((flags & TASKQ_THREADS_CPU_PCT) != 0)
 		nthreads = MAX((mp_ncpus * nthreads) / 100, 1);
 
 	tq = kmem_alloc(sizeof (*tq), KM_SLEEP);
 	tq->tq_nthreads = nthreads;
 	tq->tq_queue = taskqueue_create(name, M_WAITOK,
 	    taskqueue_thread_enqueue, &tq->tq_queue);
 	taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT,
 	    taskq_tsd_set, tq);
 	taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN,
 	    taskq_tsd_set, NULL);
 	(void) taskqueue_start_threads_in_proc(&tq->tq_queue, nthreads, pri,
 	    proc, "%s", name);
 
 	return ((taskq_t *)tq);
 }
 
 taskq_t *
 taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused,
     int maxalloc __unused, uint_t flags)
 {
 	return (taskq_create_impl(name, nthreads, pri, system_proc, flags));
 }
 
 taskq_t *
 taskq_create_proc(const char *name, int nthreads, pri_t pri,
     int minalloc __unused, int maxalloc __unused, proc_t *proc, uint_t flags)
 {
 	return (taskq_create_impl(name, nthreads, pri, proc, flags));
 }
 
 void
 taskq_destroy(taskq_t *tq)
 {
 
 	taskqueue_free(tq->tq_queue);
 	kmem_free(tq, sizeof (*tq));
 }
 
 static void taskq_sync_assign(void *arg);
 
 typedef struct taskq_sync_arg {
 	kthread_t	*tqa_thread;
 	kcondvar_t	tqa_cv;
 	kmutex_t 	tqa_lock;
 	int		tqa_ready;
 } taskq_sync_arg_t;
 
 static void
 taskq_sync_assign(void *arg)
 {
 	taskq_sync_arg_t *tqa = arg;
 
 	mutex_enter(&tqa->tqa_lock);
 	tqa->tqa_thread = curthread;
 	tqa->tqa_ready = 1;
 	cv_signal(&tqa->tqa_cv);
 	while (tqa->tqa_ready == 1)
 		cv_wait(&tqa->tqa_cv, &tqa->tqa_lock);
 	mutex_exit(&tqa->tqa_lock);
 }
 
 /*
  * Create a taskq with a specified number of pool threads. Allocate
  * and return an array of nthreads kthread_t pointers, one for each
  * thread in the pool. The array is not ordered and must be freed
  * by the caller.
  */
 taskq_t *
 taskq_create_synced(const char *name, int nthreads, pri_t pri,
     int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
 {
 	taskq_t *tq;
 	taskq_sync_arg_t *tqs = kmem_zalloc(sizeof (*tqs) * nthreads, KM_SLEEP);
 	kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
 	    KM_SLEEP);
 
 	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
 
 	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
 	    flags | TASKQ_PREPOPULATE);
 	VERIFY(tq != NULL);
 	VERIFY(tq->tq_nthreads == nthreads);
 
 	/* spawn all syncthreads */
 	for (int i = 0; i < nthreads; i++) {
 		cv_init(&tqs[i].tqa_cv, NULL, CV_DEFAULT, NULL);
 		mutex_init(&tqs[i].tqa_lock, NULL, MUTEX_DEFAULT, NULL);
 		(void) taskq_dispatch(tq, taskq_sync_assign,
 		    &tqs[i], TQ_FRONT);
 	}
 
 	/* wait on all syncthreads to start */
 	for (int i = 0; i < nthreads; i++) {
 		mutex_enter(&tqs[i].tqa_lock);
 		while (tqs[i].tqa_ready == 0)
 			cv_wait(&tqs[i].tqa_cv, &tqs[i].tqa_lock);
 		mutex_exit(&tqs[i].tqa_lock);
 	}
 
 	/* let all syncthreads resume, finish */
 	for (int i = 0; i < nthreads; i++) {
 		mutex_enter(&tqs[i].tqa_lock);
 		tqs[i].tqa_ready = 2;
 		cv_broadcast(&tqs[i].tqa_cv);
 		mutex_exit(&tqs[i].tqa_lock);
 	}
 	taskq_wait(tq);
 
 	for (int i = 0; i < nthreads; i++) {
 		kthreads[i] = tqs[i].tqa_thread;
 		mutex_destroy(&tqs[i].tqa_lock);
 		cv_destroy(&tqs[i].tqa_cv);
 	}
 	kmem_free(tqs, sizeof (*tqs) * nthreads);
 
 	*ktpp = kthreads;
 	return (tq);
 }
 
 int
 taskq_member(taskq_t *tq, kthread_t *thread)
 {
 
 	return (taskqueue_member(tq->tq_queue, thread));
 }
 
 taskq_t *
 taskq_of_curthread(void)
 {
 	return (tsd_get(taskq_tsd));
 }
 
 static void
 taskq_free(taskq_ent_t *task)
 {
 	taskq_remove(task);
 	if (refcount_release(&task->tqent_rc))
 		uma_zfree(taskq_zone, task);
 }
 
 int
-taskq_cancel_id(taskq_t *tq, taskqid_t tid)
+taskq_cancel_id(taskq_t *tq, taskqid_t tid, boolean_t wait)
 {
 	uint32_t pend;
 	int rc;
 	taskq_ent_t *ent;
 
 	if ((ent = taskq_lookup(tid)) == NULL)
 		return (ENOENT);
 
 	if (ent->tqent_type == NORMAL_TASK) {
 		rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
-		if (rc == EBUSY)
+		if (rc == EBUSY && wait)
 			taskqueue_drain(tq->tq_queue, &ent->tqent_task);
 	} else {
 		rc = taskqueue_cancel_timeout(tq->tq_queue,
 		    &ent->tqent_timeout_task, &pend);
-		if (rc == EBUSY) {
+		if (rc == EBUSY && wait) {
 			taskqueue_drain_timeout(tq->tq_queue,
 			    &ent->tqent_timeout_task);
 		}
 	}
 	if (pend) {
 		/*
 		 * Tasks normally free themselves when run, but here the task
 		 * was cancelled so it did not free itself.
 		 */
 		taskq_free(ent);
 	}
 	/* Free the extra reference we added with taskq_lookup. */
 	taskq_free(ent);
+
+	/*
+	 * If task was running and we didn't wait, return EBUSY.
+	 * Otherwise return 0 if cancelled or ENOENT if not found.
+	 */
+	if (rc == EBUSY && !wait)
+		return (EBUSY);
 	return (pend ? 0 : ENOENT);
 }
 
 static void
 taskq_run(void *arg, int pending)
 {
 	taskq_ent_t *task = arg;
 
 	if (pending == 0)
 		return;
 	task->tqent_func(task->tqent_arg);
 	taskq_free(task);
 }
 
 taskqid_t
 taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
     uint_t flags, clock_t expire_time)
 {
 	taskq_ent_t *task;
 	taskqid_t tqid;
 	clock_t timo;
 	int mflag;
 
 	timo = expire_time - ddi_get_lbolt();
 	if (timo <= 0)
 		return (taskq_dispatch(tq, func, arg, flags));
 
 	if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
 		mflag = M_WAITOK;
 	else
 		mflag = M_NOWAIT;
 
 	task = uma_zalloc(taskq_zone, mflag);
 	if (task == NULL)
 		return (0);
 	task->tqent_func = func;
 	task->tqent_arg = arg;
 	task->tqent_type = TIMEOUT_TASK;
 	refcount_init(&task->tqent_rc, 1);
 	tqid = taskq_insert(task);
 	TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0,
 	    taskq_run, task);
 
 	taskqueue_enqueue_timeout(tq->tq_queue, &task->tqent_timeout_task,
 	    timo);
 	return (tqid);
 }
 
 taskqid_t
 taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 {
 	taskq_ent_t *task;
 	int mflag, prio;
 	taskqid_t tqid;
 
 	if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP)
 		mflag = M_WAITOK;
 	else
 		mflag = M_NOWAIT;
 	/*
 	 * If TQ_FRONT is given, we want higher priority for this task, so it
 	 * can go at the front of the queue.
 	 */
 	prio = !!(flags & TQ_FRONT);
 
 	task = uma_zalloc(taskq_zone, mflag);
 	if (task == NULL)
 		return (0);
 	refcount_init(&task->tqent_rc, 1);
 	task->tqent_func = func;
 	task->tqent_arg = arg;
 	task->tqent_type = NORMAL_TASK;
 	tqid = taskq_insert(task);
 	TASK_INIT(&task->tqent_task, prio, taskq_run, task);
 	taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
 	return (tqid);
 }
 
 static void
 taskq_run_ent(void *arg, int pending)
 {
 	taskq_ent_t *task = arg;
 
 	if (pending == 0)
 		return;
 	task->tqent_func(task->tqent_arg);
 }
 
 void
 taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags,
     taskq_ent_t *task)
 {
 	/*
 	 * If TQ_FRONT is given, we want higher priority for this task, so it
 	 * can go at the front of the queue.
 	 */
 	task->tqent_task.ta_priority = !!(flags & TQ_FRONT);
 	task->tqent_func = func;
 	task->tqent_arg = arg;
 	taskqueue_enqueue(tq->tq_queue, &task->tqent_task);
 }
 
 void
 taskq_init_ent(taskq_ent_t *task)
 {
 	TASK_INIT(&task->tqent_task, 0, taskq_run_ent, task);
 	task->tqent_func = NULL;
 	task->tqent_arg = NULL;
 	task->tqent_id = 0;
 	task->tqent_type = NORMAL_TASK;
 	task->tqent_rc = 0;
 }
 
 int
 taskq_empty_ent(taskq_ent_t *task)
 {
 	return (task->tqent_task.ta_pending == 0);
 }
 
 void
 taskq_wait(taskq_t *tq)
 {
 	taskqueue_quiesce(tq->tq_queue);
 }
 
 void
 taskq_wait_id(taskq_t *tq, taskqid_t tid)
 {
 	taskq_ent_t *ent;
 
 	if ((ent = taskq_lookup(tid)) == NULL)
 		return;
 
 	if (ent->tqent_type == NORMAL_TASK)
 		taskqueue_drain(tq->tq_queue, &ent->tqent_task);
 	else
 		taskqueue_drain_timeout(tq->tq_queue, &ent->tqent_timeout_task);
 	taskq_free(ent);
 }
 
 void
 taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused)
 {
 	taskqueue_drain_all(tq->tq_queue);
 }
diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c
index 22e4ed169d03..5594b2f80c02 100644
--- a/module/os/linux/spl/spl-kmem-cache.c
+++ b/module/os/linux/spl/spl-kmem-cache.c
@@ -1,1446 +1,1446 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #define	SPL_KMEM_CACHE_IMPLEMENTING
 
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/taskq.h>
 #include <sys/timer.h>
 #include <sys/vmem.h>
 #include <sys/wait.h>
 #include <sys/string.h>
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/prefetch.h>
 
 /*
  * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}()
  * with smp_mb__{before,after}_atomic() because they were redundant. This is
  * only used inside our SLAB allocator, so we implement an internal wrapper
  * here to give us smp_mb__{before,after}_atomic() on older kernels.
  */
 #ifndef smp_mb__before_atomic
 #define	smp_mb__before_atomic(x) smp_mb__before_clear_bit(x)
 #endif
 
 #ifndef smp_mb__after_atomic
 #define	smp_mb__after_atomic(x) smp_mb__after_clear_bit(x)
 #endif
 
 /*
  * Cache magazines are an optimization designed to minimize the cost of
  * allocating memory.  They do this by keeping a per-cpu cache of recently
  * freed objects, which can then be reallocated without taking a lock. This
  * can improve performance on highly contended caches.  However, because
  * objects in magazines will prevent otherwise empty slabs from being
  * immediately released this may not be ideal for low memory machines.
  *
  * For this reason spl_kmem_cache_magazine_size can be used to set a maximum
  * magazine size.  When this value is set to 0 the magazine size will be
  * automatically determined based on the object size.  Otherwise magazines
  * will be limited to 2-256 objects per magazine (i.e per cpu).  Magazines
  * may never be entirely disabled in this implementation.
  */
 static unsigned int spl_kmem_cache_magazine_size = 0;
 module_param(spl_kmem_cache_magazine_size, uint, 0444);
 MODULE_PARM_DESC(spl_kmem_cache_magazine_size,
 	"Default magazine size (2-256), set automatically (0)");
 
 static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB;
 module_param(spl_kmem_cache_obj_per_slab, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab");
 
 static unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE;
 module_param(spl_kmem_cache_max_size, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB");
 
 /*
  * For small objects the Linux slab allocator should be used to make the most
  * efficient use of the memory.  However, large objects are not supported by
  * the Linux slab and therefore the SPL implementation is preferred.  A cutoff
  * of 16K was determined to be optimal for architectures using 4K pages and
  * to also work well on architecutres using larger 64K page sizes.
  */
 static unsigned int spl_kmem_cache_slab_limit =
     SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE;
 module_param(spl_kmem_cache_slab_limit, uint, 0644);
 MODULE_PARM_DESC(spl_kmem_cache_slab_limit,
 	"Objects less than N bytes use the Linux slab");
 
 /*
  * The number of threads available to allocate new slabs for caches.  This
  * should not need to be tuned but it is available for performance analysis.
  */
 static unsigned int spl_kmem_cache_kmem_threads = 4;
 module_param(spl_kmem_cache_kmem_threads, uint, 0444);
 MODULE_PARM_DESC(spl_kmem_cache_kmem_threads,
 	"Number of spl_kmem_cache threads");
 
 /*
  * Slab allocation interfaces
  *
  * While the Linux slab implementation was inspired by the Solaris
  * implementation I cannot use it to emulate the Solaris APIs.  I
  * require two features which are not provided by the Linux slab.
  *
  * 1) Constructors AND destructors.  Recent versions of the Linux
  *    kernel have removed support for destructors.  This is a deal
  *    breaker for the SPL which contains particularly expensive
  *    initializers for mutex's, condition variables, etc.  We also
  *    require a minimal level of cleanup for these data types unlike
  *    many Linux data types which do need to be explicitly destroyed.
  *
  * 2) Virtual address space backed slab.  Callers of the Solaris slab
  *    expect it to work well for both small are very large allocations.
  *    Because of memory fragmentation the Linux slab which is backed
  *    by kmalloc'ed memory performs very badly when confronted with
  *    large numbers of large allocations.  Basing the slab on the
  *    virtual address space removes the need for contiguous pages
  *    and greatly improve performance for large allocations.
  *
  * For these reasons, the SPL has its own slab implementation with
  * the needed features.  It is not as highly optimized as either the
  * Solaris or Linux slabs, but it should get me most of what is
  * needed until it can be optimized or obsoleted by another approach.
  *
  * One serious concern I do have about this method is the relatively
  * small virtual address space on 32bit arches.  This will seriously
  * constrain the size of the slab caches and their performance.
  */
 
 struct list_head spl_kmem_cache_list;   /* List of caches */
 struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */
 static taskq_t *spl_kmem_cache_taskq;   /* Task queue for aging / reclaim */
 
 static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj);
 
 static void *
 kv_alloc(spl_kmem_cache_t *skc, int size, int flags)
 {
 	gfp_t lflags = kmem_flags_convert(flags);
 	void *ptr;
 
 	if (skc->skc_flags & KMC_RECLAIMABLE)
 		lflags |= __GFP_RECLAIMABLE;
 	ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM);
 
 	/* Resulting allocated memory will be page aligned */
 	ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 
 	return (ptr);
 }
 
 static void
 kv_free(spl_kmem_cache_t *skc, void *ptr, int size)
 {
 	ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE));
 
 	/*
 	 * The Linux direct reclaim path uses this out of band value to
 	 * determine if forward progress is being made.  Normally this is
 	 * incremented by kmem_freepages() which is part of the various
 	 * Linux slab implementations.  However, since we are using none
 	 * of that infrastructure we are responsible for incrementing it.
 	 */
 	if (current->reclaim_state)
 #ifdef	HAVE_RECLAIM_STATE_RECLAIMED
 		current->reclaim_state->reclaimed += size >> PAGE_SHIFT;
 #else
 		current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT;
 #endif
 	vfree(ptr);
 }
 
 /*
  * Required space for each aligned sks.
  */
 static inline uint32_t
 spl_sks_size(spl_kmem_cache_t *skc)
 {
 	return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t),
 	    skc->skc_obj_align, uint32_t));
 }
 
 /*
  * Required space for each aligned object.
  */
 static inline uint32_t
 spl_obj_size(spl_kmem_cache_t *skc)
 {
 	uint32_t align = skc->skc_obj_align;
 
 	return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) +
 	    P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t));
 }
 
 uint64_t
 spl_kmem_cache_inuse(kmem_cache_t *cache)
 {
 	return (cache->skc_obj_total);
 }
 EXPORT_SYMBOL(spl_kmem_cache_inuse);
 
 uint64_t
 spl_kmem_cache_entry_size(kmem_cache_t *cache)
 {
 	return (cache->skc_obj_size);
 }
 EXPORT_SYMBOL(spl_kmem_cache_entry_size);
 
 /*
  * Lookup the spl_kmem_object_t for an object given that object.
  */
 static inline spl_kmem_obj_t *
 spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj)
 {
 	return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size,
 	    skc->skc_obj_align, uint32_t));
 }
 
 /*
  * It's important that we pack the spl_kmem_obj_t structure and the
  * actual objects in to one large address space to minimize the number
  * of calls to the allocator.  It is far better to do a few large
  * allocations and then subdivide it ourselves.  Now which allocator
  * we use requires balancing a few trade offs.
  *
  * For small objects we use kmem_alloc() because as long as you are
  * only requesting a small number of pages (ideally just one) its cheap.
  * However, when you start requesting multiple pages with kmem_alloc()
  * it gets increasingly expensive since it requires contiguous pages.
  * For this reason we shift to vmem_alloc() for slabs of large objects
  * which removes the need for contiguous pages.  We do not use
  * vmem_alloc() in all cases because there is significant locking
  * overhead in __get_vm_area_node().  This function takes a single
  * global lock when acquiring an available virtual address range which
  * serializes all vmem_alloc()'s for all slab caches.  Using slightly
  * different allocation functions for small and large objects should
  * give us the best of both worlds.
  *
  * +------------------------+
  * | spl_kmem_slab_t --+-+  |
  * | skc_obj_size    <-+ |  |
  * | spl_kmem_obj_t      |  |
  * | skc_obj_size    <---+  |
  * | spl_kmem_obj_t      |  |
  * | ...                 v  |
  * +------------------------+
  */
 static spl_kmem_slab_t *
 spl_slab_alloc(spl_kmem_cache_t *skc, int flags)
 {
 	spl_kmem_slab_t *sks;
 	void *base;
 	uint32_t obj_size;
 
 	base = kv_alloc(skc, skc->skc_slab_size, flags);
 	if (base == NULL)
 		return (NULL);
 
 	sks = (spl_kmem_slab_t *)base;
 	sks->sks_magic = SKS_MAGIC;
 	sks->sks_objs = skc->skc_slab_objs;
 	sks->sks_age = jiffies;
 	sks->sks_cache = skc;
 	INIT_LIST_HEAD(&sks->sks_list);
 	INIT_LIST_HEAD(&sks->sks_free_list);
 	sks->sks_ref = 0;
 	obj_size = spl_obj_size(skc);
 
 	for (int i = 0; i < sks->sks_objs; i++) {
 		void *obj = base + spl_sks_size(skc) + (i * obj_size);
 
 		ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 		spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj);
 		sko->sko_addr = obj;
 		sko->sko_magic = SKO_MAGIC;
 		sko->sko_slab = sks;
 		INIT_LIST_HEAD(&sko->sko_list);
 		list_add_tail(&sko->sko_list, &sks->sks_free_list);
 	}
 
 	return (sks);
 }
 
 /*
  * Remove a slab from complete or partial list, it must be called with
  * the 'skc->skc_lock' held but the actual free must be performed
  * outside the lock to prevent deadlocking on vmem addresses.
  */
 static void
 spl_slab_free(spl_kmem_slab_t *sks,
     struct list_head *sks_list, struct list_head *sko_list)
 {
 	spl_kmem_cache_t *skc;
 
 	ASSERT(sks->sks_magic == SKS_MAGIC);
 	ASSERT0(sks->sks_ref);
 
 	skc = sks->sks_cache;
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 
 	/*
 	 * Update slab/objects counters in the cache, then remove the
 	 * slab from the skc->skc_partial_list.  Finally add the slab
 	 * and all its objects in to the private work lists where the
 	 * destructors will be called and the memory freed to the system.
 	 */
 	skc->skc_obj_total -= sks->sks_objs;
 	skc->skc_slab_total--;
 	list_del(&sks->sks_list);
 	list_add(&sks->sks_list, sks_list);
 	list_splice_init(&sks->sks_free_list, sko_list);
 }
 
 /*
  * Reclaim empty slabs at the end of the partial list.
  */
 static void
 spl_slab_reclaim(spl_kmem_cache_t *skc)
 {
 	spl_kmem_slab_t *sks = NULL, *m = NULL;
 	spl_kmem_obj_t *sko = NULL, *n = NULL;
 	LIST_HEAD(sks_list);
 	LIST_HEAD(sko_list);
 
 	/*
 	 * Empty slabs and objects must be moved to a private list so they
 	 * can be safely freed outside the spin lock.  All empty slabs are
 	 * at the end of skc->skc_partial_list, therefore once a non-empty
 	 * slab is found we can stop scanning.
 	 */
 	spin_lock(&skc->skc_lock);
 	list_for_each_entry_safe_reverse(sks, m,
 	    &skc->skc_partial_list, sks_list) {
 
 		if (sks->sks_ref > 0)
 			break;
 
 		spl_slab_free(sks, &sks_list, &sko_list);
 	}
 	spin_unlock(&skc->skc_lock);
 
 	/*
 	 * The following two loops ensure all the object destructors are run,
 	 * and the slabs themselves are freed.  This is all done outside the
 	 * skc->skc_lock since this allows the destructor to sleep, and
 	 * allows us to perform a conditional reschedule when a freeing a
 	 * large number of objects and slabs back to the system.
 	 */
 
 	list_for_each_entry_safe(sko, n, &sko_list, sko_list) {
 		ASSERT(sko->sko_magic == SKO_MAGIC);
 	}
 
 	list_for_each_entry_safe(sks, m, &sks_list, sks_list) {
 		ASSERT(sks->sks_magic == SKS_MAGIC);
 		kv_free(skc, sks, skc->skc_slab_size);
 	}
 }
 
 static spl_kmem_emergency_t *
 spl_emergency_search(struct rb_root *root, void *obj)
 {
 	struct rb_node *node = root->rb_node;
 	spl_kmem_emergency_t *ske;
 	unsigned long address = (unsigned long)obj;
 
 	while (node) {
 		ske = container_of(node, spl_kmem_emergency_t, ske_node);
 
 		if (address < ske->ske_obj)
 			node = node->rb_left;
 		else if (address > ske->ske_obj)
 			node = node->rb_right;
 		else
 			return (ske);
 	}
 
 	return (NULL);
 }
 
 static int
 spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske)
 {
 	struct rb_node **new = &(root->rb_node), *parent = NULL;
 	spl_kmem_emergency_t *ske_tmp;
 	unsigned long address = ske->ske_obj;
 
 	while (*new) {
 		ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node);
 
 		parent = *new;
 		if (address < ske_tmp->ske_obj)
 			new = &((*new)->rb_left);
 		else if (address > ske_tmp->ske_obj)
 			new = &((*new)->rb_right);
 		else
 			return (0);
 	}
 
 	rb_link_node(&ske->ske_node, parent, new);
 	rb_insert_color(&ske->ske_node, root);
 
 	return (1);
 }
 
 /*
  * Allocate a single emergency object and track it in a red black tree.
  */
 static int
 spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj)
 {
 	gfp_t lflags = kmem_flags_convert(flags);
 	spl_kmem_emergency_t *ske;
 	int order = get_order(skc->skc_obj_size);
 	int empty;
 
 	/* Last chance use a partial slab if one now exists */
 	spin_lock(&skc->skc_lock);
 	empty = list_empty(&skc->skc_partial_list);
 	spin_unlock(&skc->skc_lock);
 	if (!empty)
 		return (-EEXIST);
 
 	if (skc->skc_flags & KMC_RECLAIMABLE)
 		lflags |= __GFP_RECLAIMABLE;
 	ske = kmalloc(sizeof (*ske), lflags);
 	if (ske == NULL)
 		return (-ENOMEM);
 
 	ske->ske_obj = __get_free_pages(lflags, order);
 	if (ske->ske_obj == 0) {
 		kfree(ske);
 		return (-ENOMEM);
 	}
 
 	spin_lock(&skc->skc_lock);
 	empty = spl_emergency_insert(&skc->skc_emergency_tree, ske);
 	if (likely(empty)) {
 		skc->skc_obj_total++;
 		skc->skc_obj_emergency++;
 		if (skc->skc_obj_emergency > skc->skc_obj_emergency_max)
 			skc->skc_obj_emergency_max = skc->skc_obj_emergency;
 	}
 	spin_unlock(&skc->skc_lock);
 
 	if (unlikely(!empty)) {
 		free_pages(ske->ske_obj, order);
 		kfree(ske);
 		return (-EINVAL);
 	}
 
 	*obj = (void *)ske->ske_obj;
 
 	return (0);
 }
 
 /*
  * Locate the passed object in the red black tree and free it.
  */
 static int
 spl_emergency_free(spl_kmem_cache_t *skc, void *obj)
 {
 	spl_kmem_emergency_t *ske;
 	int order = get_order(skc->skc_obj_size);
 
 	spin_lock(&skc->skc_lock);
 	ske = spl_emergency_search(&skc->skc_emergency_tree, obj);
 	if (ske) {
 		rb_erase(&ske->ske_node, &skc->skc_emergency_tree);
 		skc->skc_obj_emergency--;
 		skc->skc_obj_total--;
 	}
 	spin_unlock(&skc->skc_lock);
 
 	if (ske == NULL)
 		return (-ENOENT);
 
 	free_pages(ske->ske_obj, order);
 	kfree(ske);
 
 	return (0);
 }
 
 /*
  * Release objects from the per-cpu magazine back to their slab.  The flush
  * argument contains the max number of entries to remove from the magazine.
  */
 static void
 spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush)
 {
 	spin_lock(&skc->skc_lock);
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(skm->skm_magic == SKM_MAGIC);
 
 	int count = MIN(flush, skm->skm_avail);
 	for (int i = 0; i < count; i++)
 		spl_cache_shrink(skc, skm->skm_objs[i]);
 
 	skm->skm_avail -= count;
 	memmove(skm->skm_objs, &(skm->skm_objs[count]),
 	    sizeof (void *) * skm->skm_avail);
 
 	spin_unlock(&skc->skc_lock);
 }
 
 /*
  * Size a slab based on the size of each aligned object plus spl_kmem_obj_t.
  * When on-slab we want to target spl_kmem_cache_obj_per_slab.  However,
  * for very small objects we may end up with more than this so as not
  * to waste space in the minimal allocation of a single page.
  */
 static int
 spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size)
 {
 	uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs;
 
 	sks_size = spl_sks_size(skc);
 	obj_size = spl_obj_size(skc);
 	max_size = (spl_kmem_cache_max_size * 1024 * 1024);
 	tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size);
 
 	if (tgt_size <= max_size) {
 		tgt_objs = (tgt_size - sks_size) / obj_size;
 	} else {
 		tgt_objs = (max_size - sks_size) / obj_size;
 		tgt_size = (tgt_objs * obj_size) + sks_size;
 	}
 
 	if (tgt_objs == 0)
 		return (-ENOSPC);
 
 	*objs = tgt_objs;
 	*size = tgt_size;
 
 	return (0);
 }
 
 /*
  * Make a guess at reasonable per-cpu magazine size based on the size of
  * each object and the cost of caching N of them in each magazine.  Long
  * term this should really adapt based on an observed usage heuristic.
  */
 static int
 spl_magazine_size(spl_kmem_cache_t *skc)
 {
 	uint32_t obj_size = spl_obj_size(skc);
 	int size;
 
 	if (spl_kmem_cache_magazine_size > 0)
 		return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2));
 
 	/* Per-magazine sizes below assume a 4Kib page size */
 	if (obj_size > (PAGE_SIZE * 256))
 		size = 4;  /* Minimum 4Mib per-magazine */
 	else if (obj_size > (PAGE_SIZE * 32))
 		size = 16; /* Minimum 2Mib per-magazine */
 	else if (obj_size > (PAGE_SIZE))
 		size = 64; /* Minimum 256Kib per-magazine */
 	else if (obj_size > (PAGE_SIZE / 4))
 		size = 128; /* Minimum 128Kib per-magazine */
 	else
 		size = 256;
 
 	return (size);
 }
 
 /*
  * Allocate a per-cpu magazine to associate with a specific core.
  */
 static spl_kmem_magazine_t *
 spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu)
 {
 	spl_kmem_magazine_t *skm;
 	int size = sizeof (spl_kmem_magazine_t) +
 	    sizeof (void *) * skc->skc_mag_size;
 
 	skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu));
 	if (skm) {
 		skm->skm_magic = SKM_MAGIC;
 		skm->skm_avail = 0;
 		skm->skm_size = skc->skc_mag_size;
 		skm->skm_refill = skc->skc_mag_refill;
 		skm->skm_cache = skc;
 		skm->skm_cpu = cpu;
 	}
 
 	return (skm);
 }
 
 /*
  * Free a per-cpu magazine associated with a specific core.
  */
 static void
 spl_magazine_free(spl_kmem_magazine_t *skm)
 {
 	ASSERT(skm->skm_magic == SKM_MAGIC);
 	ASSERT0(skm->skm_avail);
 	kfree(skm);
 }
 
 /*
  * Create all pre-cpu magazines of reasonable sizes.
  */
 static int
 spl_magazine_create(spl_kmem_cache_t *skc)
 {
 	int i = 0;
 
 	ASSERT0((skc->skc_flags & KMC_SLAB));
 
 	skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) *
 	    num_possible_cpus(), kmem_flags_convert(KM_SLEEP));
 	skc->skc_mag_size = spl_magazine_size(skc);
 	skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2;
 
 	for_each_possible_cpu(i) {
 		skc->skc_mag[i] = spl_magazine_alloc(skc, i);
 		if (!skc->skc_mag[i]) {
 			for (i--; i >= 0; i--)
 				spl_magazine_free(skc->skc_mag[i]);
 
 			kfree(skc->skc_mag);
 			return (-ENOMEM);
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Destroy all pre-cpu magazines.
  */
 static void
 spl_magazine_destroy(spl_kmem_cache_t *skc)
 {
 	spl_kmem_magazine_t *skm;
 	int i = 0;
 
 	ASSERT0((skc->skc_flags & KMC_SLAB));
 
 	for_each_possible_cpu(i) {
 		skm = skc->skc_mag[i];
 		spl_cache_flush(skc, skm, skm->skm_avail);
 		spl_magazine_free(skm);
 	}
 
 	kfree(skc->skc_mag);
 }
 
 /*
  * Create a object cache based on the following arguments:
  * name		cache name
  * size		cache object size
  * align	cache object alignment
  * ctor		cache object constructor
  * dtor		cache object destructor
  * reclaim	cache object reclaim
  * priv		cache private data for ctor/dtor/reclaim
  * vmp		unused must be NULL
  * flags
  *	KMC_KVMEM       Force kvmem backed SPL cache
  *	KMC_SLAB        Force Linux slab backed cache
  *	KMC_NODEBUG	Disable debugging (unsupported)
  *	KMC_RECLAIMABLE	Memory can be freed under pressure
  */
 spl_kmem_cache_t *
 spl_kmem_cache_create(const char *name, size_t size, size_t align,
     spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim,
     void *priv, void *vmp, int flags)
 {
 	gfp_t lflags = kmem_flags_convert(KM_SLEEP);
 	spl_kmem_cache_t *skc;
 	int rc;
 
 	/*
 	 * Unsupported flags
 	 */
 	ASSERT0P(vmp);
 	ASSERT0P(reclaim);
 
 	might_sleep();
 
 	skc = kzalloc(sizeof (*skc), lflags);
 	if (skc == NULL)
 		return (NULL);
 
 	skc->skc_magic = SKC_MAGIC;
 	skc->skc_name_size = strlen(name) + 1;
 	skc->skc_name = kmalloc(skc->skc_name_size, lflags);
 	if (skc->skc_name == NULL) {
 		kfree(skc);
 		return (NULL);
 	}
 	strlcpy(skc->skc_name, name, skc->skc_name_size);
 
 	skc->skc_ctor = ctor;
 	skc->skc_dtor = dtor;
 	skc->skc_private = priv;
 	skc->skc_vmp = vmp;
 	skc->skc_linux_cache = NULL;
 	skc->skc_flags = flags;
 	skc->skc_obj_size = size;
 	skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN;
 	atomic_set(&skc->skc_ref, 0);
 
 	INIT_LIST_HEAD(&skc->skc_list);
 	INIT_LIST_HEAD(&skc->skc_complete_list);
 	INIT_LIST_HEAD(&skc->skc_partial_list);
 	skc->skc_emergency_tree = RB_ROOT;
 	spin_lock_init(&skc->skc_lock);
 	init_waitqueue_head(&skc->skc_waitq);
 	skc->skc_slab_fail = 0;
 	skc->skc_slab_create = 0;
 	skc->skc_slab_destroy = 0;
 	skc->skc_slab_total = 0;
 	skc->skc_slab_alloc = 0;
 	skc->skc_slab_max = 0;
 	skc->skc_obj_total = 0;
 	skc->skc_obj_alloc = 0;
 	skc->skc_obj_max = 0;
 	skc->skc_obj_deadlock = 0;
 	skc->skc_obj_emergency = 0;
 	skc->skc_obj_emergency_max = 0;
 
 	rc = percpu_counter_init(&skc->skc_linux_alloc, 0, GFP_KERNEL);
 	if (rc != 0) {
 		kfree(skc->skc_name);
 		kfree(skc);
 		return (NULL);
 	}
 
 	/*
 	 * Verify the requested alignment restriction is sane.
 	 */
 	if (align) {
 		VERIFY(ISP2(align));
 		VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN);
 		VERIFY3U(align, <=, PAGE_SIZE);
 		skc->skc_obj_align = align;
 	}
 
 	/*
 	 * When no specific type of slab is requested (kmem, vmem, or
 	 * linuxslab) then select a cache type based on the object size
 	 * and default tunables.
 	 */
 	if (!(skc->skc_flags & (KMC_SLAB | KMC_KVMEM))) {
 		if (spl_kmem_cache_slab_limit &&
 		    size <= (size_t)spl_kmem_cache_slab_limit) {
 			/*
 			 * Objects smaller than spl_kmem_cache_slab_limit can
 			 * use the Linux slab for better space-efficiency.
 			 */
 			skc->skc_flags |= KMC_SLAB;
 		} else {
 			/*
 			 * All other objects are considered large and are
 			 * placed on kvmem backed slabs.
 			 */
 			skc->skc_flags |= KMC_KVMEM;
 		}
 	}
 
 	/*
 	 * Given the type of slab allocate the required resources.
 	 */
 	if (skc->skc_flags & KMC_KVMEM) {
 		rc = spl_slab_size(skc,
 		    &skc->skc_slab_objs, &skc->skc_slab_size);
 		if (rc)
 			goto out;
 
 		rc = spl_magazine_create(skc);
 		if (rc)
 			goto out;
 	} else {
 		unsigned long slabflags = 0;
 
 		if (size > spl_kmem_cache_slab_limit)
 			goto out;
 
 		if (skc->skc_flags & KMC_RECLAIMABLE)
 			slabflags |= SLAB_RECLAIM_ACCOUNT;
 
 		skc->skc_linux_cache = kmem_cache_create_usercopy(
 		    skc->skc_name, size, align, slabflags, 0, size, NULL);
 		if (skc->skc_linux_cache == NULL)
 			goto out;
 	}
 
 	down_write(&spl_kmem_cache_sem);
 	list_add_tail(&skc->skc_list, &spl_kmem_cache_list);
 	up_write(&spl_kmem_cache_sem);
 
 	return (skc);
 out:
 	kfree(skc->skc_name);
 	percpu_counter_destroy(&skc->skc_linux_alloc);
 	kfree(skc);
 	return (NULL);
 }
 EXPORT_SYMBOL(spl_kmem_cache_create);
 
 /*
  * Register a move callback for cache defragmentation.
  * XXX: Unimplemented but harmless to stub out for now.
  */
 void
 spl_kmem_cache_set_move(spl_kmem_cache_t *skc,
     kmem_cbrc_t (move)(void *, void *, size_t, void *))
 {
 	ASSERT(move != NULL);
 }
 EXPORT_SYMBOL(spl_kmem_cache_set_move);
 
 /*
  * Destroy a cache and all objects associated with the cache.
  */
 void
 spl_kmem_cache_destroy(spl_kmem_cache_t *skc)
 {
 	DECLARE_WAIT_QUEUE_HEAD(wq);
 	taskqid_t id;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(skc->skc_flags & (KMC_KVMEM | KMC_SLAB));
 
 	down_write(&spl_kmem_cache_sem);
 	list_del_init(&skc->skc_list);
 	up_write(&spl_kmem_cache_sem);
 
 	/* Cancel any and wait for any pending delayed tasks */
 	VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 
 	spin_lock(&skc->skc_lock);
 	id = skc->skc_taskqid;
 	spin_unlock(&skc->skc_lock);
 
-	taskq_cancel_id(spl_kmem_cache_taskq, id);
+	taskq_cancel_id(spl_kmem_cache_taskq, id, B_TRUE);
 
 	/*
 	 * Wait until all current callers complete, this is mainly
 	 * to catch the case where a low memory situation triggers a
 	 * cache reaping action which races with this destroy.
 	 */
 	wait_event(wq, atomic_read(&skc->skc_ref) == 0);
 
 	if (skc->skc_flags & KMC_KVMEM) {
 		spl_magazine_destroy(skc);
 		spl_slab_reclaim(skc);
 	} else {
 		ASSERT(skc->skc_flags & KMC_SLAB);
 		kmem_cache_destroy(skc->skc_linux_cache);
 	}
 
 	spin_lock(&skc->skc_lock);
 
 	/*
 	 * Validate there are no objects in use and free all the
 	 * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers.
 	 */
 	ASSERT0(skc->skc_slab_alloc);
 	ASSERT0(skc->skc_obj_alloc);
 	ASSERT0(skc->skc_slab_total);
 	ASSERT0(skc->skc_obj_total);
 	ASSERT0(skc->skc_obj_emergency);
 	ASSERT(list_empty(&skc->skc_complete_list));
 
 	ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0);
 	percpu_counter_destroy(&skc->skc_linux_alloc);
 
 	spin_unlock(&skc->skc_lock);
 
 	kfree(skc->skc_name);
 	kfree(skc);
 }
 EXPORT_SYMBOL(spl_kmem_cache_destroy);
 
 /*
  * Allocate an object from a slab attached to the cache.  This is used to
  * repopulate the per-cpu magazine caches in batches when they run low.
  */
 static void *
 spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks)
 {
 	spl_kmem_obj_t *sko;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(sks->sks_magic == SKS_MAGIC);
 
 	sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list);
 	ASSERT(sko->sko_magic == SKO_MAGIC);
 	ASSERT(sko->sko_addr != NULL);
 
 	/* Remove from sks_free_list */
 	list_del_init(&sko->sko_list);
 
 	sks->sks_age = jiffies;
 	sks->sks_ref++;
 	skc->skc_obj_alloc++;
 
 	/* Track max obj usage statistics */
 	if (skc->skc_obj_alloc > skc->skc_obj_max)
 		skc->skc_obj_max = skc->skc_obj_alloc;
 
 	/* Track max slab usage statistics */
 	if (sks->sks_ref == 1) {
 		skc->skc_slab_alloc++;
 
 		if (skc->skc_slab_alloc > skc->skc_slab_max)
 			skc->skc_slab_max = skc->skc_slab_alloc;
 	}
 
 	return (sko->sko_addr);
 }
 
 /*
  * Generic slab allocation function to run by the global work queues.
  * It is responsible for allocating a new slab, linking it in to the list
  * of partial slabs, and then waking any waiters.
  */
 static int
 __spl_cache_grow(spl_kmem_cache_t *skc, int flags)
 {
 	spl_kmem_slab_t *sks;
 
 	fstrans_cookie_t cookie = spl_fstrans_mark();
 	sks = spl_slab_alloc(skc, flags);
 	spl_fstrans_unmark(cookie);
 
 	spin_lock(&skc->skc_lock);
 	if (sks) {
 		skc->skc_slab_total++;
 		skc->skc_obj_total += sks->sks_objs;
 		list_add_tail(&sks->sks_list, &skc->skc_partial_list);
 
 		smp_mb__before_atomic();
 		clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
 		smp_mb__after_atomic();
 	}
 	spin_unlock(&skc->skc_lock);
 
 	return (sks == NULL ? -ENOMEM : 0);
 }
 
 static void
 spl_cache_grow_work(void *data)
 {
 	spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data;
 	spl_kmem_cache_t *skc = ska->ska_cache;
 
 	int error = __spl_cache_grow(skc, ska->ska_flags);
 
 	atomic_dec(&skc->skc_ref);
 	smp_mb__before_atomic();
 	clear_bit(KMC_BIT_GROWING, &skc->skc_flags);
 	smp_mb__after_atomic();
 	if (error == 0)
 		wake_up_all(&skc->skc_waitq);
 
 	kfree(ska);
 }
 
 /*
  * Returns non-zero when a new slab should be available.
  */
 static int
 spl_cache_grow_wait(spl_kmem_cache_t *skc)
 {
 	return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags));
 }
 
 /*
  * No available objects on any slabs, create a new slab.  Note that this
  * functionality is disabled for KMC_SLAB caches which are backed by the
  * Linux slab.
  */
 static int
 spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj)
 {
 	int remaining, rc = 0;
 
 	ASSERT0(flags & ~KM_PUBLIC_MASK);
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT0((skc->skc_flags & KMC_SLAB));
 
 	*obj = NULL;
 
 	/*
 	 * Since we can't sleep attempt an emergency allocation to satisfy
 	 * the request.  The only alterative is to fail the allocation but
 	 * it's preferable try.  The use of KM_NOSLEEP is expected to be rare.
 	 */
 	if (flags & KM_NOSLEEP)
 		return (spl_emergency_alloc(skc, flags, obj));
 
 	might_sleep();
 
 	/*
 	 * Before allocating a new slab wait for any reaping to complete and
 	 * then return so the local magazine can be rechecked for new objects.
 	 */
 	if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) {
 		rc = wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING,
 		    TASK_UNINTERRUPTIBLE);
 		return (rc ? rc : -EAGAIN);
 	}
 
 	/*
 	 * Note: It would be nice to reduce the overhead of context switch
 	 * and improve NUMA locality, by trying to allocate a new slab in the
 	 * current process context with KM_NOSLEEP flag.
 	 *
 	 * However, this can't be applied to vmem/kvmem due to a bug that
 	 * spl_vmalloc() doesn't honor gfp flags in page table allocation.
 	 */
 
 	/*
 	 * This is handled by dispatching a work request to the global work
 	 * queue.  This allows us to asynchronously allocate a new slab while
 	 * retaining the ability to safely fall back to a smaller synchronous
 	 * allocations to ensure forward progress is always maintained.
 	 */
 	if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) {
 		spl_kmem_alloc_t *ska;
 
 		ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags));
 		if (ska == NULL) {
 			clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags);
 			smp_mb__after_atomic();
 			wake_up_all(&skc->skc_waitq);
 			return (-ENOMEM);
 		}
 
 		atomic_inc(&skc->skc_ref);
 		ska->ska_cache = skc;
 		ska->ska_flags = flags;
 		taskq_init_ent(&ska->ska_tqe);
 		taskq_dispatch_ent(spl_kmem_cache_taskq,
 		    spl_cache_grow_work, ska, 0, &ska->ska_tqe);
 	}
 
 	/*
 	 * The goal here is to only detect the rare case where a virtual slab
 	 * allocation has deadlocked.  We must be careful to minimize the use
 	 * of emergency objects which are more expensive to track.  Therefore,
 	 * we set a very long timeout for the asynchronous allocation and if
 	 * the timeout is reached the cache is flagged as deadlocked.  From
 	 * this point only new emergency objects will be allocated until the
 	 * asynchronous allocation completes and clears the deadlocked flag.
 	 */
 	if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) {
 		rc = spl_emergency_alloc(skc, flags, obj);
 	} else {
 		remaining = wait_event_timeout(skc->skc_waitq,
 		    spl_cache_grow_wait(skc), HZ / 10);
 
 		if (!remaining) {
 			spin_lock(&skc->skc_lock);
 			if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) {
 				set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags);
 				skc->skc_obj_deadlock++;
 			}
 			spin_unlock(&skc->skc_lock);
 		}
 
 		rc = -ENOMEM;
 	}
 
 	return (rc);
 }
 
 /*
  * Refill a per-cpu magazine with objects from the slabs for this cache.
  * Ideally the magazine can be repopulated using existing objects which have
  * been released, however if we are unable to locate enough free objects new
  * slabs of objects will be created.  On success NULL is returned, otherwise
  * the address of a single emergency object is returned for use by the caller.
  */
 static void *
 spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags)
 {
 	spl_kmem_slab_t *sks;
 	int count = 0, rc, refill;
 	void *obj = NULL;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(skm->skm_magic == SKM_MAGIC);
 
 	refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail);
 	spin_lock(&skc->skc_lock);
 
 	while (refill > 0) {
 		/* No slabs available we may need to grow the cache */
 		if (list_empty(&skc->skc_partial_list)) {
 			spin_unlock(&skc->skc_lock);
 
 			local_irq_enable();
 			rc = spl_cache_grow(skc, flags, &obj);
 			local_irq_disable();
 
 			/* Emergency object for immediate use by caller */
 			if (rc == 0 && obj != NULL)
 				return (obj);
 
 			if (rc)
 				goto out;
 
 			/* Rescheduled to different CPU skm is not local */
 			if (skm != skc->skc_mag[smp_processor_id()])
 				goto out;
 
 			/*
 			 * Potentially rescheduled to the same CPU but
 			 * allocations may have occurred from this CPU while
 			 * we were sleeping so recalculate max refill.
 			 */
 			refill = MIN(refill, skm->skm_size - skm->skm_avail);
 
 			spin_lock(&skc->skc_lock);
 			continue;
 		}
 
 		/* Grab the next available slab */
 		sks = list_entry((&skc->skc_partial_list)->next,
 		    spl_kmem_slab_t, sks_list);
 		ASSERT(sks->sks_magic == SKS_MAGIC);
 		ASSERT(sks->sks_ref < sks->sks_objs);
 		ASSERT(!list_empty(&sks->sks_free_list));
 
 		/*
 		 * Consume as many objects as needed to refill the requested
 		 * cache.  We must also be careful not to overfill it.
 		 */
 		while (sks->sks_ref < sks->sks_objs && refill-- > 0 &&
 		    ++count) {
 			ASSERT(skm->skm_avail < skm->skm_size);
 			ASSERT(count < skm->skm_size);
 			skm->skm_objs[skm->skm_avail++] =
 			    spl_cache_obj(skc, sks);
 		}
 
 		/* Move slab to skc_complete_list when full */
 		if (sks->sks_ref == sks->sks_objs) {
 			list_del(&sks->sks_list);
 			list_add(&sks->sks_list, &skc->skc_complete_list);
 		}
 	}
 
 	spin_unlock(&skc->skc_lock);
 out:
 	return (NULL);
 }
 
 /*
  * Release an object back to the slab from which it came.
  */
 static void
 spl_cache_shrink(spl_kmem_cache_t *skc, void *obj)
 {
 	spl_kmem_slab_t *sks = NULL;
 	spl_kmem_obj_t *sko = NULL;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 
 	sko = spl_sko_from_obj(skc, obj);
 	ASSERT(sko->sko_magic == SKO_MAGIC);
 	sks = sko->sko_slab;
 	ASSERT(sks->sks_magic == SKS_MAGIC);
 	ASSERT(sks->sks_cache == skc);
 	list_add(&sko->sko_list, &sks->sks_free_list);
 
 	sks->sks_age = jiffies;
 	sks->sks_ref--;
 	skc->skc_obj_alloc--;
 
 	/*
 	 * Move slab to skc_partial_list when no longer full.  Slabs
 	 * are added to the head to keep the partial list is quasi-full
 	 * sorted order.  Fuller at the head, emptier at the tail.
 	 */
 	if (sks->sks_ref == (sks->sks_objs - 1)) {
 		list_del(&sks->sks_list);
 		list_add(&sks->sks_list, &skc->skc_partial_list);
 	}
 
 	/*
 	 * Move empty slabs to the end of the partial list so
 	 * they can be easily found and freed during reclamation.
 	 */
 	if (sks->sks_ref == 0) {
 		list_del(&sks->sks_list);
 		list_add_tail(&sks->sks_list, &skc->skc_partial_list);
 		skc->skc_slab_alloc--;
 	}
 }
 
 /*
  * Allocate an object from the per-cpu magazine, or if the magazine
  * is empty directly allocate from a slab and repopulate the magazine.
  */
 void *
 spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags)
 {
 	spl_kmem_magazine_t *skm;
 	void *obj = NULL;
 
 	ASSERT0(flags & ~KM_PUBLIC_MASK);
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 
 	/*
 	 * Allocate directly from a Linux slab.  All optimizations are left
 	 * to the underlying cache we only need to guarantee that KM_SLEEP
 	 * callers will never fail.
 	 */
 	if (skc->skc_flags & KMC_SLAB) {
 		struct kmem_cache *slc = skc->skc_linux_cache;
 		do {
 			obj = kmem_cache_alloc(slc, kmem_flags_convert(flags));
 		} while ((obj == NULL) && !(flags & KM_NOSLEEP));
 
 		if (obj != NULL) {
 			/*
 			 * Even though we leave everything up to the
 			 * underlying cache we still keep track of
 			 * how many objects we've allocated in it for
 			 * better debuggability.
 			 */
 			percpu_counter_inc(&skc->skc_linux_alloc);
 		}
 		goto ret;
 	}
 
 	local_irq_disable();
 
 restart:
 	/*
 	 * Safe to update per-cpu structure without lock, but
 	 * in the restart case we must be careful to reacquire
 	 * the local magazine since this may have changed
 	 * when we need to grow the cache.
 	 */
 	skm = skc->skc_mag[smp_processor_id()];
 	ASSERT(skm->skm_magic == SKM_MAGIC);
 
 	if (likely(skm->skm_avail)) {
 		/* Object available in CPU cache, use it */
 		obj = skm->skm_objs[--skm->skm_avail];
 	} else {
 		obj = spl_cache_refill(skc, skm, flags);
 		if ((obj == NULL) && !(flags & KM_NOSLEEP))
 			goto restart;
 
 		local_irq_enable();
 		goto ret;
 	}
 
 	local_irq_enable();
 	ASSERT(obj);
 	ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align));
 
 ret:
 	/* Pre-emptively migrate object to CPU L1 cache */
 	if (obj) {
 		if (obj && skc->skc_ctor)
 			skc->skc_ctor(obj, skc->skc_private, flags);
 		else
 			prefetchw(obj);
 	}
 
 	return (obj);
 }
 EXPORT_SYMBOL(spl_kmem_cache_alloc);
 
 /*
  * Free an object back to the local per-cpu magazine, there is no
  * guarantee that this is the same magazine the object was originally
  * allocated from.  We may need to flush entire from the magazine
  * back to the slabs to make space.
  */
 void
 spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj)
 {
 	spl_kmem_magazine_t *skm;
 	unsigned long flags;
 	int do_reclaim = 0;
 	int do_emergency = 0;
 
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 
 	/*
 	 * Run the destructor
 	 */
 	if (skc->skc_dtor)
 		skc->skc_dtor(obj, skc->skc_private);
 
 	/*
 	 * Free the object from the Linux underlying Linux slab.
 	 */
 	if (skc->skc_flags & KMC_SLAB) {
 		kmem_cache_free(skc->skc_linux_cache, obj);
 		percpu_counter_dec(&skc->skc_linux_alloc);
 		return;
 	}
 
 	/*
 	 * While a cache has outstanding emergency objects all freed objects
 	 * must be checked.  However, since emergency objects will never use
 	 * a virtual address these objects can be safely excluded as an
 	 * optimization.
 	 */
 	if (!is_vmalloc_addr(obj)) {
 		spin_lock(&skc->skc_lock);
 		do_emergency = (skc->skc_obj_emergency > 0);
 		spin_unlock(&skc->skc_lock);
 
 		if (do_emergency && (spl_emergency_free(skc, obj) == 0))
 			return;
 	}
 
 	local_irq_save(flags);
 
 	/*
 	 * Safe to update per-cpu structure without lock, but
 	 * no remote memory allocation tracking is being performed
 	 * it is entirely possible to allocate an object from one
 	 * CPU cache and return it to another.
 	 */
 	skm = skc->skc_mag[smp_processor_id()];
 	ASSERT(skm->skm_magic == SKM_MAGIC);
 
 	/*
 	 * Per-CPU cache full, flush it to make space for this object,
 	 * this may result in an empty slab which can be reclaimed once
 	 * interrupts are re-enabled.
 	 */
 	if (unlikely(skm->skm_avail >= skm->skm_size)) {
 		spl_cache_flush(skc, skm, skm->skm_refill);
 		do_reclaim = 1;
 	}
 
 	/* Available space in cache, use it */
 	skm->skm_objs[skm->skm_avail++] = obj;
 
 	local_irq_restore(flags);
 
 	if (do_reclaim)
 		spl_slab_reclaim(skc);
 }
 EXPORT_SYMBOL(spl_kmem_cache_free);
 
 /*
  * Depending on how many and which objects are released it may simply
  * repopulate the local magazine which will then need to age-out.  Objects
  * which cannot fit in the magazine will be released back to their slabs
  * which will also need to age out before being released.  This is all just
  * best effort and we do not want to thrash creating and destroying slabs.
  */
 void
 spl_kmem_cache_reap_now(spl_kmem_cache_t *skc)
 {
 	ASSERT(skc->skc_magic == SKC_MAGIC);
 	ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags));
 
 	if (skc->skc_flags & KMC_SLAB)
 		return;
 
 	atomic_inc(&skc->skc_ref);
 
 	/*
 	 * Prevent concurrent cache reaping when contended.
 	 */
 	if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags))
 		goto out;
 
 	/* Reclaim from the magazine and free all now empty slabs. */
 	unsigned long irq_flags;
 	local_irq_save(irq_flags);
 	spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()];
 	spl_cache_flush(skc, skm, skm->skm_avail);
 	local_irq_restore(irq_flags);
 
 	spl_slab_reclaim(skc);
 	clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags);
 	smp_mb__after_atomic();
 	wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING);
 out:
 	atomic_dec(&skc->skc_ref);
 }
 EXPORT_SYMBOL(spl_kmem_cache_reap_now);
 
 /*
  * This is stubbed out for code consistency with other platforms.  There
  * is existing logic to prevent concurrent reaping so while this is ugly
  * it should do no harm.
  */
 int
 spl_kmem_cache_reap_active(void)
 {
 	return (0);
 }
 EXPORT_SYMBOL(spl_kmem_cache_reap_active);
 
 /*
  * Reap all free slabs from all registered caches.
  */
 void
 spl_kmem_reap(void)
 {
 	spl_kmem_cache_t *skc = NULL;
 
 	down_read(&spl_kmem_cache_sem);
 	list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) {
 		spl_kmem_cache_reap_now(skc);
 	}
 	up_read(&spl_kmem_cache_sem);
 }
 EXPORT_SYMBOL(spl_kmem_reap);
 
 int
 spl_kmem_cache_init(void)
 {
 	init_rwsem(&spl_kmem_cache_sem);
 	INIT_LIST_HEAD(&spl_kmem_cache_list);
 	spl_kmem_cache_taskq = taskq_create("spl_kmem_cache",
 	    spl_kmem_cache_kmem_threads, maxclsyspri,
 	    spl_kmem_cache_kmem_threads * 8, INT_MAX,
 	    TASKQ_PREPOPULATE | TASKQ_DYNAMIC);
 
 	if (spl_kmem_cache_taskq == NULL)
 		return (-ENOMEM);
 
 	return (0);
 }
 
 void
 spl_kmem_cache_fini(void)
 {
 	taskq_destroy(spl_kmem_cache_taskq);
 }
diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c
index 0eb16ae340bc..d583b92e45ca 100644
--- a/module/os/linux/spl/spl-taskq.c
+++ b/module/os/linux/spl/spl-taskq.c
@@ -1,1847 +1,1860 @@
 // SPDX-License-Identifier: GPL-2.0-or-later
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  *
  *  Solaris Porting Layer (SPL) Task Queue Implementation.
  */
 /*
  * Copyright (c) 2024, Klara Inc.
  * Copyright (c) 2024, Syneto
  */
 
 #include <sys/timer.h>
 #include <sys/taskq.h>
 #include <sys/kmem.h>
 #include <sys/tsd.h>
 #include <sys/trace_spl.h>
 #include <sys/time.h>
 #include <sys/atomic.h>
 #include <sys/kstat.h>
 #include <linux/cpuhotplug.h>
 #include <linux/mod_compat.h>
 
 /* Linux 6.2 renamed timer_delete_sync(); point it at its old name for those. */
 #ifndef HAVE_TIMER_DELETE_SYNC
 #define	timer_delete_sync(t)	del_timer_sync(t)
 #endif
 
 typedef struct taskq_kstats {
 	/* static values, for completeness */
 	kstat_named_t tqks_threads_max;
 	kstat_named_t tqks_entry_pool_min;
 	kstat_named_t tqks_entry_pool_max;
 
 	/* gauges (inc/dec counters, current value) */
 	kstat_named_t tqks_threads_active;
 	kstat_named_t tqks_threads_idle;
 	kstat_named_t tqks_threads_total;
 	kstat_named_t tqks_tasks_pending;
 	kstat_named_t tqks_tasks_priority;
 	kstat_named_t tqks_tasks_total;
 	kstat_named_t tqks_tasks_delayed;
 	kstat_named_t tqks_entries_free;
 
 	/* counters (inc only, since taskq creation) */
 	kstat_named_t tqks_threads_created;
 	kstat_named_t tqks_threads_destroyed;
 	kstat_named_t tqks_tasks_dispatched;
 	kstat_named_t tqks_tasks_dispatched_delayed;
 	kstat_named_t tqks_tasks_executed_normal;
 	kstat_named_t tqks_tasks_executed_priority;
 	kstat_named_t tqks_tasks_executed;
 	kstat_named_t tqks_tasks_delayed_requeued;
 	kstat_named_t tqks_tasks_cancelled;
 	kstat_named_t tqks_thread_wakeups;
 	kstat_named_t tqks_thread_wakeups_nowork;
 	kstat_named_t tqks_thread_sleeps;
 } taskq_kstats_t;
 
 static taskq_kstats_t taskq_kstats_template = {
 	{ "threads_max",		KSTAT_DATA_UINT64 },
 	{ "entry_pool_min",		KSTAT_DATA_UINT64 },
 	{ "entry_pool_max",		KSTAT_DATA_UINT64 },
 	{ "threads_active",		KSTAT_DATA_UINT64 },
 	{ "threads_idle",		KSTAT_DATA_UINT64 },
 	{ "threads_total",		KSTAT_DATA_UINT64 },
 	{ "tasks_pending",		KSTAT_DATA_UINT64 },
 	{ "tasks_priority",		KSTAT_DATA_UINT64 },
 	{ "tasks_total",		KSTAT_DATA_UINT64 },
 	{ "tasks_delayed",		KSTAT_DATA_UINT64 },
 	{ "entries_free",		KSTAT_DATA_UINT64 },
 
 	{ "threads_created",		KSTAT_DATA_UINT64 },
 	{ "threads_destroyed",		KSTAT_DATA_UINT64 },
 	{ "tasks_dispatched",		KSTAT_DATA_UINT64 },
 	{ "tasks_dispatched_delayed",	KSTAT_DATA_UINT64 },
 	{ "tasks_executed_normal",	KSTAT_DATA_UINT64 },
 	{ "tasks_executed_priority",	KSTAT_DATA_UINT64 },
 	{ "tasks_executed",		KSTAT_DATA_UINT64 },
 	{ "tasks_delayed_requeued",	KSTAT_DATA_UINT64 },
 	{ "tasks_cancelled",		KSTAT_DATA_UINT64 },
 	{ "thread_wakeups",		KSTAT_DATA_UINT64 },
 	{ "thread_wakeups_nowork",	KSTAT_DATA_UINT64 },
 	{ "thread_sleeps",		KSTAT_DATA_UINT64 },
 };
 
 #define	TQSTAT_INC(tq, stat)	wmsum_add(&tq->tq_sums.tqs_##stat, 1)
 #define	TQSTAT_DEC(tq, stat)	wmsum_add(&tq->tq_sums.tqs_##stat, -1)
 
 #define	_TQSTAT_MOD_LIST(mod, tq, t) do { \
 	switch (t->tqent_flags & TQENT_LIST_MASK) {			\
 	case TQENT_LIST_NONE: ASSERT(list_empty(&t->tqent_list)); break;\
 	case TQENT_LIST_PENDING: mod(tq, tasks_pending); break;		\
 	case TQENT_LIST_PRIORITY: mod(tq, tasks_priority); break;	\
 	case TQENT_LIST_DELAY: mod(tq, tasks_delayed); break;		\
 	}								\
 } while (0)
 #define	TQSTAT_INC_LIST(tq, t)	_TQSTAT_MOD_LIST(TQSTAT_INC, tq, t)
 #define	TQSTAT_DEC_LIST(tq, t)	_TQSTAT_MOD_LIST(TQSTAT_DEC, tq, t)
 
 #define	TQENT_SET_LIST(t, l)	\
 	t->tqent_flags = (t->tqent_flags & ~TQENT_LIST_MASK) | l;
 
 static int spl_taskq_thread_bind = 0;
 module_param(spl_taskq_thread_bind, int, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default");
 
 static uint_t spl_taskq_thread_timeout_ms = 5000;
 module_param(spl_taskq_thread_timeout_ms, uint, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_timeout_ms,
 	"Minimum idle threads exit interval for dynamic taskqs");
 
 static int spl_taskq_thread_dynamic = 1;
 module_param(spl_taskq_thread_dynamic, int, 0444);
 MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads");
 
 static int spl_taskq_thread_priority = 1;
 module_param(spl_taskq_thread_priority, int, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_priority,
 	"Allow non-default priority for taskq threads");
 
 static uint_t spl_taskq_thread_sequential = 4;
 module_param(spl_taskq_thread_sequential, uint, 0644);
 MODULE_PARM_DESC(spl_taskq_thread_sequential,
 	"Create new taskq threads after N sequential tasks");
 
 /*
  * Global system-wide dynamic task queue available for all consumers. This
  * taskq is not intended for long-running tasks; instead, a dedicated taskq
  * should be created.
  */
 taskq_t *system_taskq;
 EXPORT_SYMBOL(system_taskq);
 /* Global dynamic task queue for long delay */
 taskq_t *system_delay_taskq;
 EXPORT_SYMBOL(system_delay_taskq);
 
 /* Private dedicated taskq for creating new taskq threads on demand. */
 static taskq_t *dynamic_taskq;
 static taskq_thread_t *taskq_thread_create(taskq_t *);
 
 /* Multi-callback id for cpu hotplugging. */
 static int spl_taskq_cpuhp_state;
 
 /* List of all taskqs */
 LIST_HEAD(tq_list);
 struct rw_semaphore tq_list_sem;
 static uint_t taskq_tsd;
 
 static int
 task_km_flags(uint_t flags)
 {
 	if (flags & TQ_NOSLEEP)
 		return (KM_NOSLEEP);
 
 	if (flags & TQ_PUSHPAGE)
 		return (KM_PUSHPAGE);
 
 	return (KM_SLEEP);
 }
 
 /*
  * taskq_find_by_name - Find the largest instance number of a named taskq.
  */
 static int
 taskq_find_by_name(const char *name)
 {
 	struct list_head *tql = NULL;
 	taskq_t *tq;
 
 	list_for_each_prev(tql, &tq_list) {
 		tq = list_entry(tql, taskq_t, tq_taskqs);
 		if (strcmp(name, tq->tq_name) == 0)
 			return (tq->tq_instance);
 	}
 	return (-1);
 }
 
 /*
  * NOTE: Must be called with tq->tq_lock held, returns a list_t which
  * is not attached to the free, work, or pending taskq lists.
  */
 static taskq_ent_t *
 task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags)
 {
 	taskq_ent_t *t;
 	int count = 0;
 
 	ASSERT(tq);
 retry:
 	/* Acquire taskq_ent_t's from free list if available */
 	if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) {
 		t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
 
 		ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 		ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL));
 		ASSERT(!timer_pending(&t->tqent_timer));
 
 		list_del_init(&t->tqent_list);
 		TQSTAT_DEC(tq, entries_free);
 		return (t);
 	}
 
 	/* Free list is empty and memory allocations are prohibited */
 	if (flags & TQ_NOALLOC)
 		return (NULL);
 
 	/* Hit maximum taskq_ent_t pool size */
 	if (tq->tq_nalloc >= tq->tq_maxalloc) {
 		if (flags & TQ_NOSLEEP)
 			return (NULL);
 
 		/*
 		 * Sleep periodically polling the free list for an available
 		 * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed
 		 * but we cannot block forever waiting for an taskq_ent_t to
 		 * show up in the free list, otherwise a deadlock can happen.
 		 *
 		 * Therefore, we need to allocate a new task even if the number
 		 * of allocated tasks is above tq->tq_maxalloc, but we still
 		 * end up delaying the task allocation by one second, thereby
 		 * throttling the task dispatch rate.
 		 */
 		spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
 		schedule_timeout_interruptible(HZ / 100);
 		spin_lock_irqsave_nested(&tq->tq_lock, *irqflags,
 		    tq->tq_lock_class);
 		if (count < 100) {
 			count++;
 			goto retry;
 		}
 	}
 
 	spin_unlock_irqrestore(&tq->tq_lock, *irqflags);
 	t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags));
 	spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class);
 
 	if (t) {
 		taskq_init_ent(t);
 		tq->tq_nalloc++;
 	}
 
 	return (t);
 }
 
 /*
  * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t
  * to already be removed from the free, work, or pending taskq lists.
  */
 static void
 task_free(taskq_t *tq, taskq_ent_t *t)
 {
 	ASSERT(tq);
 	ASSERT(t);
 	ASSERT(list_empty(&t->tqent_list));
 	ASSERT(!timer_pending(&t->tqent_timer));
 
 	kmem_free(t, sizeof (taskq_ent_t));
 	tq->tq_nalloc--;
 }
 
 /*
  * NOTE: Must be called with tq->tq_lock held, either destroys the
  * taskq_ent_t if too many exist or moves it to the free list for later use.
  */
 static void
 task_done(taskq_t *tq, taskq_ent_t *t)
 {
 	ASSERT(tq);
 	ASSERT(t);
 	ASSERT(list_empty(&t->tqent_list));
 
 	/* Wake tasks blocked in taskq_wait_id() */
 	wake_up_all(&t->tqent_waitq);
 
 	if (tq->tq_nalloc <= tq->tq_minalloc) {
 		t->tqent_id = TASKQID_INVALID;
 		t->tqent_func = NULL;
 		t->tqent_arg = NULL;
 		t->tqent_flags = 0;
 
 		list_add_tail(&t->tqent_list, &tq->tq_free_list);
 		TQSTAT_INC(tq, entries_free);
 	} else {
 		task_free(tq, t);
 	}
 }
 
 /*
  * When a delayed task timer expires remove it from the delay list and
  * add it to the priority list in order for immediate processing.
  */
 static void
 task_expire_impl(taskq_ent_t *t)
 {
 	taskq_ent_t *w;
 	taskq_t *tq = t->tqent_taskq;
 	struct list_head *l = NULL;
 	unsigned long flags;
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 
 	if (t->tqent_flags & TQENT_FLAG_CANCEL) {
 		ASSERT(list_empty(&t->tqent_list));
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 		return;
 	}
 
 	t->tqent_birth = jiffies;
 	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
 
 	/*
 	 * The priority list must be maintained in strict task id order
 	 * from lowest to highest for lowest_id to be easily calculable.
 	 */
 	list_del(&t->tqent_list);
 	list_for_each_prev(l, &tq->tq_prio_list) {
 		w = list_entry(l, taskq_ent_t, tqent_list);
 		if (w->tqent_id < t->tqent_id) {
 			list_add(&t->tqent_list, l);
 			break;
 		}
 	}
 	if (l == &tq->tq_prio_list)
 		list_add(&t->tqent_list, &tq->tq_prio_list);
 
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	wake_up(&tq->tq_work_waitq);
 
 	TQSTAT_INC(tq, tasks_delayed_requeued);
 }
 
 static void
 task_expire(struct timer_list *tl)
 {
 	struct timer_list *tmr = (struct timer_list *)tl;
 	taskq_ent_t *t = from_timer(t, tmr, tqent_timer);
 	task_expire_impl(t);
 }
 
 /*
  * Returns the lowest incomplete taskqid_t.  The taskqid_t may
  * be queued on the pending list, on the priority list, on the
  * delay list, or on the work list currently being handled, but
  * it is not 100% complete yet.
  */
 static taskqid_t
 taskq_lowest_id(taskq_t *tq)
 {
 	taskqid_t lowest_id = tq->tq_next_id;
 	taskq_ent_t *t;
 	taskq_thread_t *tqt;
 
 	if (!list_empty(&tq->tq_pend_list)) {
 		t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list);
 		lowest_id = MIN(lowest_id, t->tqent_id);
 	}
 
 	if (!list_empty(&tq->tq_prio_list)) {
 		t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list);
 		lowest_id = MIN(lowest_id, t->tqent_id);
 	}
 
 	if (!list_empty(&tq->tq_delay_list)) {
 		t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list);
 		lowest_id = MIN(lowest_id, t->tqent_id);
 	}
 
 	if (!list_empty(&tq->tq_active_list)) {
 		tqt = list_entry(tq->tq_active_list.next, taskq_thread_t,
 		    tqt_active_list);
 		ASSERT(tqt->tqt_id != TASKQID_INVALID);
 		lowest_id = MIN(lowest_id, tqt->tqt_id);
 	}
 
 	return (lowest_id);
 }
 
 /*
  * Insert a task into a list keeping the list sorted by increasing taskqid.
  */
 static void
 taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt)
 {
 	taskq_thread_t *w;
 	struct list_head *l = NULL;
 
 	ASSERT(tq);
 	ASSERT(tqt);
 
 	list_for_each_prev(l, &tq->tq_active_list) {
 		w = list_entry(l, taskq_thread_t, tqt_active_list);
 		if (w->tqt_id < tqt->tqt_id) {
 			list_add(&tqt->tqt_active_list, l);
 			break;
 		}
 	}
 	if (l == &tq->tq_active_list)
 		list_add(&tqt->tqt_active_list, &tq->tq_active_list);
 }
 
 /*
  * Find and return a task from the given list if it exists.  The list
  * must be in lowest to highest task id order.
  */
 static taskq_ent_t *
 taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id)
 {
 	struct list_head *l = NULL;
 	taskq_ent_t *t;
 
 	list_for_each(l, lh) {
 		t = list_entry(l, taskq_ent_t, tqent_list);
 
 		if (t->tqent_id == id)
 			return (t);
 
 		if (t->tqent_id > id)
 			break;
 	}
 
 	return (NULL);
 }
 
 /*
  * Find an already dispatched task given the task id regardless of what
  * state it is in.  If a task is still pending it will be returned.
  * If a task is executing, then -EBUSY will be returned instead.
  * If the task has already been run then NULL is returned.
  */
 static taskq_ent_t *
 taskq_find(taskq_t *tq, taskqid_t id)
 {
 	taskq_thread_t *tqt;
 	struct list_head *l = NULL;
 	taskq_ent_t *t;
 
 	t = taskq_find_list(tq, &tq->tq_delay_list, id);
 	if (t)
 		return (t);
 
 	t = taskq_find_list(tq, &tq->tq_prio_list, id);
 	if (t)
 		return (t);
 
 	t = taskq_find_list(tq, &tq->tq_pend_list, id);
 	if (t)
 		return (t);
 
 	list_for_each(l, &tq->tq_active_list) {
 		tqt = list_entry(l, taskq_thread_t, tqt_active_list);
 		if (tqt->tqt_id == id) {
 			/*
 			 * Instead of returning tqt_task, we just return a non
 			 * NULL value to prevent misuse, since tqt_task only
 			 * has two valid fields.
 			 */
 			return (ERR_PTR(-EBUSY));
 		}
 	}
 
 	return (NULL);
 }
 
 /*
  * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and
  * taskq_wait() functions below.
  *
  * Taskq waiting is accomplished by tracking the lowest outstanding task
  * id and the next available task id.  As tasks are dispatched they are
  * added to the tail of the pending, priority, or delay lists.  As worker
  * threads become available the tasks are removed from the heads of these
  * lists and linked to the worker threads.  This ensures the lists are
  * kept sorted by lowest to highest task id.
  *
  * Therefore the lowest outstanding task id can be quickly determined by
  * checking the head item from all of these lists.  This value is stored
  * with the taskq as the lowest id.  It only needs to be recalculated when
  * either the task with the current lowest id completes or is canceled.
  *
  * By blocking until the lowest task id exceeds the passed task id the
  * taskq_wait_outstanding() function can be easily implemented.  Similarly,
  * by blocking until the lowest task id matches the next task id taskq_wait()
  * can be implemented.
  *
  * Callers should be aware that when there are multiple worked threads it
  * is possible for larger task ids to complete before smaller ones.  Also
  * when the taskq contains delay tasks with small task ids callers may
  * block for a considerable length of time waiting for them to expire and
  * execute.
  */
 static int
 taskq_wait_id_check(taskq_t *tq, taskqid_t id)
 {
 	int rc;
 	unsigned long flags;
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	rc = (taskq_find(tq, id) == NULL);
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	return (rc);
 }
 
 /*
  * The taskq_wait_id() function blocks until the passed task id completes.
  * This does not guarantee that all lower task ids have completed.
  */
 void
 taskq_wait_id(taskq_t *tq, taskqid_t id)
 {
 	wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id));
 }
 EXPORT_SYMBOL(taskq_wait_id);
 
 static int
 taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id)
 {
 	int rc;
 	unsigned long flags;
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	rc = (id < tq->tq_lowest_id);
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	return (rc);
 }
 
 /*
  * The taskq_wait_outstanding() function will block until all tasks with a
  * lower taskqid than the passed 'id' have been completed.  Note that all
  * task id's are assigned monotonically at dispatch time.  Zero may be
  * passed for the id to indicate all tasks dispatch up to this point,
  * but not after, should be waited for.
  */
 void
 taskq_wait_outstanding(taskq_t *tq, taskqid_t id)
 {
 	id = id ? id : tq->tq_next_id - 1;
 	wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id));
 }
 EXPORT_SYMBOL(taskq_wait_outstanding);
 
 static int
 taskq_wait_check(taskq_t *tq)
 {
 	int rc;
 	unsigned long flags;
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	rc = (tq->tq_lowest_id == tq->tq_next_id);
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	return (rc);
 }
 
 /*
  * The taskq_wait() function will block until the taskq is empty.
  * This means that if a taskq re-dispatches work to itself taskq_wait()
  * callers will block indefinitely.
  */
 void
 taskq_wait(taskq_t *tq)
 {
 	wait_event(tq->tq_wait_waitq, taskq_wait_check(tq));
 }
 EXPORT_SYMBOL(taskq_wait);
 
 int
 taskq_member(taskq_t *tq, kthread_t *t)
 {
 	return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t));
 }
 EXPORT_SYMBOL(taskq_member);
 
 taskq_t *
 taskq_of_curthread(void)
 {
 	return (tsd_get(taskq_tsd));
 }
 EXPORT_SYMBOL(taskq_of_curthread);
 
 /*
- * Cancel an already dispatched task given the task id.  Still pending tasks
- * will be immediately canceled, and if the task is active the function will
- * block until it completes.  Preallocated tasks which are canceled must be
- * freed by the caller.
+ * Cancel a dispatched task. Pending tasks are cancelled immediately.
+ * If the task is running, behavior depends on wait parameter:
+ *   - wait=B_TRUE: Block until task completes
+ *   - wait=B_FALSE: Return EBUSY immediately
+ *
+ * Return values:
+ *   0      - Cancelled before execution. Caller must release resources.
+ *   EBUSY  - Task running (wait=B_FALSE only). Will self-cleanup.
+ *   ENOENT - Not found, or completed after waiting. Already cleaned up.
+ *
+ * Note: wait=B_TRUE returns ENOENT (not EBUSY) after waiting because
+ * the task no longer exists. This distinguishes "cancelled before run"
+ * from "completed naturally" for proper resource management.
  */
 int
-taskq_cancel_id(taskq_t *tq, taskqid_t id)
+taskq_cancel_id(taskq_t *tq, taskqid_t id, boolean_t wait)
 {
 	taskq_ent_t *t;
 	int rc = ENOENT;
 	unsigned long flags;
 
 	ASSERT(tq);
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	t = taskq_find(tq, id);
 	if (t && t != ERR_PTR(-EBUSY)) {
 		list_del_init(&t->tqent_list);
 		TQSTAT_DEC_LIST(tq, t);
 		TQSTAT_DEC(tq, tasks_total);
 
 		t->tqent_flags |= TQENT_FLAG_CANCEL;
 		TQSTAT_INC(tq, tasks_cancelled);
 
 		/*
 		 * When canceling the lowest outstanding task id we
 		 * must recalculate the new lowest outstanding id.
 		 */
 		if (tq->tq_lowest_id == t->tqent_id) {
 			tq->tq_lowest_id = taskq_lowest_id(tq);
 			ASSERT3S(tq->tq_lowest_id, >, t->tqent_id);
 		}
 
 		/*
 		 * The task_expire() function takes the tq->tq_lock so drop
 		 * the lock before synchronously cancelling the timer.
 		 *
 		 * Always call timer_delete_sync() unconditionally. A
 		 * timer_pending() check would be insufficient and unsafe.
 		 * When a timer expires, it is immediately dequeued from the
 		 * timer wheel (timer_pending() returns FALSE), but the
 		 * callback (task_expire) may not run until later.
 		 *
 		 * The race window:
 		 * 1) Timer expires and is dequeued - timer_pending() now
 		 *    returns FALSE
 		 * 2) task_done() is called below, freeing the task, sets
 		 *    tqent_func = NULL and clears flags including CANCEL
 		 * 3) Timer callback finally runs, sees no CANCEL flag,
 		 *    queues task to prio_list
 		 * 4) Worker thread attempts to execute NULL tqent_func
 		 *    and panics
 		 *
 		 * timer_delete_sync() prevents this by ensuring the timer
 		 * callback completes before the task is freed.
 		 */
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 		timer_delete_sync(&t->tqent_timer);
 		spin_lock_irqsave_nested(&tq->tq_lock, flags,
 		    tq->tq_lock_class);
 
 		if (!(t->tqent_flags & TQENT_FLAG_PREALLOC))
 			task_done(tq, t);
 
 		rc = 0;
 	}
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	if (t == ERR_PTR(-EBUSY)) {
-		taskq_wait_id(tq, id);
-		rc = EBUSY;
+		if (wait) {
+			taskq_wait_id(tq, id);
+			rc = ENOENT;  /* Completed, no longer exists */
+		} else {
+			rc = EBUSY;   /* Still running */
+		}
 	}
 
 	return (rc);
 }
 EXPORT_SYMBOL(taskq_cancel_id);
 
 static int taskq_thread_spawn(taskq_t *tq);
 
 taskqid_t
 taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
 {
 	taskq_ent_t *t;
 	taskqid_t rc = TASKQID_INVALID;
 	unsigned long irqflags;
 
 	ASSERT(tq);
 	ASSERT(func);
 
 	spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
 
 	/* Taskq being destroyed and all tasks drained */
 	if (!(tq->tq_flags & TASKQ_ACTIVE))
 		goto out;
 
 	/* Do not queue the task unless there is idle thread for it */
 	ASSERT(tq->tq_nactive <= tq->tq_nthreads);
 	if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
 		/* Dynamic taskq may be able to spawn another thread */
 		if (taskq_thread_spawn(tq) == 0)
 			goto out;
 	}
 
 	if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
 		goto out;
 
 	spin_lock(&t->tqent_lock);
 
 	/* Queue to the front of the list to enforce TQ_NOQUEUE semantics */
 	if (flags & TQ_NOQUEUE) {
 		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add(&t->tqent_list, &tq->tq_prio_list);
 	/* Queue to the priority list instead of the pending list */
 	} else if (flags & TQ_FRONT) {
 		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
 	} else {
 		TQENT_SET_LIST(t, TQENT_LIST_PENDING);
 		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
 	}
 	TQSTAT_INC_LIST(tq, t);
 	TQSTAT_INC(tq, tasks_total);
 
 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
 	t->tqent_func = func;
 	t->tqent_arg = arg;
 	t->tqent_taskq = tq;
 	t->tqent_timer.function = NULL;
 	t->tqent_timer.expires = 0;
 
 	t->tqent_birth = jiffies;
 	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
 
 	ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 
 	spin_unlock(&t->tqent_lock);
 
 	wake_up(&tq->tq_work_waitq);
 
 	TQSTAT_INC(tq, tasks_dispatched);
 
 	/* Spawn additional taskq threads if required. */
 	if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
 out:
 	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 	return (rc);
 }
 EXPORT_SYMBOL(taskq_dispatch);
 
 taskqid_t
 taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
     uint_t flags, clock_t expire_time)
 {
 	taskqid_t rc = TASKQID_INVALID;
 	taskq_ent_t *t;
 	unsigned long irqflags;
 
 	ASSERT(tq);
 	ASSERT(func);
 
 	spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class);
 
 	/* Taskq being destroyed and all tasks drained */
 	if (!(tq->tq_flags & TASKQ_ACTIVE))
 		goto out;
 
 	if ((t = task_alloc(tq, flags, &irqflags)) == NULL)
 		goto out;
 
 	spin_lock(&t->tqent_lock);
 
 	/* Queue to the delay list for subsequent execution */
 	list_add_tail(&t->tqent_list, &tq->tq_delay_list);
 	TQENT_SET_LIST(t, TQENT_LIST_DELAY);
 	TQSTAT_INC_LIST(tq, t);
 	TQSTAT_INC(tq, tasks_total);
 
 	t->tqent_id = rc = tq->tq_next_id;
 	tq->tq_next_id++;
 	t->tqent_func = func;
 	t->tqent_arg = arg;
 	t->tqent_taskq = tq;
 	t->tqent_timer.function = task_expire;
 	t->tqent_timer.expires = (unsigned long)expire_time;
 	add_timer(&t->tqent_timer);
 
 	ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 
 	spin_unlock(&t->tqent_lock);
 
 	TQSTAT_INC(tq, tasks_dispatched_delayed);
 
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
 out:
 	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 	return (rc);
 }
 EXPORT_SYMBOL(taskq_dispatch_delay);
 
 void
 taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags,
     taskq_ent_t *t)
 {
 	unsigned long irqflags;
 	ASSERT(tq);
 	ASSERT(func);
 
 	spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
 	    tq->tq_lock_class);
 
 	/* Taskq being destroyed and all tasks drained */
 	if (!(tq->tq_flags & TASKQ_ACTIVE)) {
 		t->tqent_id = TASKQID_INVALID;
 		goto out;
 	}
 
 	if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) {
 		/* Dynamic taskq may be able to spawn another thread */
 		if (taskq_thread_spawn(tq) == 0)
 			goto out;
 		flags |= TQ_FRONT;
 	}
 
 	spin_lock(&t->tqent_lock);
 
 	/*
 	 * Make sure the entry is not on some other taskq; it is important to
 	 * ASSERT() under lock
 	 */
 	ASSERT(taskq_empty_ent(t));
 
 	/*
 	 * Mark it as a prealloc'd task.  This is important
 	 * to ensure that we don't free it later.
 	 */
 	t->tqent_flags |= TQENT_FLAG_PREALLOC;
 
 	/* Queue to the priority list instead of the pending list */
 	if (flags & TQ_FRONT) {
 		TQENT_SET_LIST(t, TQENT_LIST_PRIORITY);
 		list_add_tail(&t->tqent_list, &tq->tq_prio_list);
 	} else {
 		TQENT_SET_LIST(t, TQENT_LIST_PENDING);
 		list_add_tail(&t->tqent_list, &tq->tq_pend_list);
 	}
 	TQSTAT_INC_LIST(tq, t);
 	TQSTAT_INC(tq, tasks_total);
 
 	t->tqent_id = tq->tq_next_id;
 	tq->tq_next_id++;
 	t->tqent_func = func;
 	t->tqent_arg = arg;
 	t->tqent_taskq = tq;
 
 	t->tqent_birth = jiffies;
 	DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t);
 
 	spin_unlock(&t->tqent_lock);
 
 	wake_up(&tq->tq_work_waitq);
 
 	TQSTAT_INC(tq, tasks_dispatched);
 
 	/* Spawn additional taskq threads if required. */
 	if (tq->tq_nactive == tq->tq_nthreads)
 		(void) taskq_thread_spawn(tq);
 out:
 	spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 }
 EXPORT_SYMBOL(taskq_dispatch_ent);
 
 int
 taskq_empty_ent(taskq_ent_t *t)
 {
 	return (list_empty(&t->tqent_list));
 }
 EXPORT_SYMBOL(taskq_empty_ent);
 
 void
 taskq_init_ent(taskq_ent_t *t)
 {
 	spin_lock_init(&t->tqent_lock);
 	init_waitqueue_head(&t->tqent_waitq);
 	timer_setup(&t->tqent_timer, NULL, 0);
 	INIT_LIST_HEAD(&t->tqent_list);
 	t->tqent_id = 0;
 	t->tqent_func = NULL;
 	t->tqent_arg = NULL;
 	t->tqent_flags = 0;
 	t->tqent_taskq = NULL;
 }
 EXPORT_SYMBOL(taskq_init_ent);
 
 /*
  * Return the next pending task, preference is given to tasks on the
  * priority list which were dispatched with TQ_FRONT.
  */
 static taskq_ent_t *
 taskq_next_ent(taskq_t *tq)
 {
 	struct list_head *list;
 
 	if (!list_empty(&tq->tq_prio_list))
 		list = &tq->tq_prio_list;
 	else if (!list_empty(&tq->tq_pend_list))
 		list = &tq->tq_pend_list;
 	else
 		return (NULL);
 
 	return (list_entry(list->next, taskq_ent_t, tqent_list));
 }
 
 /*
  * Spawns a new thread for the specified taskq.
  */
 static void
 taskq_thread_spawn_task(void *arg)
 {
 	taskq_t *tq = (taskq_t *)arg;
 	unsigned long flags;
 
 	if (taskq_thread_create(tq) == NULL) {
 		/* restore spawning count if failed */
 		spin_lock_irqsave_nested(&tq->tq_lock, flags,
 		    tq->tq_lock_class);
 		tq->tq_nspawn--;
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 	}
 }
 
 /*
  * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current
  * number of threads is insufficient to handle the pending tasks.  These
  * new threads must be created by the dedicated dynamic_taskq to avoid
  * deadlocks between thread creation and memory reclaim.  The system_taskq
  * which is also a dynamic taskq cannot be safely used for this.
  */
 static int
 taskq_thread_spawn(taskq_t *tq)
 {
 	int spawning = 0;
 
 	if (!(tq->tq_flags & TASKQ_DYNAMIC))
 		return (0);
 
 	tq->lastspawnstop = jiffies;
 	if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) &&
 	    (tq->tq_flags & TASKQ_ACTIVE)) {
 		spawning = (++tq->tq_nspawn);
 		taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task,
 		    tq, TQ_NOSLEEP);
 	}
 
 	return (spawning);
 }
 
 /*
  * Threads in a dynamic taskq may exit once there is no more work to do.
  * To prevent threads from being created and destroyed too often limit
  * the exit rate to one per spl_taskq_thread_timeout_ms.
  *
  * The first thread is the thread list is treated as the primary thread.
  * There is nothing special about the primary thread but in order to avoid
  * all the taskq pids from changing we opt to make it long running.
  */
 static int
 taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt)
 {
 	ASSERT(!taskq_next_ent(tq));
 	if (!(tq->tq_flags & TASKQ_DYNAMIC) || !spl_taskq_thread_dynamic)
 		return (0);
 	if (!(tq->tq_flags & TASKQ_ACTIVE))
 		return (1);
 	if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t,
 	    tqt_thread_list) == tqt)
 		return (0);
 	ASSERT3U(tq->tq_nthreads, >, 1);
 	if (tq->tq_nspawn != 0)
 		return (0);
 	if (time_before(jiffies, tq->lastspawnstop +
 	    msecs_to_jiffies(spl_taskq_thread_timeout_ms)))
 		return (0);
 	tq->lastspawnstop = jiffies;
 	return (1);
 }
 
 static int
 taskq_thread(void *args)
 {
 	DECLARE_WAITQUEUE(wait, current);
 	sigset_t blocked;
 	taskq_thread_t *tqt = args;
 	taskq_t *tq;
 	taskq_ent_t *t;
 	int seq_tasks = 0;
 	unsigned long flags;
 	taskq_ent_t dup_task = {};
 
 	ASSERT(tqt);
 	ASSERT(tqt->tqt_tq);
 	tq = tqt->tqt_tq;
 	current->flags |= PF_NOFREEZE;
 
 	(void) spl_fstrans_mark();
 
 	sigfillset(&blocked);
 	sigprocmask(SIG_BLOCK, &blocked, NULL);
 	flush_signals(current);
 
 	tsd_set(taskq_tsd, tq);
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	/*
 	 * If we are dynamically spawned, decrease spawning count. Note that
 	 * we could be created during taskq_create, in which case we shouldn't
 	 * do the decrement. But it's fine because taskq_create will reset
 	 * tq_nspawn later.
 	 */
 	if (tq->tq_flags & TASKQ_DYNAMIC)
 		tq->tq_nspawn--;
 
 	/* Immediately exit if more threads than allowed were created. */
 	if (tq->tq_nthreads >= tq->tq_maxthreads)
 		goto error;
 
 	tq->tq_nthreads++;
 	list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list);
 	wake_up(&tq->tq_wait_waitq);
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	TQSTAT_INC(tq, threads_total);
 
 	while (!kthread_should_stop()) {
 
 		if (list_empty(&tq->tq_pend_list) &&
 		    list_empty(&tq->tq_prio_list)) {
 
 			if (taskq_thread_should_stop(tq, tqt))
 				break;
 
 			add_wait_queue_exclusive(&tq->tq_work_waitq, &wait);
 			spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 			TQSTAT_INC(tq, thread_sleeps);
 			TQSTAT_INC(tq, threads_idle);
 
 			schedule();
 			seq_tasks = 0;
 
 			TQSTAT_DEC(tq, threads_idle);
 			TQSTAT_INC(tq, thread_wakeups);
 
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
 			remove_wait_queue(&tq->tq_work_waitq, &wait);
 		} else {
 			__set_current_state(TASK_RUNNING);
 		}
 
 		if ((t = taskq_next_ent(tq)) != NULL) {
 			list_del_init(&t->tqent_list);
 			TQSTAT_DEC_LIST(tq, t);
 			TQSTAT_DEC(tq, tasks_total);
 
 			/*
 			 * A TQENT_FLAG_PREALLOC task may be reused or freed
 			 * during the task function call. Store tqent_id and
 			 * tqent_flags here.
 			 *
 			 * Also use an on stack taskq_ent_t for tqt_task
 			 * assignment in this case; we want to make sure
 			 * to duplicate all fields, so the values are
 			 * correct when it's accessed via DTRACE_PROBE*.
 			 */
 			tqt->tqt_id = t->tqent_id;
 			tqt->tqt_flags = t->tqent_flags;
 
 			if (t->tqent_flags & TQENT_FLAG_PREALLOC) {
 				dup_task = *t;
 				t = &dup_task;
 			}
 			tqt->tqt_task = t;
 
 			taskq_insert_in_order(tq, tqt);
 			tq->tq_nactive++;
 			spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 			TQSTAT_INC(tq, threads_active);
 			DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t);
 
 			/* Perform the requested task */
 			t->tqent_func(t->tqent_arg);
 
 			DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t);
 
 			TQSTAT_DEC(tq, threads_active);
 			if ((t->tqent_flags & TQENT_LIST_MASK) ==
 			    TQENT_LIST_PENDING)
 				TQSTAT_INC(tq, tasks_executed_normal);
 			else
 				TQSTAT_INC(tq, tasks_executed_priority);
 			TQSTAT_INC(tq, tasks_executed);
 
 			spin_lock_irqsave_nested(&tq->tq_lock, flags,
 			    tq->tq_lock_class);
 
 			tq->tq_nactive--;
 			list_del_init(&tqt->tqt_active_list);
 			tqt->tqt_task = NULL;
 
 			/* For prealloc'd tasks, we don't free anything. */
 			if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC))
 				task_done(tq, t);
 
 			/*
 			 * When the current lowest outstanding taskqid is
 			 * done calculate the new lowest outstanding id
 			 */
 			if (tq->tq_lowest_id == tqt->tqt_id) {
 				tq->tq_lowest_id = taskq_lowest_id(tq);
 				ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id);
 			}
 
 			/* Spawn additional taskq threads if required. */
 			if ((++seq_tasks) > spl_taskq_thread_sequential &&
 			    taskq_thread_spawn(tq))
 				seq_tasks = 0;
 
 			tqt->tqt_id = TASKQID_INVALID;
 			tqt->tqt_flags = 0;
 			wake_up_all(&tq->tq_wait_waitq);
 		} else
 			TQSTAT_INC(tq, thread_wakeups_nowork);
 
 		set_current_state(TASK_INTERRUPTIBLE);
 
 	}
 
 	__set_current_state(TASK_RUNNING);
 	tq->tq_nthreads--;
 	list_del_init(&tqt->tqt_thread_list);
 
 	TQSTAT_DEC(tq, threads_total);
 	TQSTAT_INC(tq, threads_destroyed);
 
 error:
 	kmem_free(tqt, sizeof (taskq_thread_t));
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	tsd_set(taskq_tsd, NULL);
 	thread_exit();
 
 	return (0);
 }
 
 static taskq_thread_t *
 taskq_thread_create(taskq_t *tq)
 {
 	static int last_used_cpu = 0;
 	taskq_thread_t *tqt;
 
 	tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE);
 	INIT_LIST_HEAD(&tqt->tqt_thread_list);
 	INIT_LIST_HEAD(&tqt->tqt_active_list);
 	tqt->tqt_tq = tq;
 	tqt->tqt_id = TASKQID_INVALID;
 
 	tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt,
 	    "%s", tq->tq_name);
 	if (tqt->tqt_thread == NULL) {
 		kmem_free(tqt, sizeof (taskq_thread_t));
 		return (NULL);
 	}
 
 	if (spl_taskq_thread_bind) {
 		last_used_cpu = (last_used_cpu + 1) % num_online_cpus();
 		kthread_bind(tqt->tqt_thread, last_used_cpu);
 	}
 
 	if (spl_taskq_thread_priority)
 		set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri));
 
 	wake_up_process(tqt->tqt_thread);
 
 	TQSTAT_INC(tq, threads_created);
 
 	return (tqt);
 }
 
 static void
 taskq_stats_init(taskq_t *tq)
 {
 	taskq_sums_t *tqs = &tq->tq_sums;
 	wmsum_init(&tqs->tqs_threads_active, 0);
 	wmsum_init(&tqs->tqs_threads_idle, 0);
 	wmsum_init(&tqs->tqs_threads_total, 0);
 	wmsum_init(&tqs->tqs_tasks_pending, 0);
 	wmsum_init(&tqs->tqs_tasks_priority, 0);
 	wmsum_init(&tqs->tqs_tasks_total, 0);
 	wmsum_init(&tqs->tqs_tasks_delayed, 0);
 	wmsum_init(&tqs->tqs_entries_free, 0);
 	wmsum_init(&tqs->tqs_threads_created, 0);
 	wmsum_init(&tqs->tqs_threads_destroyed, 0);
 	wmsum_init(&tqs->tqs_tasks_dispatched, 0);
 	wmsum_init(&tqs->tqs_tasks_dispatched_delayed, 0);
 	wmsum_init(&tqs->tqs_tasks_executed_normal, 0);
 	wmsum_init(&tqs->tqs_tasks_executed_priority, 0);
 	wmsum_init(&tqs->tqs_tasks_executed, 0);
 	wmsum_init(&tqs->tqs_tasks_delayed_requeued, 0);
 	wmsum_init(&tqs->tqs_tasks_cancelled, 0);
 	wmsum_init(&tqs->tqs_thread_wakeups, 0);
 	wmsum_init(&tqs->tqs_thread_wakeups_nowork, 0);
 	wmsum_init(&tqs->tqs_thread_sleeps, 0);
 }
 
 static void
 taskq_stats_fini(taskq_t *tq)
 {
 	taskq_sums_t *tqs = &tq->tq_sums;
 	wmsum_fini(&tqs->tqs_threads_active);
 	wmsum_fini(&tqs->tqs_threads_idle);
 	wmsum_fini(&tqs->tqs_threads_total);
 	wmsum_fini(&tqs->tqs_tasks_pending);
 	wmsum_fini(&tqs->tqs_tasks_priority);
 	wmsum_fini(&tqs->tqs_tasks_total);
 	wmsum_fini(&tqs->tqs_tasks_delayed);
 	wmsum_fini(&tqs->tqs_entries_free);
 	wmsum_fini(&tqs->tqs_threads_created);
 	wmsum_fini(&tqs->tqs_threads_destroyed);
 	wmsum_fini(&tqs->tqs_tasks_dispatched);
 	wmsum_fini(&tqs->tqs_tasks_dispatched_delayed);
 	wmsum_fini(&tqs->tqs_tasks_executed_normal);
 	wmsum_fini(&tqs->tqs_tasks_executed_priority);
 	wmsum_fini(&tqs->tqs_tasks_executed);
 	wmsum_fini(&tqs->tqs_tasks_delayed_requeued);
 	wmsum_fini(&tqs->tqs_tasks_cancelled);
 	wmsum_fini(&tqs->tqs_thread_wakeups);
 	wmsum_fini(&tqs->tqs_thread_wakeups_nowork);
 	wmsum_fini(&tqs->tqs_thread_sleeps);
 }
 
 static int
 taskq_kstats_update(kstat_t *ksp, int rw)
 {
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
 	taskq_t *tq = ksp->ks_private;
 	taskq_kstats_t *tqks = ksp->ks_data;
 
 	tqks->tqks_threads_max.value.ui64 = tq->tq_maxthreads;
 	tqks->tqks_entry_pool_min.value.ui64 = tq->tq_minalloc;
 	tqks->tqks_entry_pool_max.value.ui64 = tq->tq_maxalloc;
 
 	taskq_sums_t *tqs = &tq->tq_sums;
 
 	tqks->tqks_threads_active.value.ui64 =
 	    wmsum_value(&tqs->tqs_threads_active);
 	tqks->tqks_threads_idle.value.ui64 =
 	    wmsum_value(&tqs->tqs_threads_idle);
 	tqks->tqks_threads_total.value.ui64 =
 	    wmsum_value(&tqs->tqs_threads_total);
 	tqks->tqks_tasks_pending.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_pending);
 	tqks->tqks_tasks_priority.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_priority);
 	tqks->tqks_tasks_total.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_total);
 	tqks->tqks_tasks_delayed.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_delayed);
 	tqks->tqks_entries_free.value.ui64 =
 	    wmsum_value(&tqs->tqs_entries_free);
 	tqks->tqks_threads_created.value.ui64 =
 	    wmsum_value(&tqs->tqs_threads_created);
 	tqks->tqks_threads_destroyed.value.ui64 =
 	    wmsum_value(&tqs->tqs_threads_destroyed);
 	tqks->tqks_tasks_dispatched.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_dispatched);
 	tqks->tqks_tasks_dispatched_delayed.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_dispatched_delayed);
 	tqks->tqks_tasks_executed_normal.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_executed_normal);
 	tqks->tqks_tasks_executed_priority.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_executed_priority);
 	tqks->tqks_tasks_executed.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_executed);
 	tqks->tqks_tasks_delayed_requeued.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_delayed_requeued);
 	tqks->tqks_tasks_cancelled.value.ui64 =
 	    wmsum_value(&tqs->tqs_tasks_cancelled);
 	tqks->tqks_thread_wakeups.value.ui64 =
 	    wmsum_value(&tqs->tqs_thread_wakeups);
 	tqks->tqks_thread_wakeups_nowork.value.ui64 =
 	    wmsum_value(&tqs->tqs_thread_wakeups_nowork);
 	tqks->tqks_thread_sleeps.value.ui64 =
 	    wmsum_value(&tqs->tqs_thread_sleeps);
 
 	return (0);
 }
 
 static void
 taskq_kstats_init(taskq_t *tq)
 {
 	char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
 	snprintf(name, sizeof (name), "%s.%d", tq->tq_name, tq->tq_instance);
 
 	kstat_t *ksp = kstat_create("taskq", 0, name, "misc",
 	    KSTAT_TYPE_NAMED, sizeof (taskq_kstats_t) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	if (ksp == NULL)
 		return;
 
 	ksp->ks_private = tq;
 	ksp->ks_update = taskq_kstats_update;
 	ksp->ks_data = kmem_alloc(sizeof (taskq_kstats_t), KM_SLEEP);
 	memcpy(ksp->ks_data, &taskq_kstats_template, sizeof (taskq_kstats_t));
 	kstat_install(ksp);
 
 	tq->tq_ksp = ksp;
 }
 
 static void
 taskq_kstats_fini(taskq_t *tq)
 {
 	if (tq->tq_ksp == NULL)
 		return;
 
 	kmem_free(tq->tq_ksp->ks_data, sizeof (taskq_kstats_t));
 	kstat_delete(tq->tq_ksp);
 
 	tq->tq_ksp = NULL;
 }
 
 taskq_t *
 taskq_create(const char *name, int threads_arg, pri_t pri,
     int minalloc, int maxalloc, uint_t flags)
 {
 	taskq_t *tq;
 	taskq_thread_t *tqt;
 	int count = 0, rc = 0, i;
 	unsigned long irqflags;
 	int nthreads = threads_arg;
 
 	ASSERT(name != NULL);
 	ASSERT(minalloc >= 0);
 	ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */
 
 	/* Scale the number of threads using nthreads as a percentage */
 	if (flags & TASKQ_THREADS_CPU_PCT) {
 		ASSERT(nthreads <= 100);
 		ASSERT(nthreads >= 0);
 		nthreads = MIN(threads_arg, 100);
 		nthreads = MAX(nthreads, 0);
 		nthreads = MAX((num_online_cpus() * nthreads) /100, 1);
 	}
 
 	tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE);
 	if (tq == NULL)
 		return (NULL);
 
 	tq->tq_hp_support = B_FALSE;
 
 	if (flags & TASKQ_THREADS_CPU_PCT) {
 		tq->tq_hp_support = B_TRUE;
 		if (cpuhp_state_add_instance_nocalls(spl_taskq_cpuhp_state,
 		    &tq->tq_hp_cb_node) != 0) {
 			kmem_free(tq, sizeof (*tq));
 			return (NULL);
 		}
 	}
 
 	spin_lock_init(&tq->tq_lock);
 	INIT_LIST_HEAD(&tq->tq_thread_list);
 	INIT_LIST_HEAD(&tq->tq_active_list);
 	tq->tq_name = kmem_strdup(name);
 	tq->tq_nactive = 0;
 	tq->tq_nthreads = 0;
 	tq->tq_nspawn = 0;
 	tq->tq_maxthreads = nthreads;
 	tq->tq_cpu_pct = threads_arg;
 	tq->tq_pri = pri;
 	tq->tq_minalloc = minalloc;
 	tq->tq_maxalloc = maxalloc;
 	tq->tq_nalloc = 0;
 	tq->tq_flags = (flags | TASKQ_ACTIVE);
 	tq->tq_next_id = TASKQID_INITIAL;
 	tq->tq_lowest_id = TASKQID_INITIAL;
 	tq->lastspawnstop = jiffies;
 	INIT_LIST_HEAD(&tq->tq_free_list);
 	INIT_LIST_HEAD(&tq->tq_pend_list);
 	INIT_LIST_HEAD(&tq->tq_prio_list);
 	INIT_LIST_HEAD(&tq->tq_delay_list);
 	init_waitqueue_head(&tq->tq_work_waitq);
 	init_waitqueue_head(&tq->tq_wait_waitq);
 	tq->tq_lock_class = TQ_LOCK_GENERAL;
 	INIT_LIST_HEAD(&tq->tq_taskqs);
 	taskq_stats_init(tq);
 
 	if (flags & TASKQ_PREPOPULATE) {
 		spin_lock_irqsave_nested(&tq->tq_lock, irqflags,
 		    tq->tq_lock_class);
 
 		for (i = 0; i < minalloc; i++)
 			task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW,
 			    &irqflags));
 
 		spin_unlock_irqrestore(&tq->tq_lock, irqflags);
 	}
 
 	if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic)
 		nthreads = 1;
 
 	for (i = 0; i < nthreads; i++) {
 		tqt = taskq_thread_create(tq);
 		if (tqt == NULL)
 			rc = 1;
 		else
 			count++;
 	}
 
 	/* Wait for all threads to be started before potential destroy */
 	wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count);
 	/*
 	 * taskq_thread might have touched nspawn, but we don't want them to
 	 * because they're not dynamically spawned. So we reset it to 0
 	 */
 	tq->tq_nspawn = 0;
 
 	if (rc) {
 		taskq_destroy(tq);
 		return (NULL);
 	}
 
 	down_write(&tq_list_sem);
 	tq->tq_instance = taskq_find_by_name(name) + 1;
 	list_add_tail(&tq->tq_taskqs, &tq_list);
 	up_write(&tq_list_sem);
 
 	/* Install kstats late, because the name includes tq_instance */
 	taskq_kstats_init(tq);
 
 	return (tq);
 }
 EXPORT_SYMBOL(taskq_create);
 
 void
 taskq_destroy(taskq_t *tq)
 {
 	struct task_struct *thread;
 	taskq_thread_t *tqt;
 	taskq_ent_t *t;
 	unsigned long flags;
 
 	ASSERT(tq);
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	tq->tq_flags &= ~TASKQ_ACTIVE;
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	if (tq->tq_hp_support) {
 		VERIFY0(cpuhp_state_remove_instance_nocalls(
 		    spl_taskq_cpuhp_state, &tq->tq_hp_cb_node));
 	}
 
 	/*
 	 * When TASKQ_ACTIVE is clear new tasks may not be added nor may
 	 * new worker threads be spawned for dynamic taskq.
 	 */
 	if (dynamic_taskq != NULL)
 		taskq_wait_outstanding(dynamic_taskq, 0);
 
 	taskq_wait(tq);
 
 	taskq_kstats_fini(tq);
 
 	/* remove taskq from global list used by the kstats */
 	down_write(&tq_list_sem);
 	list_del(&tq->tq_taskqs);
 	up_write(&tq_list_sem);
 
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 	/* wait for spawning threads to insert themselves to the list */
 	while (tq->tq_nspawn) {
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 		schedule_timeout_interruptible(1);
 		spin_lock_irqsave_nested(&tq->tq_lock, flags,
 		    tq->tq_lock_class);
 	}
 
 	/*
 	 * Signal each thread to exit and block until it does.  Each thread
 	 * is responsible for removing itself from the list and freeing its
 	 * taskq_thread_t.  This allows for idle threads to opt to remove
 	 * themselves from the taskq.  They can be recreated as needed.
 	 */
 	while (!list_empty(&tq->tq_thread_list)) {
 		tqt = list_entry(tq->tq_thread_list.next,
 		    taskq_thread_t, tqt_thread_list);
 		thread = tqt->tqt_thread;
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 		kthread_stop(thread);
 
 		spin_lock_irqsave_nested(&tq->tq_lock, flags,
 		    tq->tq_lock_class);
 	}
 
 	while (!list_empty(&tq->tq_free_list)) {
 		t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list);
 
 		ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC));
 
 		list_del_init(&t->tqent_list);
 		task_free(tq, t);
 	}
 
 	ASSERT0(tq->tq_nthreads);
 	ASSERT0(tq->tq_nalloc);
 	ASSERT0(tq->tq_nspawn);
 	ASSERT(list_empty(&tq->tq_thread_list));
 	ASSERT(list_empty(&tq->tq_active_list));
 	ASSERT(list_empty(&tq->tq_free_list));
 	ASSERT(list_empty(&tq->tq_pend_list));
 	ASSERT(list_empty(&tq->tq_prio_list));
 	ASSERT(list_empty(&tq->tq_delay_list));
 
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 	taskq_stats_fini(tq);
 	kmem_strfree(tq->tq_name);
 	kmem_free(tq, sizeof (taskq_t));
 }
 EXPORT_SYMBOL(taskq_destroy);
 
 /*
  * Create a taskq with a specified number of pool threads. Allocate
  * and return an array of nthreads kthread_t pointers, one for each
  * thread in the pool. The array is not ordered and must be freed
  * by the caller.
  */
 taskq_t *
 taskq_create_synced(const char *name, int nthreads, pri_t pri,
     int minalloc, int maxalloc, uint_t flags, kthread_t ***ktpp)
 {
 	taskq_t *tq;
 	taskq_thread_t *tqt;
 	int i = 0;
 	kthread_t **kthreads = kmem_zalloc(sizeof (*kthreads) * nthreads,
 	    KM_SLEEP);
 
 	flags &= ~(TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT | TASKQ_DC_BATCH);
 
 	/* taskq_create spawns all the threads before returning */
 	tq = taskq_create(name, nthreads, minclsyspri, nthreads, INT_MAX,
 	    flags | TASKQ_PREPOPULATE);
 	VERIFY(tq != NULL);
 	VERIFY(tq->tq_nthreads == nthreads);
 
 	list_for_each_entry(tqt, &tq->tq_thread_list, tqt_thread_list) {
 		kthreads[i] = tqt->tqt_thread;
 		i++;
 	}
 
 	ASSERT3S(i, ==, nthreads);
 	*ktpp = kthreads;
 
 	return (tq);
 }
 EXPORT_SYMBOL(taskq_create_synced);
 
 static kstat_t *taskq_summary_ksp = NULL;
 
 static int
 spl_taskq_kstat_headers(char *buf, size_t size)
 {
 	size_t n = snprintf(buf, size,
 	    "%-20s | %-17s | %-23s\n"
 	    "%-20s | %-17s | %-23s\n"
 	    "%-20s | %-17s | %-23s\n",
 	    "", "threads", "tasks on queue",
 	    "taskq name", "tot [act idl] max", " pend [ norm  high] dly",
 	    "--------------------", "-----------------",
 	    "-----------------------");
 	return (n >= size ? ENOMEM : 0);
 }
 
 static int
 spl_taskq_kstat_data(char *buf, size_t size, void *data)
 {
 	struct list_head *tql = NULL;
 	taskq_t *tq;
 	char name[TASKQ_NAMELEN+5]; /* 5 for dot, 3x instance digits, null */
 	char threads[25];
 	char tasks[30];
 	size_t n;
 	int err = 0;
 
 	down_read(&tq_list_sem);
 	list_for_each_prev(tql, &tq_list) {
 		tq = list_entry(tql, taskq_t, tq_taskqs);
 
 		mutex_enter(tq->tq_ksp->ks_lock);
 		taskq_kstats_update(tq->tq_ksp, KSTAT_READ);
 		taskq_kstats_t *tqks = tq->tq_ksp->ks_data;
 
 		snprintf(name, sizeof (name), "%s.%d", tq->tq_name,
 		    tq->tq_instance);
 		snprintf(threads, sizeof (threads), "%3llu [%3llu %3llu] %3llu",
 		    tqks->tqks_threads_total.value.ui64,
 		    tqks->tqks_threads_active.value.ui64,
 		    tqks->tqks_threads_idle.value.ui64,
 		    tqks->tqks_threads_max.value.ui64);
 		snprintf(tasks, sizeof (tasks), "%5llu [%5llu %5llu] %3llu",
 		    tqks->tqks_tasks_total.value.ui64,
 		    tqks->tqks_tasks_pending.value.ui64,
 		    tqks->tqks_tasks_priority.value.ui64,
 		    tqks->tqks_tasks_delayed.value.ui64);
 
 		mutex_exit(tq->tq_ksp->ks_lock);
 
 		n = snprintf(buf, size, "%-20s | %-17s | %-23s\n",
 		    name, threads, tasks);
 		if (n >= size) {
 			err = ENOMEM;
 			break;
 		}
 
 		buf = &buf[n];
 		size -= n;
 	}
 
 	up_read(&tq_list_sem);
 
 	return (err);
 }
 
 static void
 spl_taskq_kstat_init(void)
 {
 	kstat_t *ksp = kstat_create("taskq", 0, "summary", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 
 	if (ksp == NULL)
 		return;
 
 	ksp->ks_data = (void *)(uintptr_t)1;
 	ksp->ks_ndata = 1;
 	kstat_set_raw_ops(ksp, spl_taskq_kstat_headers,
 	    spl_taskq_kstat_data, NULL);
 	kstat_install(ksp);
 
 	taskq_summary_ksp = ksp;
 }
 
 static void
 spl_taskq_kstat_fini(void)
 {
 	if (taskq_summary_ksp == NULL)
 		return;
 
 	kstat_delete(taskq_summary_ksp);
 	taskq_summary_ksp = NULL;
 }
 
 static unsigned int spl_taskq_kick = 0;
 
 static int
 param_set_taskq_kick(const char *val, zfs_kernel_param_t *kp)
 {
 	int ret;
 	taskq_t *tq = NULL;
 	taskq_ent_t *t;
 	unsigned long flags;
 
 	ret = param_set_uint(val, kp);
 	if (ret < 0 || !spl_taskq_kick)
 		return (ret);
 	/* reset value */
 	spl_taskq_kick = 0;
 
 	down_read(&tq_list_sem);
 	list_for_each_entry(tq, &tq_list, tq_taskqs) {
 		spin_lock_irqsave_nested(&tq->tq_lock, flags,
 		    tq->tq_lock_class);
 		/* Check if the first pending is older than 5 seconds */
 		t = taskq_next_ent(tq);
 		if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) {
 			(void) taskq_thread_spawn(tq);
 			printk(KERN_INFO "spl: Kicked taskq %s/%d\n",
 			    tq->tq_name, tq->tq_instance);
 		}
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 	}
 	up_read(&tq_list_sem);
 	return (ret);
 }
 
 module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint,
 	&spl_taskq_kick, 0644);
 MODULE_PARM_DESC(spl_taskq_kick,
 	"Write nonzero to kick stuck taskqs to spawn more threads");
 
 /*
  * This callback will be called exactly once for each core that comes online,
  * for each dynamic taskq. We attempt to expand taskqs that have
  * TASKQ_THREADS_CPU_PCT set. We need to redo the percentage calculation every
  * time, to correctly determine whether or not to add a thread.
  */
 static int
 spl_taskq_expand(unsigned int cpu, struct hlist_node *node)
 {
 	taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node);
 	unsigned long flags;
 	int err = 0;
 
 	ASSERT(tq);
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 
 	if (!(tq->tq_flags & TASKQ_ACTIVE)) {
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 		return (err);
 	}
 
 	ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
 	int nthreads = MIN(tq->tq_cpu_pct, 100);
 	nthreads = MAX(((num_online_cpus() + 1) * nthreads) / 100, 1);
 	tq->tq_maxthreads = nthreads;
 
 	if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) &&
 	    tq->tq_maxthreads > tq->tq_nthreads) {
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 		taskq_thread_t *tqt = taskq_thread_create(tq);
 		if (tqt == NULL)
 			err = -1;
 		return (err);
 	}
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 	return (err);
 }
 
 /*
  * While we don't support offlining CPUs, it is possible that CPUs will fail
  * to online successfully. We do need to be able to handle this case
  * gracefully.
  */
 static int
 spl_taskq_prepare_down(unsigned int cpu, struct hlist_node *node)
 {
 	taskq_t *tq = list_entry(node, taskq_t, tq_hp_cb_node);
 	unsigned long flags;
 
 	ASSERT(tq);
 	spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class);
 
 	if (!(tq->tq_flags & TASKQ_ACTIVE))
 		goto out;
 
 	ASSERT(tq->tq_flags & TASKQ_THREADS_CPU_PCT);
 	int nthreads = MIN(tq->tq_cpu_pct, 100);
 	nthreads = MAX(((num_online_cpus()) * nthreads) / 100, 1);
 	tq->tq_maxthreads = nthreads;
 
 	if (!((tq->tq_flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) &&
 	    tq->tq_maxthreads < tq->tq_nthreads) {
 		ASSERT3U(tq->tq_maxthreads, ==, tq->tq_nthreads - 1);
 		taskq_thread_t *tqt = list_entry(tq->tq_thread_list.next,
 		    taskq_thread_t, tqt_thread_list);
 		struct task_struct *thread = tqt->tqt_thread;
 		spin_unlock_irqrestore(&tq->tq_lock, flags);
 
 		kthread_stop(thread);
 
 		return (0);
 	}
 
 out:
 	spin_unlock_irqrestore(&tq->tq_lock, flags);
 	return (0);
 }
 
 int
 spl_taskq_init(void)
 {
 	init_rwsem(&tq_list_sem);
 	tsd_create(&taskq_tsd, NULL);
 
 	spl_taskq_cpuhp_state = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
 	    "fs/spl_taskq:online", spl_taskq_expand, spl_taskq_prepare_down);
 
 	system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64),
 	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
 	if (system_taskq == NULL)
 		return (-ENOMEM);
 
 	system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4),
 	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC);
 	if (system_delay_taskq == NULL) {
 		cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
 		taskq_destroy(system_taskq);
 		return (-ENOMEM);
 	}
 
 	dynamic_taskq = taskq_create("spl_dynamic_taskq", 1,
 	    maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE);
 	if (dynamic_taskq == NULL) {
 		cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
 		taskq_destroy(system_taskq);
 		taskq_destroy(system_delay_taskq);
 		return (-ENOMEM);
 	}
 
 	/*
 	 * This is used to annotate tq_lock, so
 	 *   taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch
 	 * does not trigger a lockdep warning re: possible recursive locking
 	 */
 	dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC;
 
 	spl_taskq_kstat_init();
 
 	return (0);
 }
 
 void
 spl_taskq_fini(void)
 {
 	spl_taskq_kstat_fini();
 
 	taskq_destroy(dynamic_taskq);
 	dynamic_taskq = NULL;
 
 	taskq_destroy(system_delay_taskq);
 	system_delay_taskq = NULL;
 
 	taskq_destroy(system_taskq);
 	system_taskq = NULL;
 
 	tsd_destroy(&taskq_tsd);
 
 	cpuhp_remove_multi_state(spl_taskq_cpuhp_state);
 	spl_taskq_cpuhp_state = 0;
 }
diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c
index fb4de50480a3..1ac60119fdcf 100644
--- a/module/os/linux/zfs/zfs_ctldir.c
+++ b/module/os/linux/zfs/zfs_ctldir.c
@@ -1,1424 +1,1422 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  *
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (C) 2011 Lawrence Livermore National Security, LLC.
  * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  * LLNL-CODE-403049.
  * Rewritten for Linux by:
  *   Rohan Puri <rohan.puri15@gmail.com>
  *   Brian Behlendorf <behlendorf1@llnl.gov>
  * Copyright (c) 2013 by Delphix. All rights reserved.
  * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved.
  * Copyright (c) 2018 George Melikov. All Rights Reserved.
  * Copyright (c) 2019 Datto, Inc. All rights reserved.
  * Copyright (c) 2020 The MathWorks, Inc. All rights reserved.
  */
 
 /*
  * ZFS control directory (a.k.a. ".zfs")
  *
  * This directory provides a common location for all ZFS meta-objects.
  * Currently, this is only the 'snapshot' and 'shares' directory, but this may
  * expand in the future.  The elements are built dynamically, as the hierarchy
  * does not actually exist on disk.
  *
  * For 'snapshot', we don't want to have all snapshots always mounted, because
  * this would take up a huge amount of space in /etc/mnttab.  We have three
  * types of objects:
  *
  *	ctldir ------> snapshotdir -------> snapshot
  *                                             |
  *                                             |
  *                                             V
  *                                         mounted fs
  *
  * The 'snapshot' node contains just enough information to lookup '..' and act
  * as a mountpoint for the snapshot.  Whenever we lookup a specific snapshot, we
  * perform an automount of the underlying filesystem and return the
  * corresponding inode.
  *
  * All mounts are handled automatically by an user mode helper which invokes
  * the mount procedure.  Unmounts are handled by allowing the mount
  * point to expire so the kernel may automatically unmount it.
  *
  * The '.zfs', '.zfs/snapshot', and all directories created under
  * '.zfs/snapshot' (ie: '.zfs/snapshot/<snapname>') all share the same
  * zfsvfs_t as the head filesystem (what '.zfs' lives under).
  *
  * File systems mounted on top of the '.zfs/snapshot/<snapname>' paths
  * (ie: snapshots) are complete ZFS filesystems and have their own unique
  * zfsvfs_t.  However, the fsid reported by these mounts will be the same
  * as that used by the parent zfsvfs_t to make NFS happy.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/sysmacros.h>
 #include <sys/pathname.h>
 #include <sys/vfs.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zfs_vnops.h>
 #include <sys/stat.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_destroy.h>
 #include <sys/dsl_deleg.h>
 #include <sys/zpl.h>
 #include <sys/mntent.h>
 #include "zfs_namecheck.h"
 
 /*
  * Two AVL trees are maintained which contain all currently automounted
  * snapshots.  Every automounted snapshots maps to a single zfs_snapentry_t
  * entry which MUST:
  *
  *   - be attached to both trees, and
  *   - be unique, no duplicate entries are allowed.
  *
  * The zfs_snapshots_by_name tree is indexed by the full dataset name
  * while the zfs_snapshots_by_objsetid tree is indexed by the unique
  * objsetid.  This allows for fast lookups either by name or objsetid.
  */
 static avl_tree_t zfs_snapshots_by_name;
 static avl_tree_t zfs_snapshots_by_objsetid;
 static krwlock_t zfs_snapshot_lock;
 
 /*
  * Control Directory Tunables (.zfs)
  */
 int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT;
 static int zfs_admin_snapshot = 0;
 static int zfs_snapshot_no_setuid = 0;
 
 typedef struct {
 	char		*se_name;	/* full snapshot name */
 	char		*se_path;	/* full mount path */
 	spa_t		*se_spa;	/* pool spa */
 	uint64_t	se_objsetid;	/* snapshot objset id */
 	struct dentry   *se_root_dentry; /* snapshot root dentry */
-	krwlock_t	se_taskqid_lock;  /* scheduled unmount taskqid lock */
 	taskqid_t	se_taskqid;	/* scheduled unmount taskqid */
 	avl_node_t	se_node_name;	/* zfs_snapshots_by_name link */
 	avl_node_t	se_node_objsetid; /* zfs_snapshots_by_objsetid link */
 	zfs_refcount_t	se_refcount;	/* reference count */
 } zfs_snapentry_t;
 
 static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay);
 
 /*
  * Allocate a new zfs_snapentry_t being careful to make a copy of the
  * the snapshot name and provided mount point.  No reference is taken.
  */
 static zfs_snapentry_t *
 zfsctl_snapshot_alloc(const char *full_name, const char *full_path, spa_t *spa,
     uint64_t objsetid, struct dentry *root_dentry)
 {
 	zfs_snapentry_t *se;
 
 	se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP);
 
 	se->se_name = kmem_strdup(full_name);
 	se->se_path = kmem_strdup(full_path);
 	se->se_spa = spa;
 	se->se_objsetid = objsetid;
 	se->se_root_dentry = root_dentry;
 	se->se_taskqid = TASKQID_INVALID;
-	rw_init(&se->se_taskqid_lock, NULL, RW_DEFAULT, NULL);
 
 	zfs_refcount_create(&se->se_refcount);
 
 	return (se);
 }
 
 /*
  * Free a zfs_snapentry_t the caller must ensure there are no active
  * references.
  */
 static void
 zfsctl_snapshot_free(zfs_snapentry_t *se)
 {
 	zfs_refcount_destroy(&se->se_refcount);
 	kmem_strfree(se->se_name);
 	kmem_strfree(se->se_path);
-	rw_destroy(&se->se_taskqid_lock);
 
 	kmem_free(se, sizeof (zfs_snapentry_t));
 }
 
 /*
  * Hold a reference on the zfs_snapentry_t.
  */
 static void
 zfsctl_snapshot_hold(zfs_snapentry_t *se)
 {
 	zfs_refcount_add(&se->se_refcount, NULL);
 }
 
 /*
  * Release a reference on the zfs_snapentry_t.  When the number of
  * references drops to zero the structure will be freed.
  */
 static void
 zfsctl_snapshot_rele(zfs_snapentry_t *se)
 {
 	if (zfs_refcount_remove(&se->se_refcount, NULL) == 0)
 		zfsctl_snapshot_free(se);
 }
 
 /*
  * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and
  * zfs_snapshots_by_objsetid trees.  While the zfs_snapentry_t is part
  * of the trees a reference is held.
  */
 static void
 zfsctl_snapshot_add(zfs_snapentry_t *se)
 {
 	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
 	zfsctl_snapshot_hold(se);
 	avl_add(&zfs_snapshots_by_name, se);
 	avl_add(&zfs_snapshots_by_objsetid, se);
 }
 
 /*
  * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and
  * zfs_snapshots_by_objsetid trees.  Upon removal a reference is dropped,
  * this can result in the structure being freed if that was the last
  * remaining reference.
  */
 static void
 zfsctl_snapshot_remove(zfs_snapentry_t *se)
 {
 	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
 	avl_remove(&zfs_snapshots_by_name, se);
 	avl_remove(&zfs_snapshots_by_objsetid, se);
 	zfsctl_snapshot_rele(se);
 }
 
 /*
  * Snapshot name comparison function for the zfs_snapshots_by_name.
  */
 static int
 snapentry_compare_by_name(const void *a, const void *b)
 {
 	const zfs_snapentry_t *se_a = a;
 	const zfs_snapentry_t *se_b = b;
 	int ret;
 
 	ret = strcmp(se_a->se_name, se_b->se_name);
 
 	if (ret < 0)
 		return (-1);
 	else if (ret > 0)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Snapshot name comparison function for the zfs_snapshots_by_objsetid.
  */
 static int
 snapentry_compare_by_objsetid(const void *a, const void *b)
 {
 	const zfs_snapentry_t *se_a = a;
 	const zfs_snapentry_t *se_b = b;
 
 	if (se_a->se_spa != se_b->se_spa)
 		return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1);
 
 	if (se_a->se_objsetid < se_b->se_objsetid)
 		return (-1);
 	else if (se_a->se_objsetid > se_b->se_objsetid)
 		return (1);
 	else
 		return (0);
 }
 
 /*
  * Find a zfs_snapentry_t in zfs_snapshots_by_name.  If the snapname
  * is found a pointer to the zfs_snapentry_t is returned and a reference
  * taken on the structure.  The caller is responsible for dropping the
  * reference with zfsctl_snapshot_rele().  If the snapname is not found
  * NULL will be returned.
  */
 static zfs_snapentry_t *
 zfsctl_snapshot_find_by_name(const char *snapname)
 {
 	zfs_snapentry_t *se, search;
 
 	ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
 
 	search.se_name = (char *)snapname;
 	se = avl_find(&zfs_snapshots_by_name, &search, NULL);
 	if (se)
 		zfsctl_snapshot_hold(se);
 
 	return (se);
 }
 
 /*
  * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id
  * rather than the snapname.  In all other respects it behaves the same
  * as zfsctl_snapshot_find_by_name().
  */
 static zfs_snapentry_t *
 zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid)
 {
 	zfs_snapentry_t *se, search;
 
 	ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
 
 	search.se_spa = spa;
 	search.se_objsetid = objsetid;
 	se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL);
 	if (se)
 		zfsctl_snapshot_hold(se);
 
 	return (se);
 }
 
 /*
  * Rename a zfs_snapentry_t in the zfs_snapshots_by_name.  The structure is
  * removed, renamed, and added back to the new correct location in the tree.
  */
 static int
 zfsctl_snapshot_rename(const char *old_snapname, const char *new_snapname)
 {
 	zfs_snapentry_t *se;
 
 	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
 
 	se = zfsctl_snapshot_find_by_name(old_snapname);
 	if (se == NULL)
 		return (SET_ERROR(ENOENT));
 
 	zfsctl_snapshot_remove(se);
 	kmem_strfree(se->se_name);
 	se->se_name = kmem_strdup(new_snapname);
 	zfsctl_snapshot_add(se);
 	zfsctl_snapshot_rele(se);
 
 	return (0);
 }
 
 /*
  * Delayed task responsible for unmounting an expired automounted snapshot.
  */
 static void
 snapentry_expire(void *data)
 {
 	zfs_snapentry_t *se = (zfs_snapentry_t *)data;
 	spa_t *spa = se->se_spa;
 	uint64_t objsetid = se->se_objsetid;
 
 	if (zfs_expire_snapshot <= 0) {
 		zfsctl_snapshot_rele(se);
 		return;
 	}
 
-	rw_enter(&se->se_taskqid_lock, RW_WRITER);
-	se->se_taskqid = TASKQID_INVALID;
-	rw_exit(&se->se_taskqid_lock);
 	(void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE);
-	zfsctl_snapshot_rele(se);
 
 	/*
-	 * Reschedule the unmount if the zfs_snapentry_t wasn't removed.
+	 * Clear taskqid and reschedule if the snapshot wasn't removed.
 	 * This can occur when the snapshot is busy.
 	 */
-	rw_enter(&zfs_snapshot_lock, RW_READER);
+	rw_enter(&zfs_snapshot_lock, RW_WRITER);
+	se->se_taskqid = TASKQID_INVALID;
+	zfsctl_snapshot_rele(se);
 	if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
 		zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
 		zfsctl_snapshot_rele(se);
 	}
 	rw_exit(&zfs_snapshot_lock);
 }
 
 /*
  * Cancel an automatic unmount of a snapname.  This callback is responsible
  * for dropping the reference on the zfs_snapentry_t which was taken when
  * during dispatch.
  */
 static void
 zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se)
 {
 	int err = 0;
-	rw_enter(&se->se_taskqid_lock, RW_WRITER);
-	err = taskq_cancel_id(system_delay_taskq, se->se_taskqid);
+
+	ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock));
+
+	err = taskq_cancel_id(system_delay_taskq, se->se_taskqid, B_FALSE);
 	/*
-	 * if we get ENOENT, the taskq couldn't be found to be
-	 * canceled, so we can just mark it as invalid because
-	 * it's already gone. If we got EBUSY, then we already
-	 * blocked until it was gone _anyway_, so we don't care.
+	 * Clear taskqid only if we successfully cancelled before execution.
+	 * For ENOENT, task already cleared it. For EBUSY, task will clear
+	 * it when done.
 	 */
-	se->se_taskqid = TASKQID_INVALID;
-	rw_exit(&se->se_taskqid_lock);
 	if (err == 0) {
+		se->se_taskqid = TASKQID_INVALID;
 		zfsctl_snapshot_rele(se);
 	}
 }
 
 /*
  * Dispatch the unmount task for delayed handling with a hold protecting it.
  */
 static void
 zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay)
 {
+	ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock));
 
 	if (delay <= 0)
 		return;
 
-	zfsctl_snapshot_hold(se);
-	rw_enter(&se->se_taskqid_lock, RW_WRITER);
 	/*
 	 * If this condition happens, we managed to:
 	 * - dispatch once
 	 * - want to dispatch _again_ before it returned
 	 *
 	 * So let's just return - if that task fails at unmounting,
 	 * we'll eventually dispatch again, and if it succeeds,
 	 * no problem.
 	 */
 	if (se->se_taskqid != TASKQID_INVALID) {
-		rw_exit(&se->se_taskqid_lock);
-		zfsctl_snapshot_rele(se);
 		return;
 	}
+
+	zfsctl_snapshot_hold(se);
 	se->se_taskqid = taskq_dispatch_delay(system_delay_taskq,
 	    snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ);
-	rw_exit(&se->se_taskqid_lock);
 }
 
 /*
  * Schedule an automatic unmount of objset id to occur in delay seconds from
  * now.  Any previous delayed unmount will be cancelled in favor of the
  * updated deadline.  A reference is taken by zfsctl_snapshot_find_by_name()
  * and held until the outstanding task is handled or cancelled.
  */
 int
 zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay)
 {
 	zfs_snapentry_t *se;
 	int error = ENOENT;
 
-	rw_enter(&zfs_snapshot_lock, RW_READER);
+	rw_enter(&zfs_snapshot_lock, RW_WRITER);
 	if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) {
 		zfsctl_snapshot_unmount_cancel(se);
 		zfsctl_snapshot_unmount_delay_impl(se, delay);
 		zfsctl_snapshot_rele(se);
 		error = 0;
 	}
 	rw_exit(&zfs_snapshot_lock);
 
 	return (error);
 }
 
 /*
  * Check if snapname is currently mounted.  Returned non-zero when mounted
  * and zero when unmounted.
  */
 static boolean_t
 zfsctl_snapshot_ismounted(const char *snapname)
 {
 	zfs_snapentry_t *se;
 	boolean_t ismounted = B_FALSE;
 
 	rw_enter(&zfs_snapshot_lock, RW_READER);
 	if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) {
 		zfsctl_snapshot_rele(se);
 		ismounted = B_TRUE;
 	}
 	rw_exit(&zfs_snapshot_lock);
 
 	return (ismounted);
 }
 
 /*
  * Check if the given inode is a part of the virtual .zfs directory.
  */
 boolean_t
 zfsctl_is_node(struct inode *ip)
 {
 	return (ITOZ(ip)->z_is_ctldir);
 }
 
 /*
  * Check if the given inode is a .zfs/snapshots/snapname directory.
  */
 boolean_t
 zfsctl_is_snapdir(struct inode *ip)
 {
 	return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS));
 }
 
 /*
  * Allocate a new inode with the passed id and ops.
  */
 static struct inode *
 zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id,
     const struct file_operations *fops, const struct inode_operations *ops,
     uint64_t creation)
 {
 	struct inode *ip;
 	znode_t *zp;
 	inode_timespec_t now = {.tv_sec = creation};
 
 	ip = new_inode(zfsvfs->z_sb);
 	if (ip == NULL)
 		return (NULL);
 
 	if (!creation)
 		now = current_time(ip);
 	zp = ITOZ(ip);
 	ASSERT0P(zp->z_dirlocks);
 	ASSERT0P(zp->z_acl_cached);
 	ASSERT0P(zp->z_xattr_cached);
 	zp->z_id = id;
 	zp->z_unlinked = B_FALSE;
 	zp->z_atime_dirty = B_FALSE;
 	zp->z_zn_prefetch = B_FALSE;
 	zp->z_is_sa = B_FALSE;
 	zp->z_is_ctldir = B_TRUE;
 	zp->z_sa_hdl = NULL;
 	zp->z_blksz = 0;
 	zp->z_seq = 0;
 	zp->z_mapcnt = 0;
 	zp->z_size = 0;
 	zp->z_pflags = 0;
 	zp->z_mode = 0;
 	zp->z_sync_cnt = 0;
 	ip->i_generation = 0;
 	ip->i_ino = id;
 	ip->i_mode = (S_IFDIR | S_IRWXUGO);
 	ip->i_uid = SUID_TO_KUID(0);
 	ip->i_gid = SGID_TO_KGID(0);
 	ip->i_blkbits = SPA_MINBLOCKSHIFT;
 	zpl_inode_set_atime_to_ts(ip, now);
 	zpl_inode_set_mtime_to_ts(ip, now);
 	zpl_inode_set_ctime_to_ts(ip, now);
 	ip->i_fop = fops;
 	ip->i_op = ops;
 #if defined(IOP_XATTR)
 	ip->i_opflags &= ~IOP_XATTR;
 #endif
 
 	if (insert_inode_locked(ip)) {
 		unlock_new_inode(ip);
 		iput(ip);
 		return (NULL);
 	}
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	list_insert_tail(&zfsvfs->z_all_znodes, zp);
 	membar_producer();
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	unlock_new_inode(ip);
 
 	return (ip);
 }
 
 /*
  * Lookup the inode with given id, it will be allocated if needed.
  */
 static struct inode *
 zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id,
     const struct file_operations *fops, const struct inode_operations *ops)
 {
 	struct inode *ip = NULL;
 	uint64_t creation = 0;
 	dsl_dataset_t *snap_ds;
 	dsl_pool_t *pool;
 
 	while (ip == NULL) {
 		ip = ilookup(zfsvfs->z_sb, (unsigned long)id);
 		if (ip)
 			break;
 
 		if (id <= ZFSCTL_INO_SNAPDIRS && !creation) {
 			pool = dmu_objset_pool(zfsvfs->z_os);
 			dsl_pool_config_enter(pool, FTAG);
 			if (!dsl_dataset_hold_obj(pool,
 			    ZFSCTL_INO_SNAPDIRS - id, FTAG, &snap_ds)) {
 				creation = dsl_get_creation(snap_ds);
 				dsl_dataset_rele(snap_ds, FTAG);
 			}
 			dsl_pool_config_exit(pool, FTAG);
 		}
 
 		/* May fail due to concurrent zfsctl_inode_alloc() */
 		ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops, creation);
 	}
 
 	return (ip);
 }
 
 /*
  * Create the '.zfs' directory.  This directory is cached as part of the VFS
  * structure.  This results in a hold on the zfsvfs_t.  The code in zfs_umount()
  * therefore checks against a vfs_count of 2 instead of 1.  This reference
  * is removed when the ctldir is destroyed in the unmount.  All other entities
  * under the '.zfs' directory are created dynamically as needed.
  *
  * Because the dynamically created '.zfs' directory entries assume the use
  * of 64-bit inode numbers this support must be disabled on 32-bit systems.
  */
 int
 zfsctl_create(zfsvfs_t *zfsvfs)
 {
 	ASSERT0P(zfsvfs->z_ctldir);
 
 	zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT,
 	    &zpl_fops_root, &zpl_ops_root, 0);
 	if (zfsvfs->z_ctldir == NULL)
 		return (SET_ERROR(ENOENT));
 
 	return (0);
 }
 
 /*
  * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name.
  * Only called when the filesystem is unmounted.
  */
 void
 zfsctl_destroy(zfsvfs_t *zfsvfs)
 {
 	if (zfsvfs->z_issnap) {
 		zfs_snapentry_t *se;
 		spa_t *spa = zfsvfs->z_os->os_spa;
 		uint64_t objsetid = dmu_objset_id(zfsvfs->z_os);
 
 		rw_enter(&zfs_snapshot_lock, RW_WRITER);
 		se = zfsctl_snapshot_find_by_objsetid(spa, objsetid);
-		if (se != NULL)
-			zfsctl_snapshot_remove(se);
-		rw_exit(&zfs_snapshot_lock);
 		if (se != NULL) {
+			zfsctl_snapshot_remove(se);
+			/*
+			 * Don't wait if snapentry_expire task is calling
+			 * umount, which may have resulted in this destroy
+			 * call. Waiting would deadlock: snapentry_expire
+			 * waits for umount while umount waits for task.
+			 */
 			zfsctl_snapshot_unmount_cancel(se);
 			zfsctl_snapshot_rele(se);
 		}
+		rw_exit(&zfs_snapshot_lock);
 	} else if (zfsvfs->z_ctldir) {
 		iput(zfsvfs->z_ctldir);
 		zfsvfs->z_ctldir = NULL;
 	}
 }
 
 /*
  * Given a root znode, retrieve the associated .zfs directory.
  * Add a hold to the vnode and return it.
  */
 struct inode *
 zfsctl_root(znode_t *zp)
 {
 	ASSERT(zfs_has_ctldir(zp));
 	/* Must have an existing ref, so igrab() cannot return NULL */
 	VERIFY3P(igrab(ZTOZSB(zp)->z_ctldir), !=, NULL);
 	return (ZTOZSB(zp)->z_ctldir);
 }
 
 /*
  * Generate a long fid to indicate a snapdir. We encode whether snapdir is
  * already mounted in gen field. We do this because nfsd lookup will not
  * trigger automount. Next time the nfsd does fh_to_dentry, we will notice
  * this and do automount and return ESTALE to force nfsd revalidate and follow
  * mount.
  */
 static int
 zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp)
 {
 	zfid_short_t *zfid = (zfid_short_t *)fidp;
 	zfid_long_t *zlfid = (zfid_long_t *)fidp;
 	uint32_t gen = 0;
 	uint64_t object;
 	uint64_t objsetid;
 	int i;
 	struct dentry *dentry;
 
 	if (fidp->fid_len < LONG_FID_LEN) {
 		fidp->fid_len = LONG_FID_LEN;
 		return (SET_ERROR(ENOSPC));
 	}
 
 	object = ip->i_ino;
 	objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino;
 	zfid->zf_len = LONG_FID_LEN;
 
 	dentry = d_obtain_alias(igrab(ip));
 	if (!IS_ERR(dentry)) {
 		gen = !!d_mountpoint(dentry);
 		dput(dentry);
 	}
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i));
 
 	for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 		zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i));
 
 	for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 		zlfid->zf_setgen[i] = 0;
 
 	return (0);
 }
 
 /*
  * Generate an appropriate fid for an entry in the .zfs directory.
  */
 int
 zfsctl_fid(struct inode *ip, fid_t *fidp)
 {
 	znode_t		*zp = ITOZ(ip);
 	zfsvfs_t	*zfsvfs = ITOZSB(ip);
 	uint64_t	object = zp->z_id;
 	zfid_short_t	*zfid;
 	int		i;
 	int		error;
 
 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 
 	if (zfsctl_is_snapdir(ip)) {
 		zfs_exit(zfsvfs, FTAG);
 		return (zfsctl_snapdir_fid(ip, fidp));
 	}
 
 	if (fidp->fid_len < SHORT_FID_LEN) {
 		fidp->fid_len = SHORT_FID_LEN;
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(ENOSPC));
 	}
 
 	zfid = (zfid_short_t *)fidp;
 
 	zfid->zf_len = SHORT_FID_LEN;
 
 	for (i = 0; i < sizeof (zfid->zf_object); i++)
 		zfid->zf_object[i] = (uint8_t)(object >> (8 * i));
 
 	/* .zfs znodes always have a generation number of 0 */
 	for (i = 0; i < sizeof (zfid->zf_gen); i++)
 		zfid->zf_gen[i] = 0;
 
 	zfs_exit(zfsvfs, FTAG);
 	return (0);
 }
 
 /*
  * Construct a full dataset name in full_name: "pool/dataset@snap_name"
  */
 static int
 zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len,
     char *full_name)
 {
 	objset_t *os = zfsvfs->z_os;
 
 	if (zfs_component_namecheck(snap_name, NULL, NULL) != 0)
 		return (SET_ERROR(EILSEQ));
 
 	dmu_objset_name(os, full_name);
 	if ((strlen(full_name) + 1 + strlen(snap_name)) >= len)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	(void) strcat(full_name, "@");
 	(void) strcat(full_name, snap_name);
 
 	return (0);
 }
 
 /*
  * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/"
  */
 static int
 zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid,
     int path_len, char *full_path)
 {
 	objset_t *os = zfsvfs->z_os;
 	fstrans_cookie_t cookie;
 	char *snapname;
 	boolean_t case_conflict;
 	uint64_t id, pos = 0;
 	int error = 0;
 
 	cookie = spl_fstrans_mark();
 	snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 
 	while (error == 0) {
 		dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 		error = dmu_snapshot_list_next(zfsvfs->z_os,
 		    ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos,
 		    &case_conflict);
 		dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 		if (error)
 			goto out;
 
 		if (id == objsetid)
 			break;
 	}
 
 	mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock);
 	if (zfsvfs->z_vfs->vfs_mntpoint != NULL) {
 		snprintf(full_path, path_len, "%s/.zfs/snapshot/%s",
 		    zfsvfs->z_vfs->vfs_mntpoint, snapname);
 	} else
 		error = SET_ERROR(ENOENT);
 	mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock);
 
 out:
 	kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
 	spl_fstrans_unmark(cookie);
 
 	return (error);
 }
 
 /*
  * Special case the handling of "..".
  */
 int
 zfsctl_root_lookup(struct inode *dip, const char *name, struct inode **ipp,
     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(dip);
 	int error = 0;
 
 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 
 	if (zfsvfs->z_show_ctldir == ZFS_SNAPDIR_DISABLED) {
 		*ipp = NULL;
 	} else if (strcmp(name, "..") == 0) {
 		*ipp = dip->i_sb->s_root->d_inode;
 	} else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) {
 		*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR,
 		    &zpl_fops_snapdir, &zpl_ops_snapdir);
 	} else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) {
 		*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES,
 		    &zpl_fops_shares, &zpl_ops_shares);
 	} else {
 		*ipp = NULL;
 	}
 
 	if (*ipp == NULL)
 		error = SET_ERROR(ENOENT);
 
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * Lookup entry point for the 'snapshot' directory.  Try to open the
  * snapshot if it exist, creating the pseudo filesystem inode as necessary.
  */
 int
 zfsctl_snapdir_lookup(struct inode *dip, const char *name, struct inode **ipp,
     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(dip);
 	uint64_t id;
 	int error;
 
 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 
 	error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id);
 	if (error) {
 		zfs_exit(zfsvfs, FTAG);
 		return (error);
 	}
 
 	*ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id,
 	    &simple_dir_operations, &simple_dir_inode_operations);
 	if (*ipp == NULL)
 		error = SET_ERROR(ENOENT);
 
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * Renaming a directory under '.zfs/snapshot' will automatically trigger
  * a rename of the snapshot to the new given name.  The rename is confined
  * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere.
  */
 int
 zfsctl_snapdir_rename(struct inode *sdip, const char *snm,
     struct inode *tdip, const char *tnm, cred_t *cr, int flags)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(sdip);
 	char *to, *from, *real, *fsname;
 	int error;
 
 	if (!zfs_admin_snapshot)
 		return (SET_ERROR(EACCES));
 
 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 
 	to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 		error = dmu_snapshot_realname(zfsvfs->z_os, snm, real,
 		    ZFS_MAX_DATASET_NAME_LEN, NULL);
 		if (error == 0) {
 			snm = real;
 		} else if (error != ENOTSUP) {
 			goto out;
 		}
 	}
 
 	dmu_objset_name(zfsvfs->z_os, fsname);
 
 	error = zfsctl_snapshot_name(ITOZSB(sdip), snm,
 	    ZFS_MAX_DATASET_NAME_LEN, from);
 	if (error == 0)
 		error = zfsctl_snapshot_name(ITOZSB(tdip), tnm,
 		    ZFS_MAX_DATASET_NAME_LEN, to);
 	if (error == 0)
 		error = zfs_secpolicy_rename_perms(from, to, cr);
 	if (error != 0)
 		goto out;
 
 	/*
 	 * Cannot move snapshots out of the snapdir.
 	 */
 	if (sdip != tdip) {
 		error = SET_ERROR(EINVAL);
 		goto out;
 	}
 
 	/*
 	 * No-op when names are identical.
 	 */
 	if (strcmp(snm, tnm) == 0) {
 		error = 0;
 		goto out;
 	}
 
 	rw_enter(&zfs_snapshot_lock, RW_WRITER);
 
 	error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE);
 	if (error == 0)
 		(void) zfsctl_snapshot_rename(snm, tnm);
 
 	rw_exit(&zfs_snapshot_lock);
 out:
 	kmem_free(from, ZFS_MAX_DATASET_NAME_LEN);
 	kmem_free(to, ZFS_MAX_DATASET_NAME_LEN);
 	kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
 	kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN);
 
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * Removing a directory under '.zfs/snapshot' will automatically trigger
  * the removal of the snapshot with the given name.
  */
 int
 zfsctl_snapdir_remove(struct inode *dip, const char *name, cred_t *cr,
     int flags)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(dip);
 	char *snapname, *real;
 	int error;
 
 	if (!zfs_admin_snapshot)
 		return (SET_ERROR(EACCES));
 
 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 
 	snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 		error = dmu_snapshot_realname(zfsvfs->z_os, name, real,
 		    ZFS_MAX_DATASET_NAME_LEN, NULL);
 		if (error == 0) {
 			name = real;
 		} else if (error != ENOTSUP) {
 			goto out;
 		}
 	}
 
 	error = zfsctl_snapshot_name(ITOZSB(dip), name,
 	    ZFS_MAX_DATASET_NAME_LEN, snapname);
 	if (error == 0)
 		error = zfs_secpolicy_destroy_perms(snapname, cr);
 	if (error != 0)
 		goto out;
 
 	error = zfsctl_snapshot_unmount(snapname, MNT_FORCE);
 	if ((error == 0) || (error == ENOENT))
 		error = dsl_destroy_snapshot(snapname, B_FALSE);
 out:
 	kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN);
 	kmem_free(real, ZFS_MAX_DATASET_NAME_LEN);
 
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * Creating a directory under '.zfs/snapshot' will automatically trigger
  * the creation of a new snapshot with the given name.
  */
 int
 zfsctl_snapdir_mkdir(struct inode *dip, const char *dirname, vattr_t *vap,
     struct inode **ipp, cred_t *cr, int flags)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(dip);
 	char *dsname;
 	int error;
 
 	if (!zfs_admin_snapshot)
 		return (SET_ERROR(EACCES));
 
 	dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 
 	if (zfs_component_namecheck(dirname, NULL, NULL) != 0) {
 		error = SET_ERROR(EILSEQ);
 		goto out;
 	}
 
 	dmu_objset_name(zfsvfs->z_os, dsname);
 
 	error = zfs_secpolicy_snapshot_perms(dsname, cr);
 	if (error != 0)
 		goto out;
 
 	if (error == 0) {
 		error = dmu_objset_snapshot_one(dsname, dirname);
 		if (error != 0)
 			goto out;
 
 		error = zfsctl_snapdir_lookup(dip, dirname, ipp,
 		    0, cr, NULL, NULL);
 	}
 out:
 	kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN);
 
 	return (error);
 }
 
 /*
  * Flush everything out of the kernel's export table and such.
  * This is needed as once the snapshot is used over NFS, its
  * entries in svc_export and svc_expkey caches hold reference
  * to the snapshot mount point. There is no known way of flushing
  * only the entries related to the snapshot.
  */
 static void
 exportfs_flush(void)
 {
 	char *argv[] = { "/usr/sbin/exportfs", "-f", NULL };
 	char *envp[] = { NULL };
 
 	(void) call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 }
 
 /*
  * Returns the path in char format for given struct path. Uses
  * d_path exported by kernel to convert struct path to char
  * format. Returns the correct path for mountpoints and chroot
  * environments.
  *
  * If chroot environment has directories that are mounted with
  * --bind or --rbind flag, d_path returns the complete path inside
  * chroot environment but does not return the absolute path, i.e.
  * the path to chroot environment is missing.
  */
 static int
 get_root_path(struct path *path, char *buff, int len)
 {
 	char *path_buffer, *path_ptr;
 	int error = 0;
 
 	path_get(path);
 	path_buffer = kmem_zalloc(len, KM_SLEEP);
 	path_ptr = d_path(path, path_buffer, len);
 	if (IS_ERR(path_ptr))
 		error = SET_ERROR(-PTR_ERR(path_ptr));
 	else
 		strcpy(buff, path_ptr);
 
 	kmem_free(path_buffer, len);
 	path_put(path);
 	return (error);
 }
 
 /*
  * Returns if the current process root is chrooted or not. Linux
  * kernel exposes the task_struct for current process and init.
  * Since init process root points to actual root filesystem when
  * Linux runtime is reached, we can compare the current process
  * root with init process root to determine if root of the current
  * process is different from init, which can reliably determine if
  * current process is in chroot context or not.
  */
 static int
 is_current_chrooted(void)
 {
 	struct task_struct *curr = current, *global = &init_task;
 	struct path cr_root, gl_root;
 
 	task_lock(curr);
 	get_fs_root(curr->fs, &cr_root);
 	task_unlock(curr);
 
 	task_lock(global);
 	get_fs_root(global->fs, &gl_root);
 	task_unlock(global);
 
 	int chrooted = !path_equal(&cr_root, &gl_root);
 	path_put(&gl_root);
 	path_put(&cr_root);
 
 	return (chrooted);
 }
 
 /*
  * Attempt to unmount a snapshot by making a call to user space.
  * There is no assurance that this can or will succeed, is just a
  * best effort.  In the case where it does fail, perhaps because
  * it's in use, the unmount will fail harmlessly.
  */
 int
 zfsctl_snapshot_unmount(const char *snapname, int flags)
 {
 	char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL,
 	    NULL };
 	char *envp[] = { NULL };
 	zfs_snapentry_t *se;
 	int error;
 
 	rw_enter(&zfs_snapshot_lock, RW_READER);
 	if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) {
 		rw_exit(&zfs_snapshot_lock);
 		return (SET_ERROR(ENOENT));
 	}
 	rw_exit(&zfs_snapshot_lock);
 
 	exportfs_flush();
 
 	if (flags & MNT_FORCE)
 		argv[4] = "-fn";
 	argv[5] = se->se_path;
 	dprintf("unmount; path=%s\n", se->se_path);
 	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	zfsctl_snapshot_rele(se);
 
 
 	/*
 	 * The umount system utility will return 256 on error.  We must
 	 * assume this error is because the file system is busy so it is
 	 * converted to the more sensible EBUSY.
 	 */
 	if (error)
 		error = SET_ERROR(EBUSY);
 
 	return (error);
 }
 
 int
 zfsctl_snapshot_mount(struct path *path, int flags)
 {
 	struct dentry *dentry = path->dentry;
 	struct inode *ip = dentry->d_inode;
 	zfsvfs_t *zfsvfs;
 	zfsvfs_t *snap_zfsvfs;
 	zfs_snapentry_t *se;
 	char *full_name, *full_path, *options;
 	char *argv[] = { "/usr/bin/env", "mount", "-i", "-t", "zfs", "-n",
 	    "-o", NULL, NULL, NULL, NULL };
 	char *envp[] = { NULL };
 	int error;
 	struct path spath;
 
 	if (ip == NULL)
 		return (SET_ERROR(EISDIR));
 
 	zfsvfs = ITOZSB(ip);
 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 
 	full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP);
 	full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 	options = kmem_zalloc(7, KM_SLEEP);
 
 	error = zfsctl_snapshot_name(zfsvfs, dname(dentry),
 	    ZFS_MAX_DATASET_NAME_LEN, full_name);
 	if (error)
 		goto error;
 
 	if (is_current_chrooted() == 0) {
 		/*
 		 * Current process is not in chroot context
 		 */
 
 		char *m = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 		struct path mnt_path;
 		mnt_path.mnt = path->mnt;
 		mnt_path.dentry = path->mnt->mnt_root;
 
 		/*
 		 * Get path to current mountpoint
 		 */
 		error = get_root_path(&mnt_path, m, MAXPATHLEN);
 		if (error != 0) {
 			kmem_free(m, MAXPATHLEN);
 			goto error;
 		}
 		mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock);
 		if (zfsvfs->z_vfs->vfs_mntpoint != NULL) {
 			/*
 			 * If current mnountpoint and vfs_mntpoint are not same,
 			 * store current mountpoint in vfs_mntpoint.
 			 */
 			if (strcmp(zfsvfs->z_vfs->vfs_mntpoint, m) != 0) {
 				kmem_strfree(zfsvfs->z_vfs->vfs_mntpoint);
 				zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m);
 			}
 		} else
 			zfsvfs->z_vfs->vfs_mntpoint = kmem_strdup(m);
 		mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock);
 		kmem_free(m, MAXPATHLEN);
 	}
 
 	/*
 	 * Construct a mount point path from sb of the ctldir inode and dirent
 	 * name, instead of from d_path(), so that chroot'd process doesn't fail
 	 * on mount.zfs(8).
 	 */
 	mutex_enter(&zfsvfs->z_vfs->vfs_mntpt_lock);
 	snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s",
 	    zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "",
 	    dname(dentry));
 	mutex_exit(&zfsvfs->z_vfs->vfs_mntpt_lock);
 
 	snprintf(options, 7, "%s",
 	    zfs_snapshot_no_setuid ? "nosuid" : "suid");
 
 	/*
 	 * Multiple concurrent automounts of a snapshot are never allowed.
 	 * The snapshot may be manually mounted as many times as desired.
 	 */
 	if (zfsctl_snapshot_ismounted(full_name)) {
 		error = 0;
 		goto error;
 	}
 
 	/*
 	 * Attempt to mount the snapshot from user space.  Normally this
 	 * would be done using the vfs_kern_mount() function, however that
 	 * function is marked GPL-only and cannot be used.  On error we
 	 * careful to log the real error to the console and return EISDIR
 	 * to safely abort the automount.  This should be very rare.
 	 *
 	 * If the user mode helper happens to return EBUSY, a concurrent
 	 * mount is already in progress in which case the error is ignored.
 	 * Take note that if the program was executed successfully the return
 	 * value from call_usermodehelper() will be (exitcode << 8 + signal).
 	 */
 	dprintf("mount; name=%s path=%s\n", full_name, full_path);
 	argv[7] = options;
 	argv[8] = full_name;
 	argv[9] = full_path;
 	error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC);
 	if (error) {
 		if (!(error & MOUNT_BUSY << 8)) {
 			zfs_dbgmsg("Unable to automount %s error=%d",
 			    full_path, error);
 			error = SET_ERROR(EISDIR);
 		} else {
 			/*
 			 * EBUSY, this could mean a concurrent mount, or the
 			 * snapshot has already been mounted at completely
 			 * different place. We return 0 so VFS will retry. For
 			 * the latter case the VFS will retry several times
 			 * and return ELOOP, which is probably not a very good
 			 * behavior.
 			 */
 			error = 0;
 		}
 		goto error;
 	}
 
 	/*
 	 * Follow down in to the mounted snapshot and set MNT_SHRINKABLE
 	 * to identify this as an automounted filesystem.
 	 */
 	spath = *path;
 	path_get(&spath);
 	if (follow_down_one(&spath)) {
 		snap_zfsvfs = ITOZSB(spath.dentry->d_inode);
 		snap_zfsvfs->z_parent = zfsvfs;
 		dentry = spath.dentry;
 		spath.mnt->mnt_flags |= MNT_SHRINKABLE;
 
 		rw_enter(&zfs_snapshot_lock, RW_WRITER);
 		se = zfsctl_snapshot_alloc(full_name, full_path,
 		    snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os),
 		    dentry);
 		zfsctl_snapshot_add(se);
 		zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot);
 		rw_exit(&zfs_snapshot_lock);
 	}
 	path_put(&spath);
 error:
 	kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN);
 	kmem_free(full_path, MAXPATHLEN);
 
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * Get the snapdir inode from fid
  */
 int
 zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen,
     struct inode **ipp)
 {
 	int error;
 	struct path path;
 	char *mnt;
 	struct dentry *dentry;
 
 	mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 	error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid,
 	    MAXPATHLEN, mnt);
 	if (error)
 		goto out;
 
 	/* Trigger automount */
 	error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path);
 	if (error)
 		goto out;
 
 	path_put(&path);
 	/*
 	 * Get the snapdir inode. Note, we don't want to use the above
 	 * path because it contains the root of the snapshot rather
 	 * than the snapdir.
 	 */
 	*ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid);
 	if (*ipp == NULL) {
 		error = SET_ERROR(ENOENT);
 		goto out;
 	}
 
 	/* check gen, see zfsctl_snapdir_fid */
 	dentry = d_obtain_alias(igrab(*ipp));
 	if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) {
 		iput(*ipp);
 		*ipp = NULL;
 		error = SET_ERROR(ENOENT);
 	}
 	if (!IS_ERR(dentry))
 		dput(dentry);
 out:
 	kmem_free(mnt, MAXPATHLEN);
 	return (error);
 }
 
 int
 zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp,
     int flags, cred_t *cr, int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t *zfsvfs = ITOZSB(dip);
 	znode_t *zp;
 	znode_t *dzp;
 	int error;
 
 	if ((error = zfs_enter(zfsvfs, FTAG)) != 0)
 		return (error);
 
 	if (zfsvfs->z_shares_dir == 0) {
 		zfs_exit(zfsvfs, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
 		error = zfs_lookup(dzp, name, &zp, 0, cr, NULL, NULL);
 		zrele(dzp);
 	}
 
 	zfs_exit(zfsvfs, FTAG);
 
 	return (error);
 }
 
 /*
  * Initialize the various pieces we'll need to create and manipulate .zfs
  * directories.  Currently this is unused but available.
  */
 void
 zfsctl_init(void)
 {
 	avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name,
 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
 	    se_node_name));
 	avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid,
 	    sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t,
 	    se_node_objsetid));
 	rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL);
 }
 
 /*
  * Cleanup the various pieces we needed for .zfs directories.  In particular
  * ensure the expiry timer is canceled safely.
  */
 void
 zfsctl_fini(void)
 {
 	avl_destroy(&zfs_snapshots_by_name);
 	avl_destroy(&zfs_snapshots_by_objsetid);
 	rw_destroy(&zfs_snapshot_lock);
 }
 
 module_param(zfs_admin_snapshot, int, 0644);
 MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot");
 
 module_param(zfs_expire_snapshot, int, 0644);
 MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot");
 
 module_param(zfs_snapshot_no_setuid, int, 0644);
 MODULE_PARM_DESC(zfs_snapshot_no_setuid,
 	"Disable setuid/setgid for automounts in .zfs/snapshot");
diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c
index e8de536606e2..7edea05f94e6 100644
--- a/module/os/linux/zfs/zfs_dir.c
+++ b/module/os/linux/zfs/zfs_dir.c
@@ -1,1291 +1,1292 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2013, 2016 by Delphix. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/time.h>
 #include <sys/sysmacros.h>
 #include <sys/vfs.h>
 #include <sys/vnode.h>
 #include <sys/file.h>
 #include <sys/kmem.h>
 #include <sys/uio.h>
 #include <sys/pathname.h>
 #include <sys/cmn_err.h>
 #include <sys/errno.h>
 #include <sys/stat.h>
 #include <sys/sunddi.h>
 #include <sys/random.h>
 #include <sys/policy.h>
 #include <sys/zfs_dir.h>
 #include <sys/zfs_acl.h>
 #include <sys/zfs_vnops.h>
 #include <sys/fs/zfs.h>
 #include <sys/zap.h>
 #include <sys/dmu.h>
 #include <sys/atomic.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/sa.h>
 #include <sys/zfs_sa.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 
 /*
  * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups
  * of names after deciding which is the appropriate lookup interface.
  */
 static int
 zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name,
     matchtype_t mt, boolean_t update, int *deflags, pathname_t *rpnp,
     uint64_t *zoid)
 {
 	boolean_t conflict = B_FALSE;
 	int error;
 
 	if (zfsvfs->z_norm) {
 		size_t bufsz = 0;
 		char *buf = NULL;
 
 		if (rpnp) {
 			buf = rpnp->pn_buf;
 			bufsz = rpnp->pn_bufsize;
 		}
 
 		/*
 		 * In the non-mixed case we only expect there would ever
 		 * be one match, but we need to use the normalizing lookup.
 		 */
 		error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1,
 		    zoid, mt, buf, bufsz, &conflict);
 	} else {
 		error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid);
 	}
 
 	/*
 	 * Allow multiple entries provided the first entry is
 	 * the object id.  Non-zpl consumers may safely make
 	 * use of the additional space.
 	 *
 	 * XXX: This should be a feature flag for compatibility
 	 */
 	if (error == EOVERFLOW)
 		error = 0;
 
 	if (zfsvfs->z_norm && !error && deflags)
 		*deflags = conflict ? ED_CASE_CONFLICT : 0;
 
 	*zoid = ZFS_DIRENT_OBJ(*zoid);
 
 	return (error);
 }
 
 /*
  * Lock a directory entry.  A dirlock on <dzp, name> protects that name
  * in dzp's directory zap object.  As long as you hold a dirlock, you can
  * assume two things: (1) dzp cannot be reaped, and (2) no other thread
  * can change the zap entry for (i.e. link or unlink) this name.
  *
  * Input arguments:
  *	dzp	- znode for directory
  *	name	- name of entry to lock
  *	flag	- ZNEW: if the entry already exists, fail with EEXIST.
  *		  ZEXISTS: if the entry does not exist, fail with ENOENT.
  *		  ZSHARED: allow concurrent access with other ZSHARED callers.
  *		  ZXATTR: we want dzp's xattr directory
  *		  ZCILOOK: On a mixed sensitivity file system,
  *			   this lookup should be case-insensitive.
  *		  ZCIEXACT: On a purely case-insensitive file system,
  *			    this lookup should be case-sensitive.
  *		  ZRENAMING: we are locking for renaming, force narrow locks
  *		  ZHAVELOCK: Don't grab the z_name_lock for this call. The
  *			     current thread already holds it.
  *
  * Output arguments:
  *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
  *	dlpp	- pointer to the dirlock for this entry (NULL on error)
  *      direntflags - (case-insensitive lookup only)
  *		flags if multiple case-sensitive matches exist in directory
  *      realpnp     - (case-insensitive lookup only)
  *		actual name matched within the directory
  *
  * Return value: 0 on success or errno on failure.
  *
  * NOTE: Always checks for, and rejects, '.' and '..'.
  * NOTE: For case-insensitive file systems we take wide locks (see below),
  *	 but return znode pointers to a single match.
  */
 int
 zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name,
     znode_t **zpp, int flag, int *direntflags, pathname_t *realpnp)
 {
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	zfs_dirlock_t	*dl;
 	boolean_t	update;
 	matchtype_t	mt = 0;
 	uint64_t	zoid;
 	int		error = 0;
 	int		cmpflags;
 
 	*zpp = NULL;
 	*dlpp = NULL;
 
 	/*
 	 * Verify that we are not trying to lock '.', '..', or '.zfs'
 	 */
 	if ((name[0] == '.' &&
 	    (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) ||
 	    (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))
 		return (SET_ERROR(EEXIST));
 
 	/*
 	 * Case sensitivity and normalization preferences are set when
 	 * the file system is created.  These are stored in the
 	 * zfsvfs->z_case and zfsvfs->z_norm fields.  These choices
 	 * affect what vnodes can be cached in the DNLC, how we
 	 * perform zap lookups, and the "width" of our dirlocks.
 	 *
 	 * A normal dirlock locks a single name.  Note that with
 	 * normalization a name can be composed multiple ways, but
 	 * when normalized, these names all compare equal.  A wide
 	 * dirlock locks multiple names.  We need these when the file
 	 * system is supporting mixed-mode access.  It is sometimes
 	 * necessary to lock all case permutations of file name at
 	 * once so that simultaneous case-insensitive/case-sensitive
 	 * behaves as rationally as possible.
 	 */
 
 	/*
 	 * When matching we may need to normalize & change case according to
 	 * FS settings.
 	 *
 	 * Note that a normalized match is necessary for a case insensitive
 	 * filesystem when the lookup request is not exact because normalization
 	 * can fold case independent of normalizing code point sequences.
 	 *
 	 * See the table above zfs_dropname().
 	 */
 	if (zfsvfs->z_norm != 0) {
 		mt = MT_NORMALIZE;
 
 		/*
 		 * Determine if the match needs to honor the case specified in
 		 * lookup, and if so keep track of that so that during
 		 * normalization we don't fold case.
 		 */
 		if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE &&
 		    (flag & ZCIEXACT)) ||
 		    (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) {
 			mt |= MT_MATCH_CASE;
 		}
 	}
 
 	/*
 	 * Only look in or update the DNLC if we are looking for the
 	 * name on a file system that does not require normalization
 	 * or case folding.  We can also look there if we happen to be
 	 * on a non-normalizing, mixed sensitivity file system IF we
 	 * are looking for the exact name.
 	 *
 	 * Maybe can add TO-UPPERed version of name to dnlc in ci-only
 	 * case for performance improvement?
 	 */
 	update = !zfsvfs->z_norm ||
 	    (zfsvfs->z_case == ZFS_CASE_MIXED &&
 	    !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK));
 
 	/*
 	 * ZRENAMING indicates we are in a situation where we should
 	 * take narrow locks regardless of the file system's
 	 * preferences for normalizing and case folding.  This will
 	 * prevent us deadlocking trying to grab the same wide lock
 	 * twice if the two names happen to be case-insensitive
 	 * matches.
 	 */
 	if (flag & ZRENAMING)
 		cmpflags = 0;
 	else
 		cmpflags = zfsvfs->z_norm;
 
 	/*
 	 * Wait until there are no locks on this name.
 	 *
 	 * Don't grab the lock if it is already held. However, cannot
 	 * have both ZSHARED and ZHAVELOCK together.
 	 */
 	ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
 	if (!(flag & ZHAVELOCK))
 		rw_enter(&dzp->z_name_lock, RW_READER);
 
 	mutex_enter(&dzp->z_lock);
 	for (;;) {
 		if (dzp->z_unlinked && !(flag & ZXATTR)) {
 			mutex_exit(&dzp->z_lock);
 			if (!(flag & ZHAVELOCK))
 				rw_exit(&dzp->z_name_lock);
 			return (SET_ERROR(ENOENT));
 		}
 		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
 			if ((u8_strcmp(name, dl->dl_name, 0, cmpflags,
 			    U8_UNICODE_LATEST, &error) == 0) || error != 0)
 				break;
 		}
 		if (error != 0) {
 			mutex_exit(&dzp->z_lock);
 			if (!(flag & ZHAVELOCK))
 				rw_exit(&dzp->z_name_lock);
 			return (SET_ERROR(ENOENT));
 		}
 		if (dl == NULL)	{
 			/*
 			 * Allocate a new dirlock and add it to the list.
 			 */
 			dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP);
 			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
 			dl->dl_name = name;
 			dl->dl_sharecnt = 0;
 			dl->dl_namelock = 0;
 			dl->dl_namesize = 0;
 			dl->dl_dzp = dzp;
 			dl->dl_next = dzp->z_dirlocks;
 			dzp->z_dirlocks = dl;
 			break;
 		}
 		if ((flag & ZSHARED) && dl->dl_sharecnt != 0)
 			break;
 		cv_wait(&dl->dl_cv, &dzp->z_lock);
 	}
 
 	/*
 	 * If the z_name_lock was NOT held for this dirlock record it.
 	 */
 	if (flag & ZHAVELOCK)
 		dl->dl_namelock = 1;
 
 	if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
 		/*
 		 * We're the second shared reference to dl.  Make a copy of
 		 * dl_name in case the first thread goes away before we do.
 		 * Note that we initialize the new name before storing its
 		 * pointer into dl_name, because the first thread may load
 		 * dl->dl_name at any time.  It'll either see the old value,
 		 * which belongs to it, or the new shared copy; either is OK.
 		 */
 		dl->dl_namesize = strlen(dl->dl_name) + 1;
 		name = kmem_alloc(dl->dl_namesize, KM_SLEEP);
 		memcpy(name, dl->dl_name, dl->dl_namesize);
 		dl->dl_name = name;
 	}
 
 	mutex_exit(&dzp->z_lock);
 
 	/*
 	 * We have a dirlock on the name.  (Note that it is the dirlock,
 	 * not the dzp's z_lock, that protects the name in the zap object.)
 	 * See if there's an object by this name; if so, put a hold on it.
 	 */
 	if (flag & ZXATTR) {
 		error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid,
 		    sizeof (zoid));
 		if (error == 0)
 			error = (zoid == 0 ? SET_ERROR(ENOENT) : 0);
 	} else {
 		error = zfs_match_find(zfsvfs, dzp, name, mt,
 		    update, direntflags, realpnp, &zoid);
 	}
 	if (error) {
 		if (error != ENOENT || (flag & ZEXISTS)) {
 			zfs_dirent_unlock(dl);
 			return (error);
 		}
 	} else {
 		if (flag & ZNEW) {
 			zfs_dirent_unlock(dl);
 			return (SET_ERROR(EEXIST));
 		}
 		error = zfs_zget(zfsvfs, zoid, zpp);
 		if (error) {
 			zfs_dirent_unlock(dl);
 			return (error);
 		}
 	}
 
 	*dlpp = dl;
 
 	return (0);
 }
 
 /*
  * Unlock this directory entry and wake anyone who was waiting for it.
  */
 void
 zfs_dirent_unlock(zfs_dirlock_t *dl)
 {
 	znode_t *dzp = dl->dl_dzp;
 	zfs_dirlock_t **prev_dl, *cur_dl;
 
 	mutex_enter(&dzp->z_lock);
 
 	if (!dl->dl_namelock)
 		rw_exit(&dzp->z_name_lock);
 
 	if (dl->dl_sharecnt > 1) {
 		dl->dl_sharecnt--;
 		mutex_exit(&dzp->z_lock);
 		return;
 	}
 	prev_dl = &dzp->z_dirlocks;
 	while ((cur_dl = *prev_dl) != dl)
 		prev_dl = &cur_dl->dl_next;
 	*prev_dl = dl->dl_next;
 	cv_broadcast(&dl->dl_cv);
 	mutex_exit(&dzp->z_lock);
 
 	if (dl->dl_namesize != 0)
 		kmem_free(dl->dl_name, dl->dl_namesize);
 	cv_destroy(&dl->dl_cv);
 	kmem_free(dl, sizeof (*dl));
 }
 
 /*
  * Look up an entry in a directory.
  *
  * NOTE: '.' and '..' are handled as special cases because
  *	no directory entries are actually stored for them.  If this is
  *	the root of a filesystem, then '.zfs' is also treated as a
  *	special pseudo-directory.
  */
 int
 zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags,
     int *deflg, pathname_t *rpnp)
 {
 	zfs_dirlock_t *dl;
 	znode_t *zp;
 	struct inode *ip;
 	int error = 0;
 	uint64_t parent;
 
 	if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) {
 		*zpp = dzp;
 		zhold(*zpp);
 	} else if (name[0] == '.' && name[1] == '.' && name[2] == 0) {
 		zfsvfs_t *zfsvfs = ZTOZSB(dzp);
 
 		/*
 		 * If we are a snapshot mounted under .zfs, return
 		 * the inode pointer for the snapshot directory.
 		 */
 		if ((error = sa_lookup(dzp->z_sa_hdl,
 		    SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0)
 			return (error);
 
 		if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) {
 			error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir,
 			    "snapshot", &ip, 0, kcred, NULL, NULL);
 			*zpp = ITOZ(ip);
 			return (error);
 		}
 		rw_enter(&dzp->z_parent_lock, RW_READER);
 		error = zfs_zget(zfsvfs, parent, &zp);
 		if (error == 0)
 			*zpp = zp;
 		rw_exit(&dzp->z_parent_lock);
 	} else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) {
 		if (ZTOZSB(dzp)->z_show_ctldir == ZFS_SNAPDIR_DISABLED) {
 			return (SET_ERROR(ENOENT));
 		}
 		ip = zfsctl_root(dzp);
 		*zpp = ITOZ(ip);
 	} else {
 		int zf;
 
 		zf = ZEXISTS | ZSHARED;
 		if (flags & FIGNORECASE)
 			zf |= ZCILOOK;
 
 		error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp);
 		if (error == 0) {
 			*zpp = zp;
 			zfs_dirent_unlock(dl);
 			dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */
 		}
 		rpnp = NULL;
 	}
 
 	if ((flags & FIGNORECASE) && rpnp && !error)
 		(void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize);
 
 	return (error);
 }
 
 /*
  * unlinked Set (formerly known as the "delete queue") Error Handling
  *
  * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we
  * don't specify the name of the entry that we will be manipulating.  We
  * also fib and say that we won't be adding any new entries to the
  * unlinked set, even though we might (this is to lower the minimum file
  * size that can be deleted in a full filesystem).  So on the small
  * chance that the nlink list is using a fat zap (ie. has more than
  * 2000 entries), we *may* not pre-read a block that's needed.
  * Therefore it is remotely possible for some of the assertions
  * regarding the unlinked set below to fail due to i/o error.  On a
  * nondebug system, this will result in the space being leaked.
  */
 void
 zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 
 	ASSERT(zp->z_unlinked);
 	ASSERT0(ZTOI(zp)->i_nlink);
 
 	VERIFY3U(0, ==,
 	    zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx));
 
 	dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1);
 }
 
 /*
  * Clean up any znodes that had no links when we either crashed or
  * (force) umounted the file system.
  */
 static void
 zfs_unlinked_drain_task(void *arg)
 {
 	zfsvfs_t *zfsvfs = arg;
 	zap_cursor_t	zc;
 	zap_attribute_t *zap = zap_attribute_alloc();
 	dmu_object_info_t doi;
 	znode_t		*zp;
 	int		error;
 
 	ASSERT3B(zfsvfs->z_draining, ==, B_TRUE);
 
 	/*
 	 * Iterate over the contents of the unlinked set.
 	 */
 	for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj);
 	    zap_cursor_retrieve(&zc, zap) == 0 && !zfsvfs->z_drain_cancel;
 	    zap_cursor_advance(&zc)) {
 
 		/*
 		 * See what kind of object we have in list
 		 */
 
 		error = dmu_object_info(zfsvfs->z_os,
 		    zap->za_first_integer, &doi);
 		if (error != 0)
 			continue;
 
 		ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) ||
 		    (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS));
 		/*
 		 * We need to re-mark these list entries for deletion,
 		 * so we pull them back into core and set zp->z_unlinked.
 		 */
 		error = zfs_zget(zfsvfs, zap->za_first_integer, &zp);
 
 		/*
 		 * We may pick up znodes that are already marked for deletion.
 		 * This could happen during the purge of an extended attribute
 		 * directory.  All we need to do is skip over them, since they
 		 * are already in the system marked z_unlinked.
 		 */
 		if (error != 0)
 			continue;
 
 		zp->z_unlinked = B_TRUE;
 
 		/*
 		 * zrele() decrements the znode's ref count and may cause
 		 * it to be synchronously freed. We interrupt freeing
 		 * of this znode by checking the return value of
 		 * dmu_objset_zfs_unmounting() in dmu_free_long_range()
 		 * when an unmount is requested.
 		 */
 		zrele(zp);
 		ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
 	}
 	zap_cursor_fini(&zc);
 
 	zfsvfs->z_draining = B_FALSE;
 	zfsvfs->z_drain_task = TASKQID_INVALID;
 	zap_attribute_free(zap);
 }
 
 /*
  * Sets z_draining then tries to dispatch async unlinked drain.
  * If that fails executes synchronous unlinked drain.
  */
 void
 zfs_unlinked_drain(zfsvfs_t *zfsvfs)
 {
 	ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
 	ASSERT3B(zfsvfs->z_draining, ==, B_FALSE);
 
 	zfsvfs->z_draining = B_TRUE;
 	zfsvfs->z_drain_cancel = B_FALSE;
 
 	zfsvfs->z_drain_task = taskq_dispatch(
 	    dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)),
 	    zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP);
 	if (zfsvfs->z_drain_task == TASKQID_INVALID) {
 		zfs_dbgmsg("async zfs_unlinked_drain dispatch failed");
 		zfs_unlinked_drain_task(zfsvfs);
 	}
 }
 
 /*
  * Wait for the unlinked drain taskq task to stop. This will interrupt the
  * unlinked set processing if it is in progress.
  */
 void
 zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs)
 {
 	ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE);
 
 	if (zfsvfs->z_draining) {
 		zfsvfs->z_drain_cancel = B_TRUE;
 		taskq_cancel_id(dsl_pool_unlinked_drain_taskq(
-		    dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task);
+		    dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task,
+		    B_TRUE);
 		zfsvfs->z_drain_task = TASKQID_INVALID;
 		zfsvfs->z_draining = B_FALSE;
 	}
 }
 
 /*
  * Delete the entire contents of a directory.  Return a count
  * of the number of entries that could not be deleted. If we encounter
  * an error, return a count of at least one so that the directory stays
  * in the unlinked set.
  *
  * NOTE: this function assumes that the directory is inactive,
  *	so there is no need to lock its entries before deletion.
  *	Also, it assumes the directory contents is *only* regular
  *	files.
  */
 static int
 zfs_purgedir(znode_t *dzp)
 {
 	zap_cursor_t	zc;
 	zap_attribute_t	*zap = zap_attribute_alloc();
 	znode_t		*xzp;
 	dmu_tx_t	*tx;
 	zfsvfs_t	*zfsvfs = ZTOZSB(dzp);
 	zfs_dirlock_t	dl;
 	int skipped = 0;
 	int error;
 
 	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
 	    (error = zap_cursor_retrieve(&zc, zap)) == 0;
 	    zap_cursor_advance(&zc)) {
 		error = zfs_zget(zfsvfs,
 		    ZFS_DIRENT_OBJ(zap->za_first_integer), &xzp);
 		if (error) {
 			skipped += 1;
 			continue;
 		}
 
 		ASSERT(S_ISREG(ZTOI(xzp)->i_mode) ||
 		    S_ISLNK(ZTOI(xzp)->i_mode));
 
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap->za_name);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 		/* Is this really needed ? */
 		zfs_sa_upgrade_txholds(tx, xzp);
 		dmu_tx_mark_netfree(tx);
 		error = dmu_tx_assign(tx, DMU_TX_WAIT);
 		if (error) {
 			dmu_tx_abort(tx);
 			zfs_zrele_async(xzp);
 			skipped += 1;
 			continue;
 		}
 		memset(&dl, 0, sizeof (dl));
 		dl.dl_dzp = dzp;
 		dl.dl_name = zap->za_name;
 
 		error = zfs_link_destroy(&dl, xzp, tx, 0, NULL);
 		if (error)
 			skipped += 1;
 		dmu_tx_commit(tx);
 
 		zfs_zrele_async(xzp);
 	}
 	zap_cursor_fini(&zc);
 	zap_attribute_free(zap);
 	if (error != ENOENT)
 		skipped += 1;
 	return (skipped);
 }
 
 void
 zfs_rmnode(znode_t *zp)
 {
 	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
 	objset_t	*os = zfsvfs->z_os;
 	znode_t		*xzp = NULL;
 	dmu_tx_t	*tx;
 	znode_hold_t	*zh;
 	uint64_t	z_id = zp->z_id;
 	uint64_t	acl_obj;
 	uint64_t	xattr_obj;
 	uint64_t	links;
 	int		error;
 
 	ASSERT0(ZTOI(zp)->i_nlink);
 	ASSERT0(atomic_read(&ZTOI(zp)->i_count));
 
 	/*
 	 * If this is an attribute directory, purge its contents.
 	 */
 	if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) {
 		if (zfs_purgedir(zp) != 0) {
 			/*
 			 * Not enough space to delete some xattrs.
 			 * Leave it in the unlinked set.
 			 */
 			zh = zfs_znode_hold_enter(zfsvfs, z_id);
 			zfs_znode_dmu_fini(zp);
 			zfs_znode_hold_exit(zfsvfs, zh);
 			return;
 		}
 	}
 
 	/*
 	 * Free up all the data in the file.  We don't do this for directories
 	 * because we need truncate and remove to be in the same tx, like in
 	 * zfs_znode_delete(). Otherwise, if we crash here we'll end up with
 	 * an inconsistent truncated zap object in the delete queue.  Note a
 	 * truncated file is harmless since it only contains user data.
 	 */
 	if (S_ISREG(ZTOI(zp)->i_mode)) {
 		error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END);
 		if (error) {
 			/*
 			 * Not enough space or we were interrupted by unmount.
 			 * Leave the file in the unlinked set.
 			 */
 			zh = zfs_znode_hold_enter(zfsvfs, z_id);
 			zfs_znode_dmu_fini(zp);
 			zfs_znode_hold_exit(zfsvfs, zh);
 			return;
 		}
 	}
 
 	/*
 	 * If the file has extended attributes, we're going to unlink
 	 * the xattr dir.
 	 */
 	error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs),
 	    &xattr_obj, sizeof (xattr_obj));
 	if (error == 0 && xattr_obj) {
 		error = zfs_zget(zfsvfs, xattr_obj, &xzp);
 		ASSERT0(error);
 	}
 
 	acl_obj = zfs_external_acl(zp);
 
 	/*
 	 * Set up the final transaction.
 	 */
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 	if (xzp) {
 		dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL);
 		dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE);
 	}
 	if (acl_obj)
 		dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END);
 
 	zfs_sa_upgrade_txholds(tx, zp);
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error) {
 		/*
 		 * Not enough space to delete the file.  Leave it in the
 		 * unlinked set, leaking it until the fs is remounted (at
 		 * which point we'll call zfs_unlinked_drain() to process it).
 		 */
 		dmu_tx_abort(tx);
 		zh = zfs_znode_hold_enter(zfsvfs, z_id);
 		zfs_znode_dmu_fini(zp);
 		zfs_znode_hold_exit(zfsvfs, zh);
 		goto out;
 	}
 
 	if (xzp) {
 		ASSERT0(error);
 		mutex_enter(&xzp->z_lock);
 		xzp->z_unlinked = B_TRUE;	/* mark xzp for deletion */
 		clear_nlink(ZTOI(xzp));		/* no more links to it */
 		links = 0;
 		VERIFY0(sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs),
 		    &links, sizeof (links), tx));
 		mutex_exit(&xzp->z_lock);
 		zfs_unlinked_add(xzp, tx);
 	}
 
 	mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
 
 	/*
 	 * Remove this znode from the unlinked set.  If a has rollback has
 	 * occurred while a file is open and unlinked.  Then when the file
 	 * is closed post rollback it will not exist in the rolled back
 	 * version of the unlinked object.
 	 */
 	error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj,
 	    zp->z_id, tx);
 	VERIFY(error == 0 || error == ENOENT);
 
 	uint64_t count;
 	if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) {
 		cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv);
 	}
 
 	mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock);
 
 	dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1);
 
 	zfs_znode_delete(zp, tx);
 
 	dmu_tx_commit(tx);
 out:
 	if (xzp)
 		zfs_zrele_async(xzp);
 }
 
 static uint64_t
 zfs_dirent(znode_t *zp, uint64_t mode)
 {
 	uint64_t de = zp->z_id;
 
 	if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE)
 		de |= IFTODT(mode) << 60;
 	return (de);
 }
 
 /*
  * Link zp into dl.  Can fail in the following cases :
  * - if zp has been unlinked.
  * - if the number of entries with the same hash (aka. colliding entries)
  *    exceed the capacity of a leaf-block of fatzap and splitting of the
  *    leaf-block does not help.
  */
 int
 zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag)
 {
 	znode_t *dzp = dl->dl_dzp;
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	uint64_t value;
 	int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
 	sa_bulk_attr_t bulk[5];
 	uint64_t mtime[2], ctime[2];
 	uint64_t links;
 	int count = 0;
 	int error;
 
 	mutex_enter(&zp->z_lock);
 
 	if (!(flag & ZRENAMING)) {
 		if (zp->z_unlinked) {	/* no new links to unlinked zp */
 			ASSERT(!(flag & (ZNEW | ZEXISTS)));
 			mutex_exit(&zp->z_lock);
 			return (SET_ERROR(ENOENT));
 		}
 		if (!(flag & ZNEW)) {
 			/*
 			 * ZNEW nodes come from zfs_mknode() where the link
 			 * count has already been initialised
 			 */
 			inc_nlink(ZTOI(zp));
 			links = ZTOI(zp)->i_nlink;
 			SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
 			    NULL, &links, sizeof (links));
 		}
 	}
 
 	value = zfs_dirent(zp, zp->z_mode);
 	error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1,
 	    &value, tx);
 
 	/*
 	 * zap_add could fail to add the entry if it exceeds the capacity of the
 	 * leaf-block and zap_leaf_split() failed to help.
 	 * The caller of this routine is responsible for failing the transaction
 	 * which will rollback the SA updates done above.
 	 */
 	if (error != 0) {
 		if (!(flag & ZRENAMING) && !(flag & ZNEW))
 			drop_nlink(ZTOI(zp));
 		mutex_exit(&zp->z_lock);
 		return (error);
 	}
 
 	/*
 	 * If we added a longname activate the SPA_FEATURE_LONGNAME.
 	 */
 	if (strlen(dl->dl_name) >= ZAP_MAXNAMELEN) {
 		dsl_dataset_t *ds = dmu_objset_ds(zfsvfs->z_os);
 		ds->ds_feature_activation[SPA_FEATURE_LONGNAME] =
 		    (void *)B_TRUE;
 	}
 
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL,
 	    &dzp->z_id, sizeof (dzp->z_id));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &zp->z_pflags, sizeof (zp->z_pflags));
 
 	if (!(flag & ZNEW)) {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 		    ctime, sizeof (ctime));
 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
 		    ctime);
 	}
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 	ASSERT0(error);
 
 	mutex_exit(&zp->z_lock);
 
 	mutex_enter(&dzp->z_lock);
 	dzp->z_size++;
 	if (zp_is_dir)
 		inc_nlink(ZTOI(dzp));
 	links = ZTOI(dzp)->i_nlink;
 	count = 0;
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL,
 	    &dzp->z_size, sizeof (dzp->z_size));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL,
 	    &links, sizeof (links));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL,
 	    mtime, sizeof (mtime));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL,
 	    ctime, sizeof (ctime));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL,
 	    &dzp->z_pflags, sizeof (dzp->z_pflags));
 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
 	ASSERT0(error);
 	mutex_exit(&dzp->z_lock);
 
 	return (0);
 }
 
 /*
  * The match type in the code for this function should conform to:
  *
  * ------------------------------------------------------------------------
  * fs type  | z_norm      | lookup type | match type
  * ---------|-------------|-------------|----------------------------------
  * CS !norm | 0           |           0 | 0 (exact)
  * CS  norm | formX       |           0 | MT_NORMALIZE
  * CI !norm | upper       |   !ZCIEXACT | MT_NORMALIZE
  * CI !norm | upper       |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
  * CI  norm | upper|formX |   !ZCIEXACT | MT_NORMALIZE
  * CI  norm | upper|formX |    ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE
  * CM !norm | upper       |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
  * CM !norm | upper       |     ZCILOOK | MT_NORMALIZE
  * CM  norm | upper|formX |    !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE
  * CM  norm | upper|formX |     ZCILOOK | MT_NORMALIZE
  *
  * Abbreviations:
  *    CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed
  *    upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER)
  *    formX = unicode normalization form set on fs creation
  */
 static int
 zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx,
     int flag)
 {
 	int error;
 
 	if (ZTOZSB(zp)->z_norm) {
 		matchtype_t mt = MT_NORMALIZE;
 
 		if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE &&
 		    (flag & ZCIEXACT)) ||
 		    (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED &&
 		    !(flag & ZCILOOK))) {
 			mt |= MT_MATCH_CASE;
 		}
 
 		error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id,
 		    dl->dl_name, mt, tx);
 	} else {
 		error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name,
 		    tx);
 	}
 
 	return (error);
 }
 
 static int
 zfs_drop_nlink_locked(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)
 {
 	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
 	int		zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
 	boolean_t	unlinked = B_FALSE;
 	sa_bulk_attr_t	bulk[3];
 	uint64_t	mtime[2], ctime[2];
 	uint64_t	links;
 	int		count = 0;
 	int		error;
 
 	if (zp_is_dir && !zfs_dirempty(zp))
 		return (SET_ERROR(ENOTEMPTY));
 
 	if (ZTOI(zp)->i_nlink <= zp_is_dir) {
 		zfs_panic_recover("zfs: link count on %lu is %u, "
 		    "should be at least %u", zp->z_id,
 		    (int)ZTOI(zp)->i_nlink, zp_is_dir + 1);
 		set_nlink(ZTOI(zp), zp_is_dir + 1);
 	}
 	drop_nlink(ZTOI(zp));
 	if (ZTOI(zp)->i_nlink == zp_is_dir) {
 		zp->z_unlinked = B_TRUE;
 		clear_nlink(ZTOI(zp));
 		unlinked = B_TRUE;
 	} else {
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
 		    NULL, &ctime, sizeof (ctime));
 		SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 		    NULL, &zp->z_pflags, sizeof (zp->z_pflags));
 		zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime,
 		    ctime);
 	}
 	links = ZTOI(zp)->i_nlink;
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
 	    NULL, &links, sizeof (links));
 	error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx);
 	ASSERT0(error);
 
 	if (unlinkedp != NULL)
 		*unlinkedp = unlinked;
 	else if (unlinked)
 		zfs_unlinked_add(zp, tx);
 
 	return (0);
 }
 
 /*
  * Forcefully drop an nlink reference from (zp) and mark it for deletion if it
  * was the last link. This *must* only be done to znodes which have already
  * been zfs_link_destroy()'d with ZRENAMING. This is explicitly only used in
  * the error path of zfs_rename(), where we have to correct the nlink count if
  * we failed to link the target as well as failing to re-link the original
  * znodes.
  */
 int
 zfs_drop_nlink(znode_t *zp, dmu_tx_t *tx, boolean_t *unlinkedp)
 {
 	int error;
 
 	mutex_enter(&zp->z_lock);
 	error = zfs_drop_nlink_locked(zp, tx, unlinkedp);
 	mutex_exit(&zp->z_lock);
 
 	return (error);
 }
 
 /*
  * Unlink zp from dl, and mark zp for deletion if this was the last link. Can
  * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY).
  * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list.
  * If it's non-NULL, we use it to indicate whether the znode needs deletion,
  * and it's the caller's job to do it.
  */
 int
 zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag,
     boolean_t *unlinkedp)
 {
 	znode_t *dzp = dl->dl_dzp;
 	zfsvfs_t *zfsvfs = ZTOZSB(dzp);
 	int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode);
 	boolean_t unlinked = B_FALSE;
 	sa_bulk_attr_t bulk[5];
 	uint64_t mtime[2], ctime[2];
 	uint64_t links;
 	int count = 0;
 	int error;
 
 	if (!(flag & ZRENAMING)) {
 		mutex_enter(&zp->z_lock);
 
 		if (zp_is_dir && !zfs_dirempty(zp)) {
 			mutex_exit(&zp->z_lock);
 			return (SET_ERROR(ENOTEMPTY));
 		}
 
 		/*
 		 * If we get here, we are going to try to remove the object.
 		 * First try removing the name from the directory; if that
 		 * fails, return the error.
 		 */
 		error = zfs_dropname(dl, zp, dzp, tx, flag);
 		if (error != 0) {
 			mutex_exit(&zp->z_lock);
 			return (error);
 		}
 
 		/* The only error is !zfs_dirempty() and we checked earlier. */
 		error = zfs_drop_nlink_locked(zp, tx, &unlinked);
 		ASSERT0(error);
 		mutex_exit(&zp->z_lock);
 	} else {
 		error = zfs_dropname(dl, zp, dzp, tx, flag);
 		if (error != 0)
 			return (error);
 	}
 
 	mutex_enter(&dzp->z_lock);
 	dzp->z_size--;		/* one dirent removed */
 	if (zp_is_dir)
 		drop_nlink(ZTOI(dzp));	/* ".." link from zp */
 	links = ZTOI(dzp)->i_nlink;
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs),
 	    NULL, &links, sizeof (links));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs),
 	    NULL, &dzp->z_size, sizeof (dzp->z_size));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs),
 	    NULL, ctime, sizeof (ctime));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs),
 	    NULL, mtime, sizeof (mtime));
 	SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs),
 	    NULL, &dzp->z_pflags, sizeof (dzp->z_pflags));
 	zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime);
 	error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx);
 	ASSERT0(error);
 	mutex_exit(&dzp->z_lock);
 
 	if (unlinkedp != NULL)
 		*unlinkedp = unlinked;
 	else if (unlinked)
 		zfs_unlinked_add(zp, tx);
 
 	return (0);
 }
 
 /*
  * Indicate whether the directory is empty.  Works with or without z_lock
  * held, but can only be consider a hint in the latter case.  Returns true
  * if only "." and ".." remain and there's no work in progress.
  *
  * The internal ZAP size, rather than zp->z_size, needs to be checked since
  * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE.
  */
 boolean_t
 zfs_dirempty(znode_t *dzp)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(dzp);
 	uint64_t count;
 	int error;
 
 	if (dzp->z_dirlocks != NULL)
 		return (B_FALSE);
 
 	error = zap_count(zfsvfs->z_os, dzp->z_id, &count);
 	if (error != 0 || count != 0)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 int
 zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = ZTOZSB(zp);
 	znode_t *xzp;
 	dmu_tx_t *tx;
 	int error;
 	zfs_acl_ids_t acl_ids;
 	boolean_t fuid_dirtied;
 #ifdef ZFS_DEBUG
 	uint64_t parent;
 #endif
 
 	*xzpp = NULL;
 
 	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
 	    &acl_ids, zfs_init_idmap)) != 0)
 		return (error);
 	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) {
 		zfs_acl_ids_free(&acl_ids);
 		return (SET_ERROR(EDQUOT));
 	}
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes +
 	    ZFS_SA_BASE_ATTR_SIZE);
 	dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error) {
 		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_abort(tx);
 		return (error);
 	}
 	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids);
 
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 
 #ifdef ZFS_DEBUG
 	error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs),
 	    &parent, sizeof (parent));
 	ASSERT(error == 0 && parent == zp->z_id);
 #endif
 
 	VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id,
 	    sizeof (xzp->z_id), tx));
 
 	if (!zp->z_unlinked)
 		zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL,
 		    acl_ids.z_fuidp, vap);
 
 	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
 	*xzpp = xzp;
 
 	return (0);
 }
 
 /*
  * Return a znode for the extended attribute directory for zp.
  * ** If the directory does not already exist, it is created **
  *
  *	IN:	zp	- znode to obtain attribute directory from
  *		cr	- credentials of caller
  *		flags	- flags from the VOP_LOOKUP call
  *
  *	OUT:	xipp	- pointer to extended attribute znode
  *
  *	RETURN:	0 on success
  *		error number on failure
  */
 int
 zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags)
 {
 	zfsvfs_t	*zfsvfs = ZTOZSB(zp);
 	znode_t		*xzp;
 	zfs_dirlock_t	*dl;
 	vattr_t		va;
 	int		error;
 top:
 	error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL);
 	if (error)
 		return (error);
 
 	if (xzp != NULL) {
 		*xzpp = xzp;
 		zfs_dirent_unlock(dl);
 		return (0);
 	}
 
 	if (!(flags & CREATE_XATTR_DIR)) {
 		zfs_dirent_unlock(dl);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (zfs_is_readonly(zfsvfs)) {
 		zfs_dirent_unlock(dl);
 		return (SET_ERROR(EROFS));
 	}
 
 	/*
 	 * The ability to 'create' files in an attribute
 	 * directory comes from the write_xattr permission on the base file.
 	 *
 	 * The ability to 'search' an attribute directory requires
 	 * read_xattr permission on the base file.
 	 *
 	 * Once in a directory the ability to read/write attributes
 	 * is controlled by the permissions on the attribute file.
 	 */
 	va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID;
 	va.va_mode = S_IFDIR | S_ISVTX | 0777;
 	zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid);
 
 	va.va_dentry = NULL;
 	error = zfs_make_xattrdir(zp, &va, xzpp, cr);
 	zfs_dirent_unlock(dl);
 
 	if (error == ERESTART) {
 		/* NB: we already did dmu_tx_wait() if necessary */
 		goto top;
 	}
 
 	return (error);
 }
 
 /*
  * Decide whether it is okay to remove within a sticky directory.
  *
  * In sticky directories, write access is not sufficient;
  * you can remove entries from a directory only if:
  *
  *	you own the directory,
  *	you own the entry,
  *	you have write access to the entry,
  *	or you are privileged (checked in secpolicy...).
  *
  * The function returns 0 if remove access is granted.
  */
 int
 zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
 {
 	uid_t		uid;
 	uid_t		downer;
 	uid_t		fowner;
 	zfsvfs_t	*zfsvfs = ZTOZSB(zdp);
 
 	if (zfsvfs->z_replay)
 		return (0);
 
 	if ((zdp->z_mode & S_ISVTX) == 0)
 		return (0);
 
 	downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid),
 	    cr, ZFS_OWNER);
 	fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid),
 	    cr, ZFS_OWNER);
 
 	if ((uid = crgetuid(cr)) == downer || uid == fowner ||
 	    zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr,
 	    zfs_init_idmap) == 0)
 		return (0);
 	else
 		return (secpolicy_vnode_remove(cr));
 }
diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c
index 8e6b569c2100..5a815b59e37b 100644
--- a/module/zfs/dmu_objset.c
+++ b/module/zfs/dmu_objset.c
@@ -1,3092 +1,3093 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2020 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright 2017 Nexenta Systems, Inc.
  * Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
  * Copyright (c) 2018, loli10K <ezomori.nozomu@gmail.com>. All rights reserved.
  * Copyright (c) 2019, Klara Inc.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2022 Hewlett Packard Enterprise Development LP.
  * Copyright (c) 2025, Rob Norris <robn@despairlabs.com>
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/cred.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dnode.h>
 #include <sys/dbuf.h>
 #include <sys/zvol.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/dmu_impl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/sa.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_destroy.h>
 #include <sys/vdev.h>
 #include <sys/zfeature.h>
 #include <sys/policy.h>
 #include <sys/spa_impl.h>
 #include <sys/dmu_recv.h>
 #include <sys/zfs_project.h>
 #include "zfs_namecheck.h"
 #include <sys/vdev_impl.h>
 #include <sys/arc.h>
 #include <cityhash.h>
 #include <sys/cred.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
  * before it can be safely accessed.
  */
 krwlock_t os_lock;
 
 /*
  * Tunable to overwrite the maximum number of threads for the parallelization
  * of dmu_objset_find_dp, needed to speed up the import of pools with many
  * datasets.
  * Default is 4 times the number of leaf vdevs.
  */
 static const int dmu_find_threads = 0;
 
 /*
  * Backfill lower metadnode objects after this many have been freed.
  * Backfilling negatively impacts object creation rates, so only do it
  * if there are enough holes to fill.
  */
 static const int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT;
 
 static const char *upgrade_tag = "upgrade_tag";
 
 static void dmu_objset_find_dp_cb(void *arg);
 
 static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb);
 static void dmu_objset_upgrade_stop(objset_t *os);
 
 void
 dmu_objset_init(void)
 {
 	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
 }
 
 void
 dmu_objset_fini(void)
 {
 	rw_destroy(&os_lock);
 }
 
 spa_t *
 dmu_objset_spa(objset_t *os)
 {
 	return (os->os_spa);
 }
 
 zilog_t *
 dmu_objset_zil(objset_t *os)
 {
 	return (os->os_zil);
 }
 
 dsl_pool_t *
 dmu_objset_pool(objset_t *os)
 {
 	dsl_dataset_t *ds;
 
 	if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
 		return (ds->ds_dir->dd_pool);
 	else
 		return (spa_get_dsl(os->os_spa));
 }
 
 dsl_dataset_t *
 dmu_objset_ds(objset_t *os)
 {
 	return (os->os_dsl_dataset);
 }
 
 dmu_objset_type_t
 dmu_objset_type(objset_t *os)
 {
 	return (os->os_phys->os_type);
 }
 
 void
 dmu_objset_name(objset_t *os, char *buf)
 {
 	dsl_dataset_name(os->os_dsl_dataset, buf);
 }
 
 uint64_t
 dmu_objset_id(objset_t *os)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 
 	return (ds ? ds->ds_object : 0);
 }
 
 uint64_t
 dmu_objset_dnodesize(objset_t *os)
 {
 	return (os->os_dnodesize);
 }
 
 zfs_sync_type_t
 dmu_objset_syncprop(objset_t *os)
 {
 	return (os->os_sync);
 }
 
 zfs_logbias_op_t
 dmu_objset_logbias(objset_t *os)
 {
 	return (os->os_logbias);
 }
 
 static void
 checksum_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance should have been done by now.
 	 */
 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 
 	os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
 }
 
 static void
 compression_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval != ZIO_COMPRESS_INHERIT);
 
 	os->os_compress = zio_compress_select(os->os_spa,
 	    ZIO_COMPRESS_ALGO(newval), ZIO_COMPRESS_ON);
 	os->os_complevel = zio_complevel_select(os->os_spa, os->os_compress,
 	    ZIO_COMPRESS_LEVEL(newval), ZIO_COMPLEVEL_DEFAULT);
 }
 
 static void
 copies_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval > 0);
 	ASSERT(newval <= spa_max_replication(os->os_spa));
 
 	os->os_copies = newval;
 }
 
 static void
 dedup_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 	spa_t *spa = os->os_spa;
 	enum zio_checksum checksum;
 
 	/*
 	 * Inheritance should have been done by now.
 	 */
 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 
 	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
 
 	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
 	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
 }
 
 static void
 primary_cache_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 	    newval == ZFS_CACHE_METADATA);
 
 	os->os_primary_cache = newval;
 }
 
 static void
 secondary_cache_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 	    newval == ZFS_CACHE_METADATA);
 
 	os->os_secondary_cache = newval;
 }
 
 static void
 prefetch_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance should have been done by now.
 	 */
 	ASSERT(newval == ZFS_PREFETCH_ALL || newval == ZFS_PREFETCH_NONE ||
 	    newval == ZFS_PREFETCH_METADATA);
 	os->os_prefetch = newval;
 }
 
 static void
 sync_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
 	    newval == ZFS_SYNC_DISABLED);
 
 	os->os_sync = newval;
 	if (os->os_zil)
 		zil_set_sync(os->os_zil, newval);
 }
 
 static void
 redundant_metadata_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
 	    newval == ZFS_REDUNDANT_METADATA_MOST ||
 	    newval == ZFS_REDUNDANT_METADATA_SOME ||
 	    newval == ZFS_REDUNDANT_METADATA_NONE);
 
 	os->os_redundant_metadata = newval;
 }
 
 static void
 dnodesize_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	switch (newval) {
 	case ZFS_DNSIZE_LEGACY:
 		os->os_dnodesize = DNODE_MIN_SIZE;
 		break;
 	case ZFS_DNSIZE_AUTO:
 		/*
 		 * Choose a dnode size that will work well for most
 		 * workloads if the user specified "auto". Future code
 		 * improvements could dynamically select a dnode size
 		 * based on observed workload patterns.
 		 */
 		os->os_dnodesize = DNODE_MIN_SIZE * 2;
 		break;
 	case ZFS_DNSIZE_1K:
 	case ZFS_DNSIZE_2K:
 	case ZFS_DNSIZE_4K:
 	case ZFS_DNSIZE_8K:
 	case ZFS_DNSIZE_16K:
 		os->os_dnodesize = newval;
 		break;
 	}
 }
 
 static void
 smallblk_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	os->os_zpl_special_smallblock = newval;
 }
 
 static void
 direct_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_DIRECT_DISABLED || newval == ZFS_DIRECT_STANDARD ||
 	    newval == ZFS_DIRECT_ALWAYS);
 
 	os->os_direct = newval;
 }
 
 static void
 logbias_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
 	    newval == ZFS_LOGBIAS_THROUGHPUT);
 	os->os_logbias = newval;
 	if (os->os_zil)
 		zil_set_logbias(os->os_zil, newval);
 }
 
 static void
 recordsize_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	os->os_recordsize = newval;
 }
 
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
 	objset_phys_t *osp = buf;
 
 	ASSERT(size == OBJSET_PHYS_SIZE_V1 || size == OBJSET_PHYS_SIZE_V2 ||
 	    size == sizeof (objset_phys_t));
 	dnode_byteswap(&osp->os_meta_dnode);
 	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
 	osp->os_type = BSWAP_64(osp->os_type);
 	osp->os_flags = BSWAP_64(osp->os_flags);
 	if (size >= OBJSET_PHYS_SIZE_V2) {
 		dnode_byteswap(&osp->os_userused_dnode);
 		dnode_byteswap(&osp->os_groupused_dnode);
 		if (size >= sizeof (objset_phys_t))
 			dnode_byteswap(&osp->os_projectused_dnode);
 	}
 }
 
 /*
  * Runs cityhash on the objset_t pointer and the object number.
  */
 static uint64_t
 dnode_hash(const objset_t *os, uint64_t obj)
 {
 	uintptr_t osv = (uintptr_t)os;
 	return (cityhash2((uint64_t)osv, obj));
 }
 
 static unsigned int
 dnode_multilist_index_func(multilist_t *ml, void *obj)
 {
 	dnode_t *dn = obj;
 
 	/*
 	 * The low order bits of the hash value are thought to be
 	 * distributed evenly. Otherwise, in the case that the multilist
 	 * has a power of two number of sublists, each sublists' usage
 	 * would not be evenly distributed. In this context full 64bit
 	 * division would be a waste of time, so limit it to 32 bits.
 	 */
 	return ((unsigned int)dnode_hash(dn->dn_objset, dn->dn_object) %
 	    multilist_get_num_sublists(ml));
 }
 
 static inline boolean_t
 dmu_os_is_l2cacheable(objset_t *os)
 {
 	if (os->os_secondary_cache == ZFS_CACHE_ALL ||
 	    os->os_secondary_cache == ZFS_CACHE_METADATA) {
 		if (l2arc_exclude_special == 0)
 			return (B_TRUE);
 
 		blkptr_t *bp = os->os_rootbp;
 		if (bp == NULL || BP_IS_HOLE(bp))
 			return (B_FALSE);
 		uint64_t vdev = DVA_GET_VDEV(bp->blk_dva);
 		vdev_t *rvd = os->os_spa->spa_root_vdev;
 		vdev_t *vd = NULL;
 
 		if (vdev < rvd->vdev_children)
 			vd = rvd->vdev_child[vdev];
 
 		if (vd == NULL)
 			return (B_TRUE);
 
 		if (vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL &&
 		    vd->vdev_alloc_bias != VDEV_BIAS_DEDUP)
 			return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 /*
  * Instantiates the objset_t in-memory structure corresponding to the
  * objset_phys_t that's pointed to by the specified blkptr_t.
  */
 int
 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     objset_t **osp)
 {
 	objset_t *os;
 	int i, err;
 
 	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 	ASSERT(!BP_IS_REDACTED(bp));
 
 	/*
 	 * We need the pool config lock to get properties.
 	 */
 	ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool));
 
 	/*
 	 * The $ORIGIN dataset (if it exists) doesn't have an associated
 	 * objset, so there's no reason to open it. The $ORIGIN dataset
 	 * will not exist on pools older than SPA_VERSION_ORIGIN.
 	 */
 	if (ds != NULL && spa_get_dsl(spa) != NULL &&
 	    spa_get_dsl(spa)->dp_origin_snap != NULL) {
 		ASSERT3P(ds->ds_dir, !=,
 		    spa_get_dsl(spa)->dp_origin_snap->ds_dir);
 	}
 
 	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
 	os->os_dsl_dataset = ds;
 	os->os_spa = spa;
 	os->os_rootbp = bp;
 	if (!BP_IS_HOLE(os->os_rootbp)) {
 		arc_flags_t aflags = ARC_FLAG_WAIT;
 		zbookmark_phys_t zb;
 		int size;
 		zio_flag_t zio_flags = ZIO_FLAG_CANFAIL;
 		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 		if (dmu_os_is_l2cacheable(os))
 			aflags |= ARC_FLAG_L2CACHE;
 
 		if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) {
 			ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF);
 			ASSERT(BP_IS_AUTHENTICATED(bp));
 			zio_flags |= ZIO_FLAG_RAW;
 		}
 
 		dprintf_bp(os->os_rootbp, "reading %s", "");
 		err = arc_read(NULL, spa, os->os_rootbp,
 		    arc_getbuf_func, &os->os_phys_buf,
 		    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 		if (err != 0) {
 			kmem_free(os, sizeof (objset_t));
 			/* convert checksum errors into IO errors */
 			if (err == ECKSUM)
 				err = SET_ERROR(EIO);
 			return (err);
 		}
 
 		if (spa_version(spa) < SPA_VERSION_USERSPACE)
 			size = OBJSET_PHYS_SIZE_V1;
 		else if (!spa_feature_is_enabled(spa,
 		    SPA_FEATURE_PROJECT_QUOTA))
 			size = OBJSET_PHYS_SIZE_V2;
 		else
 			size = sizeof (objset_phys_t);
 
 		/* Increase the blocksize if we are permitted. */
 		if (arc_buf_size(os->os_phys_buf) < size) {
 			arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
 			    ARC_BUFC_METADATA, size);
 			memset(buf->b_data, 0, size);
 			memcpy(buf->b_data, os->os_phys_buf->b_data,
 			    arc_buf_size(os->os_phys_buf));
 			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 			os->os_phys_buf = buf;
 		}
 
 		os->os_phys = os->os_phys_buf->b_data;
 		os->os_flags = os->os_phys->os_flags;
 	} else {
 		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
 		    sizeof (objset_phys_t) : OBJSET_PHYS_SIZE_V1;
 		os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
 		    ARC_BUFC_METADATA, size);
 		os->os_phys = os->os_phys_buf->b_data;
 		memset(os->os_phys, 0, size);
 	}
 	/*
 	 * These properties will be filled in by the logic in zfs_get_zplprop()
 	 * when they are queried for the first time.
 	 */
 	os->os_version = OBJSET_PROP_UNINITIALIZED;
 	os->os_normalization = OBJSET_PROP_UNINITIALIZED;
 	os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
 	os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
 
 	/*
 	 * Note: the changed_cb will be called once before the register
 	 * func returns, thus changing the checksum/compression from the
 	 * default (fletcher2/off).  Snapshots don't need to know about
 	 * checksum/compression/copies.
 	 */
 	if (ds != NULL) {
 		os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0);
 
 		err = dsl_prop_register(ds,
 		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 		    primary_cache_changed_cb, os);
 		if (err == 0) {
 			err = dsl_prop_register(ds,
 			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 			    secondary_cache_changed_cb, os);
 		}
 		if (err == 0) {
 			err = dsl_prop_register(ds,
 			    zfs_prop_to_name(ZFS_PROP_PREFETCH),
 			    prefetch_changed_cb, os);
 		}
 		if (!ds->ds_is_snapshot) {
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 				    checksum_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 				    compression_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_COPIES),
 				    copies_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_DEDUP),
 				    dedup_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 				    logbias_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_SYNC),
 				    sync_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(
 				    ZFS_PROP_REDUNDANT_METADATA),
 				    redundant_metadata_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 				    recordsize_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_DNODESIZE),
 				    dnodesize_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(
 				    ZFS_PROP_SPECIAL_SMALL_BLOCKS),
 				    smallblk_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_DIRECT),
 				    direct_changed_cb, os);
 			}
 		}
 		if (err != 0) {
 			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 			kmem_free(os, sizeof (objset_t));
 			return (err);
 		}
 	} else {
 		/* It's the meta-objset. */
 		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
 		os->os_compress = ZIO_COMPRESS_ON;
 		os->os_complevel = ZIO_COMPLEVEL_DEFAULT;
 		os->os_encrypted = B_FALSE;
 		os->os_copies = spa_max_replication(spa);
 		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
 		os->os_dedup_verify = B_FALSE;
 		os->os_logbias = ZFS_LOGBIAS_LATENCY;
 		os->os_sync = ZFS_SYNC_STANDARD;
 		os->os_primary_cache = ZFS_CACHE_ALL;
 		os->os_secondary_cache = ZFS_CACHE_ALL;
 		os->os_dnodesize = DNODE_MIN_SIZE;
 		os->os_prefetch = ZFS_PREFETCH_ALL;
 	}
 
 	if (ds == NULL || !ds->ds_is_snapshot)
 		os->os_zil_header = os->os_phys->os_zil_header;
 	os->os_zil = zil_alloc(os, &os->os_zil_header);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		multilist_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[i]),
 		    dnode_multilist_index_func);
 	}
 	list_create(&os->os_dnodes, sizeof (dnode_t),
 	    offsetof(dnode_t, dn_link));
 	list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
 	list_link_init(&os->os_evicting_node);
 
 	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 	os->os_obj_next_percpu_len = boot_ncpus;
 	os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len *
 	    sizeof (os->os_obj_next_percpu[0]), KM_SLEEP);
 
 	dnode_special_open(os, &os->os_phys->os_meta_dnode,
 	    DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
 	if (OBJSET_BUF_HAS_USERUSED(os->os_phys_buf)) {
 		dnode_special_open(os, &os->os_phys->os_userused_dnode,
 		    DMU_USERUSED_OBJECT, &os->os_userused_dnode);
 		dnode_special_open(os, &os->os_phys->os_groupused_dnode,
 		    DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
 		if (OBJSET_BUF_HAS_PROJECTUSED(os->os_phys_buf))
 			dnode_special_open(os,
 			    &os->os_phys->os_projectused_dnode,
 			    DMU_PROJECTUSED_OBJECT, &os->os_projectused_dnode);
 	}
 
 	mutex_init(&os->os_upgrade_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	*osp = os;
 	return (0);
 }
 
 int
 dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
 {
 	int err = 0;
 
 	/*
 	 * We need the pool_config lock to manipulate the dsl_dataset_t.
 	 * Even if the dataset is long-held, we need the pool_config lock
 	 * to open the objset, as it needs to get properties.
 	 */
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool));
 
 	mutex_enter(&ds->ds_opening_lock);
 	if (ds->ds_objset == NULL) {
 		objset_t *os;
 		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
 		    ds, dsl_dataset_get_blkptr(ds), &os);
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 		if (err == 0) {
 			mutex_enter(&ds->ds_lock);
 			ASSERT0P(ds->ds_objset);
 			ds->ds_objset = os;
 			mutex_exit(&ds->ds_lock);
 		}
 	}
 	*osp = ds->ds_objset;
 	mutex_exit(&ds->ds_opening_lock);
 	return (err);
 }
 
 /*
  * Holds the pool while the objset is held.  Therefore only one objset
  * can be held at a time.
  */
 int
 dmu_objset_hold_flags(const char *name, boolean_t decrypt, const void *tag,
     objset_t **osp)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 	ds_hold_flags_t flags;
 
 	flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
 	err = dsl_pool_hold(name, tag, &dp);
 	if (err != 0)
 		return (err);
 	err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, tag);
 		return (err);
 	}
 
 	err = dmu_objset_from_ds(ds, osp);
 	if (err != 0) {
 		dsl_dataset_rele_flags(ds, flags, tag);
 		dsl_pool_rele(dp, tag);
 	}
 
 	return (err);
 }
 
 int
 dmu_objset_hold(const char *name, const void *tag, objset_t **osp)
 {
 	return (dmu_objset_hold_flags(name, B_FALSE, tag, osp));
 }
 
 static int
 dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
     boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)
 {
 	(void) tag;
 
 	int err = dmu_objset_from_ds(ds, osp);
 	if (err != 0) {
 		return (err);
 	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
 		return (SET_ERROR(EINVAL));
 	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
 		return (SET_ERROR(EROFS));
 	} else if (!readonly && decrypt &&
 	    dsl_dir_incompatible_encryption_version(ds->ds_dir)) {
 		return (SET_ERROR(EROFS));
 	}
 
 	/* if we are decrypting, we can now check MACs in os->os_phys_buf */
 	if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) {
 		zbookmark_phys_t zb;
 
 		SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
 		    ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 		err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa,
 		    &zb, B_FALSE);
 		if (err != 0)
 			return (err);
 
 		ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf));
 	}
 
 	return (0);
 }
 
 /*
  * dsl_pool must not be held when this is called.
  * Upon successful return, there will be a longhold on the dataset,
  * and the dsl_pool will not be held.
  */
 int
 dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 	ds_hold_flags_t flags;
 
 	flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
 	err = dsl_pool_hold(name, FTAG, &dp);
 	if (err != 0)
 		return (err);
 	err = dsl_dataset_own(dp, name, flags, tag, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 	err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
 	if (err != 0) {
 		dsl_dataset_disown(ds, flags, tag);
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 
 	/*
 	 * User accounting requires the dataset to be decrypted and rw.
 	 * We also don't begin user accounting during claiming to help
 	 * speed up pool import times and to keep this txg reserved
 	 * completely for recovery work.
 	 */
 	if (!readonly && !dp->dp_spa->spa_claiming &&
 	    (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) {
 		if (dmu_objset_userobjspace_upgradable(*osp) ||
 		    dmu_objset_projectquota_upgradable(*osp)) {
 			dmu_objset_id_quota_upgrade(*osp);
 		} else if (dmu_objset_userused_enabled(*osp)) {
 			dmu_objset_userspace_upgrade(*osp);
 		}
 	}
 
 	dsl_pool_rele(dp, FTAG);
 	return (0);
 }
 
 int
 dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
     boolean_t readonly, boolean_t decrypt, const void *tag, objset_t **osp)
 {
 	dsl_dataset_t *ds;
 	int err;
 	ds_hold_flags_t flags;
 
 	flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
 	err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds);
 	if (err != 0)
 		return (err);
 
 	err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp);
 	if (err != 0) {
 		dsl_dataset_disown(ds, flags, tag);
 		return (err);
 	}
 
 	return (0);
 }
 
 void
 dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, const void *tag)
 {
 	ds_hold_flags_t flags;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
 	dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag);
 	dsl_pool_rele(dp, tag);
 }
 
 void
 dmu_objset_rele(objset_t *os, const void *tag)
 {
 	dmu_objset_rele_flags(os, B_FALSE, tag);
 }
 
 /*
  * When we are called, os MUST refer to an objset associated with a dataset
  * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
  * == tag.  We will then release and reacquire ownership of the dataset while
  * holding the pool config_rwlock to avoid intervening namespace or ownership
  * changes may occur.
  *
  * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
  * release the hold on its dataset and acquire a new one on the dataset of the
  * same name so that it can be partially torn down and reconstructed.
  */
 void
 dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
     boolean_t decrypt, const void *tag)
 {
 	dsl_pool_t *dp;
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 	ds_hold_flags_t flags;
 
 	flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
 	VERIFY3P(ds, !=, NULL);
 	VERIFY3P(ds->ds_owner, ==, tag);
 	VERIFY(dsl_dataset_long_held(ds));
 
 	dsl_dataset_name(ds, name);
 	dp = ds->ds_dir->dd_pool;
 	dsl_pool_config_enter(dp, FTAG);
 	dsl_dataset_disown(ds, flags, tag);
 	VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds));
 	dsl_pool_config_exit(dp, FTAG);
 }
 
 void
 dmu_objset_disown(objset_t *os, boolean_t decrypt, const void *tag)
 {
 	ds_hold_flags_t flags;
 
 	flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE;
 	/*
 	 * Stop upgrading thread
 	 */
 	dmu_objset_upgrade_stop(os);
 	dsl_dataset_disown(os->os_dsl_dataset, flags, tag);
 }
 
 void
 dmu_objset_evict_dbufs(objset_t *os)
 {
 	dnode_t *dn_marker;
 	dnode_t *dn;
 
 	dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP);
 
 	mutex_enter(&os->os_lock);
 	dn = list_head(&os->os_dnodes);
 	while (dn != NULL) {
 		/*
 		 * Skip dnodes without holds.  We have to do this dance
 		 * because dnode_add_ref() only works if there is already a
 		 * hold.  If the dnode has no holds, then it has no dbufs.
 		 */
 		if (dnode_add_ref(dn, FTAG)) {
 			list_insert_after(&os->os_dnodes, dn, dn_marker);
 			mutex_exit(&os->os_lock);
 
 			dnode_evict_dbufs(dn);
 			dnode_rele(dn, FTAG);
 
 			mutex_enter(&os->os_lock);
 			dn = list_next(&os->os_dnodes, dn_marker);
 			list_remove(&os->os_dnodes, dn_marker);
 		} else {
 			dn = list_next(&os->os_dnodes, dn);
 		}
 	}
 	mutex_exit(&os->os_lock);
 
 	kmem_free(dn_marker, sizeof (dnode_t));
 
 	if (DMU_USERUSED_DNODE(os) != NULL) {
 		if (DMU_PROJECTUSED_DNODE(os) != NULL)
 			dnode_evict_dbufs(DMU_PROJECTUSED_DNODE(os));
 		dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
 		dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
 	}
 	dnode_evict_dbufs(DMU_META_DNODE(os));
 }
 
 /*
  * Objset eviction processing is split into into two pieces.
  * The first marks the objset as evicting, evicts any dbufs that
  * have a refcount of zero, and then queues up the objset for the
  * second phase of eviction.  Once os->os_dnodes has been cleared by
  * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
  * The second phase closes the special dnodes, dequeues the objset from
  * the list of those undergoing eviction, and finally frees the objset.
  *
  * NOTE: Due to asynchronous eviction processing (invocation of
  *       dnode_buf_pageout()), it is possible for the meta dnode for the
  *       objset to have no holds even though os->os_dnodes is not empty.
  */
 void
 dmu_objset_evict(objset_t *os)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!dmu_objset_is_dirty(os, t));
 
 	if (ds)
 		dsl_prop_unregister_all(ds, os);
 
 	if (os->os_sa)
 		sa_tear_down(os);
 
 	dmu_objset_evict_dbufs(os);
 
 	mutex_enter(&os->os_lock);
 	spa_evicting_os_register(os->os_spa, os);
 	if (list_is_empty(&os->os_dnodes)) {
 		mutex_exit(&os->os_lock);
 		dmu_objset_evict_done(os);
 	} else {
 		mutex_exit(&os->os_lock);
 	}
 
 
 }
 
 void
 dmu_objset_evict_done(objset_t *os)
 {
 	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 
 	dnode_special_close(&os->os_meta_dnode);
 	if (DMU_USERUSED_DNODE(os)) {
 		if (DMU_PROJECTUSED_DNODE(os))
 			dnode_special_close(&os->os_projectused_dnode);
 		dnode_special_close(&os->os_userused_dnode);
 		dnode_special_close(&os->os_groupused_dnode);
 	}
 	zil_free(os->os_zil);
 
 	arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 
 	/*
 	 * This is a barrier to prevent the objset from going away in
 	 * dnode_move() until we can safely ensure that the objset is still in
 	 * use. We consider the objset valid before the barrier and invalid
 	 * after the barrier.
 	 */
 	rw_enter(&os_lock, RW_READER);
 	rw_exit(&os_lock);
 
 	kmem_free(os->os_obj_next_percpu,
 	    os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]));
 
 	mutex_destroy(&os->os_lock);
 	mutex_destroy(&os->os_userused_lock);
 	mutex_destroy(&os->os_obj_lock);
 	mutex_destroy(&os->os_user_ptr_lock);
 	mutex_destroy(&os->os_upgrade_lock);
 	for (int i = 0; i < TXG_SIZE; i++)
 		multilist_destroy(&os->os_dirty_dnodes[i]);
 	spa_evicting_os_deregister(os->os_spa, os);
 	kmem_free(os, sizeof (objset_t));
 }
 
 inode_timespec_t
 dmu_objset_snap_cmtime(objset_t *os)
 {
 	return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
 }
 
 objset_t *
 dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx)
 {
 	objset_t *os;
 	dnode_t *mdn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	if (blksz == 0)
 		blksz = DNODE_BLOCK_SIZE;
 	if (ibs == 0)
 		ibs = DN_MAX_INDBLKSHIFT;
 
 	if (ds != NULL)
 		VERIFY0(dmu_objset_from_ds(ds, &os));
 	else
 		VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
 
 	mdn = DMU_META_DNODE(os);
 
 	dnode_allocate(mdn, DMU_OT_DNODE, blksz, ibs, DMU_OT_NONE, 0,
 	    DNODE_MIN_SLOTS, tx);
 
 	/*
 	 * We don't want to have to increase the meta-dnode's nlevels
 	 * later, because then we could do it in quiescing context while
 	 * we are also accessing it in open context.
 	 *
 	 * This precaution is not necessary for the MOS (ds == NULL),
 	 * because the MOS is only updated in syncing context.
 	 * This is most fortunate: the MOS is the only objset that
 	 * needs to be synced multiple times as spa_sync() iterates
 	 * to convergence, so minimizing its dn_nlevels matters.
 	 */
 	if (ds != NULL) {
 		if (levels == 0) {
 			levels = 1;
 
 			/*
 			 * Determine the number of levels necessary for the
 			 * meta-dnode to contain DN_MAX_OBJECT dnodes.  Note
 			 * that in order to ensure that we do not overflow
 			 * 64 bits, there has to be a nlevels that gives us a
 			 * number of blocks > DN_MAX_OBJECT but < 2^64.
 			 * Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)
 			 * (10) must be less than (64 - log2(DN_MAX_OBJECT))
 			 * (16).
 			 */
 			while ((uint64_t)mdn->dn_nblkptr <<
 			    (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) *
 			    (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
 			    DN_MAX_OBJECT)
 				levels++;
 		}
 
 		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
 		    mdn->dn_nlevels = levels;
 	}
 
 	ASSERT(type != DMU_OST_NONE);
 	ASSERT(type != DMU_OST_ANY);
 	ASSERT(type < DMU_OST_NUMTYPES);
 	os->os_phys->os_type = type;
 
 	/*
 	 * Enable user accounting if it is enabled and this is not an
 	 * encrypted receive.
 	 */
 	if (dmu_objset_userused_enabled(os) &&
 	    (!os->os_encrypted || !dmu_objset_is_receiving(os))) {
 		os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 		if (dmu_objset_userobjused_enabled(os)) {
 			ASSERT3P(ds, !=, NULL);
 			ds->ds_feature_activation[
 			    SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
 			os->os_phys->os_flags |=
 			    OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
 		}
 		if (dmu_objset_projectquota_enabled(os)) {
 			ASSERT3P(ds, !=, NULL);
 			ds->ds_feature_activation[
 			    SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
 			os->os_phys->os_flags |=
 			    OBJSET_FLAG_PROJECTQUOTA_COMPLETE;
 		}
 		os->os_flags = os->os_phys->os_flags;
 	}
 
 	dsl_dataset_dirty(ds, tx);
 
 	return (os);
 }
 
 /* called from dsl for meta-objset */
 objset_t *
 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_objset_type_t type, dmu_tx_t *tx)
 {
 	return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx));
 }
 
 typedef struct dmu_objset_create_arg {
 	const char *doca_name;
 	cred_t *doca_cred;
 	void (*doca_userfunc)(objset_t *os, void *arg,
 	    cred_t *cr, dmu_tx_t *tx);
 	void *doca_userarg;
 	dmu_objset_type_t doca_type;
 	uint64_t doca_flags;
 	dsl_crypto_params_t *doca_dcp;
 } dmu_objset_create_arg_t;
 
 static int
 dmu_objset_create_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_create_arg_t *doca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *pdd;
 	dsl_dataset_t *parentds;
 	objset_t *parentos;
 	const char *tail;
 	int error;
 
 	if (strchr(doca->doca_name, '@') != NULL)
 		return (SET_ERROR(EINVAL));
 
 	if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	if (dataset_nestcheck(doca->doca_name) != 0)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
 	if (error != 0)
 		return (error);
 	if (tail == NULL) {
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 
 	error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp, NULL);
 	if (error != 0) {
 		dsl_dir_rele(pdd, FTAG);
 		return (error);
 	}
 
 	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 	    doca->doca_cred);
 	if (error != 0) {
 		dsl_dir_rele(pdd, FTAG);
 		return (error);
 	}
 
 	/* can't create below anything but filesystems (eg. no ZVOLs) */
 	error = dsl_dataset_hold_obj(pdd->dd_pool,
 	    dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds);
 	if (error != 0) {
 		dsl_dir_rele(pdd, FTAG);
 		return (error);
 	}
 	error = dmu_objset_from_ds(parentds, &parentos);
 	if (error != 0) {
 		dsl_dataset_rele(parentds, FTAG);
 		dsl_dir_rele(pdd, FTAG);
 		return (error);
 	}
 	if (dmu_objset_type(parentos) != DMU_OST_ZFS) {
 		dsl_dataset_rele(parentds, FTAG);
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(ZFS_ERR_WRONG_PARENT));
 	}
 	dsl_dataset_rele(parentds, FTAG);
 	dsl_dir_rele(pdd, FTAG);
 
 	return (error);
 }
 
 static void
 dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_create_arg_t *doca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	spa_t *spa = dp->dp_spa;
 	dsl_dir_t *pdd;
 	const char *tail;
 	dsl_dataset_t *ds;
 	uint64_t obj;
 	blkptr_t *bp;
 	objset_t *os;
 	zio_t *rzio;
 
 	VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
 
 	obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
 	    doca->doca_cred, doca->doca_dcp, tx);
 
 	VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj,
 	    DS_HOLD_FLAG_DECRYPT, FTAG, &ds));
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	bp = dsl_dataset_get_blkptr(ds);
 	os = dmu_objset_create_impl(spa, ds, bp, doca->doca_type, tx);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 	if (doca->doca_userfunc != NULL) {
 		doca->doca_userfunc(os, doca->doca_userarg,
 		    doca->doca_cred, tx);
 	}
 
 	/*
 	 * The doca_userfunc() may write out some data that needs to be
 	 * encrypted if the dataset is encrypted (specifically the root
 	 * directory).  This data must be written out before the encryption
 	 * key mapping is removed by dsl_dataset_rele_flags().  Force the
 	 * I/O to occur immediately by invoking the relevant sections of
 	 * dsl_pool_sync().
 	 */
 	if (os->os_encrypted) {
 		dsl_dataset_t *tmpds = NULL;
 		boolean_t need_sync_done = B_FALSE;
 
 		mutex_enter(&ds->ds_lock);
 		ds->ds_owner = FTAG;
 		mutex_exit(&ds->ds_lock);
 
 		rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 		tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,
 		    tx->tx_txg);
 		if (tmpds != NULL) {
 			dsl_dataset_sync(ds, rzio, tx);
 			need_sync_done = B_TRUE;
 		}
 		VERIFY0(zio_wait(rzio));
 
 		dmu_objset_sync_done(os, tx);
 		taskq_wait(dp->dp_sync_taskq);
 		if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) {
 			ASSERT3P(ds->ds_key_mapping, !=, NULL);
 			key_mapping_rele(spa, ds->ds_key_mapping, ds);
 		}
 
 		rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 		tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds,
 		    tx->tx_txg);
 		if (tmpds != NULL) {
 			dmu_buf_rele(ds->ds_dbuf, ds);
 			dsl_dataset_sync(ds, rzio, tx);
 		}
 		VERIFY0(zio_wait(rzio));
 
 		if (need_sync_done) {
 			ASSERT3P(ds->ds_key_mapping, !=, NULL);
 			key_mapping_rele(spa, ds->ds_key_mapping, ds);
 			dsl_dataset_sync_done(ds, tx);
 			dmu_buf_rele(ds->ds_dbuf, ds);
 		}
 
 		mutex_enter(&ds->ds_lock);
 		ds->ds_owner = NULL;
 		mutex_exit(&ds->ds_lock);
 	}
 
 	spa_history_log_internal_ds(ds, "create", tx, " ");
 
 	dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG);
 	dsl_dir_rele(pdd, FTAG);
 }
 
 int
 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg)
 {
 	dmu_objset_create_arg_t doca;
 	dsl_crypto_params_t tmp_dcp = { 0 };
 
 	cred_t *cr = CRED();
 	crhold(cr);
 
 	doca.doca_name = name;
 	doca.doca_cred = cr;
 	doca.doca_flags = flags;
 	doca.doca_userfunc = func;
 	doca.doca_userarg = arg;
 	doca.doca_type = type;
 
 	/*
 	 * Some callers (mostly for testing) do not provide a dcp on their
 	 * own but various code inside the sync task will require it to be
 	 * allocated. Rather than adding NULL checks throughout this code
 	 * or adding dummy dcp's to all of the callers we simply create a
 	 * dummy one here and use that. This zero dcp will have the same
 	 * effect as asking for inheritance of all encryption params.
 	 */
 	doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp;
 
 	int rv = dsl_sync_task(name,
 	    dmu_objset_create_check, dmu_objset_create_sync, &doca,
 	    6, ZFS_SPACE_CHECK_NORMAL);
 
 	if (rv == 0)
 		zvol_create_minors(name);
 
 	crfree(cr);
 
 	return (rv);
 }
 
 int
 dmu_objset_snapshot_one(const char *fsname, const char *snapname)
 {
 	int err;
 	char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
 	nvlist_t *snaps = fnvlist_alloc();
 
 	fnvlist_add_boolean(snaps, longsnap);
 	kmem_strfree(longsnap);
 	err = dsl_dataset_snapshot(snaps, NULL, NULL);
 	fnvlist_free(snaps);
 	return (err);
 }
 
 static void
 dmu_objset_upgrade_task_cb(void *data)
 {
 	objset_t *os = data;
 
 	mutex_enter(&os->os_upgrade_lock);
 	os->os_upgrade_status = EINTR;
 	if (!os->os_upgrade_exit) {
 		int status;
 
 		mutex_exit(&os->os_upgrade_lock);
 
 		status = os->os_upgrade_cb(os);
 
 		mutex_enter(&os->os_upgrade_lock);
 
 		os->os_upgrade_status = status;
 	}
 	os->os_upgrade_exit = B_TRUE;
 	os->os_upgrade_id = 0;
 	mutex_exit(&os->os_upgrade_lock);
 	dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
 }
 
 static void
 dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb)
 {
 	if (os->os_upgrade_id != 0)
 		return;
 
 	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
 	dsl_dataset_long_hold(dmu_objset_ds(os), upgrade_tag);
 
 	mutex_enter(&os->os_upgrade_lock);
 	if (os->os_upgrade_id == 0 && os->os_upgrade_status == 0) {
 		os->os_upgrade_exit = B_FALSE;
 		os->os_upgrade_cb = cb;
 		os->os_upgrade_id = taskq_dispatch(
 		    os->os_spa->spa_upgrade_taskq,
 		    dmu_objset_upgrade_task_cb, os, TQ_SLEEP);
 		if (os->os_upgrade_id == TASKQID_INVALID) {
 			dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
 			os->os_upgrade_status = ENOMEM;
 		}
 	} else {
 		dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
 	}
 	mutex_exit(&os->os_upgrade_lock);
 }
 
 static void
 dmu_objset_upgrade_stop(objset_t *os)
 {
 	mutex_enter(&os->os_upgrade_lock);
 	os->os_upgrade_exit = B_TRUE;
 	if (os->os_upgrade_id != 0) {
 		taskqid_t id = os->os_upgrade_id;
 
 		os->os_upgrade_id = 0;
 		mutex_exit(&os->os_upgrade_lock);
 
-		if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) {
+		if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id,
+		    B_TRUE)) == 0) {
 			dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag);
 		}
 		txg_wait_synced(os->os_spa->spa_dsl_pool, 0);
 	} else {
 		mutex_exit(&os->os_upgrade_lock);
 	}
 }
 
 static void
 dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	while ((dn = multilist_sublist_head(list)) != NULL) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		ASSERT(dn->dn_dbuf->db_data_pending);
 		/*
 		 * Initialize dn_zio outside dnode_sync() because the
 		 * meta-dnode needs to set it outside dnode_sync().
 		 */
 		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
 		ASSERT(dn->dn_zio);
 
 		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
 		multilist_sublist_remove(list, dn);
 
 		/*
 		 * See the comment above dnode_rele_task() for an explanation
 		 * of why this dnode hold is always needed (even when not
 		 * doing user accounting).
 		 */
 		multilist_t *newlist = &dn->dn_objset->os_synced_dnodes;
 		(void) dnode_add_ref(dn, newlist);
 		multilist_insert(newlist, dn);
 
 		dnode_sync(dn, tx);
 	}
 }
 
 static void
 dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	(void) abuf;
 	blkptr_t *bp = zio->io_bp;
 	objset_t *os = arg;
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 	uint64_t fill = 0;
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
 	ASSERT0(BP_GET_LEVEL(bp));
 
 	/*
 	 * Update rootbp fill count: it should be the number of objects
 	 * allocated in the object set (not counting the "special"
 	 * objects that are stored in the objset_phys_t -- the meta
 	 * dnode and user/group/project accounting objects).
 	 */
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
 
 	BP_SET_FILL(bp, fill);
 
 	if (os->os_dsl_dataset != NULL)
 		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
 	*os->os_rootbp = *bp;
 	if (os->os_dsl_dataset != NULL)
 		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
 }
 
 static void
 dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	(void) abuf;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	objset_t *os = arg;
 
 	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		dmu_tx_t *tx = os->os_synctx;
 
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
 	kmem_free(bp, sizeof (*bp));
 }
 
 typedef struct sync_objset_arg {
 	zio_t		*soa_zio;
 	objset_t	*soa_os;
 	dmu_tx_t	*soa_tx;
 	kmutex_t	soa_mutex;
 	int		soa_count;
 	taskq_ent_t	soa_tq_ent;
 } sync_objset_arg_t;
 
 typedef struct sync_dnodes_arg {
 	multilist_t	*sda_list;
 	int		sda_sublist_idx;
 	multilist_t	*sda_newlist;
 	sync_objset_arg_t *sda_soa;
 } sync_dnodes_arg_t;
 
 static void sync_meta_dnode_task(void *arg);
 
 static void
 sync_dnodes_task(void *arg)
 {
 	sync_dnodes_arg_t *sda = arg;
 	sync_objset_arg_t *soa = sda->sda_soa;
 	objset_t *os = soa->soa_os;
 
 	uint_t allocator = spa_acq_allocator(os->os_spa);
 	multilist_sublist_t *ms =
 	    multilist_sublist_lock_idx(sda->sda_list, sda->sda_sublist_idx);
 
 	dmu_objset_sync_dnodes(ms, soa->soa_tx);
 
 	multilist_sublist_unlock(ms);
 	spa_rel_allocator(os->os_spa, allocator);
 
 	kmem_free(sda, sizeof (*sda));
 
 	mutex_enter(&soa->soa_mutex);
 	ASSERT(soa->soa_count != 0);
 	if (--soa->soa_count != 0) {
 		mutex_exit(&soa->soa_mutex);
 		return;
 	}
 	mutex_exit(&soa->soa_mutex);
 
 	taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
 	    sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
 }
 
 /*
  * Issue the zio_nowait() for all dirty record zios on the meta dnode,
  * then trigger the callback for the zil_sync. This runs once for each
  * objset, only after any/all sublists in the objset have been synced.
  */
 static void
 sync_meta_dnode_task(void *arg)
 {
 	sync_objset_arg_t *soa = arg;
 	objset_t *os = soa->soa_os;
 	dmu_tx_t *tx = soa->soa_tx;
 	int txgoff = tx->tx_txg & TXG_MASK;
 	dbuf_dirty_record_t *dr;
 
 	ASSERT0(soa->soa_count);
 
 	list_t *list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
 	while ((dr = list_remove_head(list)) != NULL) {
 		ASSERT0(dr->dr_dbuf->db_level);
 		zio_nowait(dr->dr_zio);
 	}
 
 	/* Enable dnode backfill if enough objects have been freed. */
 	if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
 		os->os_rescan_dnodes = B_TRUE;
 		os->os_freed_dnodes = 0;
 	}
 
 	/*
 	 * Free intent log blocks up to this tx.
 	 */
 	zil_sync(os->os_zil, tx);
 	os->os_phys->os_zil_header = os->os_zil_header;
 	zio_nowait(soa->soa_zio);
 
 	mutex_destroy(&soa->soa_mutex);
 	kmem_free(soa, sizeof (*soa));
 }
 
 /* called from dsl */
 void
 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 {
 	int txgoff;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
 	int num_sublists;
 	multilist_t *ml;
 	blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
 	*blkptr_copy = *os->os_rootbp;
 
 	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", (u_longlong_t)tx->tx_txg);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* XXX the write_done callback should really give us the tx... */
 	os->os_synctx = tx;
 
 	if (os->os_dsl_dataset == NULL) {
 		/*
 		 * This is the MOS.  If we have upgraded,
 		 * spa_max_replication() could change, so reset
 		 * os_copies here.
 		 */
 		os->os_copies = spa_max_replication(os->os_spa);
 	}
 
 	/*
 	 * Create the root block IO
 	 */
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	arc_release(os->os_phys_buf, &os->os_phys_buf);
 
 	dmu_write_policy(os, NULL, 0, 0, &zp);
 
 	/*
 	 * If we are either claiming the ZIL or doing a raw receive, write
 	 * out the os_phys_buf raw. Neither of these actions will effect the
 	 * MAC at this point.
 	 */
 	if (os->os_raw_receive ||
 	    os->os_next_write_raw[tx->tx_txg & TXG_MASK]) {
 		ASSERT(os->os_encrypted);
 		arc_convert_to_raw(os->os_phys_buf,
 		    os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER,
 		    DMU_OT_OBJSET, NULL, NULL, NULL);
 	}
 
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
 	    blkptr_copy, os->os_phys_buf, B_FALSE, dmu_os_is_l2cacheable(os),
 	    &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done,
 	    os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
 	 * Sync special dnodes - the parent IO for the sync is the root block
 	 */
 	DMU_META_DNODE(os)->dn_zio = zio;
 	dnode_sync(DMU_META_DNODE(os), tx);
 
 	os->os_phys->os_flags = os->os_flags;
 
 	if (DMU_USERUSED_DNODE(os) &&
 	    DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
 		DMU_USERUSED_DNODE(os)->dn_zio = zio;
 		dnode_sync(DMU_USERUSED_DNODE(os), tx);
 		DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
 		dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
 	}
 
 	if (DMU_PROJECTUSED_DNODE(os) &&
 	    DMU_PROJECTUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
 		DMU_PROJECTUSED_DNODE(os)->dn_zio = zio;
 		dnode_sync(DMU_PROJECTUSED_DNODE(os), tx);
 	}
 
 	txgoff = tx->tx_txg & TXG_MASK;
 
 	/*
 	 * We must create the list here because it uses the
 	 * dn_dirty_link[] of this txg.  But it may already
 	 * exist because we call dsl_dataset_sync() twice per txg.
 	 */
 	if (os->os_synced_dnodes.ml_sublists == NULL) {
 		multilist_create(&os->os_synced_dnodes, sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[txgoff]),
 		    dnode_multilist_index_func);
 	} else {
 		ASSERT3U(os->os_synced_dnodes.ml_offset, ==,
 		    offsetof(dnode_t, dn_dirty_link[txgoff]));
 	}
 
 	/*
 	 * zio_nowait(zio) is done after any/all sublist and meta dnode
 	 * zios have been nowaited, and the zil_sync() has been performed.
 	 * The soa is freed at the end of sync_meta_dnode_task.
 	 */
 	sync_objset_arg_t *soa = kmem_alloc(sizeof (*soa), KM_SLEEP);
 	soa->soa_zio = zio;
 	soa->soa_os = os;
 	soa->soa_tx = tx;
 	taskq_init_ent(&soa->soa_tq_ent);
 	mutex_init(&soa->soa_mutex, NULL, MUTEX_DEFAULT, NULL);
 
 	ml = &os->os_dirty_dnodes[txgoff];
 	soa->soa_count = num_sublists = multilist_get_num_sublists(ml);
 
 	for (int i = 0; i < num_sublists; i++) {
 		if (multilist_sublist_is_empty_idx(ml, i))
 			soa->soa_count--;
 	}
 
 	if (soa->soa_count == 0) {
 		taskq_dispatch_ent(dmu_objset_pool(os)->dp_sync_taskq,
 		    sync_meta_dnode_task, soa, TQ_FRONT, &soa->soa_tq_ent);
 	} else {
 		/*
 		 * Sync sublists in parallel. The last to finish
 		 * (i.e., when soa->soa_count reaches zero) must
 		 *  dispatch sync_meta_dnode_task.
 		 */
 		for (int i = 0; i < num_sublists; i++) {
 			if (multilist_sublist_is_empty_idx(ml, i))
 				continue;
 			sync_dnodes_arg_t *sda =
 			    kmem_alloc(sizeof (*sda), KM_SLEEP);
 			sda->sda_list = ml;
 			sda->sda_sublist_idx = i;
 			sda->sda_soa = soa;
 			(void) taskq_dispatch(
 			    dmu_objset_pool(os)->dp_sync_taskq,
 			    sync_dnodes_task, sda, 0);
 			/* sync_dnodes_task frees sda */
 		}
 	}
 }
 
 boolean_t
 dmu_objset_is_dirty(objset_t *os, uint64_t txg)
 {
 	return (!multilist_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]));
 }
 
 static file_info_cb_t *file_cbs[DMU_OST_NUMTYPES];
 
 void
 dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb)
 {
 	file_cbs[ost] = cb;
 }
 
 int
 dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, const void *data,
     zfs_file_info_t *zfi)
 {
 	file_info_cb_t *cb = file_cbs[os->os_phys->os_type];
 	if (cb == NULL)
 		return (EINVAL);
 	return (cb(bonustype, data, zfi));
 }
 
 boolean_t
 dmu_objset_userused_enabled(objset_t *os)
 {
 	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
 	    file_cbs[os->os_phys->os_type] != NULL &&
 	    DMU_USERUSED_DNODE(os) != NULL);
 }
 
 boolean_t
 dmu_objset_userobjused_enabled(objset_t *os)
 {
 	return (dmu_objset_userused_enabled(os) &&
 	    spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING));
 }
 
 boolean_t
 dmu_objset_projectquota_enabled(objset_t *os)
 {
 	return (file_cbs[os->os_phys->os_type] != NULL &&
 	    DMU_PROJECTUSED_DNODE(os) != NULL &&
 	    spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA));
 }
 
 typedef struct userquota_node {
 	/* must be in the first filed, see userquota_update_cache() */
 	char		uqn_id[20 + DMU_OBJACCT_PREFIX_LEN];
 	int64_t		uqn_delta;
 	avl_node_t	uqn_node;
 } userquota_node_t;
 
 typedef struct userquota_cache {
 	avl_tree_t uqc_user_deltas;
 	avl_tree_t uqc_group_deltas;
 	avl_tree_t uqc_project_deltas;
 } userquota_cache_t;
 
 static int
 userquota_compare(const void *l, const void *r)
 {
 	const userquota_node_t *luqn = l;
 	const userquota_node_t *ruqn = r;
 	int rv;
 
 	/*
 	 * NB: can only access uqn_id because userquota_update_cache() doesn't
 	 * pass in an entire userquota_node_t.
 	 */
 	rv = strcmp(luqn->uqn_id, ruqn->uqn_id);
 
 	return (TREE_ISIGN(rv));
 }
 
 static void
 do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
 {
 	void *cookie;
 	userquota_node_t *uqn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	cookie = NULL;
 	while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
 	    &cookie)) != NULL) {
 		/*
 		 * os_userused_lock protects against concurrent calls to
 		 * zap_increment_int().  It's needed because zap_increment_int()
 		 * is not thread-safe (i.e. not atomic).
 		 */
 		mutex_enter(&os->os_userused_lock);
 		VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT,
 		    uqn->uqn_id, uqn->uqn_delta, tx));
 		mutex_exit(&os->os_userused_lock);
 		kmem_free(uqn, sizeof (*uqn));
 	}
 	avl_destroy(&cache->uqc_user_deltas);
 
 	cookie = NULL;
 	while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
 	    &cookie)) != NULL) {
 		mutex_enter(&os->os_userused_lock);
 		VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT,
 		    uqn->uqn_id, uqn->uqn_delta, tx));
 		mutex_exit(&os->os_userused_lock);
 		kmem_free(uqn, sizeof (*uqn));
 	}
 	avl_destroy(&cache->uqc_group_deltas);
 
 	if (dmu_objset_projectquota_enabled(os)) {
 		cookie = NULL;
 		while ((uqn = avl_destroy_nodes(&cache->uqc_project_deltas,
 		    &cookie)) != NULL) {
 			mutex_enter(&os->os_userused_lock);
 			VERIFY0(zap_increment(os, DMU_PROJECTUSED_OBJECT,
 			    uqn->uqn_id, uqn->uqn_delta, tx));
 			mutex_exit(&os->os_userused_lock);
 			kmem_free(uqn, sizeof (*uqn));
 		}
 		avl_destroy(&cache->uqc_project_deltas);
 	}
 }
 
 static void
 userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta)
 {
 	userquota_node_t *uqn;
 	avl_index_t idx;
 
 	ASSERT(strlen(id) < sizeof (uqn->uqn_id));
 	/*
 	 * Use id directly for searching because uqn_id is the first field of
 	 * userquota_node_t and fields after uqn_id won't be accessed in
 	 * avl_find().
 	 */
 	uqn = avl_find(avl, (const void *)id, &idx);
 	if (uqn == NULL) {
 		uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
 		strlcpy(uqn->uqn_id, id, sizeof (uqn->uqn_id));
 		avl_insert(avl, uqn, idx);
 	}
 	uqn->uqn_delta += delta;
 }
 
 static void
 do_userquota_update(objset_t *os, userquota_cache_t *cache, uint64_t used,
     uint64_t flags, uint64_t user, uint64_t group, uint64_t project,
     boolean_t subtract)
 {
 	if (flags & DNODE_FLAG_USERUSED_ACCOUNTED) {
 		int64_t delta = DNODE_MIN_SIZE + used;
 		char name[20];
 
 		if (subtract)
 			delta = -delta;
 
 		(void) snprintf(name, sizeof (name), "%llx", (longlong_t)user);
 		userquota_update_cache(&cache->uqc_user_deltas, name, delta);
 
 		(void) snprintf(name, sizeof (name), "%llx", (longlong_t)group);
 		userquota_update_cache(&cache->uqc_group_deltas, name, delta);
 
 		if (dmu_objset_projectquota_enabled(os)) {
 			(void) snprintf(name, sizeof (name), "%llx",
 			    (longlong_t)project);
 			userquota_update_cache(&cache->uqc_project_deltas,
 			    name, delta);
 		}
 	}
 }
 
 static void
 do_userobjquota_update(objset_t *os, userquota_cache_t *cache, uint64_t flags,
     uint64_t user, uint64_t group, uint64_t project, boolean_t subtract)
 {
 	if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) {
 		char name[20 + DMU_OBJACCT_PREFIX_LEN];
 		int delta = subtract ? -1 : 1;
 
 		(void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
 		    (longlong_t)user);
 		userquota_update_cache(&cache->uqc_user_deltas, name, delta);
 
 		(void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx",
 		    (longlong_t)group);
 		userquota_update_cache(&cache->uqc_group_deltas, name, delta);
 
 		if (dmu_objset_projectquota_enabled(os)) {
 			(void) snprintf(name, sizeof (name),
 			    DMU_OBJACCT_PREFIX "%llx", (longlong_t)project);
 			userquota_update_cache(&cache->uqc_project_deltas,
 			    name, delta);
 		}
 	}
 }
 
 typedef struct userquota_updates_arg {
 	objset_t *uua_os;
 	int uua_sublist_idx;
 	dmu_tx_t *uua_tx;
 } userquota_updates_arg_t;
 
 static void
 userquota_updates_task(void *arg)
 {
 	userquota_updates_arg_t *uua = arg;
 	objset_t *os = uua->uua_os;
 	dmu_tx_t *tx = uua->uua_tx;
 	dnode_t *dn;
 	userquota_cache_t cache = { { 0 } };
 
 	multilist_sublist_t *list = multilist_sublist_lock_idx(
 	    &os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	ASSERT(multilist_sublist_head(list) == NULL ||
 	    dmu_objset_userused_enabled(os));
 	avl_create(&cache.uqc_user_deltas, userquota_compare,
 	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
 	avl_create(&cache.uqc_group_deltas, userquota_compare,
 	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
 	if (dmu_objset_projectquota_enabled(os))
 		avl_create(&cache.uqc_project_deltas, userquota_compare,
 		    sizeof (userquota_node_t), offsetof(userquota_node_t,
 		    uqn_node));
 
 	while ((dn = multilist_sublist_head(list)) != NULL) {
 		int flags;
 		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
 		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
 		    dn->dn_phys->dn_flags &
 		    DNODE_FLAG_USERUSED_ACCOUNTED);
 
 		flags = dn->dn_id_flags;
 		ASSERT(flags);
 		if (flags & DN_ID_OLD_EXIST)  {
 			do_userquota_update(os, &cache, dn->dn_oldused,
 			    dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid,
 			    dn->dn_oldprojid, B_TRUE);
 			do_userobjquota_update(os, &cache, dn->dn_oldflags,
 			    dn->dn_olduid, dn->dn_oldgid,
 			    dn->dn_oldprojid, B_TRUE);
 		}
 		if (flags & DN_ID_NEW_EXIST) {
 			do_userquota_update(os, &cache,
 			    DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags,
 			    dn->dn_newuid, dn->dn_newgid,
 			    dn->dn_newprojid, B_FALSE);
 			do_userobjquota_update(os, &cache,
 			    dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid,
 			    dn->dn_newprojid, B_FALSE);
 		}
 
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_oldused = 0;
 		dn->dn_oldflags = 0;
 		if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
 			dn->dn_olduid = dn->dn_newuid;
 			dn->dn_oldgid = dn->dn_newgid;
 			dn->dn_oldprojid = dn->dn_newprojid;
 			dn->dn_id_flags |= DN_ID_OLD_EXIST;
 			if (dn->dn_bonuslen == 0)
 				dn->dn_id_flags |= DN_ID_CHKED_SPILL;
 			else
 				dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 		}
 		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
 		ASSERT3U(dn->dn_dirtycnt, >, 0);
 		dn->dn_dirtycnt--;
 		mutex_exit(&dn->dn_mtx);
 
 		multilist_sublist_remove(list, dn);
 		dnode_rele(dn, &os->os_synced_dnodes);
 	}
 	do_userquota_cacheflush(os, &cache, tx);
 	multilist_sublist_unlock(list);
 	kmem_free(uua, sizeof (*uua));
 }
 
 /*
  * Release dnode holds from dmu_objset_sync_dnodes().  When the dnode is being
  * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be
  * evicted because the block containing the dnode can't be evicted until it is
  * written out.  However, this hold is necessary to prevent the dnode_t from
  * being moved (via dnode_move()) while it's still referenced by
  * dbuf_dirty_record_t:dr_dnode.  And dr_dnode is needed for
  * dirty_lightweight_leaf-type dirty records.
  *
  * If we are doing user-object accounting, the dnode_rele() happens from
  * userquota_updates_task() instead.
  */
 static void
 dnode_rele_task(void *arg)
 {
 	userquota_updates_arg_t *uua = arg;
 	objset_t *os = uua->uua_os;
 
 	multilist_sublist_t *list = multilist_sublist_lock_idx(
 	    &os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	dnode_t *dn;
 	while ((dn = multilist_sublist_head(list)) != NULL) {
 		mutex_enter(&dn->dn_mtx);
 		ASSERT3U(dn->dn_dirtycnt, >, 0);
 		dn->dn_dirtycnt--;
 		mutex_exit(&dn->dn_mtx);
 		multilist_sublist_remove(list, dn);
 		dnode_rele(dn, &os->os_synced_dnodes);
 	}
 	multilist_sublist_unlock(list);
 	kmem_free(uua, sizeof (*uua));
 }
 
 /*
  * Return TRUE if userquota updates are needed.
  */
 static boolean_t
 dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx)
 {
 	if (!dmu_objset_userused_enabled(os))
 		return (B_FALSE);
 
 	/*
 	 * If this is a raw receive just return and handle accounting
 	 * later when we have the keys loaded. We also don't do user
 	 * accounting during claiming since the datasets are not owned
 	 * for the duration of claiming and this txg should only be
 	 * used for recovery.
 	 */
 	if (os->os_encrypted && dmu_objset_is_receiving(os))
 		return (B_FALSE);
 
 	if (tx->tx_txg <= os->os_spa->spa_claim_max_txg)
 		return (B_FALSE);
 
 	/* Allocate the user/group/project used objects if necessary. */
 	if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
 		VERIFY0(zap_create_claim(os,
 		    DMU_USERUSED_OBJECT,
 		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 		VERIFY0(zap_create_claim(os,
 		    DMU_GROUPUSED_OBJECT,
 		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 	}
 
 	if (dmu_objset_projectquota_enabled(os) &&
 	    DMU_PROJECTUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
 		VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT,
 		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 	}
 	return (B_TRUE);
 }
 
 /*
  * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and
  * also release the holds on the dnodes from dmu_objset_sync_dnodes().
  * The caller must taskq_wait(dp_sync_taskq).
  */
 void
 dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx)
 {
 	boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx);
 
 	int num_sublists = multilist_get_num_sublists(&os->os_synced_dnodes);
 	for (int i = 0; i < num_sublists; i++) {
 		userquota_updates_arg_t *uua =
 		    kmem_alloc(sizeof (*uua), KM_SLEEP);
 		uua->uua_os = os;
 		uua->uua_sublist_idx = i;
 		uua->uua_tx = tx;
 
 		/*
 		 * If we don't need to update userquotas, use
 		 * dnode_rele_task() to call dnode_rele()
 		 */
 		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
 		    need_userquota ? userquota_updates_task : dnode_rele_task,
 		    uua, 0);
 		/* callback frees uua */
 	}
 }
 
 
 /*
  * Returns a pointer to data to find uid/gid from
  *
  * If a dirty record for transaction group that is syncing can't
  * be found then NULL is returned.  In the NULL case it is assumed
  * the uid/gid aren't changing.
  */
 static void *
 dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr;
 	void *data;
 
 	if (db->db_dirtycnt == 0) {
 		ASSERT(MUTEX_HELD(&db->db_mtx));
 		return (db->db.db_data);  /* Nothing is changing */
 	}
 
 	dr = dbuf_find_dirty_eq(db, tx->tx_txg);
 
 	if (dr == NULL) {
 		data = NULL;
 	} else {
 		if (dr->dr_dnode->dn_bonuslen == 0 &&
 		    dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
 			data = dr->dt.dl.dr_data->b_data;
 		else
 			data = dr->dt.dl.dr_data;
 	}
 
 	return (data);
 }
 
 void
 dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	void *data = NULL;
 	dmu_buf_impl_t *db = NULL;
 	int flags = dn->dn_id_flags;
 	int error;
 	boolean_t have_spill = B_FALSE;
 
 	if (!dmu_objset_userused_enabled(dn->dn_objset))
 		return;
 
 	/*
 	 * Raw receives introduce a problem with user accounting. Raw
 	 * receives cannot update the user accounting info because the
 	 * user ids and the sizes are encrypted. To guarantee that we
 	 * never end up with bad user accounting, we simply disable it
 	 * during raw receives. We also disable this for normal receives
 	 * so that an incremental raw receive may be done on top of an
 	 * existing non-raw receive.
 	 */
 	if (os->os_encrypted && dmu_objset_is_receiving(os))
 		return;
 
 	if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
 	    DN_ID_CHKED_SPILL)))
 		return;
 
 	if (before && dn->dn_bonuslen != 0)
 		data = DN_BONUS(dn->dn_phys);
 	else if (!before && dn->dn_bonuslen != 0) {
 		if (dn->dn_bonus) {
 			db = dn->dn_bonus;
 			mutex_enter(&db->db_mtx);
 			data = dmu_objset_userquota_find_data(db, tx);
 		} else {
 			data = DN_BONUS(dn->dn_phys);
 		}
 	} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
 			dmu_flags_t rf = DB_RF_MUST_SUCCEED;
 
 			if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
 				rf |= DB_RF_HAVESTRUCT;
 			error = dmu_spill_hold_by_dnode(dn, rf,
 			    FTAG, (dmu_buf_t **)&db);
 			ASSERT0(error);
 			mutex_enter(&db->db_mtx);
 			data = (before) ? db->db.db_data :
 			    dmu_objset_userquota_find_data(db, tx);
 			have_spill = B_TRUE;
 	} else {
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 		mutex_exit(&dn->dn_mtx);
 		return;
 	}
 
 	/*
 	 * Must always call the callback in case the object
 	 * type has changed and that type isn't an object type to track
 	 */
 	zfs_file_info_t zfi;
 	error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi);
 
 	if (before) {
 		ASSERT(data);
 		dn->dn_olduid = zfi.zfi_user;
 		dn->dn_oldgid = zfi.zfi_group;
 		dn->dn_oldprojid = zfi.zfi_project;
 	} else if (data) {
 		dn->dn_newuid = zfi.zfi_user;
 		dn->dn_newgid = zfi.zfi_group;
 		dn->dn_newprojid = zfi.zfi_project;
 	}
 
 	/*
 	 * Preserve existing uid/gid when the callback can't determine
 	 * what the new uid/gid are and the callback returned EEXIST.
 	 * The EEXIST error tells us to just use the existing uid/gid.
 	 * If we don't know what the old values are then just assign
 	 * them to 0, since that is a new file  being created.
 	 */
 	if (!before && data == NULL && error == EEXIST) {
 		if (flags & DN_ID_OLD_EXIST) {
 			dn->dn_newuid = dn->dn_olduid;
 			dn->dn_newgid = dn->dn_oldgid;
 			dn->dn_newprojid = dn->dn_oldprojid;
 		} else {
 			dn->dn_newuid = 0;
 			dn->dn_newgid = 0;
 			dn->dn_newprojid = ZFS_DEFAULT_PROJID;
 		}
 		error = 0;
 	}
 
 	if (db)
 		mutex_exit(&db->db_mtx);
 
 	mutex_enter(&dn->dn_mtx);
 	if (error == 0 && before)
 		dn->dn_id_flags |= DN_ID_OLD_EXIST;
 	if (error == 0 && !before)
 		dn->dn_id_flags |= DN_ID_NEW_EXIST;
 
 	if (have_spill) {
 		dn->dn_id_flags |= DN_ID_CHKED_SPILL;
 	} else {
 		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 	}
 	mutex_exit(&dn->dn_mtx);
 	if (have_spill)
 		dmu_buf_rele((dmu_buf_t *)db, FTAG);
 }
 
 boolean_t
 dmu_objset_userspace_present(objset_t *os)
 {
 	return (os->os_phys->os_flags &
 	    OBJSET_FLAG_USERACCOUNTING_COMPLETE);
 }
 
 boolean_t
 dmu_objset_userobjspace_present(objset_t *os)
 {
 	return (os->os_phys->os_flags &
 	    OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE);
 }
 
 boolean_t
 dmu_objset_projectquota_present(objset_t *os)
 {
 	return (os->os_phys->os_flags &
 	    OBJSET_FLAG_PROJECTQUOTA_COMPLETE);
 }
 
 static int
 dmu_objset_space_upgrade(objset_t *os)
 {
 	uint64_t obj;
 	int err = 0;
 
 	/*
 	 * We simply need to mark every object dirty, so that it will be
 	 * synced out and now accounted.  If this is called
 	 * concurrently, or if we already did some work before crashing,
 	 * that's fine, since we track each object's accounted state
 	 * independently.
 	 */
 
 	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
 		dmu_tx_t *tx;
 		dmu_buf_t *db;
 		int objerr;
 
 		mutex_enter(&os->os_upgrade_lock);
 		if (os->os_upgrade_exit)
 			err = SET_ERROR(EINTR);
 		mutex_exit(&os->os_upgrade_lock);
 		if (err != 0)
 			return (err);
 
 		if (issig())
 			return (SET_ERROR(EINTR));
 
 		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
 		if (objerr != 0)
 			continue;
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, obj);
 		objerr = dmu_tx_assign(tx, DMU_TX_WAIT);
 		if (objerr != 0) {
 			dmu_buf_rele(db, FTAG);
 			dmu_tx_abort(tx);
 			continue;
 		}
 		dmu_buf_will_dirty(db, tx);
 		dmu_buf_rele(db, FTAG);
 		dmu_tx_commit(tx);
 	}
 	return (0);
 }
 
 static int
 dmu_objset_userspace_upgrade_cb(objset_t *os)
 {
 	int err = 0;
 
 	if (dmu_objset_userspace_present(os))
 		return (0);
 	if (dmu_objset_is_snapshot(os))
 		return (SET_ERROR(EINVAL));
 	if (!dmu_objset_userused_enabled(os))
 		return (SET_ERROR(ENOTSUP));
 
 	err = dmu_objset_space_upgrade(os);
 	if (err)
 		return (err);
 
 	os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 	txg_wait_synced(dmu_objset_pool(os), 0);
 	return (0);
 }
 
 void
 dmu_objset_userspace_upgrade(objset_t *os)
 {
 	dmu_objset_upgrade(os, dmu_objset_userspace_upgrade_cb);
 }
 
 static int
 dmu_objset_id_quota_upgrade_cb(objset_t *os)
 {
 	int err = 0;
 
 	if (dmu_objset_userobjspace_present(os) &&
 	    dmu_objset_projectquota_present(os))
 		return (0);
 	if (dmu_objset_is_snapshot(os))
 		return (SET_ERROR(EINVAL));
 	if (!dmu_objset_userused_enabled(os))
 		return (SET_ERROR(ENOTSUP));
 	if (!dmu_objset_projectquota_enabled(os) &&
 	    dmu_objset_userobjspace_present(os))
 		return (SET_ERROR(ENOTSUP));
 
 	if (dmu_objset_userobjused_enabled(os))
 		dmu_objset_ds(os)->ds_feature_activation[
 		    SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE;
 	if (dmu_objset_projectquota_enabled(os))
 		dmu_objset_ds(os)->ds_feature_activation[
 		    SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE;
 
 	err = dmu_objset_space_upgrade(os);
 	if (err)
 		return (err);
 
 	os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 	if (dmu_objset_userobjused_enabled(os))
 		os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE;
 	if (dmu_objset_projectquota_enabled(os))
 		os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE;
 
 	txg_wait_synced(dmu_objset_pool(os), 0);
 	return (0);
 }
 
 void
 dmu_objset_id_quota_upgrade(objset_t *os)
 {
 	dmu_objset_upgrade(os, dmu_objset_id_quota_upgrade_cb);
 }
 
 boolean_t
 dmu_objset_userobjspace_upgradable(objset_t *os)
 {
 	return (dmu_objset_type(os) == DMU_OST_ZFS &&
 	    !dmu_objset_is_snapshot(os) &&
 	    dmu_objset_userobjused_enabled(os) &&
 	    !dmu_objset_userobjspace_present(os) &&
 	    spa_writeable(dmu_objset_spa(os)));
 }
 
 boolean_t
 dmu_objset_projectquota_upgradable(objset_t *os)
 {
 	return (dmu_objset_type(os) == DMU_OST_ZFS &&
 	    !dmu_objset_is_snapshot(os) &&
 	    dmu_objset_projectquota_enabled(os) &&
 	    !dmu_objset_projectquota_present(os) &&
 	    spa_writeable(dmu_objset_spa(os)));
 }
 
 void
 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
 	dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
 	    usedobjsp, availobjsp);
 }
 
 uint64_t
 dmu_objset_fsid_guid(objset_t *os)
 {
 	return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
 }
 
 void
 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
 {
 	stat->dds_type = os->os_phys->os_type;
 	if (os->os_dsl_dataset)
 		dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
 }
 
 void
 dmu_objset_stats(objset_t *os, nvlist_t *nv)
 {
 	ASSERT(os->os_dsl_dataset ||
 	    os->os_phys->os_type == DMU_OST_META);
 
 	if (os->os_dsl_dataset != NULL)
 		dsl_dataset_stats(os->os_dsl_dataset, nv);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
 	    os->os_phys->os_type);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
 	    dmu_objset_userspace_present(os));
 }
 
 int
 dmu_objset_is_snapshot(objset_t *os)
 {
 	if (os->os_dsl_dataset != NULL)
 		return (os->os_dsl_dataset->ds_is_snapshot);
 	else
 		return (B_FALSE);
 }
 
 int
 dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen,
     boolean_t *conflict)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	uint64_t ignored;
 
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
 		return (SET_ERROR(ENOENT));
 
 	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
 	    MT_NORMALIZE, real, maxlen, conflict));
 }
 
 int
 dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	zap_cursor_t cursor;
 	zap_attribute_t *attr;
 
 	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
 
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
 		return (SET_ERROR(ENOENT));
 
 	attr = zap_attribute_alloc();
 	zap_cursor_init_serialized(&cursor,
 	    ds->ds_dir->dd_pool->dp_meta_objset,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, attr) != 0) {
 		zap_cursor_fini(&cursor);
 		zap_attribute_free(attr);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (strlen(attr->za_name) + 1 > namelen) {
 		zap_cursor_fini(&cursor);
 		zap_attribute_free(attr);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	(void) strlcpy(name, attr->za_name, namelen);
 	if (idp)
 		*idp = attr->za_first_integer;
 	if (case_conflict)
 		*case_conflict = attr->za_normalization_conflict;
 	zap_cursor_advance(&cursor);
 	*offp = zap_cursor_serialize(&cursor);
 	zap_cursor_fini(&cursor);
 	zap_attribute_free(attr);
 
 	return (0);
 }
 
 int
 dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value)
 {
 	return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value));
 }
 
 int
 dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp)
 {
 	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
 	zap_cursor_t cursor;
 	zap_attribute_t *attr;
 
 	/* there is no next dir on a snapshot! */
 	if (os->os_dsl_dataset->ds_object !=
 	    dsl_dir_phys(dd)->dd_head_dataset_obj)
 		return (SET_ERROR(ENOENT));
 
 	attr = zap_attribute_alloc();
 	zap_cursor_init_serialized(&cursor,
 	    dd->dd_pool->dp_meta_objset,
 	    dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, attr) != 0) {
 		zap_cursor_fini(&cursor);
 		zap_attribute_free(attr);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (strlen(attr->za_name) + 1 > namelen) {
 		zap_cursor_fini(&cursor);
 		zap_attribute_free(attr);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	(void) strlcpy(name, attr->za_name, namelen);
 	if (idp)
 		*idp = attr->za_first_integer;
 	zap_cursor_advance(&cursor);
 	*offp = zap_cursor_serialize(&cursor);
 	zap_cursor_fini(&cursor);
 	zap_attribute_free(attr);
 
 	return (0);
 }
 
 typedef struct dmu_objset_find_ctx {
 	taskq_t		*dc_tq;
 	dsl_pool_t	*dc_dp;
 	uint64_t	dc_ddobj;
 	char		*dc_ddname; /* last component of ddobj's name */
 	int		(*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
 	void		*dc_arg;
 	int		dc_flags;
 	kmutex_t	*dc_error_lock;
 	int		*dc_error;
 } dmu_objset_find_ctx_t;
 
 static void
 dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
 {
 	dsl_pool_t *dp = dcp->dc_dp;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	uint64_t thisobj;
 	int err = 0;
 
 	/* don't process if there already was an error */
 	if (*dcp->dc_error != 0)
 		goto out;
 
 	/*
 	 * Note: passing the name (dc_ddname) here is optional, but it
 	 * improves performance because we don't need to call
 	 * zap_value_search() to determine the name.
 	 */
 	err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);
 	if (err != 0)
 		goto out;
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
 		dsl_dir_rele(dd, FTAG);
 		goto out;
 	}
 
 	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	attr = zap_attribute_alloc();
 
 	/*
 	 * Iterate over all children.
 	 */
 	if (dcp->dc_flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT3U(attr->za_integer_length, ==,
 			    sizeof (uint64_t));
 			ASSERT3U(attr->za_num_integers, ==, 1);
 
 			dmu_objset_find_ctx_t *child_dcp =
 			    kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
 			*child_dcp = *dcp;
 			child_dcp->dc_ddobj = attr->za_first_integer;
 			child_dcp->dc_ddname = spa_strdup(attr->za_name);
 			if (dcp->dc_tq != NULL)
 				(void) taskq_dispatch(dcp->dc_tq,
 				    dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
 			else
 				dmu_objset_find_dp_impl(child_dcp);
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	/*
 	 * Iterate over all snapshots.
 	 */
 	if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
 		dsl_dataset_t *ds;
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
 		if (err == 0) {
 			uint64_t snapobj;
 
 			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 			dsl_dataset_rele(ds, FTAG);
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
 			    zap_cursor_retrieve(&zc, attr) == 0;
 			    (void) zap_cursor_advance(&zc)) {
 				ASSERT3U(attr->za_integer_length, ==,
 				    sizeof (uint64_t));
 				ASSERT3U(attr->za_num_integers, ==, 1);
 
 				err = dsl_dataset_hold_obj(dp,
 				    attr->za_first_integer, FTAG, &ds);
 				if (err != 0)
 					break;
 				err = dcp->dc_func(dp, ds, dcp->dc_arg);
 				dsl_dataset_rele(ds, FTAG);
 				if (err != 0)
 					break;
 			}
 			zap_cursor_fini(&zc);
 		}
 	}
 
 	zap_attribute_free(attr);
 
 	if (err != 0) {
 		dsl_dir_rele(dd, FTAG);
 		goto out;
 	}
 
 	/*
 	 * Apply to self.
 	 */
 	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
 	/*
 	 * Note: we hold the dir while calling dsl_dataset_hold_obj() so
 	 * that the dir will remain cached, and we won't have to re-instantiate
 	 * it (which could be expensive due to finding its name via
 	 * zap_value_search()).
 	 */
 	dsl_dir_rele(dd, FTAG);
 	if (err != 0)
 		goto out;
 	err = dcp->dc_func(dp, ds, dcp->dc_arg);
 	dsl_dataset_rele(ds, FTAG);
 
 out:
 	if (err != 0) {
 		mutex_enter(dcp->dc_error_lock);
 		/* only keep first error */
 		if (*dcp->dc_error == 0)
 			*dcp->dc_error = err;
 		mutex_exit(dcp->dc_error_lock);
 	}
 
 	if (dcp->dc_ddname != NULL)
 		spa_strfree(dcp->dc_ddname);
 	kmem_free(dcp, sizeof (*dcp));
 }
 
 static void
 dmu_objset_find_dp_cb(void *arg)
 {
 	dmu_objset_find_ctx_t *dcp = arg;
 	dsl_pool_t *dp = dcp->dc_dp;
 
 	/*
 	 * We need to get a pool_config_lock here, as there are several
 	 * assert(pool_config_held) down the stack. Getting a lock via
 	 * dsl_pool_config_enter is risky, as it might be stalled by a
 	 * pending writer. This would deadlock, as the write lock can
 	 * only be granted when our parent thread gives up the lock.
 	 * The _prio interface gives us priority over a pending writer.
 	 */
 	dsl_pool_config_enter_prio(dp, FTAG);
 
 	dmu_objset_find_dp_impl(dcp);
 
 	dsl_pool_config_exit(dp, FTAG);
 }
 
 /*
  * Find objsets under and including ddobj, call func(ds) on each.
  * The order for the enumeration is completely undefined.
  * func is called with dsl_pool_config held.
  */
 int
 dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
     int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
 {
 	int error = 0;
 	taskq_t *tq = NULL;
 	int ntasks;
 	dmu_objset_find_ctx_t *dcp;
 	kmutex_t err_lock;
 
 	mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
 	dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
 	dcp->dc_tq = NULL;
 	dcp->dc_dp = dp;
 	dcp->dc_ddobj = ddobj;
 	dcp->dc_ddname = NULL;
 	dcp->dc_func = func;
 	dcp->dc_arg = arg;
 	dcp->dc_flags = flags;
 	dcp->dc_error_lock = &err_lock;
 	dcp->dc_error = &error;
 
 	if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
 		/*
 		 * In case a write lock is held we can't make use of
 		 * parallelism, as down the stack of the worker threads
 		 * the lock is asserted via dsl_pool_config_held.
 		 * In case of a read lock this is solved by getting a read
 		 * lock in each worker thread, which isn't possible in case
 		 * of a writer lock. So we fall back to the synchronous path
 		 * here.
 		 * In the future it might be possible to get some magic into
 		 * dsl_pool_config_held in a way that it returns true for
 		 * the worker threads so that a single lock held from this
 		 * thread suffices. For now, stay single threaded.
 		 */
 		dmu_objset_find_dp_impl(dcp);
 		mutex_destroy(&err_lock);
 
 		return (error);
 	}
 
 	ntasks = dmu_find_threads;
 	if (ntasks == 0)
 		ntasks = vdev_count_leaves(dp->dp_spa) * 4;
 	tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks,
 	    INT_MAX, 0);
 	if (tq == NULL) {
 		kmem_free(dcp, sizeof (*dcp));
 		mutex_destroy(&err_lock);
 
 		return (SET_ERROR(ENOMEM));
 	}
 	dcp->dc_tq = tq;
 
 	/* dcp will be freed by task */
 	(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
 
 	/*
 	 * PORTING: this code relies on the property of taskq_wait to wait
 	 * until no more tasks are queued and no more tasks are active. As
 	 * we always queue new tasks from within other tasks, task_wait
 	 * reliably waits for the full recursion to finish, even though we
 	 * enqueue new tasks after taskq_wait has been called.
 	 * On platforms other than illumos, taskq_wait may not have this
 	 * property.
 	 */
 	taskq_wait(tq);
 	taskq_destroy(tq);
 	mutex_destroy(&err_lock);
 
 	return (error);
 }
 
 /*
  * Find all objsets under name, and for each, call 'func(child_name, arg)'.
  * The dp_config_rwlock must not be held when this is called, and it
  * will not be held when the callback is called.
  * Therefore this function should only be used when the pool is not changing
  * (e.g. in syncing context), or the callback can deal with the possible races.
  */
 static int
 dmu_objset_find_impl(spa_t *spa, const char *name,
     int func(const char *, void *), void *arg, int flags)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	char *child;
 	uint64_t thisobj;
 	int err;
 
 	dsl_pool_config_enter(dp, FTAG);
 
 	err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
 	if (err != 0) {
 		dsl_pool_config_exit(dp, FTAG);
 		return (err);
 	}
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
 		dsl_dir_rele(dd, FTAG);
 		dsl_pool_config_exit(dp, FTAG);
 		return (0);
 	}
 
 	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	attr = zap_attribute_alloc();
 
 	/*
 	 * Iterate over all children.
 	 */
 	if (flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT3U(attr->za_integer_length, ==,
 			    sizeof (uint64_t));
 			ASSERT3U(attr->za_num_integers, ==, 1);
 
 			child = kmem_asprintf("%s/%s", name, attr->za_name);
 			dsl_pool_config_exit(dp, FTAG);
 			err = dmu_objset_find_impl(spa, child,
 			    func, arg, flags);
 			dsl_pool_config_enter(dp, FTAG);
 			kmem_strfree(child);
 			if (err != 0)
 				break;
 		}
 		zap_cursor_fini(&zc);
 
 		if (err != 0) {
 			dsl_dir_rele(dd, FTAG);
 			dsl_pool_config_exit(dp, FTAG);
 			zap_attribute_free(attr);
 			return (err);
 		}
 	}
 
 	/*
 	 * Iterate over all snapshots.
 	 */
 	if (flags & DS_FIND_SNAPSHOTS) {
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
 		if (err == 0) {
 			uint64_t snapobj;
 
 			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 			dsl_dataset_rele(ds, FTAG);
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
 			    zap_cursor_retrieve(&zc, attr) == 0;
 			    (void) zap_cursor_advance(&zc)) {
 				ASSERT3U(attr->za_integer_length, ==,
 				    sizeof (uint64_t));
 				ASSERT3U(attr->za_num_integers, ==, 1);
 
 				child = kmem_asprintf("%s@%s",
 				    name, attr->za_name);
 				dsl_pool_config_exit(dp, FTAG);
 				err = func(child, arg);
 				dsl_pool_config_enter(dp, FTAG);
 				kmem_strfree(child);
 				if (err != 0)
 					break;
 			}
 			zap_cursor_fini(&zc);
 		}
 	}
 
 	dsl_dir_rele(dd, FTAG);
 	zap_attribute_free(attr);
 	dsl_pool_config_exit(dp, FTAG);
 
 	if (err != 0)
 		return (err);
 
 	/* Apply to self. */
 	return (func(name, arg));
 }
 
 /*
  * See comment above dmu_objset_find_impl().
  */
 int
 dmu_objset_find(const char *name, int func(const char *, void *), void *arg,
     int flags)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	error = dmu_objset_find_impl(spa, name, func, arg, flags);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 boolean_t
 dmu_objset_incompatible_encryption_version(objset_t *os)
 {
 	return (dsl_dir_incompatible_encryption_version(
 	    os->os_dsl_dataset->ds_dir));
 }
 
 void
 dmu_objset_set_user(objset_t *os, void *user_ptr)
 {
 	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
 	os->os_user_ptr = user_ptr;
 }
 
 void *
 dmu_objset_get_user(objset_t *os)
 {
 	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
 	return (os->os_user_ptr);
 }
 
 /*
  * Determine name of filesystem, given name of snapshot.
  * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
  */
 int
 dmu_fsname(const char *snapname, char *buf)
 {
 	char *atp = strchr(snapname, '@');
 	if (atp == NULL)
 		return (SET_ERROR(EINVAL));
 	if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 	(void) strlcpy(buf, snapname, atp - snapname + 1);
 	return (0);
 }
 
 /*
  * Call when we think we're going to write/free space in open context
  * to track the amount of dirty data in the open txg, which is also the
  * amount of memory that can not be evicted until this txg syncs.
  *
  * Note that there are two conditions where this can be called from
  * syncing context:
  *
  * [1] When we just created the dataset, in which case we go on with
  *     updating any accounting of dirty data as usual.
  * [2] When we are dirtying MOS data, in which case we only update the
  *     pool's accounting of dirty data.
  */
 void
 dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
 
 	if (ds != NULL) {
 		dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
 	}
 
 	dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
 }
 
 #if defined(_KERNEL)
 EXPORT_SYMBOL(dmu_objset_zil);
 EXPORT_SYMBOL(dmu_objset_pool);
 EXPORT_SYMBOL(dmu_objset_ds);
 EXPORT_SYMBOL(dmu_objset_type);
 EXPORT_SYMBOL(dmu_objset_name);
 EXPORT_SYMBOL(dmu_objset_hold);
 EXPORT_SYMBOL(dmu_objset_hold_flags);
 EXPORT_SYMBOL(dmu_objset_own);
 EXPORT_SYMBOL(dmu_objset_rele);
 EXPORT_SYMBOL(dmu_objset_rele_flags);
 EXPORT_SYMBOL(dmu_objset_disown);
 EXPORT_SYMBOL(dmu_objset_from_ds);
 EXPORT_SYMBOL(dmu_objset_create);
 EXPORT_SYMBOL(dmu_objset_stats);
 EXPORT_SYMBOL(dmu_objset_fast_stat);
 EXPORT_SYMBOL(dmu_objset_spa);
 EXPORT_SYMBOL(dmu_objset_space);
 EXPORT_SYMBOL(dmu_objset_fsid_guid);
 EXPORT_SYMBOL(dmu_objset_find);
 EXPORT_SYMBOL(dmu_objset_byteswap);
 EXPORT_SYMBOL(dmu_objset_evict_dbufs);
 EXPORT_SYMBOL(dmu_objset_snap_cmtime);
 EXPORT_SYMBOL(dmu_objset_dnodesize);
 
 EXPORT_SYMBOL(dmu_objset_sync);
 EXPORT_SYMBOL(dmu_objset_is_dirty);
 EXPORT_SYMBOL(dmu_objset_create_impl_dnstats);
 EXPORT_SYMBOL(dmu_objset_create_impl);
 EXPORT_SYMBOL(dmu_objset_open_impl);
 EXPORT_SYMBOL(dmu_objset_evict);
 EXPORT_SYMBOL(dmu_objset_register_type);
 EXPORT_SYMBOL(dmu_objset_sync_done);
 EXPORT_SYMBOL(dmu_objset_userquota_get_ids);
 EXPORT_SYMBOL(dmu_objset_userused_enabled);
 EXPORT_SYMBOL(dmu_objset_userspace_upgrade);
 EXPORT_SYMBOL(dmu_objset_userspace_present);
 EXPORT_SYMBOL(dmu_objset_userobjused_enabled);
 EXPORT_SYMBOL(dmu_objset_userobjspace_upgradable);
 EXPORT_SYMBOL(dmu_objset_userobjspace_present);
 EXPORT_SYMBOL(dmu_objset_projectquota_enabled);
 EXPORT_SYMBOL(dmu_objset_projectquota_present);
 EXPORT_SYMBOL(dmu_objset_projectquota_upgradable);
 EXPORT_SYMBOL(dmu_objset_id_quota_upgrade);
 #endif
diff --git a/module/zfs/spa.c b/module/zfs/spa.c
index 34de3f1d9525..c481070e1f2d 100644
--- a/module/zfs/spa.c
+++ b/module/zfs/spa.c
@@ -1,11347 +1,11347 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2024 by Delphix. All rights reserved.
  * Copyright (c) 2018, Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Toomas Soome <tsoome@me.com>
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  * Copyright 2018 Joyent, Inc.
  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2021, Colm Buckley <colm@tuatha.org>
  * Copyright (c) 2023 Hewlett Packard Enterprise Development LP.
  * Copyright (c) 2023, 2024, Klara Inc.
  */
 
 /*
  * SPA: Storage Pool Allocator
  *
  * This file contains all the routines used when modifying on-disk SPA state.
  * This includes opening, importing, destroying, exporting a pool, and syncing a
  * pool.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/fm/fs/zfs.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/brt.h>
 #include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_removal.h>
 #include <sys/vdev_indirect_mapping.h>
 #include <sys/vdev_indirect_births.h>
 #include <sys/vdev_initialize.h>
 #include <sys/vdev_rebuild.h>
 #include <sys/vdev_trim.h>
 #include <sys/vdev_disk.h>
 #include <sys/vdev_raidz.h>
 #include <sys/vdev_draid.h>
 #include <sys/metaslab.h>
 #include <sys/metaslab_impl.h>
 #include <sys/mmp.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
 #include <sys/bpobj.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dmu_objset.h>
 #include <sys/unique.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
 #include <sys/callb.h>
 #include <sys/systeminfo.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/dsl_scan.h>
 #include <sys/zfeature.h>
 #include <sys/dsl_destroy.h>
 #include <sys/zvol.h>
 
 #ifdef	_KERNEL
 #include <sys/fm/protocol.h>
 #include <sys/fm/util.h>
 #include <sys/callb.h>
 #include <sys/zone.h>
 #include <sys/vmsystm.h>
 #endif	/* _KERNEL */
 
 #include "zfs_crrd.h"
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 #include <cityhash.h>
 
 /*
  * spa_thread() existed on Illumos as a parent thread for the various worker
  * threads that actually run the pool, as a way to both reference the entire
  * pool work as a single object, and to share properties like scheduling
  * options. It has not yet been adapted to Linux or FreeBSD. This define is
  * used to mark related parts of the code to make things easier for the reader,
  * and to compile this code out. It can be removed when someone implements it,
  * moves it to some Illumos-specific place, or removes it entirely.
  */
 #undef HAVE_SPA_THREAD
 
 /*
  * The "System Duty Cycle" scheduling class is an Illumos feature to help
  * prevent CPU-intensive kernel threads from affecting latency on interactive
  * threads. It doesn't exist on Linux or FreeBSD, so the supporting code is
  * gated behind a define. On Illumos SDC depends on spa_thread(), but
  * spa_thread() also has other uses, so this is a separate define.
  */
 #undef HAVE_SYSDC
 
 /*
  * The interval, in seconds, at which failed configuration cache file writes
  * should be retried.
  */
 int zfs_ccw_retry_interval = 300;
 
 typedef enum zti_modes {
 	ZTI_MODE_FIXED,			/* value is # of threads (min 1) */
 	ZTI_MODE_SCALE,			/* Taskqs scale with CPUs. */
 	ZTI_MODE_SYNC,			/* sync thread assigned */
 	ZTI_MODE_NULL,			/* don't create a taskq */
 	ZTI_NMODES
 } zti_modes_t;
 
 #define	ZTI_P(n, q)	{ ZTI_MODE_FIXED, (n), (q) }
 #define	ZTI_PCT(n)	{ ZTI_MODE_ONLINE_PERCENT, (n), 1 }
 #define	ZTI_SCALE(min)	{ ZTI_MODE_SCALE, (min), 1 }
 #define	ZTI_SYNC	{ ZTI_MODE_SYNC, 0, 1 }
 #define	ZTI_NULL	{ ZTI_MODE_NULL, 0, 0 }
 
 #define	ZTI_N(n)	ZTI_P(n, 1)
 #define	ZTI_ONE		ZTI_N(1)
 
 typedef struct zio_taskq_info {
 	zti_modes_t zti_mode;
 	uint_t zti_value;
 	uint_t zti_count;
 } zio_taskq_info_t;
 
 static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
 	"iss", "iss_h", "int", "int_h"
 };
 
 /*
  * This table defines the taskq settings for each ZFS I/O type. When
  * initializing a pool, we use this table to create an appropriately sized
  * taskq. Some operations are low volume and therefore have a small, static
  * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE
  * macros. Other operations process a large amount of data; the ZTI_SCALE
  * macro causes us to create a taskq oriented for throughput. Some operations
  * are so high frequency and short-lived that the taskq itself can become a
  * point of lock contention. The ZTI_P(#, #) macro indicates that we need an
  * additional degree of parallelism specified by the number of threads per-
  * taskq and the number of taskqs; when dispatching an event in this case, the
  * particular taskq is chosen at random. ZTI_SCALE uses a number of taskqs
  * that scales with the number of CPUs.
  *
  * The different taskq priorities are to handle the different contexts (issue
  * and interrupt) and then to reserve threads for high priority I/Os that
  * need to be handled with minimum delay.  Illumos taskq has unfair TQ_FRONT
  * implementation, so separate high priority threads are used there.
  */
 static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
 	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* NULL */
 	{ ZTI_N(8),	ZTI_NULL,	ZTI_SCALE(0),	ZTI_NULL }, /* READ */
 #ifdef illumos
 	{ ZTI_SYNC,	ZTI_N(5),	ZTI_SCALE(0),	ZTI_N(5) }, /* WRITE */
 #else
 	{ ZTI_SYNC,	ZTI_NULL,	ZTI_SCALE(0),	ZTI_NULL }, /* WRITE */
 #endif
 	{ ZTI_SCALE(32), ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FREE */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* CLAIM */
 	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* FLUSH */
 	{ ZTI_N(4),	ZTI_NULL,	ZTI_ONE,	ZTI_NULL }, /* TRIM */
 };
 
 static void spa_sync_version(void *arg, dmu_tx_t *tx);
 static void spa_sync_props(void *arg, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
 static int spa_load_impl(spa_t *spa, spa_import_type_t type,
     const char **ereport);
 static void spa_vdev_resilver_done(spa_t *spa);
 
 /*
  * Percentage of all CPUs that can be used by the metaslab preload taskq.
  */
 static uint_t metaslab_preload_pct = 50;
 
 static uint_t	zio_taskq_batch_pct = 80;	  /* 1 thread per cpu in pset */
 static uint_t	zio_taskq_batch_tpq;		  /* threads per taskq */
 
 #ifdef HAVE_SYSDC
 static const boolean_t	zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
 static const uint_t	zio_taskq_basedc = 80;	  /* base duty cycle */
 #endif
 
 #ifdef HAVE_SPA_THREAD
 static const boolean_t spa_create_process = B_TRUE; /* no process => no sysdc */
 #endif
 
 static uint_t	zio_taskq_write_tpq = 16;
 
 /*
  * Report any spa_load_verify errors found, but do not fail spa_load.
  * This is used by zdb to analyze non-idle pools.
  */
 boolean_t	spa_load_verify_dryrun = B_FALSE;
 
 /*
  * Allow read spacemaps in case of readonly import (spa_mode == SPA_MODE_READ).
  * This is used by zdb for spacemaps verification.
  */
 boolean_t	spa_mode_readable_spacemaps = B_FALSE;
 
 /*
  * This (illegal) pool name is used when temporarily importing a spa_t in order
  * to get the vdev stats associated with the imported devices.
  */
 #define	TRYIMPORT_NAME	"$import"
 
 /*
  * For debugging purposes: print out vdev tree during pool import.
  */
 static int		spa_load_print_vdev_tree = B_FALSE;
 
 /*
  * A non-zero value for zfs_max_missing_tvds means that we allow importing
  * pools with missing top-level vdevs. This is strictly intended for advanced
  * pool recovery cases since missing data is almost inevitable. Pools with
  * missing devices can only be imported read-only for safety reasons, and their
  * fail-mode will be automatically set to "continue".
  *
  * With 1 missing vdev we should be able to import the pool and mount all
  * datasets. User data that was not modified after the missing device has been
  * added should be recoverable. This means that snapshots created prior to the
  * addition of that device should be completely intact.
  *
  * With 2 missing vdevs, some datasets may fail to mount since there are
  * dataset statistics that are stored as regular metadata. Some data might be
  * recoverable if those vdevs were added recently.
  *
  * With 3 or more missing vdevs, the pool is severely damaged and MOS entries
  * may be missing entirely. Chances of data recovery are very low. Note that
  * there are also risks of performing an inadvertent rewind as we might be
  * missing all the vdevs with the latest uberblocks.
  */
 uint64_t	zfs_max_missing_tvds = 0;
 
 /*
  * The parameters below are similar to zfs_max_missing_tvds but are only
  * intended for a preliminary open of the pool with an untrusted config which
  * might be incomplete or out-dated.
  *
  * We are more tolerant for pools opened from a cachefile since we could have
  * an out-dated cachefile where a device removal was not registered.
  * We could have set the limit arbitrarily high but in the case where devices
  * are really missing we would want to return the proper error codes; we chose
  * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available
  * and we get a chance to retrieve the trusted config.
  */
 uint64_t	zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1;
 
 /*
  * In the case where config was assembled by scanning device paths (/dev/dsks
  * by default) we are less tolerant since all the existing devices should have
  * been detected and we want spa_load to return the right error codes.
  */
 uint64_t	zfs_max_missing_tvds_scan = 0;
 
 /*
  * Debugging aid that pauses spa_sync() towards the end.
  */
 static const boolean_t	zfs_pause_spa_sync = B_FALSE;
 
 /*
  * Variables to indicate the livelist condense zthr func should wait at certain
  * points for the livelist to be removed - used to test condense/destroy races
  */
 static int zfs_livelist_condense_zthr_pause = 0;
 static int zfs_livelist_condense_sync_pause = 0;
 
 /*
  * Variables to track whether or not condense cancellation has been
  * triggered in testing.
  */
 static int zfs_livelist_condense_sync_cancel = 0;
 static int zfs_livelist_condense_zthr_cancel = 0;
 
 /*
  * Variable to track whether or not extra ALLOC blkptrs were added to a
  * livelist entry while it was being condensed (caused by the way we track
  * remapped blkptrs in dbuf_remap_impl)
  */
 static int zfs_livelist_condense_new_alloc = 0;
 
 /*
  * Time variable to decide how often the txg should be added into the
  * database (in seconds).
  * The smallest available resolution is in minutes, which means an update occurs
  * each time we reach `spa_note_txg_time` and the txg has changed. We provide
  * a 256-slot ring buffer for minute-level resolution. The number is limited by
  * the size of the structure we use and the maximum amount of bytes we can write
  * into ZAP. Setting `spa_note_txg_time` to 10 minutes results in approximately
  * 144 records per day. Given the 256 slots, this provides roughly 1.5 days of
  * high-resolution data.
  *
  * The user can decrease `spa_note_txg_time` to increase resolution within
  * a day, at the cost of retaining fewer days of data. Alternatively, increasing
  * the interval allows storing data over a longer period, but with lower
  * frequency.
  *
  * This parameter does not affect the daily or monthly databases, as those only
  * store one record per day and per month, respectively.
  */
 static uint_t spa_note_txg_time = 10 * 60;
 
 /*
  * How often flush txg database to a disk (in seconds).
  * We flush data every time we write to it, making it the most reliable option.
  * Since this happens every 10 minutes, it shouldn't introduce any noticeable
  * overhead for the system. In case of failure, we will always have an
  * up-to-date version of the database.
  *
  * The user can adjust the flush interval to a lower value, but it probably
  * doesn't make sense to flush more often than the database is updated.
  * The user can also increase the interval if they're concerned about the
  * performance of writing the entire database to disk.
  */
 static uint_t spa_flush_txg_time = 10 * 60;
 
 /*
  * ==========================================================================
  * SPA properties routines
  * ==========================================================================
  */
 
 /*
  * Add a (source=src, propname=propval) list to an nvlist.
  */
 static void
 spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, const char *strval,
     uint64_t intval, zprop_source_t src)
 {
 	const char *propname = zpool_prop_to_name(prop);
 	nvlist_t *propval;
 
 	propval = fnvlist_alloc();
 	fnvlist_add_uint64(propval, ZPROP_SOURCE, src);
 
 	if (strval != NULL)
 		fnvlist_add_string(propval, ZPROP_VALUE, strval);
 	else
 		fnvlist_add_uint64(propval, ZPROP_VALUE, intval);
 
 	fnvlist_add_nvlist(nvl, propname, propval);
 	nvlist_free(propval);
 }
 
 static int
 spa_prop_add(spa_t *spa, const char *propname, nvlist_t *outnvl)
 {
 	zpool_prop_t prop = zpool_name_to_prop(propname);
 	zprop_source_t src = ZPROP_SRC_NONE;
 	uint64_t intval;
 	int err;
 
 	/*
 	 * NB: Not all properties lookups via this API require
 	 * the spa props lock, so they must explicitly grab it here.
 	 */
 	switch (prop) {
 	case ZPOOL_PROP_DEDUPCACHED:
 		err = ddt_get_pool_dedup_cached(spa, &intval);
 		if (err != 0)
 			return (SET_ERROR(err));
 		break;
 	default:
 		return (SET_ERROR(EINVAL));
 	}
 
 	spa_prop_add_list(outnvl, prop, NULL, intval, src);
 
 	return (0);
 }
 
 int
 spa_prop_get_nvlist(spa_t *spa, char **props, unsigned int n_props,
     nvlist_t *outnvl)
 {
 	int err = 0;
 
 	if (props == NULL)
 		return (0);
 
 	for (unsigned int i = 0; i < n_props && err == 0; i++) {
 		err = spa_prop_add(spa, props[i], outnvl);
 	}
 
 	return (err);
 }
 
 /*
  * Add a user property (source=src, propname=propval) to an nvlist.
  */
 static void
 spa_prop_add_user(nvlist_t *nvl, const char *propname, char *strval,
     zprop_source_t src)
 {
 	nvlist_t *propval;
 
 	VERIFY0(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP));
 	VERIFY0(nvlist_add_uint64(propval, ZPROP_SOURCE, src));
 	VERIFY0(nvlist_add_string(propval, ZPROP_VALUE, strval));
 	VERIFY0(nvlist_add_nvlist(nvl, propname, propval));
 	nvlist_free(propval);
 }
 
 /*
  * Get property values from the spa configuration.
  */
 static void
 spa_prop_get_config(spa_t *spa, nvlist_t *nv)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	dsl_pool_t *pool = spa->spa_dsl_pool;
 	uint64_t size, alloc, cap, version;
 	const zprop_source_t src = ZPROP_SRC_NONE;
 	spa_config_dirent_t *dp;
 	metaslab_class_t *mc = spa_normal_class(spa);
 
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
 	if (rvd != NULL) {
 		alloc = metaslab_class_get_alloc(mc);
 		alloc += metaslab_class_get_alloc(spa_special_class(spa));
 		alloc += metaslab_class_get_alloc(spa_dedup_class(spa));
 		alloc += metaslab_class_get_alloc(spa_embedded_log_class(spa));
 		alloc += metaslab_class_get_alloc(
 		    spa_special_embedded_log_class(spa));
 
 		size = metaslab_class_get_space(mc);
 		size += metaslab_class_get_space(spa_special_class(spa));
 		size += metaslab_class_get_space(spa_dedup_class(spa));
 		size += metaslab_class_get_space(spa_embedded_log_class(spa));
 		size += metaslab_class_get_space(
 		    spa_special_embedded_log_class(spa));
 
 		spa_prop_add_list(nv, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
 		spa_prop_add_list(nv, ZPOOL_PROP_SIZE, NULL, size, src);
 		spa_prop_add_list(nv, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
 		spa_prop_add_list(nv, ZPOOL_PROP_FREE, NULL,
 		    size - alloc, src);
 		spa_prop_add_list(nv, ZPOOL_PROP_CHECKPOINT, NULL,
 		    spa->spa_checkpoint_info.sci_dspace, src);
 
 		spa_prop_add_list(nv, ZPOOL_PROP_FRAGMENTATION, NULL,
 		    metaslab_class_fragmentation(mc), src);
 		spa_prop_add_list(nv, ZPOOL_PROP_EXPANDSZ, NULL,
 		    metaslab_class_expandable_space(mc), src);
 		spa_prop_add_list(nv, ZPOOL_PROP_READONLY, NULL,
 		    (spa_mode(spa) == SPA_MODE_READ), src);
 
 		cap = (size == 0) ? 0 : (alloc * 100 / size);
 		spa_prop_add_list(nv, ZPOOL_PROP_CAPACITY, NULL, cap, src);
 
 		spa_prop_add_list(nv, ZPOOL_PROP_DEDUPRATIO, NULL,
 		    ddt_get_pool_dedup_ratio(spa), src);
 		spa_prop_add_list(nv, ZPOOL_PROP_BCLONEUSED, NULL,
 		    brt_get_used(spa), src);
 		spa_prop_add_list(nv, ZPOOL_PROP_BCLONESAVED, NULL,
 		    brt_get_saved(spa), src);
 		spa_prop_add_list(nv, ZPOOL_PROP_BCLONERATIO, NULL,
 		    brt_get_ratio(spa), src);
 
 		spa_prop_add_list(nv, ZPOOL_PROP_DEDUP_TABLE_SIZE, NULL,
 		    ddt_get_ddt_dsize(spa), src);
 		spa_prop_add_list(nv, ZPOOL_PROP_HEALTH, NULL,
 		    rvd->vdev_state, src);
 		spa_prop_add_list(nv, ZPOOL_PROP_LAST_SCRUBBED_TXG, NULL,
 		    spa_get_last_scrubbed_txg(spa), src);
 
 		version = spa_version(spa);
 		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) {
 			spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL,
 			    version, ZPROP_SRC_DEFAULT);
 		} else {
 			spa_prop_add_list(nv, ZPOOL_PROP_VERSION, NULL,
 			    version, ZPROP_SRC_LOCAL);
 		}
 		spa_prop_add_list(nv, ZPOOL_PROP_LOAD_GUID,
 		    NULL, spa_load_guid(spa), src);
 	}
 
 	if (pool != NULL) {
 		/*
 		 * The $FREE directory was introduced in SPA_VERSION_DEADLISTS,
 		 * when opening pools before this version freedir will be NULL.
 		 */
 		if (pool->dp_free_dir != NULL) {
 			spa_prop_add_list(nv, ZPOOL_PROP_FREEING, NULL,
 			    dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes,
 			    src);
 		} else {
 			spa_prop_add_list(nv, ZPOOL_PROP_FREEING,
 			    NULL, 0, src);
 		}
 
 		if (pool->dp_leak_dir != NULL) {
 			spa_prop_add_list(nv, ZPOOL_PROP_LEAKED, NULL,
 			    dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes,
 			    src);
 		} else {
 			spa_prop_add_list(nv, ZPOOL_PROP_LEAKED,
 			    NULL, 0, src);
 		}
 	}
 
 	spa_prop_add_list(nv, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
 
 	if (spa->spa_comment != NULL) {
 		spa_prop_add_list(nv, ZPOOL_PROP_COMMENT, spa->spa_comment,
 		    0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_compatibility != NULL) {
 		spa_prop_add_list(nv, ZPOOL_PROP_COMPATIBILITY,
 		    spa->spa_compatibility, 0, ZPROP_SRC_LOCAL);
 	}
 
 	if (spa->spa_root != NULL)
 		spa_prop_add_list(nv, ZPOOL_PROP_ALTROOT, spa->spa_root,
 		    0, ZPROP_SRC_LOCAL);
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) {
 		spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE);
 	} else {
 		spa_prop_add_list(nv, ZPOOL_PROP_MAXBLOCKSIZE, NULL,
 		    SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE);
 	}
 
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) {
 		spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL,
 		    DNODE_MAX_SIZE, ZPROP_SRC_NONE);
 	} else {
 		spa_prop_add_list(nv, ZPOOL_PROP_MAXDNODESIZE, NULL,
 		    DNODE_MIN_SIZE, ZPROP_SRC_NONE);
 	}
 
 	if ((dp = list_head(&spa->spa_config_list)) != NULL) {
 		if (dp->scd_path == NULL) {
 			spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE,
 			    "none", 0, ZPROP_SRC_LOCAL);
 		} else if (strcmp(dp->scd_path, spa_config_path) != 0) {
 			spa_prop_add_list(nv, ZPOOL_PROP_CACHEFILE,
 			    dp->scd_path, 0, ZPROP_SRC_LOCAL);
 		}
 	}
 }
 
 /*
  * Get zpool property values.
  */
 int
 spa_prop_get(spa_t *spa, nvlist_t *nv)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t *za;
 	dsl_pool_t *dp;
 	int err = 0;
 
 	dp = spa_get_dsl(spa);
 	dsl_pool_config_enter(dp, FTAG);
 	za = zap_attribute_alloc();
 	mutex_enter(&spa->spa_props_lock);
 
 	/*
 	 * Get properties from the spa config.
 	 */
 	spa_prop_get_config(spa, nv);
 
 	/* If no pool property object, no more prop to get. */
 	if (mos == NULL || spa->spa_pool_props_object == 0)
 		goto out;
 
 	/*
 	 * Get properties from the MOS pool property object.
 	 */
 	for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object);
 	    (err = zap_cursor_retrieve(&zc, za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		uint64_t intval = 0;
 		char *strval = NULL;
 		zprop_source_t src = ZPROP_SRC_DEFAULT;
 		zpool_prop_t prop;
 
 		if ((prop = zpool_name_to_prop(za->za_name)) ==
 		    ZPOOL_PROP_INVAL && !zfs_prop_user(za->za_name))
 			continue;
 
 		switch (za->za_integer_length) {
 		case 8:
 			/* integer property */
 			if (za->za_first_integer !=
 			    zpool_prop_default_numeric(prop))
 				src = ZPROP_SRC_LOCAL;
 
 			if (prop == ZPOOL_PROP_BOOTFS) {
 				dsl_dataset_t *ds = NULL;
 
 				err = dsl_dataset_hold_obj(dp,
 				    za->za_first_integer, FTAG, &ds);
 				if (err != 0)
 					break;
 
 				strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN,
 				    KM_SLEEP);
 				dsl_dataset_name(ds, strval);
 				dsl_dataset_rele(ds, FTAG);
 			} else {
 				strval = NULL;
 				intval = za->za_first_integer;
 			}
 
 			spa_prop_add_list(nv, prop, strval, intval, src);
 
 			if (strval != NULL)
 				kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN);
 
 			break;
 
 		case 1:
 			/* string property */
 			strval = kmem_alloc(za->za_num_integers, KM_SLEEP);
 			err = zap_lookup(mos, spa->spa_pool_props_object,
 			    za->za_name, 1, za->za_num_integers, strval);
 			if (err) {
 				kmem_free(strval, za->za_num_integers);
 				break;
 			}
 			if (prop != ZPOOL_PROP_INVAL) {
 				spa_prop_add_list(nv, prop, strval, 0, src);
 			} else {
 				src = ZPROP_SRC_LOCAL;
 				spa_prop_add_user(nv, za->za_name, strval,
 				    src);
 			}
 			kmem_free(strval, za->za_num_integers);
 			break;
 
 		default:
 			break;
 		}
 	}
 	zap_cursor_fini(&zc);
 out:
 	mutex_exit(&spa->spa_props_lock);
 	dsl_pool_config_exit(dp, FTAG);
 	zap_attribute_free(za);
 
 	if (err && err != ENOENT)
 		return (err);
 
 	return (0);
 }
 
 /*
  * Validate the given pool properties nvlist and modify the list
  * for the property values to be set.
  */
 static int
 spa_prop_validate(spa_t *spa, nvlist_t *props)
 {
 	nvpair_t *elem;
 	int error = 0, reset_bootfs = 0;
 	uint64_t objnum = 0;
 	boolean_t has_feature = B_FALSE;
 
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
 		uint64_t intval;
 		const char *strval, *slash, *check, *fname;
 		const char *propname = nvpair_name(elem);
 		zpool_prop_t prop = zpool_name_to_prop(propname);
 
 		switch (prop) {
 		case ZPOOL_PROP_INVAL:
 			/*
 			 * Sanitize the input.
 			 */
 			if (zfs_prop_user(propname)) {
 				if (strlen(propname) >= ZAP_MAXNAMELEN) {
 					error = SET_ERROR(ENAMETOOLONG);
 					break;
 				}
 
 				if (strlen(fnvpair_value_string(elem)) >=
 				    ZAP_MAXVALUELEN) {
 					error = SET_ERROR(E2BIG);
 					break;
 				}
 			} else if (zpool_prop_feature(propname)) {
 				if (nvpair_type(elem) != DATA_TYPE_UINT64) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 
 				if (nvpair_value_uint64(elem, &intval) != 0) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 
 				if (intval != 0) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 
 				fname = strchr(propname, '@') + 1;
 				if (zfeature_lookup_name(fname, NULL) != 0) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 
 				has_feature = B_TRUE;
 			} else {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 			break;
 
 		case ZPOOL_PROP_VERSION:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error &&
 			    (intval < spa_version(spa) ||
 			    intval > SPA_VERSION_BEFORE_FEATURES ||
 			    has_feature))
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
 			error = nvpair_value_uint64(elem, &intval);
 			break;
 
 		case ZPOOL_PROP_DELEGATION:
 		case ZPOOL_PROP_AUTOREPLACE:
 		case ZPOOL_PROP_LISTSNAPS:
 		case ZPOOL_PROP_AUTOEXPAND:
 		case ZPOOL_PROP_AUTOTRIM:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_MULTIHOST:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = SET_ERROR(EINVAL);
 
 			if (!error) {
 				uint32_t hostid = zone_get_hostid(NULL);
 				if (hostid)
 					spa->spa_hostid = hostid;
 				else
 					error = SET_ERROR(ENOTSUP);
 			}
 
 			break;
 
 		case ZPOOL_PROP_BOOTFS:
 			/*
 			 * If the pool version is less than SPA_VERSION_BOOTFS,
 			 * or the pool is still being created (version == 0),
 			 * the bootfs property cannot be set.
 			 */
 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			/*
 			 * Make sure the vdev config is bootable
 			 */
 			if (!vdev_is_bootable(spa->spa_root_vdev)) {
 				error = SET_ERROR(ENOTSUP);
 				break;
 			}
 
 			reset_bootfs = 1;
 
 			error = nvpair_value_string(elem, &strval);
 
 			if (!error) {
 				objset_t *os;
 
 				if (strval == NULL || strval[0] == '\0') {
 					objnum = zpool_prop_default_numeric(
 					    ZPOOL_PROP_BOOTFS);
 					break;
 				}
 
 				error = dmu_objset_hold(strval, FTAG, &os);
 				if (error != 0)
 					break;
 
 				/* Must be ZPL. */
 				if (dmu_objset_type(os) != DMU_OST_ZFS) {
 					error = SET_ERROR(ENOTSUP);
 				} else {
 					objnum = dmu_objset_id(os);
 				}
 				dmu_objset_rele(os, FTAG);
 			}
 			break;
 
 		case ZPOOL_PROP_FAILUREMODE:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > ZIO_FAILURE_MODE_PANIC)
 				error = SET_ERROR(EINVAL);
 
 			/*
 			 * This is a special case which only occurs when
 			 * the pool has completely failed. This allows
 			 * the user to change the in-core failmode property
 			 * without syncing it out to disk (I/Os might
 			 * currently be blocked). We do this by returning
 			 * EIO to the caller (spa_prop_set) to trick it
 			 * into thinking we encountered a property validation
 			 * error.
 			 */
 			if (!error && spa_suspended(spa)) {
 				spa->spa_failmode = intval;
 				error = SET_ERROR(EIO);
 			}
 			break;
 
 		case ZPOOL_PROP_CACHEFILE:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 
 			if (strval[0] == '\0')
 				break;
 
 			if (strcmp(strval, "none") == 0)
 				break;
 
 			if (strval[0] != '/') {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 
 			slash = strrchr(strval, '/');
 			ASSERT(slash != NULL);
 
 			if (slash[1] == '\0' || strcmp(slash, "/.") == 0 ||
 			    strcmp(slash, "/..") == 0)
 				error = SET_ERROR(EINVAL);
 			break;
 
 		case ZPOOL_PROP_COMMENT:
 			if ((error = nvpair_value_string(elem, &strval)) != 0)
 				break;
 			for (check = strval; *check != '\0'; check++) {
 				if (!isprint(*check)) {
 					error = SET_ERROR(EINVAL);
 					break;
 				}
 			}
 			if (strlen(strval) > ZPROP_MAX_COMMENT)
 				error = SET_ERROR(E2BIG);
 			break;
 
 		default:
 			break;
 		}
 
 		if (error)
 			break;
 	}
 
 	(void) nvlist_remove_all(props,
 	    zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO));
 
 	if (!error && reset_bootfs) {
 		error = nvlist_remove(props,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING);
 
 		if (!error) {
 			error = nvlist_add_uint64(props,
 			    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum);
 		}
 	}
 
 	return (error);
 }
 
 void
 spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
 {
 	const char *cachefile;
 	spa_config_dirent_t *dp;
 
 	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
 	    &cachefile) != 0)
 		return;
 
 	dp = kmem_alloc(sizeof (spa_config_dirent_t),
 	    KM_SLEEP);
 
 	if (cachefile[0] == '\0')
 		dp->scd_path = spa_strdup(spa_config_path);
 	else if (strcmp(cachefile, "none") == 0)
 		dp->scd_path = NULL;
 	else
 		dp->scd_path = spa_strdup(cachefile);
 
 	list_insert_head(&spa->spa_config_list, dp);
 	if (need_sync)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
 int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
 	int error;
 	nvpair_t *elem = NULL;
 	boolean_t need_sync = B_FALSE;
 
 	if ((error = spa_prop_validate(spa, nvp)) != 0)
 		return (error);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
 		zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem));
 
 		if (prop == ZPOOL_PROP_CACHEFILE ||
 		    prop == ZPOOL_PROP_ALTROOT ||
 		    prop == ZPOOL_PROP_READONLY)
 			continue;
 
 		if (prop == ZPOOL_PROP_INVAL &&
 		    zfs_prop_user(nvpair_name(elem))) {
 			need_sync = B_TRUE;
 			break;
 		}
 
 		if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) {
 			uint64_t ver = 0;
 
 			if (prop == ZPOOL_PROP_VERSION) {
 				VERIFY0(nvpair_value_uint64(elem, &ver));
 			} else {
 				ASSERT(zpool_prop_feature(nvpair_name(elem)));
 				ver = SPA_VERSION_FEATURES;
 				need_sync = B_TRUE;
 			}
 
 			/* Save time if the version is already set. */
 			if (ver == spa_version(spa))
 				continue;
 
 			/*
 			 * In addition to the pool directory object, we might
 			 * create the pool properties object, the features for
 			 * read object, the features for write object, or the
 			 * feature descriptions object.
 			 */
 			error = dsl_sync_task(spa->spa_name, NULL,
 			    spa_sync_version, &ver,
 			    6, ZFS_SPACE_CHECK_RESERVED);
 			if (error)
 				return (error);
 			continue;
 		}
 
 		need_sync = B_TRUE;
 		break;
 	}
 
 	if (need_sync) {
 		return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props,
 		    nvp, 6, ZFS_SPACE_CHECK_RESERVED));
 	}
 
 	return (0);
 }
 
 /*
  * If the bootfs property value is dsobj, clear it.
  */
 void
 spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx)
 {
 	if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) {
 		VERIFY(zap_remove(spa->spa_meta_objset,
 		    spa->spa_pool_props_object,
 		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0);
 		spa->spa_bootfs = 0;
 	}
 }
 
 static int
 spa_change_guid_check(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid __maybe_unused = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t vdev_state;
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		int error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (SET_ERROR(error));
 	}
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	vdev_state = rvd->vdev_state;
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	if (vdev_state != VDEV_STATE_HEALTHY)
 		return (SET_ERROR(ENXIO));
 
 	ASSERT3U(spa_guid(spa), !=, *newguid);
 
 	return (0);
 }
 
 static void
 spa_change_guid_sync(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *newguid = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	uint64_t oldguid;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	oldguid = spa_guid(spa);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	rvd->vdev_guid = *newguid;
 	rvd->vdev_guid_sum += (*newguid - oldguid);
 	vdev_config_dirty(rvd);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu",
 	    (u_longlong_t)oldguid, (u_longlong_t)*newguid);
 }
 
 /*
  * Change the GUID for the pool.  This is done so that we can later
  * re-import a pool built from a clone of our own vdevs.  We will modify
  * the root vdev's guid, our own pool guid, and then mark all of our
  * vdevs dirty.  Note that we must make sure that all our vdevs are
  * online when we do this, or else any vdevs that weren't present
  * would be orphaned from our pool.  We are also going to issue a
  * sysevent to update any watchers.
  *
  * The GUID of the pool will be changed to the value pointed to by guidp.
  * The GUID may not be set to the reserverd value of 0.
  * The new GUID will be generated if guidp is NULL.
  */
 int
 spa_change_guid(spa_t *spa, const uint64_t *guidp)
 {
 	uint64_t guid;
 	int error;
 
 	mutex_enter(&spa->spa_vdev_top_lock);
 	spa_namespace_enter(FTAG);
 
 	if (guidp != NULL) {
 		guid = *guidp;
 		if (guid == 0) {
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		if (spa_guid_exists(guid, 0)) {
 			error = SET_ERROR(EEXIST);
 			goto out;
 		}
 	} else {
 		guid = spa_generate_guid(NULL);
 	}
 
 	error = dsl_sync_task(spa->spa_name, spa_change_guid_check,
 	    spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED);
 
 	if (error == 0) {
 		/*
 		 * Clear the kobj flag from all the vdevs to allow
 		 * vdev_cache_process_kobj_evt() to post events to all the
 		 * vdevs since GUID is updated.
 		 */
 		vdev_clear_kobj_evt(spa->spa_root_vdev);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			vdev_clear_kobj_evt(spa->spa_l2cache.sav_vdevs[i]);
 
 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID);
 	}
 
 out:
 	spa_namespace_exit(FTAG);
 	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
 
 /*
  * ==========================================================================
  * SPA state manipulation (open/create/destroy/import/export)
  * ==========================================================================
  */
 
 static int
 spa_error_entry_compare(const void *a, const void *b)
 {
 	const spa_error_entry_t *sa = (const spa_error_entry_t *)a;
 	const spa_error_entry_t *sb = (const spa_error_entry_t *)b;
 	int ret;
 
 	ret = memcmp(&sa->se_bookmark, &sb->se_bookmark,
 	    sizeof (zbookmark_phys_t));
 
 	return (TREE_ISIGN(ret));
 }
 
 /*
  * Utility function which retrieves copies of the current logs and
  * re-initializes them in the process.
  */
 void
 spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 {
 	ASSERT(MUTEX_HELD(&spa->spa_errlist_lock));
 
 	memcpy(last, &spa->spa_errlist_last, sizeof (avl_tree_t));
 	memcpy(scrub, &spa->spa_errlist_scrub, sizeof (avl_tree_t));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
 static void
 spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
 	enum zti_modes mode = ztip->zti_mode;
 	uint_t value = ztip->zti_value;
 	uint_t count = ztip->zti_count;
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	uint_t cpus, threads, flags = TASKQ_DYNAMIC;
 
 	switch (mode) {
 	case ZTI_MODE_FIXED:
 		ASSERT3U(value, >, 0);
 		break;
 
 	case ZTI_MODE_SYNC:
 
 		/*
 		 * Create one wr_iss taskq for every 'zio_taskq_write_tpq' CPUs,
 		 * not to exceed the number of spa allocators, and align to it.
 		 */
 		threads = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
 		count = MAX(1, threads / MAX(1, zio_taskq_write_tpq));
 		count = MAX(count, (zio_taskq_batch_pct + 99) / 100);
 		count = MIN(count, spa->spa_alloc_count);
 		while (spa->spa_alloc_count % count != 0 &&
 		    spa->spa_alloc_count < count * 2)
 			count--;
 
 		/*
 		 * zio_taskq_batch_pct is unbounded and may exceed 100%, but no
 		 * single taskq may have more threads than 100% of online cpus.
 		 */
 		value = (zio_taskq_batch_pct + count / 2) / count;
 		value = MIN(value, 100);
 		flags |= TASKQ_THREADS_CPU_PCT;
 		break;
 
 	case ZTI_MODE_SCALE:
 		/*
 		 * We want more taskqs to reduce lock contention, but we want
 		 * less for better request ordering and CPU utilization.
 		 */
 		threads = MAX(1, boot_ncpus * zio_taskq_batch_pct / 100);
 		threads = MAX(threads, value);
 		if (zio_taskq_batch_tpq > 0) {
 			count = MAX(1, (threads + zio_taskq_batch_tpq / 2) /
 			    zio_taskq_batch_tpq);
 		} else {
 			/*
 			 * Prefer 6 threads per taskq, but no more taskqs
 			 * than threads in them on large systems. For 80%:
 			 *
 			 *                 taskq   taskq   total
 			 * cpus    taskqs  percent threads threads
 			 * ------- ------- ------- ------- -------
 			 * 1       1       80%     1       1
 			 * 2       1       80%     1       1
 			 * 4       1       80%     3       3
 			 * 8       2       40%     3       6
 			 * 16      3       27%     4       12
 			 * 32      5       16%     5       25
 			 * 64      7       11%     7       49
 			 * 128     10      8%      10      100
 			 * 256     14      6%      15      210
 			 */
 			cpus = MIN(threads, boot_ncpus);
 			count = 1 + threads / 6;
 			while (count * count > cpus)
 				count--;
 		}
 
 		/*
 		 * Try to represent the number of threads per taskq as percent
 		 * of online CPUs to allow scaling with later online/offline.
 		 * Fall back to absolute numbers if can't.
 		 */
 		value = (threads * 100 + boot_ncpus * count / 2) /
 		    (boot_ncpus * count);
 		if (value < 5 || value > 100)
 			value = MAX(1, (threads + count / 2) / count);
 		else
 			flags |= TASKQ_THREADS_CPU_PCT;
 		break;
 
 	case ZTI_MODE_NULL:
 		tqs->stqs_count = 0;
 		tqs->stqs_taskq = NULL;
 		return;
 
 	default:
 		panic("unrecognized mode for %s_%s taskq (%u:%u) in "
 		    "spa_taskqs_init()",
 		    zio_type_name[t], zio_taskq_types[q], mode, value);
 		break;
 	}
 
 	ASSERT3U(count, >, 0);
 	tqs->stqs_count = count;
 	tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP);
 
 	for (uint_t i = 0; i < count; i++) {
 		taskq_t *tq;
 		char name[32];
 
 		if (count > 1)
 			(void) snprintf(name, sizeof (name), "%s_%s_%u",
 			    zio_type_name[t], zio_taskq_types[q], i);
 		else
 			(void) snprintf(name, sizeof (name), "%s_%s",
 			    zio_type_name[t], zio_taskq_types[q]);
 
 #ifdef HAVE_SYSDC
 		if (zio_taskq_sysdc && spa->spa_proc != &p0) {
 			(void) zio_taskq_basedc;
 			tq = taskq_create_sysdc(name, value, 50, INT_MAX,
 			    spa->spa_proc, zio_taskq_basedc, flags);
 		} else {
 #endif
 			/*
 			 * The write issue taskq can be extremely CPU
 			 * intensive.  Run it at slightly less important
 			 * priority than the other taskqs.
 			 */
 			const pri_t pri = (t == ZIO_TYPE_WRITE &&
 			    q == ZIO_TASKQ_ISSUE) ?
 			    wtqclsyspri : maxclsyspri;
 			tq = taskq_create_proc(name, value, pri, 50,
 			    INT_MAX, spa->spa_proc, flags);
 #ifdef HAVE_SYSDC
 		}
 #endif
 
 		tqs->stqs_taskq[i] = tq;
 	}
 }
 
 static void
 spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 
 	if (tqs->stqs_taskq == NULL) {
 		ASSERT0(tqs->stqs_count);
 		return;
 	}
 
 	for (uint_t i = 0; i < tqs->stqs_count; i++) {
 		ASSERT3P(tqs->stqs_taskq[i], !=, NULL);
 		taskq_destroy(tqs->stqs_taskq[i]);
 	}
 
 	kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *));
 	tqs->stqs_taskq = NULL;
 }
 
 #ifdef _KERNEL
 /*
  * The READ and WRITE rows of zio_taskqs are configurable at module load time
  * by setting zio_taskq_read or zio_taskq_write.
  *
  * Example (the defaults for READ and WRITE)
  *   zio_taskq_read='fixed,1,8 null scale null'
  *   zio_taskq_write='sync null scale null'
  *
  * Each sets the entire row at a time.
  *
  * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number
  * of threads per taskq.
  *
  * 'null' can only be set on the high-priority queues (queue selection for
  * high-priority queues will fall back to the regular queue if the high-pri
  * is NULL.
  */
 static const char *const modes[ZTI_NMODES] = {
 	"fixed", "scale", "sync", "null"
 };
 
 /* Parse the incoming config string. Modifies cfg */
 static int
 spa_taskq_param_set(zio_type_t t, char *cfg)
 {
 	int err = 0;
 
 	zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}};
 
 	char *next = cfg, *tok, *c;
 
 	/*
 	 * Parse out each element from the string and fill `row`. The entire
 	 * row has to be set at once, so any errors are flagged by just
 	 * breaking out of this loop early.
 	 */
 	uint_t q;
 	for (q = 0; q < ZIO_TASKQ_TYPES; q++) {
 		/* `next` is the start of the config */
 		if (next == NULL)
 			break;
 
 		/* Eat up leading space */
 		while (isspace(*next))
 			next++;
 		if (*next == '\0')
 			break;
 
 		/* Mode ends at space or end of string */
 		tok = next;
 		next = strchr(tok, ' ');
 		if (next != NULL) *next++ = '\0';
 
 		/* Parameters start after a comma */
 		c = strchr(tok, ',');
 		if (c != NULL) *c++ = '\0';
 
 		/* Match mode string */
 		uint_t mode;
 		for (mode = 0; mode < ZTI_NMODES; mode++)
 			if (strcmp(tok, modes[mode]) == 0)
 				break;
 		if (mode == ZTI_NMODES)
 			break;
 
 		/* Invalid canary */
 		row[q].zti_mode = ZTI_NMODES;
 
 		/* Per-mode setup */
 		switch (mode) {
 
 		/*
 		 * FIXED is parameterised: number of queues, and number of
 		 * threads per queue.
 		 */
 		case ZTI_MODE_FIXED: {
 			/* No parameters? */
 			if (c == NULL || *c == '\0')
 				break;
 
 			/* Find next parameter */
 			tok = c;
 			c = strchr(tok, ',');
 			if (c == NULL)
 				break;
 
 			/* Take digits and convert */
 			unsigned long long nq;
 			if (!(isdigit(*tok)))
 				break;
 			err = ddi_strtoull(tok, &tok, 10, &nq);
 			/* Must succeed and also end at the next param sep */
 			if (err != 0 || tok != c)
 				break;
 
 			/* Move past the comma */
 			tok++;
 			/* Need another number */
 			if (!(isdigit(*tok)))
 				break;
 			/* Remember start to make sure we moved */
 			c = tok;
 
 			/* Take digits */
 			unsigned long long ntpq;
 			err = ddi_strtoull(tok, &tok, 10, &ntpq);
 			/* Must succeed, and moved forward */
 			if (err != 0 || tok == c || *tok != '\0')
 				break;
 
 			/*
 			 * sanity; zero queues/threads make no sense, and
 			 * 16K is almost certainly more than anyone will ever
 			 * need and avoids silly numbers like UINT32_MAX
 			 */
 			if (nq == 0 || nq >= 16384 ||
 			    ntpq == 0 || ntpq >= 16384)
 				break;
 
 			const zio_taskq_info_t zti = ZTI_P(ntpq, nq);
 			row[q] = zti;
 			break;
 		}
 
 		/*
 		 * SCALE is optionally parameterised by minimum number of
 		 * threads.
 		 */
 		case ZTI_MODE_SCALE: {
 			unsigned long long mint = 0;
 			if (c != NULL && *c != '\0') {
 				/* Need a number */
 				if (!(isdigit(*c)))
 					break;
 				tok = c;
 
 				/* Take digits */
 				err = ddi_strtoull(tok, &tok, 10, &mint);
 				/* Must succeed, and moved forward */
 				if (err != 0 || tok == c || *tok != '\0')
 					break;
 
 				/* Sanity check */
 				if (mint >= 16384)
 					break;
 			}
 
 			const zio_taskq_info_t zti = ZTI_SCALE(mint);
 			row[q] = zti;
 			break;
 		}
 
 		case ZTI_MODE_SYNC: {
 			const zio_taskq_info_t zti = ZTI_SYNC;
 			row[q] = zti;
 			break;
 		}
 
 		case ZTI_MODE_NULL: {
 			/*
 			 * Can only null the high-priority queues; the general-
 			 * purpose ones have to exist.
 			 */
 			if (q != ZIO_TASKQ_ISSUE_HIGH &&
 			    q != ZIO_TASKQ_INTERRUPT_HIGH)
 				break;
 
 			const zio_taskq_info_t zti = ZTI_NULL;
 			row[q] = zti;
 			break;
 		}
 
 		default:
 			break;
 		}
 
 		/* Ensure we set a mode */
 		if (row[q].zti_mode == ZTI_NMODES)
 			break;
 	}
 
 	/* Didn't get a full row, fail */
 	if (q < ZIO_TASKQ_TYPES)
 		return (SET_ERROR(EINVAL));
 
 	/* Eat trailing space */
 	if (next != NULL)
 		while (isspace(*next))
 			next++;
 
 	/* If there's anything left over then fail */
 	if (next != NULL && *next != '\0')
 		return (SET_ERROR(EINVAL));
 
 	/* Success! Copy it into the real config */
 	for (q = 0; q < ZIO_TASKQ_TYPES; q++)
 		zio_taskqs[t][q] = row[q];
 
 	return (0);
 }
 
 static int
 spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline)
 {
 	int pos = 0;
 
 	/* Build paramater string from live config */
 	const char *sep = "";
 	for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) {
 		const zio_taskq_info_t *zti = &zio_taskqs[t][q];
 		if (zti->zti_mode == ZTI_MODE_FIXED)
 			pos += sprintf(&buf[pos], "%s%s,%u,%u", sep,
 			    modes[zti->zti_mode], zti->zti_count,
 			    zti->zti_value);
 		else if (zti->zti_mode == ZTI_MODE_SCALE && zti->zti_value > 0)
 			pos += sprintf(&buf[pos], "%s%s,%u", sep,
 			    modes[zti->zti_mode], zti->zti_value);
 		else
 			pos += sprintf(&buf[pos], "%s%s", sep,
 			    modes[zti->zti_mode]);
 		sep = " ";
 	}
 
 	if (add_newline)
 		buf[pos++] = '\n';
 	buf[pos] = '\0';
 
 	return (pos);
 }
 
 #ifdef __linux__
 static int
 spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp)
 {
 	char *cfg = kmem_strdup(val);
 	int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg);
 	kmem_strfree(cfg);
 	return (-err);
 }
 
 static int
 spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp)
 {
 	return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE));
 }
 
 static int
 spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp)
 {
 	char *cfg = kmem_strdup(val);
 	int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg);
 	kmem_strfree(cfg);
 	return (-err);
 }
 
 static int
 spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp)
 {
 	return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE));
 }
 
 static int
 spa_taskq_free_param_set(const char *val, zfs_kernel_param_t *kp)
 {
 	char *cfg = kmem_strdup(val);
 	int err = spa_taskq_param_set(ZIO_TYPE_FREE, cfg);
 	kmem_strfree(cfg);
 	return (-err);
 }
 
 static int
 spa_taskq_free_param_get(char *buf, zfs_kernel_param_t *kp)
 {
 	return (spa_taskq_param_get(ZIO_TYPE_FREE, buf, TRUE));
 }
 #else
 /*
  * On FreeBSD load-time parameters can be set up before malloc() is available,
  * so we have to do all the parsing work on the stack.
  */
 #define	SPA_TASKQ_PARAM_MAX	(128)
 
 static int
 spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS)
 {
 	char buf[SPA_TASKQ_PARAM_MAX];
 	int err;
 
 	(void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE);
 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (err || req->newptr == NULL)
 		return (err);
 	return (spa_taskq_param_set(ZIO_TYPE_READ, buf));
 }
 
 static int
 spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS)
 {
 	char buf[SPA_TASKQ_PARAM_MAX];
 	int err;
 
 	(void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE);
 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (err || req->newptr == NULL)
 		return (err);
 	return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf));
 }
 
 static int
 spa_taskq_free_param(ZFS_MODULE_PARAM_ARGS)
 {
 	char buf[SPA_TASKQ_PARAM_MAX];
 	int err;
 
 	(void) spa_taskq_param_get(ZIO_TYPE_FREE, buf, FALSE);
 	err = sysctl_handle_string(oidp, buf, sizeof (buf), req);
 	if (err || req->newptr == NULL)
 		return (err);
 	return (spa_taskq_param_set(ZIO_TYPE_FREE, buf));
 }
 #endif
 #endif /* _KERNEL */
 
 /*
  * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority.
  * Note that a type may have multiple discrete taskqs to avoid lock contention
  * on the taskq itself.
  */
 void
 spa_taskq_dispatch(spa_t *spa, zio_type_t t, zio_taskq_type_t q,
     task_func_t *func, zio_t *zio, boolean_t cutinline)
 {
 	spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q];
 	taskq_t *tq;
 
 	ASSERT3P(tqs->stqs_taskq, !=, NULL);
 	ASSERT3U(tqs->stqs_count, !=, 0);
 
 	/*
 	 * NB: We are assuming that the zio can only be dispatched
 	 * to a single taskq at a time.  It would be a grievous error
 	 * to dispatch the zio to another taskq at the same time.
 	 */
 	ASSERT(zio);
 	ASSERT(taskq_empty_ent(&zio->io_tqent));
 
 	if (tqs->stqs_count == 1) {
 		tq = tqs->stqs_taskq[0];
 	} else if ((t == ZIO_TYPE_WRITE) && (q == ZIO_TASKQ_ISSUE) &&
 	    ZIO_HAS_ALLOCATOR(zio)) {
 		tq = tqs->stqs_taskq[zio->io_allocator % tqs->stqs_count];
 	} else {
 		tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count];
 	}
 
 	taskq_dispatch_ent(tq, func, zio, cutinline ? TQ_FRONT : 0,
 	    &zio->io_tqent);
 }
 
 static void
 spa_create_zio_taskqs(spa_t *spa)
 {
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_init(spa, t, q);
 		}
 	}
 }
 
 #if defined(_KERNEL) && defined(HAVE_SPA_THREAD)
 static void
 spa_thread(void *arg)
 {
 	psetid_t zio_taskq_psrset_bind = PS_NONE;
 	callb_cpr_t cprinfo;
 
 	spa_t *spa = arg;
 	user_t *pu = PTOU(curproc);
 
 	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
 	    spa->spa_name);
 
 	ASSERT(curproc != &p0);
 	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
 	    "zpool-%s", spa->spa_name);
 	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
 
 	/* bind this thread to the requested psrset */
 	if (zio_taskq_psrset_bind != PS_NONE) {
 		pool_lock();
 		mutex_enter(&cpu_lock);
 		mutex_enter(&pidlock);
 		mutex_enter(&curproc->p_lock);
 
 		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
 		    0, NULL, NULL) == 0)  {
 			curthread->t_bind_pset = zio_taskq_psrset_bind;
 		} else {
 			cmn_err(CE_WARN,
 			    "Couldn't bind process for zfs pool \"%s\" to "
 			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
 		}
 
 		mutex_exit(&curproc->p_lock);
 		mutex_exit(&pidlock);
 		mutex_exit(&cpu_lock);
 		pool_unlock();
 	}
 
 #ifdef HAVE_SYSDC
 	if (zio_taskq_sysdc) {
 		sysdc_thread_enter(curthread, 100, 0);
 	}
 #endif
 
 	spa->spa_proc = curproc;
 	spa->spa_did = curthread->t_did;
 
 	spa_create_zio_taskqs(spa);
 
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
 
 	spa->spa_proc_state = SPA_PROC_ACTIVE;
 	cv_broadcast(&spa->spa_proc_cv);
 
 	CALLB_CPR_SAFE_BEGIN(&cprinfo);
 	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
 		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
 
 	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
 	spa->spa_proc_state = SPA_PROC_GONE;
 	spa->spa_proc = &p0;
 	cv_broadcast(&spa->spa_proc_cv);
 	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
 
 	mutex_enter(&curproc->p_lock);
 	lwp_exit();
 }
 #endif
 
 extern metaslab_ops_t *metaslab_allocator(spa_t *spa);
 
 /*
  * Activate an uninitialized pool.
  */
 static void
 spa_activate(spa_t *spa, spa_mode_t mode)
 {
 	metaslab_ops_t *msp = metaslab_allocator(spa);
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_final_txg = UINT64_MAX;
 	spa->spa_mode = mode;
 	spa->spa_read_spacemaps = spa_mode_readable_spacemaps;
 
 	spa->spa_normal_class = metaslab_class_create(spa, "normal",
 	    msp, B_FALSE);
 	spa->spa_log_class = metaslab_class_create(spa, "log", msp, B_TRUE);
 	spa->spa_embedded_log_class = metaslab_class_create(spa,
 	    "embedded_log", msp, B_TRUE);
 	spa->spa_special_class = metaslab_class_create(spa, "special",
 	    msp, B_FALSE);
 	spa->spa_special_embedded_log_class = metaslab_class_create(spa,
 	    "special_embedded_log", msp, B_TRUE);
 	spa->spa_dedup_class = metaslab_class_create(spa, "dedup",
 	    msp, B_FALSE);
 
 	/* Try to create a covering process */
 	mutex_enter(&spa->spa_proc_lock);
 	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
 	ASSERT(spa->spa_proc == &p0);
 	spa->spa_did = 0;
 
 #ifdef HAVE_SPA_THREAD
 	/* Only create a process if we're going to be around a while. */
 	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
 		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
 		    NULL, 0) == 0) {
 			spa->spa_proc_state = SPA_PROC_CREATED;
 			while (spa->spa_proc_state == SPA_PROC_CREATED) {
 				cv_wait(&spa->spa_proc_cv,
 				    &spa->spa_proc_lock);
 			}
 			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 			ASSERT(spa->spa_proc != &p0);
 			ASSERT(spa->spa_did != 0);
 		} else {
 #ifdef _KERNEL
 			cmn_err(CE_WARN,
 			    "Couldn't create process for zfs pool \"%s\"\n",
 			    spa->spa_name);
 #endif
 		}
 	}
 #endif /* HAVE_SPA_THREAD */
 	mutex_exit(&spa->spa_proc_lock);
 
 	/* If we didn't create a process, we need to create our taskqs. */
 	if (spa->spa_proc == &p0) {
 		spa_create_zio_taskqs(spa);
 	}
 
 	for (size_t i = 0; i < TXG_SIZE; i++) {
 		spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL);
 	}
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
 	list_create(&spa->spa_evicting_os_list, sizeof (objset_t),
 	    offsetof(objset_t, os_evicting_node));
 	list_create(&spa->spa_state_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_state_dirty_node));
 
 	txg_list_create(&spa->spa_vdev_txg_list, spa,
 	    offsetof(struct vdev, vdev_txg_node));
 
 	avl_create(&spa->spa_errlist_scrub,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_last,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 	avl_create(&spa->spa_errlist_healed,
 	    spa_error_entry_compare, sizeof (spa_error_entry_t),
 	    offsetof(spa_error_entry_t, se_avl));
 
 	spa_activate_os(spa);
 
 	spa_keystore_init(&spa->spa_keystore);
 
 	/*
 	 * This taskq is used to perform zvol-minor-related tasks
 	 * asynchronously. This has several advantages, including easy
 	 * resolution of various deadlocks.
 	 *
 	 * The taskq must be single threaded to ensure tasks are always
 	 * processed in the order in which they were dispatched.
 	 *
 	 * A taskq per pool allows one to keep the pools independent.
 	 * This way if one pool is suspended, it will not impact another.
 	 *
 	 * The preferred location to dispatch a zvol minor task is a sync
 	 * task. In this context, there is easy access to the spa_t and minimal
 	 * error handling is required because the sync task must succeed.
 	 */
 	spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri,
 	    1, INT_MAX, 0);
 
 	/*
 	 * The taskq to preload metaslabs.
 	 */
 	spa->spa_metaslab_taskq = taskq_create("z_metaslab",
 	    metaslab_preload_pct, maxclsyspri, 1, INT_MAX,
 	    TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 
 	/*
 	 * Taskq dedicated to prefetcher threads: this is used to prevent the
 	 * pool traverse code from monopolizing the global (and limited)
 	 * system_taskq by inappropriately scheduling long running tasks on it.
 	 */
 	spa->spa_prefetch_taskq = taskq_create("z_prefetch", 100,
 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 
 	/*
 	 * The taskq to upgrade datasets in this pool. Currently used by
 	 * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA.
 	 */
 	spa->spa_upgrade_taskq = taskq_create("z_upgrade", 100,
 	    defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT);
 }
 
 /*
  * Opposite of spa_activate().
  */
 static void
 spa_deactivate(spa_t *spa)
 {
 	ASSERT(spa->spa_sync_on == B_FALSE);
 	ASSERT0P(spa->spa_dsl_pool);
 	ASSERT0P(spa->spa_root_vdev);
 	ASSERT0P(spa->spa_async_zio_root);
 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
 	spa_evicting_os_wait(spa);
 
 	if (spa->spa_zvol_taskq) {
 		taskq_destroy(spa->spa_zvol_taskq);
 		spa->spa_zvol_taskq = NULL;
 	}
 
 	if (spa->spa_metaslab_taskq) {
 		taskq_destroy(spa->spa_metaslab_taskq);
 		spa->spa_metaslab_taskq = NULL;
 	}
 
 	if (spa->spa_prefetch_taskq) {
 		taskq_destroy(spa->spa_prefetch_taskq);
 		spa->spa_prefetch_taskq = NULL;
 	}
 
 	if (spa->spa_upgrade_taskq) {
 		taskq_destroy(spa->spa_upgrade_taskq);
 		spa->spa_upgrade_taskq = NULL;
 	}
 
 	txg_list_destroy(&spa->spa_vdev_txg_list);
 
 	list_destroy(&spa->spa_config_dirty_list);
 	list_destroy(&spa->spa_evicting_os_list);
 	list_destroy(&spa->spa_state_dirty_list);
 
-	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
 			spa_taskqs_fini(spa, t, q);
 		}
 	}
 
 	for (size_t i = 0; i < TXG_SIZE; i++) {
 		ASSERT3P(spa->spa_txg_zio[i], !=, NULL);
 		VERIFY0(zio_wait(spa->spa_txg_zio[i]));
 		spa->spa_txg_zio[i] = NULL;
 	}
 
 	metaslab_class_destroy(spa->spa_normal_class);
 	spa->spa_normal_class = NULL;
 
 	metaslab_class_destroy(spa->spa_log_class);
 	spa->spa_log_class = NULL;
 
 	metaslab_class_destroy(spa->spa_embedded_log_class);
 	spa->spa_embedded_log_class = NULL;
 
 	metaslab_class_destroy(spa->spa_special_class);
 	spa->spa_special_class = NULL;
 
 	metaslab_class_destroy(spa->spa_special_embedded_log_class);
 	spa->spa_special_embedded_log_class = NULL;
 
 	metaslab_class_destroy(spa->spa_dedup_class);
 	spa->spa_dedup_class = NULL;
 
 	/*
 	 * If this was part of an import or the open otherwise failed, we may
 	 * still have errors left in the queues.  Empty them just in case.
 	 */
 	spa_errlog_drain(spa);
 	avl_destroy(&spa->spa_errlist_scrub);
 	avl_destroy(&spa->spa_errlist_last);
 	avl_destroy(&spa->spa_errlist_healed);
 
 	spa_keystore_fini(&spa->spa_keystore);
 
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 
 	mutex_enter(&spa->spa_proc_lock);
 	if (spa->spa_proc_state != SPA_PROC_NONE) {
 		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
 		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
 		cv_broadcast(&spa->spa_proc_cv);
 		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
 			ASSERT(spa->spa_proc != &p0);
 			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
 		}
 		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
 		spa->spa_proc_state = SPA_PROC_NONE;
 	}
 	ASSERT(spa->spa_proc == &p0);
 	mutex_exit(&spa->spa_proc_lock);
 
 	/*
 	 * We want to make sure spa_thread() has actually exited the ZFS
 	 * module, so that the module can't be unloaded out from underneath
 	 * it.
 	 */
 	if (spa->spa_did != 0) {
 		thread_join(spa->spa_did);
 		spa->spa_did = 0;
 	}
 
 	spa_deactivate_os(spa);
 
 }
 
 /*
  * Verify a pool configuration, and construct the vdev tree appropriately.  This
  * will create all the necessary vdevs in the appropriate layout, with each vdev
  * in the CLOSED state.  This will prep the pool before open/creation/import.
  * All vdev validation is done by the vdev_alloc() routine.
  */
 int
 spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
     uint_t id, int atype)
 {
 	nvlist_t **child;
 	uint_t children;
 	int error;
 
 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
 		return (error);
 
 	if ((*vdp)->vdev_ops->vdev_op_leaf)
 		return (0);
 
 	error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 	    &child, &children);
 
 	if (error == ENOENT)
 		return (0);
 
 	if (error) {
 		vdev_free(*vdp);
 		*vdp = NULL;
 		return (SET_ERROR(EINVAL));
 	}
 
 	for (int c = 0; c < children; c++) {
 		vdev_t *vd;
 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
 		    atype)) != 0) {
 			vdev_free(*vdp);
 			*vdp = NULL;
 			return (error);
 		}
 	}
 
 	ASSERT(*vdp != NULL);
 
 	return (0);
 }
 
 static boolean_t
 spa_should_flush_logs_on_unload(spa_t *spa)
 {
 	if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))
 		return (B_FALSE);
 
 	if (!spa_writeable(spa))
 		return (B_FALSE);
 
 	if (!spa->spa_sync_on)
 		return (B_FALSE);
 
 	if (spa_state(spa) != POOL_STATE_EXPORTED)
 		return (B_FALSE);
 
 	if (zfs_keep_log_spacemaps_at_export)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Opens a transaction that will set the flag that will instruct
  * spa_sync to attempt to flush all the metaslabs for that txg.
  */
 static void
 spa_unload_log_sm_flush_all(spa_t *spa)
 {
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT | DMU_TX_SUSPEND));
 
 	ASSERT0(spa->spa_log_flushall_txg);
 	spa->spa_log_flushall_txg = dmu_tx_get_txg(tx);
 
 	dmu_tx_commit(tx);
 	txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg);
 }
 
 static void
 spa_unload_log_sm_metadata(spa_t *spa)
 {
 	void *cookie = NULL;
 	spa_log_sm_t *sls;
 	log_summary_entry_t *e;
 
 	while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg,
 	    &cookie)) != NULL) {
 		VERIFY0(sls->sls_mscount);
 		kmem_free(sls, sizeof (spa_log_sm_t));
 	}
 
 	while ((e = list_remove_head(&spa->spa_log_summary)) != NULL) {
 		VERIFY0(e->lse_mscount);
 		kmem_free(e, sizeof (log_summary_entry_t));
 	}
 
 	spa->spa_unflushed_stats.sus_nblocks = 0;
 	spa->spa_unflushed_stats.sus_memused = 0;
 	spa->spa_unflushed_stats.sus_blocklimit = 0;
 }
 
 static void
 spa_destroy_aux_threads(spa_t *spa)
 {
 	if (spa->spa_condense_zthr != NULL) {
 		zthr_destroy(spa->spa_condense_zthr);
 		spa->spa_condense_zthr = NULL;
 	}
 	if (spa->spa_checkpoint_discard_zthr != NULL) {
 		zthr_destroy(spa->spa_checkpoint_discard_zthr);
 		spa->spa_checkpoint_discard_zthr = NULL;
 	}
 	if (spa->spa_livelist_delete_zthr != NULL) {
 		zthr_destroy(spa->spa_livelist_delete_zthr);
 		spa->spa_livelist_delete_zthr = NULL;
 	}
 	if (spa->spa_livelist_condense_zthr != NULL) {
 		zthr_destroy(spa->spa_livelist_condense_zthr);
 		spa->spa_livelist_condense_zthr = NULL;
 	}
 	if (spa->spa_raidz_expand_zthr != NULL) {
 		zthr_destroy(spa->spa_raidz_expand_zthr);
 		spa->spa_raidz_expand_zthr = NULL;
 	}
 }
 
 static void
 spa_sync_time_logger(spa_t *spa, uint64_t txg)
 {
 	uint64_t curtime;
 	dmu_tx_t *tx;
 
 	if (!spa_writeable(spa)) {
 		return;
 	}
 	curtime = gethrestime_sec();
 	if (curtime < spa->spa_last_noted_txg_time + spa_note_txg_time) {
 		return;
 	}
 
 	if (txg > spa->spa_last_noted_txg) {
 		spa->spa_last_noted_txg_time = curtime;
 		spa->spa_last_noted_txg = txg;
 
 		mutex_enter(&spa->spa_txg_log_time_lock);
 		dbrrd_add(&spa->spa_txg_log_time, curtime, txg);
 		mutex_exit(&spa->spa_txg_log_time_lock);
 	}
 
 	if (curtime < spa->spa_last_flush_txg_time + spa_flush_txg_time) {
 		return;
 	}
 	spa->spa_last_flush_txg_time = curtime;
 
 	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
 	    &spa->spa_txg_log_time.dbr_minutes, tx));
 	VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
 	    &spa->spa_txg_log_time.dbr_days, tx));
 	VERIFY0(zap_update(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
 	    &spa->spa_txg_log_time.dbr_months, tx));
 	dmu_tx_commit(tx);
 }
 
 static void
 spa_unload_sync_time_logger(spa_t *spa)
 {
 	uint64_t txg;
 	dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	VERIFY0(dmu_tx_assign(tx, DMU_TX_WAIT));
 
 	txg = dmu_tx_get_txg(tx);
 	spa->spa_last_noted_txg_time = 0;
 	spa->spa_last_flush_txg_time = 0;
 	spa_sync_time_logger(spa, txg);
 
 	dmu_tx_commit(tx);
 }
 
 static void
 spa_load_txg_log_time(spa_t *spa)
 {
 	int error;
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TXG_LOG_TIME_MINUTES, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
 	    &spa->spa_txg_log_time.dbr_minutes);
 	if (error != 0 && error != ENOENT) {
 		spa_load_note(spa, "unable to load a txg time database with "
 		    "minute resolution [error=%d]", error);
 	}
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TXG_LOG_TIME_DAYS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
 	    &spa->spa_txg_log_time.dbr_days);
 	if (error != 0 && error != ENOENT) {
 		spa_load_note(spa, "unable to load a txg time database with "
 		    "day resolution [error=%d]", error);
 	}
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_TXG_LOG_TIME_MONTHS, RRD_ENTRY_SIZE, RRD_STRUCT_ELEM,
 	    &spa->spa_txg_log_time.dbr_months);
 	if (error != 0 && error != ENOENT) {
 		spa_load_note(spa, "unable to load a txg time database with "
 		    "month resolution [error=%d]", error);
 	}
 }
 
 static boolean_t
 spa_should_sync_time_logger_on_unload(spa_t *spa)
 {
 
 	if (!spa_writeable(spa))
 		return (B_FALSE);
 
 	if (!spa->spa_sync_on)
 		return (B_FALSE);
 
 	if (spa_state(spa) != POOL_STATE_EXPORTED)
 		return (B_FALSE);
 
 	if (spa->spa_last_noted_txg == 0)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 
 /*
  * Opposite of spa_load().
  */
 static void
 spa_unload(spa_t *spa)
 {
 	ASSERT(spa_namespace_held() ||
 	    spa->spa_export_thread == curthread);
 	ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED);
 
 	spa_import_progress_remove(spa_guid(spa));
 	spa_load_note(spa, "UNLOADING");
 
 	spa_wake_waiters(spa);
 
 	/*
 	 * If we have set the spa_final_txg, we have already performed the
 	 * tasks below in spa_export_common(). We should not redo it here since
 	 * we delay the final TXGs beyond what spa_final_txg is set at.
 	 */
 	if (spa->spa_final_txg == UINT64_MAX) {
 		if (spa_should_sync_time_logger_on_unload(spa))
 			spa_unload_sync_time_logger(spa);
 
 		/*
 		 * If the log space map feature is enabled and the pool is
 		 * getting exported (but not destroyed), we want to spend some
 		 * time flushing as many metaslabs as we can in an attempt to
 		 * destroy log space maps and save import time.
 		 */
 		if (spa_should_flush_logs_on_unload(spa))
 			spa_unload_log_sm_flush_all(spa);
 
 		/*
 		 * Stop async tasks.
 		 */
 		spa_async_suspend(spa);
 
 		if (spa->spa_root_vdev) {
 			vdev_t *root_vdev = spa->spa_root_vdev;
 			vdev_initialize_stop_all(root_vdev,
 			    VDEV_INITIALIZE_ACTIVE);
 			vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE);
 			vdev_autotrim_stop_all(spa);
 			vdev_rebuild_stop_all(spa);
 			l2arc_spa_rebuild_stop(spa);
 		}
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa->spa_final_txg = spa_last_synced_txg(spa) +
 		    TXG_DEFER_SIZE + 1;
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	}
 
 	/*
 	 * Stop syncing.
 	 */
 	if (spa->spa_sync_on) {
 		txg_sync_stop(spa->spa_dsl_pool);
 		spa->spa_sync_on = B_FALSE;
 	}
 
 	/*
 	 * This ensures that there is no async metaslab prefetching
 	 * while we attempt to unload the spa.
 	 */
 	taskq_wait(spa->spa_metaslab_taskq);
 
 	if (spa->spa_mmp.mmp_thread)
 		mmp_thread_stop(spa);
 
 	/*
 	 * Wait for any outstanding async I/O to complete.
 	 */
 	if (spa->spa_async_zio_root != NULL) {
 		for (int i = 0; i < max_ncpus; i++)
 			(void) zio_wait(spa->spa_async_zio_root[i]);
 		kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *));
 		spa->spa_async_zio_root = NULL;
 	}
 
 	if (spa->spa_vdev_removal != NULL) {
 		spa_vdev_removal_destroy(spa->spa_vdev_removal);
 		spa->spa_vdev_removal = NULL;
 	}
 
 	spa_destroy_aux_threads(spa);
 
 	spa_condense_fini(spa);
 
 	bpobj_close(&spa->spa_deferred_bpobj);
 
 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
 	/*
 	 * Close all vdevs.
 	 */
 	if (spa->spa_root_vdev)
 		vdev_free(spa->spa_root_vdev);
 	ASSERT0P(spa->spa_root_vdev);
 
 	/*
 	 * Close the dsl pool.
 	 */
 	if (spa->spa_dsl_pool) {
 		dsl_pool_close(spa->spa_dsl_pool);
 		spa->spa_dsl_pool = NULL;
 		spa->spa_meta_objset = NULL;
 	}
 
 	ddt_unload(spa);
 	brt_unload(spa);
 	spa_unload_log_sm_metadata(spa);
 
 	/*
 	 * Drop and purge level 2 cache
 	 */
 	spa_l2cache_drop(spa);
 
 	if (spa->spa_spares.sav_vdevs) {
 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
 			vdev_free(spa->spa_spares.sav_vdevs[i]);
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 		spa->spa_spares.sav_vdevs = NULL;
 	}
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 	}
 	spa->spa_spares.sav_count = 0;
 
 	if (spa->spa_l2cache.sav_vdevs) {
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]);
 			vdev_free(spa->spa_l2cache.sav_vdevs[i]);
 		}
 		kmem_free(spa->spa_l2cache.sav_vdevs,
 		    spa->spa_l2cache.sav_count * sizeof (void *));
 		spa->spa_l2cache.sav_vdevs = NULL;
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 	}
 	spa->spa_l2cache.sav_count = 0;
 
 	spa->spa_async_suspended = 0;
 
 	spa->spa_indirect_vdevs_loaded = B_FALSE;
 
 	if (spa->spa_comment != NULL) {
 		spa_strfree(spa->spa_comment);
 		spa->spa_comment = NULL;
 	}
 	if (spa->spa_compatibility != NULL) {
 		spa_strfree(spa->spa_compatibility);
 		spa->spa_compatibility = NULL;
 	}
 
 	spa->spa_raidz_expand = NULL;
 	spa->spa_checkpoint_txg = 0;
 
 	spa_config_exit(spa, SCL_ALL, spa);
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active spares for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_spares.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  */
 void
 spa_load_spares(spa_t *spa)
 {
 	nvlist_t **spares;
 	uint_t nspares;
 	int i;
 	vdev_t *vd, *tvd;
 
 #ifndef _KERNEL
 	/*
 	 * zdb opens both the current state of the pool and the
 	 * checkpointed state (if present), with a different spa_t.
 	 *
 	 * As spare vdevs are shared among open pools, we skip loading
 	 * them when we load the checkpointed state of the pool.
 	 */
 	if (!spa_writeable(spa))
 		return;
 #endif
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * First, close and free any existing spare vdevs.
 	 */
 	if (spa->spa_spares.sav_vdevs) {
 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
 			vd = spa->spa_spares.sav_vdevs[i];
 
 			/* Undo the call to spa_activate() below */
 			if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 			    B_FALSE)) != NULL && tvd->vdev_isspare)
 				spa_spare_remove(tvd);
 			vdev_close(vd);
 			vdev_free(vd);
 		}
 
 		kmem_free(spa->spa_spares.sav_vdevs,
 		    spa->spa_spares.sav_count * sizeof (void *));
 	}
 
 	if (spa->spa_spares.sav_config == NULL)
 		nspares = 0;
 	else
 		VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares));
 
 	spa->spa_spares.sav_count = (int)nspares;
 	spa->spa_spares.sav_vdevs = NULL;
 
 	if (nspares == 0)
 		return;
 
 	/*
 	 * Construct the array of vdevs, opening them to get status in the
 	 * process.   For each spare, there is potentially two different vdev_t
 	 * structures associated with it: one in the list of spares (used only
 	 * for basic validation purposes) and one in the active vdev
 	 * configuration (if it's spared in).  During this phase we open and
 	 * validate each vdev on the spare list.  If the vdev also exists in the
 	 * active configuration, then we also mark this vdev as an active spare.
 	 */
 	spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++) {
 		VERIFY0(spa_config_parse(spa, &vd, spares[i], NULL, 0,
 		    VDEV_ALLOC_SPARE));
 		ASSERT(vd != NULL);
 
 		spa->spa_spares.sav_vdevs[i] = vd;
 
 		if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid,
 		    B_FALSE)) != NULL) {
 			if (!tvd->vdev_isspare)
 				spa_spare_add(tvd);
 
 			/*
 			 * We only mark the spare active if we were successfully
 			 * able to load the vdev.  Otherwise, importing a pool
 			 * with a bad active spare would result in strange
 			 * behavior, because multiple pool would think the spare
 			 * is actively in use.
 			 *
 			 * There is a vulnerability here to an equally bizarre
 			 * circumstance, where a dead active spare is later
 			 * brought back to life (onlined or otherwise).  Given
 			 * the rarity of this scenario, and the extra complexity
 			 * it adds, we ignore the possibility.
 			 */
 			if (!vdev_is_dead(tvd))
 				spa_spare_activate(tvd);
 		}
 
 		vd->vdev_top = vd;
 		vd->vdev_aux = &spa->spa_spares;
 
 		if (vdev_open(vd) != 0)
 			continue;
 
 		if (vdev_validate_aux(vd) == 0)
 			spa_spare_add(vd);
 	}
 
 	/*
 	 * Recompute the stashed list of spares, with status information
 	 * this time.
 	 */
 	fnvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES);
 
 	spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *),
 	    KM_SLEEP);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		spares[i] = vdev_config_generate(spa,
 		    spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE);
 	fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
 	    spa->spa_spares.sav_count);
 	for (i = 0; i < spa->spa_spares.sav_count; i++)
 		nvlist_free(spares[i]);
 	kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *));
 }
 
 /*
  * Load (or re-load) the current list of vdevs describing the active l2cache for
  * this pool.  When this is called, we have some form of basic information in
  * 'spa_l2cache.sav_config'.  We parse this into vdevs, try to open them, and
  * then re-generate a more complete list including status information.
  * Devices which are already active have their details maintained, and are
  * not re-opened.
  */
 void
 spa_load_l2cache(spa_t *spa)
 {
 	nvlist_t **l2cache = NULL;
 	uint_t nl2cache;
 	int i, j, oldnvdevs;
 	uint64_t guid;
 	vdev_t *vd, **oldvdevs, **newvdevs;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 #ifndef _KERNEL
 	/*
 	 * zdb opens both the current state of the pool and the
 	 * checkpointed state (if present), with a different spa_t.
 	 *
 	 * As L2 caches are part of the ARC which is shared among open
 	 * pools, we skip loading them when we load the checkpointed
 	 * state of the pool.
 	 */
 	if (!spa_writeable(spa))
 		return;
 #endif
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	oldvdevs = sav->sav_vdevs;
 	oldnvdevs = sav->sav_count;
 	sav->sav_vdevs = NULL;
 	sav->sav_count = 0;
 
 	if (sav->sav_config == NULL) {
 		nl2cache = 0;
 		newvdevs = NULL;
 		goto out;
 	}
 
 	VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
 	newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP);
 
 	/*
 	 * Process new nvlist of vdevs.
 	 */
 	for (i = 0; i < nl2cache; i++) {
 		guid = fnvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID);
 
 		newvdevs[i] = NULL;
 		for (j = 0; j < oldnvdevs; j++) {
 			vd = oldvdevs[j];
 			if (vd != NULL && guid == vd->vdev_guid) {
 				/*
 				 * Retain previous vdev for add/remove ops.
 				 */
 				newvdevs[i] = vd;
 				oldvdevs[j] = NULL;
 				break;
 			}
 		}
 
 		if (newvdevs[i] == NULL) {
 			/*
 			 * Create new vdev
 			 */
 			VERIFY0(spa_config_parse(spa, &vd, l2cache[i], NULL, 0,
 			    VDEV_ALLOC_L2CACHE));
 			ASSERT(vd != NULL);
 			newvdevs[i] = vd;
 
 			/*
 			 * Commit this vdev as an l2cache device,
 			 * even if it fails to open.
 			 */
 			spa_l2cache_add(vd);
 
 			vd->vdev_top = vd;
 			vd->vdev_aux = sav;
 
 			spa_l2cache_activate(vd);
 
 			if (vdev_open(vd) != 0)
 				continue;
 
 			(void) vdev_validate_aux(vd);
 
 			if (!vdev_is_dead(vd))
 				l2arc_add_vdev(spa, vd);
 
 			/*
 			 * Upon cache device addition to a pool or pool
 			 * creation with a cache device or if the header
 			 * of the device is invalid we issue an async
 			 * TRIM command for the whole device which will
 			 * execute if l2arc_trim_ahead > 0.
 			 */
 			spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM);
 		}
 	}
 
 	sav->sav_vdevs = newvdevs;
 	sav->sav_count = (int)nl2cache;
 
 	/*
 	 * Recompute the stashed list of l2cache devices, with status
 	 * information this time.
 	 */
 	fnvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE);
 
 	if (sav->sav_count > 0)
 		l2cache = kmem_alloc(sav->sav_count * sizeof (void *),
 		    KM_SLEEP);
 	for (i = 0; i < sav->sav_count; i++)
 		l2cache[i] = vdev_config_generate(spa,
 		    sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE);
 	fnvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE,
 	    (const nvlist_t * const *)l2cache, sav->sav_count);
 
 out:
 	/*
 	 * Purge vdevs that were dropped
 	 */
 	if (oldvdevs) {
 		for (i = 0; i < oldnvdevs; i++) {
 			uint64_t pool;
 
 			vd = oldvdevs[i];
 			if (vd != NULL) {
 				ASSERT(vd->vdev_isl2cache);
 
 				if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 				    pool != 0ULL && l2arc_vdev_present(vd))
 					l2arc_remove_vdev(vd);
 				vdev_clear_stats(vd);
 				vdev_free(vd);
 			}
 		}
 
 		kmem_free(oldvdevs, oldnvdevs * sizeof (void *));
 	}
 
 	for (i = 0; i < sav->sav_count; i++)
 		nvlist_free(l2cache[i]);
 	if (sav->sav_count)
 		kmem_free(l2cache, sav->sav_count * sizeof (void *));
 }
 
 static int
 load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 {
 	dmu_buf_t *db;
 	char *packed = NULL;
 	size_t nvsize = 0;
 	int error;
 	*value = NULL;
 
 	error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db);
 	if (error)
 		return (error);
 
 	nvsize = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
 	packed = vmem_alloc(nvsize, KM_SLEEP);
 	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
 	    DMU_READ_PREFETCH);
 	if (error == 0)
 		error = nvlist_unpack(packed, nvsize, value, 0);
 	vmem_free(packed, nvsize);
 
 	return (error);
 }
 
 /*
  * Concrete top-level vdevs that are not missing and are not logs. At every
  * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds.
  */
 static uint64_t
 spa_healthy_core_tvds(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t tvds = 0;
 
 	for (uint64_t i = 0; i < rvd->vdev_children; i++) {
 		vdev_t *vd = rvd->vdev_child[i];
 		if (vd->vdev_islog)
 			continue;
 		if (vdev_is_concrete(vd) && !vdev_is_dead(vd))
 			tvds++;
 	}
 
 	return (tvds);
 }
 
 /*
  * Checks to see if the given vdev could not be opened, in which case we post a
  * sysevent to notify the autoreplace code that the device has been removed.
  */
 static void
 spa_check_removed(vdev_t *vd)
 {
 	for (uint64_t c = 0; c < vd->vdev_children; c++)
 		spa_check_removed(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) &&
 	    vdev_is_concrete(vd)) {
 		zfs_post_autoreplace(vd->vdev_spa, vd);
 		spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK);
 	}
 }
 
 static int
 spa_check_for_missing_logs(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If we're doing a normal import, then build up any additional
 	 * diagnostic information about missing log devices.
 	 * We'll pass this up to the user for further processing.
 	 */
 	if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) {
 		nvlist_t **child, *nv;
 		uint64_t idx = 0;
 
 		child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *),
 		    KM_SLEEP);
 		nv = fnvlist_alloc();
 
 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 
 			/*
 			 * We consider a device as missing only if it failed
 			 * to open (i.e. offline or faulted is not considered
 			 * as missing).
 			 */
 			if (tvd->vdev_islog &&
 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
 				child[idx++] = vdev_config_generate(spa, tvd,
 				    B_FALSE, VDEV_CONFIG_MISSING);
 			}
 		}
 
 		if (idx > 0) {
 			fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
 			    (const nvlist_t * const *)child, idx);
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_MISSING_DEVICES, nv);
 
 			for (uint64_t i = 0; i < idx; i++)
 				nvlist_free(child[i]);
 		}
 		nvlist_free(nv);
 		kmem_free(child, rvd->vdev_children * sizeof (char **));
 
 		if (idx > 0) {
 			spa_load_failed(spa, "some log devices are missing");
 			vdev_dbgmsg_print_tree(rvd, 2);
 			return (SET_ERROR(ENXIO));
 		}
 	} else {
 		for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
 
 			if (tvd->vdev_islog &&
 			    tvd->vdev_state == VDEV_STATE_CANT_OPEN) {
 				spa_set_log_state(spa, SPA_LOG_CLEAR);
 				spa_load_note(spa, "some log devices are "
 				    "missing, ZIL is dropped.");
 				vdev_dbgmsg_print_tree(rvd, 2);
 				break;
 			}
 		}
 	}
 
 	return (0);
 }
 
 /*
  * Check for missing log devices
  */
 static boolean_t
 spa_check_logs(spa_t *spa)
 {
 	boolean_t rv = B_FALSE;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	switch (spa->spa_log_state) {
 	default:
 		break;
 	case SPA_LOG_MISSING:
 		/* need to recheck in case slog has been restored */
 	case SPA_LOG_UNKNOWN:
 		rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 		    zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0);
 		if (rv)
 			spa_set_log_state(spa, SPA_LOG_MISSING);
 		break;
 	}
 	return (rv);
 }
 
 /*
  * Passivate any log vdevs (note, does not apply to embedded log metaslabs).
  */
 static boolean_t
 spa_passivate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	boolean_t slog_found = B_FALSE;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		if (tvd->vdev_islog) {
 			ASSERT0P(tvd->vdev_log_mg);
 			metaslab_group_passivate(tvd->vdev_mg);
 			slog_found = B_TRUE;
 		}
 	}
 
 	return (slog_found);
 }
 
 /*
  * Activate any log vdevs (note, does not apply to embedded log metaslabs).
  */
 static void
 spa_activate_log(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *tvd = rvd->vdev_child[c];
 
 		if (tvd->vdev_islog) {
 			ASSERT0P(tvd->vdev_log_mg);
 			metaslab_group_activate(tvd->vdev_mg);
 		}
 	}
 }
 
 int
 spa_reset_logs(spa_t *spa)
 {
 	int error;
 
 	error = dmu_objset_find(spa_name(spa), zil_reset,
 	    NULL, DS_FIND_CHILDREN);
 	if (error == 0) {
 		/*
 		 * We successfully offlined the log device, sync out the
 		 * current txg so that the "stubby" block can be removed
 		 * by zil_sync().
 		 */
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 	}
 	return (error);
 }
 
 static void
 spa_aux_check_removed(spa_aux_vdev_t *sav)
 {
 	for (int i = 0; i < sav->sav_count; i++)
 		spa_check_removed(sav->sav_vdevs[i]);
 }
 
 void
 spa_claim_notify(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
 
 	if (zio->io_error)
 		return;
 
 	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
 	if (spa->spa_claim_max_txg < BP_GET_BIRTH(zio->io_bp))
 		spa->spa_claim_max_txg = BP_GET_BIRTH(zio->io_bp);
 	mutex_exit(&spa->spa_props_lock);
 }
 
 typedef struct spa_load_error {
 	boolean_t	sle_verify_data;
 	uint64_t	sle_meta_count;
 	uint64_t	sle_data_count;
 } spa_load_error_t;
 
 static void
 spa_load_verify_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	spa_load_error_t *sle = zio->io_private;
 	dmu_object_type_t type = BP_GET_TYPE(bp);
 	int error = zio->io_error;
 	spa_t *spa = zio->io_spa;
 
 	abd_free(zio->io_abd);
 	if (error) {
 		if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) &&
 		    type != DMU_OT_INTENT_LOG)
 			atomic_inc_64(&sle->sle_meta_count);
 		else
 			atomic_inc_64(&sle->sle_data_count);
 	}
 
 	mutex_enter(&spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp);
 	cv_broadcast(&spa->spa_scrub_io_cv);
 	mutex_exit(&spa->spa_scrub_lock);
 }
 
 /*
  * Maximum number of inflight bytes is the log2 fraction of the arc size.
  * By default, we set it to 1/16th of the arc.
  */
 static uint_t spa_load_verify_shift = 4;
 static int spa_load_verify_metadata = B_TRUE;
 static int spa_load_verify_data = B_TRUE;
 
 static int
 spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	zio_t *rio = arg;
 	spa_load_error_t *sle = rio->io_private;
 
 	(void) zilog, (void) dnp;
 
 	/*
 	 * Note: normally this routine will not be called if
 	 * spa_load_verify_metadata is not set.  However, it may be useful
 	 * to manually set the flag after the traversal has begun.
 	 */
 	if (!spa_load_verify_metadata)
 		return (0);
 
 	/*
 	 * Sanity check the block pointer in order to detect obvious damage
 	 * before using the contents in subsequent checks or in zio_read().
 	 * When damaged consider it to be a metadata error since we cannot
 	 * trust the BP_GET_TYPE and BP_GET_LEVEL values.
 	 */
 	if (zfs_blkptr_verify(spa, bp, BLK_CONFIG_NEEDED, BLK_VERIFY_LOG)) {
 		atomic_inc_64(&sle->sle_meta_count);
 		return (0);
 	}
 
 	if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) ||
 	    BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp))
 		return (0);
 
 	if (!BP_IS_METADATA(bp) &&
 	    (!spa_load_verify_data || !sle->sle_verify_data))
 		return (0);
 
 	uint64_t maxinflight_bytes =
 	    arc_target_bytes() >> spa_load_verify_shift;
 	size_t size = BP_GET_PSIZE(bp);
 
 	mutex_enter(&spa->spa_scrub_lock);
 	while (spa->spa_load_verify_bytes >= maxinflight_bytes)
 		cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
 	spa->spa_load_verify_bytes += size;
 	mutex_exit(&spa->spa_scrub_lock);
 
 	zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size,
 	    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
 	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
 	    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
 	return (0);
 }
 
 static int
 verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
 {
 	(void) dp, (void) arg;
 
 	if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	return (0);
 }
 
 static int
 spa_load_verify(spa_t *spa)
 {
 	zio_t *rio;
 	spa_load_error_t sle = { 0 };
 	zpool_load_policy_t policy;
 	boolean_t verify_ok = B_FALSE;
 	int error = 0;
 
 	zpool_get_load_policy(spa->spa_config, &policy);
 
 	if (policy.zlp_rewind & ZPOOL_NEVER_REWIND ||
 	    policy.zlp_maxmeta == UINT64_MAX)
 		return (0);
 
 	dsl_pool_config_enter(spa->spa_dsl_pool, FTAG);
 	error = dmu_objset_find_dp(spa->spa_dsl_pool,
 	    spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL,
 	    DS_FIND_CHILDREN);
 	dsl_pool_config_exit(spa->spa_dsl_pool, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Verify data only if we are rewinding or error limit was set.
 	 * Otherwise nothing except dbgmsg care about it to waste time.
 	 */
 	sle.sle_verify_data = (policy.zlp_rewind & ZPOOL_REWIND_MASK) ||
 	    (policy.zlp_maxdata < UINT64_MAX);
 
 	rio = zio_root(spa, NULL, &sle,
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
 
 	if (spa_load_verify_metadata) {
 		if (spa->spa_extreme_rewind) {
 			spa_load_note(spa, "performing a complete scan of the "
 			    "pool since extreme rewind is on. This may take "
 			    "a very long time.\n  (spa_load_verify_data=%u, "
 			    "spa_load_verify_metadata=%u)",
 			    spa_load_verify_data, spa_load_verify_metadata);
 		}
 
 		error = traverse_pool(spa, spa->spa_verify_min_txg,
 		    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA |
 		    TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio);
 	}
 
 	(void) zio_wait(rio);
 	ASSERT0(spa->spa_load_verify_bytes);
 
 	spa->spa_load_meta_errors = sle.sle_meta_count;
 	spa->spa_load_data_errors = sle.sle_data_count;
 
 	if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) {
 		spa_load_note(spa, "spa_load_verify found %llu metadata errors "
 		    "and %llu data errors", (u_longlong_t)sle.sle_meta_count,
 		    (u_longlong_t)sle.sle_data_count);
 	}
 
 	if (spa_load_verify_dryrun ||
 	    (!error && sle.sle_meta_count <= policy.zlp_maxmeta &&
 	    sle.sle_data_count <= policy.zlp_maxdata)) {
 		int64_t loss = 0;
 
 		verify_ok = B_TRUE;
 		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
 		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
 		fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME,
 		    spa->spa_load_txg_ts);
 		fnvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME,
 		    loss);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_META_ERRORS, sle.sle_meta_count);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count);
 	} else {
 		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
 	}
 
 	if (spa_load_verify_dryrun)
 		return (0);
 
 	if (error) {
 		if (error != ENXIO && error != EIO)
 			error = SET_ERROR(EIO);
 		return (error);
 	}
 
 	return (verify_ok ? 0 : EIO);
 }
 
 /*
  * Find a value in the pool props object.
  */
 static void
 spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
 {
 	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
 	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
 }
 
 /*
  * Find a value in the pool directory object.
  */
 static int
 spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent)
 {
 	int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    name, sizeof (uint64_t), 1, val);
 
 	if (error != 0 && (error != ENOENT || log_enoent)) {
 		spa_load_failed(spa, "couldn't get '%s' value in MOS directory "
 		    "[error=%d]", name, error);
 	}
 
 	return (error);
 }
 
 static int
 spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
 {
 	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
 	return (SET_ERROR(err));
 }
 
 boolean_t
 spa_livelist_delete_check(spa_t *spa)
 {
 	return (spa->spa_livelists_to_delete != 0);
 }
 
 static boolean_t
 spa_livelist_delete_cb_check(void *arg, zthr_t *z)
 {
 	(void) z;
 	spa_t *spa = arg;
 	return (spa_livelist_delete_check(spa));
 }
 
 static int
 delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	spa_t *spa = arg;
 	zio_free(spa, tx->tx_txg, bp);
 	dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD,
 	    -bp_get_dsize_sync(spa, bp),
 	    -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx);
 	return (0);
 }
 
 static int
 dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp)
 {
 	int err;
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_alloc();
 	zap_cursor_init(&zc, os, zap_obj);
 	err = zap_cursor_retrieve(&zc, za);
 	zap_cursor_fini(&zc);
 	if (err == 0)
 		*llp = za->za_first_integer;
 	zap_attribute_free(za);
 	return (err);
 }
 
 /*
  * Components of livelist deletion that must be performed in syncing
  * context: freeing block pointers and updating the pool-wide data
  * structures to indicate how much work is left to do
  */
 typedef struct sublist_delete_arg {
 	spa_t *spa;
 	dsl_deadlist_t *ll;
 	uint64_t key;
 	bplist_t *to_free;
 } sublist_delete_arg_t;
 
 static void
 sublist_delete_sync(void *arg, dmu_tx_t *tx)
 {
 	sublist_delete_arg_t *sda = arg;
 	spa_t *spa = sda->spa;
 	dsl_deadlist_t *ll = sda->ll;
 	uint64_t key = sda->key;
 	bplist_t *to_free = sda->to_free;
 
 	bplist_iterate(to_free, delete_blkptr_cb, spa, tx);
 	dsl_deadlist_remove_entry(ll, key, tx);
 }
 
 typedef struct livelist_delete_arg {
 	spa_t *spa;
 	uint64_t ll_obj;
 	uint64_t zap_obj;
 } livelist_delete_arg_t;
 
 static void
 livelist_delete_sync(void *arg, dmu_tx_t *tx)
 {
 	livelist_delete_arg_t *lda = arg;
 	spa_t *spa = lda->spa;
 	uint64_t ll_obj = lda->ll_obj;
 	uint64_t zap_obj = lda->zap_obj;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t count;
 
 	/* free the livelist and decrement the feature count */
 	VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx));
 	dsl_deadlist_free(mos, ll_obj, tx);
 	spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx);
 	VERIFY0(zap_count(mos, zap_obj, &count));
 	if (count == 0) {
 		/* no more livelists to delete */
 		VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_DELETED_CLONES, tx));
 		VERIFY0(zap_destroy(mos, zap_obj, tx));
 		spa->spa_livelists_to_delete = 0;
 		spa_notify_waiters(spa);
 	}
 }
 
 /*
  * Load in the value for the livelist to be removed and open it. Then,
  * load its first sublist and determine which block pointers should actually
  * be freed. Then, call a synctask which performs the actual frees and updates
  * the pool-wide livelist data.
  */
 static void
 spa_livelist_delete_cb(void *arg, zthr_t *z)
 {
 	spa_t *spa = arg;
 	uint64_t ll_obj = 0, count;
 	objset_t *mos = spa->spa_meta_objset;
 	uint64_t zap_obj = spa->spa_livelists_to_delete;
 	/*
 	 * Determine the next livelist to delete. This function should only
 	 * be called if there is at least one deleted clone.
 	 */
 	VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj));
 	VERIFY0(zap_count(mos, ll_obj, &count));
 	if (count > 0) {
 		dsl_deadlist_t *ll;
 		dsl_deadlist_entry_t *dle;
 		bplist_t to_free;
 		ll = kmem_zalloc(sizeof (dsl_deadlist_t), KM_SLEEP);
 		VERIFY0(dsl_deadlist_open(ll, mos, ll_obj));
 		dle = dsl_deadlist_first(ll);
 		ASSERT3P(dle, !=, NULL);
 		bplist_create(&to_free);
 		int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free,
 		    z, NULL);
 		if (err == 0) {
 			sublist_delete_arg_t sync_arg = {
 			    .spa = spa,
 			    .ll = ll,
 			    .key = dle->dle_mintxg,
 			    .to_free = &to_free
 			};
 			zfs_dbgmsg("deleting sublist (id %llu) from"
 			    " livelist %llu, %lld remaining",
 			    (u_longlong_t)dle->dle_bpobj.bpo_object,
 			    (u_longlong_t)ll_obj, (longlong_t)count - 1);
 			VERIFY0(dsl_sync_task(spa_name(spa), NULL,
 			    sublist_delete_sync, &sync_arg, 0,
 			    ZFS_SPACE_CHECK_DESTROY));
 		} else {
 			VERIFY3U(err, ==, EINTR);
 		}
 		bplist_clear(&to_free);
 		bplist_destroy(&to_free);
 		dsl_deadlist_close(ll);
 		kmem_free(ll, sizeof (dsl_deadlist_t));
 	} else {
 		livelist_delete_arg_t sync_arg = {
 		    .spa = spa,
 		    .ll_obj = ll_obj,
 		    .zap_obj = zap_obj
 		};
 		zfs_dbgmsg("deletion of livelist %llu completed",
 		    (u_longlong_t)ll_obj);
 		VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync,
 		    &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY));
 	}
 }
 
 static void
 spa_start_livelist_destroy_thread(spa_t *spa)
 {
 	ASSERT0P(spa->spa_livelist_delete_zthr);
 	spa->spa_livelist_delete_zthr =
 	    zthr_create("z_livelist_destroy",
 	    spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa,
 	    minclsyspri);
 }
 
 typedef struct livelist_new_arg {
 	bplist_t *allocs;
 	bplist_t *frees;
 } livelist_new_arg_t;
 
 static int
 livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT0P(tx);
 	livelist_new_arg_t *lna = arg;
 	if (bp_freed) {
 		bplist_append(lna->frees, bp);
 	} else {
 		bplist_append(lna->allocs, bp);
 		zfs_livelist_condense_new_alloc++;
 	}
 	return (0);
 }
 
 typedef struct livelist_condense_arg {
 	spa_t *spa;
 	bplist_t to_keep;
 	uint64_t first_size;
 	uint64_t next_size;
 } livelist_condense_arg_t;
 
 static void
 spa_livelist_condense_sync(void *arg, dmu_tx_t *tx)
 {
 	livelist_condense_arg_t *lca = arg;
 	spa_t *spa = lca->spa;
 	bplist_t new_frees;
 	dsl_dataset_t *ds = spa->spa_to_condense.ds;
 
 	/* Have we been cancelled? */
 	if (spa->spa_to_condense.cancelled) {
 		zfs_livelist_condense_sync_cancel++;
 		goto out;
 	}
 
 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
 	dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist;
 
 	/*
 	 * It's possible that the livelist was changed while the zthr was
 	 * running. Therefore, we need to check for new blkptrs in the two
 	 * entries being condensed and continue to track them in the livelist.
 	 * Because of the way we handle remapped blkptrs (see dbuf_remap_impl),
 	 * it's possible that the newly added blkptrs are FREEs or ALLOCs so
 	 * we need to sort them into two different bplists.
 	 */
 	uint64_t first_obj = first->dle_bpobj.bpo_object;
 	uint64_t next_obj = next->dle_bpobj.bpo_object;
 	uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs;
 	uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs;
 
 	bplist_create(&new_frees);
 	livelist_new_arg_t new_bps = {
 	    .allocs = &lca->to_keep,
 	    .frees = &new_frees,
 	};
 
 	if (cur_first_size > lca->first_size) {
 		VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj,
 		    livelist_track_new_cb, &new_bps, lca->first_size));
 	}
 	if (cur_next_size > lca->next_size) {
 		VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj,
 		    livelist_track_new_cb, &new_bps, lca->next_size));
 	}
 
 	dsl_deadlist_clear_entry(first, ll, tx);
 	ASSERT(bpobj_is_empty(&first->dle_bpobj));
 	dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx);
 
 	bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx);
 	bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx);
 	bplist_destroy(&new_frees);
 
 	char dsname[ZFS_MAX_DATASET_NAME_LEN];
 	dsl_dataset_name(ds, dsname);
 	zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu "
 	    "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu "
 	    "(%llu blkptrs)", (u_longlong_t)tx->tx_txg, dsname,
 	    (u_longlong_t)ds->ds_object, (u_longlong_t)first_obj,
 	    (u_longlong_t)cur_first_size, (u_longlong_t)next_obj,
 	    (u_longlong_t)cur_next_size,
 	    (u_longlong_t)first->dle_bpobj.bpo_object,
 	    (u_longlong_t)first->dle_bpobj.bpo_phys->bpo_num_blkptrs);
 out:
 	dmu_buf_rele(ds->ds_dbuf, spa);
 	spa->spa_to_condense.ds = NULL;
 	bplist_clear(&lca->to_keep);
 	bplist_destroy(&lca->to_keep);
 	kmem_free(lca, sizeof (livelist_condense_arg_t));
 	spa->spa_to_condense.syncing = B_FALSE;
 }
 
 static void
 spa_livelist_condense_cb(void *arg, zthr_t *t)
 {
 	while (zfs_livelist_condense_zthr_pause &&
 	    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
 		delay(1);
 
 	spa_t *spa = arg;
 	dsl_deadlist_entry_t *first = spa->spa_to_condense.first;
 	dsl_deadlist_entry_t *next = spa->spa_to_condense.next;
 	uint64_t first_size, next_size;
 
 	livelist_condense_arg_t *lca =
 	    kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP);
 	bplist_create(&lca->to_keep);
 
 	/*
 	 * Process the livelists (matching FREEs and ALLOCs) in open context
 	 * so we have minimal work in syncing context to condense.
 	 *
 	 * We save bpobj sizes (first_size and next_size) to use later in
 	 * syncing context to determine if entries were added to these sublists
 	 * while in open context. This is possible because the clone is still
 	 * active and open for normal writes and we want to make sure the new,
 	 * unprocessed blockpointers are inserted into the livelist normally.
 	 *
 	 * Note that dsl_process_sub_livelist() both stores the size number of
 	 * blockpointers and iterates over them while the bpobj's lock held, so
 	 * the sizes returned to us are consistent which what was actually
 	 * processed.
 	 */
 	int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t,
 	    &first_size);
 	if (err == 0)
 		err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep,
 		    t, &next_size);
 
 	if (err == 0) {
 		while (zfs_livelist_condense_sync_pause &&
 		    !(zthr_has_waiters(t) || zthr_iscancelled(t)))
 			delay(1);
 
 		dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 		dmu_tx_mark_netfree(tx);
 		dmu_tx_hold_space(tx, 1);
 		err = dmu_tx_assign(tx, DMU_TX_NOWAIT | DMU_TX_NOTHROTTLE);
 		if (err == 0) {
 			/*
 			 * Prevent the condense zthr restarting before
 			 * the synctask completes.
 			 */
 			spa->spa_to_condense.syncing = B_TRUE;
 			lca->spa = spa;
 			lca->first_size = first_size;
 			lca->next_size = next_size;
 			dsl_sync_task_nowait(spa_get_dsl(spa),
 			    spa_livelist_condense_sync, lca, tx);
 			dmu_tx_commit(tx);
 			return;
 		}
 	}
 	/*
 	 * Condensing can not continue: either it was externally stopped or
 	 * we were unable to assign to a tx because the pool has run out of
 	 * space. In the second case, we'll just end up trying to condense
 	 * again in a later txg.
 	 */
 	ASSERT(err != 0);
 	bplist_clear(&lca->to_keep);
 	bplist_destroy(&lca->to_keep);
 	kmem_free(lca, sizeof (livelist_condense_arg_t));
 	dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa);
 	spa->spa_to_condense.ds = NULL;
 	if (err == EINTR)
 		zfs_livelist_condense_zthr_cancel++;
 }
 
 /*
  * Check that there is something to condense but that a condense is not
  * already in progress and that condensing has not been cancelled.
  */
 static boolean_t
 spa_livelist_condense_cb_check(void *arg, zthr_t *z)
 {
 	(void) z;
 	spa_t *spa = arg;
 	if ((spa->spa_to_condense.ds != NULL) &&
 	    (spa->spa_to_condense.syncing == B_FALSE) &&
 	    (spa->spa_to_condense.cancelled == B_FALSE)) {
 		return (B_TRUE);
 	}
 	return (B_FALSE);
 }
 
 static void
 spa_start_livelist_condensing_thread(spa_t *spa)
 {
 	spa->spa_to_condense.ds = NULL;
 	spa->spa_to_condense.first = NULL;
 	spa->spa_to_condense.next = NULL;
 	spa->spa_to_condense.syncing = B_FALSE;
 	spa->spa_to_condense.cancelled = B_FALSE;
 
 	ASSERT0P(spa->spa_livelist_condense_zthr);
 	spa->spa_livelist_condense_zthr =
 	    zthr_create("z_livelist_condense",
 	    spa_livelist_condense_cb_check,
 	    spa_livelist_condense_cb, spa, minclsyspri);
 }
 
 static void
 spa_spawn_aux_threads(spa_t *spa)
 {
 	ASSERT(spa_writeable(spa));
 
 	spa_start_raidz_expansion_thread(spa);
 	spa_start_indirect_condensing_thread(spa);
 	spa_start_livelist_destroy_thread(spa);
 	spa_start_livelist_condensing_thread(spa);
 
 	ASSERT0P(spa->spa_checkpoint_discard_zthr);
 	spa->spa_checkpoint_discard_zthr =
 	    zthr_create("z_checkpoint_discard",
 	    spa_checkpoint_discard_thread_check,
 	    spa_checkpoint_discard_thread, spa, minclsyspri);
 }
 
 /*
  * Fix up config after a partly-completed split.  This is done with the
  * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
  * pool have that entry in their config, but only the splitting one contains
  * a list of all the guids of the vdevs that are being split off.
  *
  * This function determines what to do with that list: either rejoin
  * all the disks to the pool, or complete the splitting process.  To attempt
  * the rejoin, each disk that is offlined is marked online again, and
  * we do a reopen() call.  If the vdev label for every disk that was
  * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
  * then we call vdev_split() on each disk, and complete the split.
  *
  * Otherwise we leave the config alone, with all the vdevs in place in
  * the original pool.
  */
 static void
 spa_try_repair(spa_t *spa, nvlist_t *config)
 {
 	uint_t extracted;
 	uint64_t *glist;
 	uint_t i, gcount;
 	nvlist_t *nvl;
 	vdev_t **vd;
 	boolean_t attempt_reopen;
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
 		return;
 
 	/* check that the config is complete */
 	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
 	    &glist, &gcount) != 0)
 		return;
 
 	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
 
 	/* attempt to online all the vdevs & validate */
 	attempt_reopen = B_TRUE;
 	for (i = 0; i < gcount; i++) {
 		if (glist[i] == 0)	/* vdev is hole */
 			continue;
 
 		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
 		if (vd[i] == NULL) {
 			/*
 			 * Don't bother attempting to reopen the disks;
 			 * just do the split.
 			 */
 			attempt_reopen = B_FALSE;
 		} else {
 			/* attempt to re-online it */
 			vd[i]->vdev_offline = B_FALSE;
 		}
 	}
 
 	if (attempt_reopen) {
 		vdev_reopen(spa->spa_root_vdev);
 
 		/* check each device to see what state it's in */
 		for (extracted = 0, i = 0; i < gcount; i++) {
 			if (vd[i] != NULL &&
 			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
 				break;
 			++extracted;
 		}
 	}
 
 	/*
 	 * If every disk has been moved to the new pool, or if we never
 	 * even attempted to look at them, then we split them off for
 	 * good.
 	 */
 	if (!attempt_reopen || gcount == extracted) {
 		for (i = 0; i < gcount; i++)
 			if (vd[i] != NULL)
 				vdev_split(vd[i]);
 		vdev_reopen(spa->spa_root_vdev);
 	}
 
 	kmem_free(vd, gcount * sizeof (vdev_t *));
 }
 
 static int
 spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type)
 {
 	const char *ereport = FM_EREPORT_ZFS_POOL;
 	int error;
 
 	spa->spa_load_state = state;
 	(void) spa_import_progress_set_state(spa_guid(spa),
 	    spa_load_state(spa));
 	spa_import_progress_set_notes(spa, "spa_load()");
 
 	gethrestime(&spa->spa_loaded_ts);
 	error = spa_load_impl(spa, type, &ereport);
 
 	/*
 	 * Don't count references from objsets that are already closed
 	 * and are making their way through the eviction process.
 	 */
 	spa_evicting_os_wait(spa);
 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
 	if (error) {
 		if (error != EEXIST) {
 			spa->spa_loaded_ts.tv_sec = 0;
 			spa->spa_loaded_ts.tv_nsec = 0;
 		}
 		if (error != EBADF) {
 			(void) zfs_ereport_post(ereport, spa,
 			    NULL, NULL, NULL, 0);
 		}
 	}
 	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
 	spa->spa_ena = 0;
 
 	(void) spa_import_progress_set_state(spa_guid(spa),
 	    spa_load_state(spa));
 
 	return (error);
 }
 
 #ifdef ZFS_DEBUG
 /*
  * Count the number of per-vdev ZAPs associated with all of the vdevs in the
  * vdev tree rooted in the given vd, and ensure that each ZAP is present in the
  * spa's per-vdev ZAP list.
  */
 static uint64_t
 vdev_count_verify_zaps(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
 	uint64_t total = 0;
 
 	if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_AVZ_V2) &&
 	    vd->vdev_root_zap != 0) {
 		total++;
 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, vd->vdev_root_zap));
 	}
 	if (vd->vdev_top_zap != 0) {
 		total++;
 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, vd->vdev_top_zap));
 	}
 	if (vd->vdev_leaf_zap != 0) {
 		total++;
 		ASSERT0(zap_lookup_int(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, vd->vdev_leaf_zap));
 	}
 
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		total += vdev_count_verify_zaps(vd->vdev_child[i]);
 	}
 
 	return (total);
 }
 #else
 #define	vdev_count_verify_zaps(vd) ((void) sizeof (vd), 0)
 #endif
 
 /*
  * Determine whether the activity check is required.
  */
 static boolean_t
 spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label,
     nvlist_t *config)
 {
 	uint64_t state = 0;
 	uint64_t hostid = 0;
 	uint64_t tryconfig_txg = 0;
 	uint64_t tryconfig_timestamp = 0;
 	uint16_t tryconfig_mmp_seq = 0;
 	nvlist_t *nvinfo;
 
 	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
 		nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO);
 		(void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG,
 		    &tryconfig_txg);
 		(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    &tryconfig_timestamp);
 		(void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ,
 		    &tryconfig_mmp_seq);
 	}
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state);
 
 	/*
 	 * Disable the MMP activity check - This is used by zdb which
 	 * is intended to be used on potentially active pools.
 	 */
 	if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP)
 		return (B_FALSE);
 
 	/*
 	 * Skip the activity check when the MMP feature is disabled.
 	 */
 	if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0)
 		return (B_FALSE);
 
 	/*
 	 * If the tryconfig_ values are nonzero, they are the results of an
 	 * earlier tryimport.  If they all match the uberblock we just found,
 	 * then the pool has not changed and we return false so we do not test
 	 * a second time.
 	 */
 	if (tryconfig_txg && tryconfig_txg == ub->ub_txg &&
 	    tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp &&
 	    tryconfig_mmp_seq && tryconfig_mmp_seq ==
 	    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0))
 		return (B_FALSE);
 
 	/*
 	 * Allow the activity check to be skipped when importing the pool
 	 * on the same host which last imported it.  Since the hostid from
 	 * configuration may be stale use the one read from the label.
 	 */
 	if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID))
 		hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID);
 
 	if (hostid == spa_get_hostid(spa))
 		return (B_FALSE);
 
 	/*
 	 * Skip the activity test when the pool was cleanly exported.
 	 */
 	if (state != POOL_STATE_ACTIVE)
 		return (B_FALSE);
 
 	return (B_TRUE);
 }
 
 /*
  * Nanoseconds the activity check must watch for changes on-disk.
  */
 static uint64_t
 spa_activity_check_duration(spa_t *spa, uberblock_t *ub)
 {
 	uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1);
 	uint64_t multihost_interval = MSEC2NSEC(
 	    MMP_INTERVAL_OK(zfs_multihost_interval));
 	uint64_t import_delay = MAX(NANOSEC, import_intervals *
 	    multihost_interval);
 
 	/*
 	 * Local tunables determine a minimum duration except for the case
 	 * where we know when the remote host will suspend the pool if MMP
 	 * writes do not land.
 	 *
 	 * See Big Theory comment at the top of mmp.c for the reasoning behind
 	 * these cases and times.
 	 */
 
 	ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100);
 
 	if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
 	    MMP_FAIL_INT(ub) > 0) {
 
 		/* MMP on remote host will suspend pool after failed writes */
 		import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) *
 		    MMP_IMPORT_SAFETY_FACTOR / 100;
 
 		zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp "
 		    "mmp_fails=%llu ub_mmp mmp_interval=%llu "
 		    "import_intervals=%llu", (u_longlong_t)import_delay,
 		    (u_longlong_t)MMP_FAIL_INT(ub),
 		    (u_longlong_t)MMP_INTERVAL(ub),
 		    (u_longlong_t)import_intervals);
 
 	} else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) &&
 	    MMP_FAIL_INT(ub) == 0) {
 
 		/* MMP on remote host will never suspend pool */
 		import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) +
 		    ub->ub_mmp_delay) * import_intervals);
 
 		zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp "
 		    "mmp_interval=%llu ub_mmp_delay=%llu "
 		    "import_intervals=%llu", (u_longlong_t)import_delay,
 		    (u_longlong_t)MMP_INTERVAL(ub),
 		    (u_longlong_t)ub->ub_mmp_delay,
 		    (u_longlong_t)import_intervals);
 
 	} else if (MMP_VALID(ub)) {
 		/*
 		 * zfs-0.7 compatibility case
 		 */
 
 		import_delay = MAX(import_delay, (multihost_interval +
 		    ub->ub_mmp_delay) * import_intervals);
 
 		zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu "
 		    "import_intervals=%llu leaves=%u",
 		    (u_longlong_t)import_delay,
 		    (u_longlong_t)ub->ub_mmp_delay,
 		    (u_longlong_t)import_intervals,
 		    vdev_count_leaves(spa));
 	} else {
 		/* Using local tunings is the only reasonable option */
 		zfs_dbgmsg("pool last imported on non-MMP aware "
 		    "host using import_delay=%llu multihost_interval=%llu "
 		    "import_intervals=%llu", (u_longlong_t)import_delay,
 		    (u_longlong_t)multihost_interval,
 		    (u_longlong_t)import_intervals);
 	}
 
 	return (import_delay);
 }
 
 /*
  * Remote host activity check.
  *
  * error results:
  *          0 - no activity detected
  *  EREMOTEIO - remote activity detected
  *      EINTR - user canceled the operation
  */
 static int
 spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config,
     boolean_t importing)
 {
 	uint64_t txg = ub->ub_txg;
 	uint64_t timestamp = ub->ub_timestamp;
 	uint64_t mmp_config = ub->ub_mmp_config;
 	uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0;
 	uint64_t import_delay;
 	hrtime_t import_expire, now;
 	nvlist_t *mmp_label = NULL;
 	vdev_t *rvd = spa->spa_root_vdev;
 	kcondvar_t cv;
 	kmutex_t mtx;
 	int error = 0;
 
 	cv_init(&cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_enter(&mtx);
 
 	/*
 	 * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed
 	 * during the earlier tryimport.  If the txg recorded there is 0 then
 	 * the pool is known to be active on another host.
 	 *
 	 * Otherwise, the pool might be in use on another host.  Check for
 	 * changes in the uberblocks on disk if necessary.
 	 */
 	if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) {
 		nvlist_t *nvinfo = fnvlist_lookup_nvlist(config,
 		    ZPOOL_CONFIG_LOAD_INFO);
 
 		if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) &&
 		    fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) {
 			vdev_uberblock_load(rvd, ub, &mmp_label);
 			error = SET_ERROR(EREMOTEIO);
 			goto out;
 		}
 	}
 
 	import_delay = spa_activity_check_duration(spa, ub);
 
 	/* Add a small random factor in case of simultaneous imports (0-25%) */
 	import_delay += import_delay * random_in_range(250) / 1000;
 
 	import_expire = gethrtime() + import_delay;
 
 	if (importing) {
 		spa_import_progress_set_notes(spa, "Checking MMP activity, "
 		    "waiting %llu ms", (u_longlong_t)NSEC2MSEC(import_delay));
 	}
 
 	int iterations = 0;
 	while ((now = gethrtime()) < import_expire) {
 		if (importing && iterations++ % 30 == 0) {
 			spa_import_progress_set_notes(spa, "Checking MMP "
 			    "activity, %llu ms remaining",
 			    (u_longlong_t)NSEC2MSEC(import_expire - now));
 		}
 
 		if (importing) {
 			(void) spa_import_progress_set_mmp_check(spa_guid(spa),
 			    NSEC2SEC(import_expire - gethrtime()));
 		}
 
 		vdev_uberblock_load(rvd, ub, &mmp_label);
 
 		if (txg != ub->ub_txg || timestamp != ub->ub_timestamp ||
 		    mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) {
 			zfs_dbgmsg("multihost activity detected "
 			    "txg %llu ub_txg  %llu "
 			    "timestamp %llu ub_timestamp  %llu "
 			    "mmp_config %#llx ub_mmp_config %#llx",
 			    (u_longlong_t)txg, (u_longlong_t)ub->ub_txg,
 			    (u_longlong_t)timestamp,
 			    (u_longlong_t)ub->ub_timestamp,
 			    (u_longlong_t)mmp_config,
 			    (u_longlong_t)ub->ub_mmp_config);
 
 			error = SET_ERROR(EREMOTEIO);
 			break;
 		}
 
 		if (mmp_label) {
 			nvlist_free(mmp_label);
 			mmp_label = NULL;
 		}
 
 		error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz);
 		if (error != -1) {
 			error = SET_ERROR(EINTR);
 			break;
 		}
 		error = 0;
 	}
 
 out:
 	mutex_exit(&mtx);
 	mutex_destroy(&mtx);
 	cv_destroy(&cv);
 
 	/*
 	 * If the pool is determined to be active store the status in the
 	 * spa->spa_load_info nvlist.  If the remote hostname or hostid are
 	 * available from configuration read from disk store them as well.
 	 * This allows 'zpool import' to generate a more useful message.
 	 *
 	 * ZPOOL_CONFIG_MMP_STATE    - observed pool status (mandatory)
 	 * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool
 	 * ZPOOL_CONFIG_MMP_HOSTID   - hostid from the active pool
 	 */
 	if (error == EREMOTEIO) {
 		if (mmp_label) {
 			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) {
 				const char *hostname = fnvlist_lookup_string(
 				    mmp_label, ZPOOL_CONFIG_HOSTNAME);
 				fnvlist_add_string(spa->spa_load_info,
 				    ZPOOL_CONFIG_MMP_HOSTNAME, hostname);
 			}
 
 			if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) {
 				uint64_t hostid = fnvlist_lookup_uint64(
 				    mmp_label, ZPOOL_CONFIG_HOSTID);
 				fnvlist_add_uint64(spa->spa_load_info,
 				    ZPOOL_CONFIG_MMP_HOSTID, hostid);
 			}
 		}
 
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_TXG, 0);
 
 		error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO);
 	}
 
 	if (mmp_label)
 		nvlist_free(mmp_label);
 
 	return (error);
 }
 
 /*
  * Called from zfs_ioc_clear for a pool that was suspended
  * after failing mmp write checks.
  */
 boolean_t
 spa_mmp_remote_host_activity(spa_t *spa)
 {
 	ASSERT(spa_multihost(spa) && spa_suspended(spa));
 
 	nvlist_t *best_label;
 	uberblock_t best_ub;
 
 	/*
 	 * Locate the best uberblock on disk
 	 */
 	vdev_uberblock_load(spa->spa_root_vdev, &best_ub, &best_label);
 	if (best_label) {
 		/*
 		 * confirm that the best hostid matches our hostid
 		 */
 		if (nvlist_exists(best_label, ZPOOL_CONFIG_HOSTID) &&
 		    spa_get_hostid(spa) !=
 		    fnvlist_lookup_uint64(best_label, ZPOOL_CONFIG_HOSTID)) {
 			nvlist_free(best_label);
 			return (B_TRUE);
 		}
 		nvlist_free(best_label);
 	} else {
 		return (B_TRUE);
 	}
 
 	if (!MMP_VALID(&best_ub) ||
 	    !MMP_FAIL_INT_VALID(&best_ub) ||
 	    MMP_FAIL_INT(&best_ub) == 0) {
 		return (B_TRUE);
 	}
 
 	if (best_ub.ub_txg != spa->spa_uberblock.ub_txg ||
 	    best_ub.ub_timestamp != spa->spa_uberblock.ub_timestamp) {
 		zfs_dbgmsg("txg mismatch detected during pool clear "
 		    "txg %llu ub_txg %llu timestamp %llu ub_timestamp %llu",
 		    (u_longlong_t)spa->spa_uberblock.ub_txg,
 		    (u_longlong_t)best_ub.ub_txg,
 		    (u_longlong_t)spa->spa_uberblock.ub_timestamp,
 		    (u_longlong_t)best_ub.ub_timestamp);
 		return (B_TRUE);
 	}
 
 	/*
 	 * Perform an activity check looking for any remote writer
 	 */
 	return (spa_activity_check(spa, &spa->spa_uberblock, spa->spa_config,
 	    B_FALSE) != 0);
 }
 
 static int
 spa_verify_host(spa_t *spa, nvlist_t *mos_config)
 {
 	uint64_t hostid;
 	const char *hostname;
 	uint64_t myhostid = 0;
 
 	if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config,
 	    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 		hostname = fnvlist_lookup_string(mos_config,
 		    ZPOOL_CONFIG_HOSTNAME);
 
 		myhostid = zone_get_hostid(NULL);
 
 		if (hostid != 0 && myhostid != 0 && hostid != myhostid) {
 			cmn_err(CE_WARN, "pool '%s' could not be "
 			    "loaded as it was last accessed by "
 			    "another system (host: %s hostid: 0x%llx). "
 			    "See: https://openzfs.github.io/openzfs-docs/msg/"
 			    "ZFS-8000-EY",
 			    spa_name(spa), hostname, (u_longlong_t)hostid);
 			spa_load_failed(spa, "hostid verification failed: pool "
 			    "last accessed by host: %s (hostid: 0x%llx)",
 			    hostname, (u_longlong_t)hostid);
 			return (SET_ERROR(EBADF));
 		}
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_parse_config(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 	nvlist_t *nvtree, *nvl, *config = spa->spa_config;
 	int parse;
 	vdev_t *rvd;
 	uint64_t pool_guid;
 	const char *comment;
 	const char *compatibility;
 
 	/*
 	 * Versioning wasn't explicitly added to the label until later, so if
 	 * it's not present treat it as the initial version.
 	 */
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
 	    &spa->spa_ubsync.ub_version) != 0)
 		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
 
 	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
 		spa_load_failed(spa, "invalid config provided: '%s' missing",
 		    ZPOOL_CONFIG_POOL_GUID);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If we are doing an import, ensure that the pool is not already
 	 * imported by checking if its pool guid already exists in the
 	 * spa namespace.
 	 *
 	 * The only case that we allow an already imported pool to be
 	 * imported again, is when the pool is checkpointed and we want to
 	 * look at its checkpointed state from userland tools like zdb.
 	 */
 #ifdef _KERNEL
 	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0)) {
 #else
 	if ((spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_TRYIMPORT) &&
 	    spa_guid_exists(pool_guid, 0) &&
 	    !spa_importing_readonly_checkpoint(spa)) {
 #endif
 		spa_load_failed(spa, "a pool with guid %llu is already open",
 		    (u_longlong_t)pool_guid);
 		return (SET_ERROR(EEXIST));
 	}
 
 	spa->spa_config_guid = pool_guid;
 
 	nvlist_free(spa->spa_load_info);
 	spa->spa_load_info = fnvlist_alloc();
 
 	ASSERT0P(spa->spa_comment);
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0)
 		spa->spa_comment = spa_strdup(comment);
 
 	ASSERT0P(spa->spa_compatibility);
 	if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMPATIBILITY,
 	    &compatibility) == 0)
 		spa->spa_compatibility = spa_strdup(compatibility);
 
 	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
 	    &spa->spa_config_txg);
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0)
 		spa->spa_config_splitting = fnvlist_dup(nvl);
 
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) {
 		spa_load_failed(spa, "invalid config provided: '%s' missing",
 		    ZPOOL_CONFIG_VDEV_TREE);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 	    KM_SLEEP);
 	for (int i = 0; i < max_ncpus; i++) {
 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 	}
 
 	/*
 	 * Parse the configuration into a vdev tree.  We explicitly set the
 	 * value that will be returned by spa_version() since parsing the
 	 * configuration requires knowing the version number.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	parse = (type == SPA_IMPORT_EXISTING ?
 	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
 	error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_load_failed(spa, "unable to parse config [error=%d]",
 		    error);
 		return (error);
 	}
 
 	ASSERT(spa->spa_root_vdev == rvd);
 	ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT);
 	ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT);
 
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_guid(spa) == pool_guid);
 	}
 
 	return (0);
 }
 
 /*
  * Recursively open all vdevs in the vdev tree. This function is called twice:
  * first with the untrusted config, then with the trusted config.
  */
 static int
 spa_ld_open_vdevs(spa_t *spa)
 {
 	int error = 0;
 
 	/*
 	 * spa_missing_tvds_allowed defines how many top-level vdevs can be
 	 * missing/unopenable for the root vdev to be still considered openable.
 	 */
 	if (spa->spa_trust_config) {
 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds;
 	} else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) {
 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile;
 	} else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) {
 		spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan;
 	} else {
 		spa->spa_missing_tvds_allowed = 0;
 	}
 
 	spa->spa_missing_tvds_allowed =
 	    MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_open(spa->spa_root_vdev);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (spa->spa_missing_tvds != 0) {
 		spa_load_note(spa, "vdev tree has %lld missing top-level "
 		    "vdevs.", (u_longlong_t)spa->spa_missing_tvds);
 		if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) {
 			/*
 			 * Although theoretically we could allow users to open
 			 * incomplete pools in RW mode, we'd need to add a lot
 			 * of extra logic (e.g. adjust pool space to account
 			 * for missing vdevs).
 			 * This limitation also prevents users from accidentally
 			 * opening the pool in RW mode during data recovery and
 			 * damaging it further.
 			 */
 			spa_load_note(spa, "pools with missing top-level "
 			    "vdevs can only be opened in read-only mode.");
 			error = SET_ERROR(ENXIO);
 		} else {
 			spa_load_note(spa, "current settings allow for maximum "
 			    "%lld missing top-level vdevs at this stage.",
 			    (u_longlong_t)spa->spa_missing_tvds_allowed);
 		}
 	}
 	if (error != 0) {
 		spa_load_failed(spa, "unable to open vdev tree [error=%d]",
 		    error);
 	}
 	if (spa->spa_missing_tvds != 0 || error != 0)
 		vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2);
 
 	return (error);
 }
 
 /*
  * We need to validate the vdev labels against the configuration that
  * we have in hand. This function is called twice: first with an untrusted
  * config, then with a trusted config. The validation is more strict when the
  * config is trusted.
  */
 static int
 spa_ld_validate_vdevs(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	error = vdev_validate(rvd);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_load_failed(spa, "vdev_validate failed [error=%d]", error);
 		return (error);
 	}
 
 	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
 		spa_load_failed(spa, "cannot open vdev tree after invalidating "
 		    "some vdevs");
 		vdev_dbgmsg_print_tree(rvd, 2);
 		return (SET_ERROR(ENXIO));
 	}
 
 	return (0);
 }
 
 static void
 spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub)
 {
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
 	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
 	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
 	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
 	spa->spa_claim_max_txg = spa->spa_first_txg;
 	spa->spa_prev_software_version = ub->ub_software_version;
 }
 
 static int
 spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	nvlist_t *label;
 	uberblock_t *ub = &spa->spa_uberblock;
 	boolean_t activity_check = B_FALSE;
 
 	/*
 	 * If we are opening the checkpointed state of the pool by
 	 * rewinding to it, at this point we will have written the
 	 * checkpointed uberblock to the vdev labels, so searching
 	 * the labels will find the right uberblock.  However, if
 	 * we are opening the checkpointed state read-only, we have
 	 * not modified the labels. Therefore, we must ignore the
 	 * labels and continue using the spa_uberblock that was set
 	 * by spa_ld_checkpoint_rewind.
 	 *
 	 * Note that it would be fine to ignore the labels when
 	 * rewinding (opening writeable) as well. However, if we
 	 * crash just after writing the labels, we will end up
 	 * searching the labels. Doing so in the common case means
 	 * that this code path gets exercised normally, rather than
 	 * just in the edge case.
 	 */
 	if (ub->ub_checkpoint_txg != 0 &&
 	    spa_importing_readonly_checkpoint(spa)) {
 		spa_ld_select_uberblock_done(spa, ub);
 		return (0);
 	}
 
 	/*
 	 * Find the best uberblock.
 	 */
 	vdev_uberblock_load(rvd, ub, &label);
 
 	/*
 	 * If we weren't able to find a single valid uberblock, return failure.
 	 */
 	if (ub->ub_txg == 0) {
 		nvlist_free(label);
 		spa_load_failed(spa, "no valid uberblock found");
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
 	}
 
 	if (spa->spa_load_max_txg != UINT64_MAX) {
 		(void) spa_import_progress_set_max_txg(spa_guid(spa),
 		    (u_longlong_t)spa->spa_load_max_txg);
 	}
 	spa_load_note(spa, "using uberblock with txg=%llu",
 	    (u_longlong_t)ub->ub_txg);
 	if (ub->ub_raidz_reflow_info != 0) {
 		spa_load_note(spa, "uberblock raidz_reflow_info: "
 		    "state=%u offset=%llu",
 		    (int)RRSS_GET_STATE(ub),
 		    (u_longlong_t)RRSS_GET_OFFSET(ub));
 	}
 
 
 	/*
 	 * For pools which have the multihost property on determine if the
 	 * pool is truly inactive and can be safely imported.  Prevent
 	 * hosts which don't have a hostid set from importing the pool.
 	 */
 	activity_check = spa_activity_check_required(spa, ub, label,
 	    spa->spa_config);
 	if (activity_check) {
 		if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay &&
 		    spa_get_hostid(spa) == 0) {
 			nvlist_free(label);
 			fnvlist_add_uint64(spa->spa_load_info,
 			    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
 			return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 		}
 
 		int error =
 		    spa_activity_check(spa, ub, spa->spa_config, B_TRUE);
 		if (error) {
 			nvlist_free(label);
 			return (error);
 		}
 
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE);
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_TXG, ub->ub_txg);
 		fnvlist_add_uint16(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_SEQ,
 		    (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0));
 	}
 
 	/*
 	 * If the pool has an unsupported version we can't open it.
 	 */
 	if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) {
 		nvlist_free(label);
 		spa_load_failed(spa, "version %llu is not supported",
 		    (u_longlong_t)ub->ub_version);
 		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
 	}
 
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *features;
 
 		/*
 		 * If we weren't able to find what's necessary for reading the
 		 * MOS in the label, return failure.
 		 */
 		if (label == NULL) {
 			spa_load_failed(spa, "label config unavailable");
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    ENXIO));
 		}
 
 		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ,
 		    &features) != 0) {
 			nvlist_free(label);
 			spa_load_failed(spa, "invalid label: '%s' missing",
 			    ZPOOL_CONFIG_FEATURES_FOR_READ);
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    ENXIO));
 		}
 
 		/*
 		 * Update our in-core representation with the definitive values
 		 * from the label.
 		 */
 		nvlist_free(spa->spa_label_features);
 		spa->spa_label_features = fnvlist_dup(features);
 	}
 
 	nvlist_free(label);
 
 	/*
 	 * Look through entries in the label nvlist's features_for_read. If
 	 * there is a feature listed there which we don't understand then we
 	 * cannot open a pool.
 	 */
 	if (ub->ub_version >= SPA_VERSION_FEATURES) {
 		nvlist_t *unsup_feat;
 
 		unsup_feat = fnvlist_alloc();
 
 		for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features,
 		    NULL); nvp != NULL;
 		    nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) {
 			if (!zfeature_is_supported(nvpair_name(nvp))) {
 				fnvlist_add_string(unsup_feat,
 				    nvpair_name(nvp), "");
 			}
 		}
 
 		if (!nvlist_empty(unsup_feat)) {
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 			nvlist_free(unsup_feat);
 			spa_load_failed(spa, "some features are unsupported");
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		nvlist_free(unsup_feat);
 	}
 
 	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_try_repair(spa, spa->spa_config);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		nvlist_free(spa->spa_config_splitting);
 		spa->spa_config_splitting = NULL;
 	}
 
 	/*
 	 * Initialize internal SPA structures.
 	 */
 	spa_ld_select_uberblock_done(spa, ub);
 
 	return (0);
 }
 
 static int
 spa_ld_open_rootbp(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
 	if (error != 0) {
 		spa_load_failed(spa, "unable to open rootbp in dsl_pool_init "
 		    "[error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
 	return (0);
 }
 
 static int
 spa_ld_trusted_config(spa_t *spa, spa_import_type_t type,
     boolean_t reloading)
 {
 	vdev_t *mrvd, *rvd = spa->spa_root_vdev;
 	nvlist_t *nv, *mos_config, *policy;
 	int error = 0, copy_error;
 	uint64_t healthy_tvds, healthy_tvds_mos;
 	uint64_t mos_config_txg;
 
 	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE)
 	    != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * If we're assembling a pool from a split, the config provided is
 	 * already trusted so there is nothing to do.
 	 */
 	if (type == SPA_IMPORT_ASSEMBLE)
 		return (0);
 
 	healthy_tvds = spa_healthy_core_tvds(spa);
 
 	if (load_nvlist(spa, spa->spa_config_object, &mos_config)
 	    != 0) {
 		spa_load_failed(spa, "unable to retrieve MOS config");
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * If we are doing an open, pool owner wasn't verified yet, thus do
 	 * the verification here.
 	 */
 	if (spa->spa_load_state == SPA_LOAD_OPEN) {
 		error = spa_verify_host(spa, mos_config);
 		if (error != 0) {
 			nvlist_free(mos_config);
 			return (error);
 		}
 	}
 
 	nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * Build a new vdev tree from the trusted config
 	 */
 	error = spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD);
 	if (error != 0) {
 		nvlist_free(mos_config);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa_load_failed(spa, "spa_config_parse failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	/*
 	 * Vdev paths in the MOS may be obsolete. If the untrusted config was
 	 * obtained by scanning /dev/dsk, then it will have the right vdev
 	 * paths. We update the trusted MOS config with this information.
 	 * We first try to copy the paths with vdev_copy_path_strict, which
 	 * succeeds only when both configs have exactly the same vdev tree.
 	 * If that fails, we fall back to a more flexible method that has a
 	 * best effort policy.
 	 */
 	copy_error = vdev_copy_path_strict(rvd, mrvd);
 	if (copy_error != 0 || spa_load_print_vdev_tree) {
 		spa_load_note(spa, "provided vdev tree:");
 		vdev_dbgmsg_print_tree(rvd, 2);
 		spa_load_note(spa, "MOS vdev tree:");
 		vdev_dbgmsg_print_tree(mrvd, 2);
 	}
 	if (copy_error != 0) {
 		spa_load_note(spa, "vdev_copy_path_strict failed, falling "
 		    "back to vdev_copy_path_relaxed");
 		vdev_copy_path_relaxed(rvd, mrvd);
 	}
 
 	vdev_close(rvd);
 	vdev_free(rvd);
 	spa->spa_root_vdev = mrvd;
 	rvd = mrvd;
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * If 'zpool import' used a cached config, then the on-disk hostid and
 	 * hostname may be different to the cached config in ways that should
 	 * prevent import.  Userspace can't discover this without a scan, but
 	 * we know, so we add these values to LOAD_INFO so the caller can know
 	 * the difference.
 	 *
 	 * Note that we have to do this before the config is regenerated,
 	 * because the new config will have the hostid and hostname for this
 	 * host, in readiness for import.
 	 */
 	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTID))
 		fnvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_HOSTID,
 		    fnvlist_lookup_uint64(mos_config, ZPOOL_CONFIG_HOSTID));
 	if (nvlist_exists(mos_config, ZPOOL_CONFIG_HOSTNAME))
 		fnvlist_add_string(spa->spa_load_info, ZPOOL_CONFIG_HOSTNAME,
 		    fnvlist_lookup_string(mos_config, ZPOOL_CONFIG_HOSTNAME));
 
 	/*
 	 * We will use spa_config if we decide to reload the spa or if spa_load
 	 * fails and we rewind. We must thus regenerate the config using the
 	 * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to
 	 * pass settings on how to load the pool and is not stored in the MOS.
 	 * We copy it over to our new, trusted config.
 	 */
 	mos_config_txg = fnvlist_lookup_uint64(mos_config,
 	    ZPOOL_CONFIG_POOL_TXG);
 	nvlist_free(mos_config);
 	mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE);
 	if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY,
 	    &policy) == 0)
 		fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy);
 	spa_config_set(spa, mos_config);
 	spa->spa_config_source = SPA_CONFIG_SRC_MOS;
 
 	/*
 	 * Now that we got the config from the MOS, we should be more strict
 	 * in checking blkptrs and can make assumptions about the consistency
 	 * of the vdev tree. spa_trust_config must be set to true before opening
 	 * vdevs in order for them to be writeable.
 	 */
 	spa->spa_trust_config = B_TRUE;
 
 	/*
 	 * Open and validate the new vdev tree
 	 */
 	error = spa_ld_open_vdevs(spa);
 	if (error != 0)
 		return (error);
 
 	error = spa_ld_validate_vdevs(spa);
 	if (error != 0)
 		return (error);
 
 	if (copy_error != 0 || spa_load_print_vdev_tree) {
 		spa_load_note(spa, "final vdev tree:");
 		vdev_dbgmsg_print_tree(rvd, 2);
 	}
 
 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT &&
 	    !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) {
 		/*
 		 * Sanity check to make sure that we are indeed loading the
 		 * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds
 		 * in the config provided and they happened to be the only ones
 		 * to have the latest uberblock, we could involuntarily perform
 		 * an extreme rewind.
 		 */
 		healthy_tvds_mos = spa_healthy_core_tvds(spa);
 		if (healthy_tvds_mos - healthy_tvds >=
 		    SPA_SYNC_MIN_VDEVS) {
 			spa_load_note(spa, "config provided misses too many "
 			    "top-level vdevs compared to MOS (%lld vs %lld). ",
 			    (u_longlong_t)healthy_tvds,
 			    (u_longlong_t)healthy_tvds_mos);
 			spa_load_note(spa, "vdev tree:");
 			vdev_dbgmsg_print_tree(rvd, 2);
 			if (reloading) {
 				spa_load_failed(spa, "config was already "
 				    "provided from MOS. Aborting.");
 				return (spa_vdev_err(rvd,
 				    VDEV_AUX_CORRUPT_DATA, EIO));
 			}
 			spa_load_note(spa, "spa must be reloaded using MOS "
 			    "config");
 			return (SET_ERROR(EAGAIN));
 		}
 	}
 
 	error = spa_check_for_missing_logs(spa);
 	if (error != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
 
 	if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) {
 		spa_load_failed(spa, "uberblock guid sum doesn't match MOS "
 		    "guid sum (%llu != %llu)",
 		    (u_longlong_t)spa->spa_uberblock.ub_guid_sum,
 		    (u_longlong_t)rvd->vdev_guid_sum);
 		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM,
 		    ENXIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_open_indirect_vdev_metadata(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * Everything that we read before spa_remove_init() must be stored
 	 * on concreted vdevs.  Therefore we do this as early as possible.
 	 */
 	error = spa_remove_init(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "spa_remove_init failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * Retrieve information needed to condense indirect vdev mappings.
 	 */
 	error = spa_condense_init(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "spa_condense_init failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	if (spa_version(spa) >= SPA_VERSION_FEATURES) {
 		boolean_t missing_feat_read = B_FALSE;
 		nvlist_t *unsup_feat, *enabled_feat;
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ,
 		    &spa->spa_feat_for_read_obj, B_TRUE) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE,
 		    &spa->spa_feat_for_write_obj, B_TRUE) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS,
 		    &spa->spa_feat_desc_obj, B_TRUE) != 0) {
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		enabled_feat = fnvlist_alloc();
 		unsup_feat = fnvlist_alloc();
 
 		if (!spa_features_check(spa, B_FALSE,
 		    unsup_feat, enabled_feat))
 			missing_feat_read = B_TRUE;
 
 		if (spa_writeable(spa) ||
 		    spa->spa_load_state == SPA_LOAD_TRYIMPORT) {
 			if (!spa_features_check(spa, B_TRUE,
 			    unsup_feat, enabled_feat)) {
 				*missing_feat_writep = B_TRUE;
 			}
 		}
 
 		fnvlist_add_nvlist(spa->spa_load_info,
 		    ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat);
 
 		if (!nvlist_empty(unsup_feat)) {
 			fnvlist_add_nvlist(spa->spa_load_info,
 			    ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat);
 		}
 
 		fnvlist_free(enabled_feat);
 		fnvlist_free(unsup_feat);
 
 		if (!missing_feat_read) {
 			fnvlist_add_boolean(spa->spa_load_info,
 			    ZPOOL_CONFIG_CAN_RDONLY);
 		}
 
 		/*
 		 * If the state is SPA_LOAD_TRYIMPORT, our objective is
 		 * twofold: to determine whether the pool is available for
 		 * import in read-write mode and (if it is not) whether the
 		 * pool is available for import in read-only mode. If the pool
 		 * is available for import in read-write mode, it is displayed
 		 * as available in userland; if it is not available for import
 		 * in read-only mode, it is displayed as unavailable in
 		 * userland. If the pool is available for import in read-only
 		 * mode but not read-write mode, it is displayed as unavailable
 		 * in userland with a special note that the pool is actually
 		 * available for open in read-only mode.
 		 *
 		 * As a result, if the state is SPA_LOAD_TRYIMPORT and we are
 		 * missing a feature for write, we must first determine whether
 		 * the pool can be opened read-only before returning to
 		 * userland in order to know whether to display the
 		 * abovementioned note.
 		 */
 		if (missing_feat_read || (*missing_feat_writep &&
 		    spa_writeable(spa))) {
 			spa_load_failed(spa, "pool uses unsupported features");
 			return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT,
 			    ENOTSUP));
 		}
 
 		/*
 		 * Load refcounts for ZFS features from disk into an in-memory
 		 * cache during SPA initialization.
 		 */
 		for (spa_feature_t i = 0; i < SPA_FEATURES; i++) {
 			uint64_t refcount;
 
 			error = feature_get_refcount_from_disk(spa,
 			    &spa_feature_table[i], &refcount);
 			if (error == 0) {
 				spa->spa_feat_refcount_cache[i] = refcount;
 			} else if (error == ENOTSUP) {
 				spa->spa_feat_refcount_cache[i] =
 				    SPA_FEATURE_DISABLED;
 			} else {
 				spa_load_failed(spa, "error getting refcount "
 				    "for feature %s [error=%d]",
 				    spa_feature_table[i].fi_guid, error);
 				return (spa_vdev_err(rvd,
 				    VDEV_AUX_CORRUPT_DATA, EIO));
 			}
 		}
 	}
 
 	if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) {
 		if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG,
 		    &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0)
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * Encryption was added before bookmark_v2, even though bookmark_v2
 	 * is now a dependency. If this pool has encryption enabled without
 	 * bookmark_v2, trigger an errata message.
 	 */
 	if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) {
 		spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION;
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_load_special_directories(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	spa->spa_is_initializing = B_TRUE;
 	error = dsl_pool_open(spa->spa_dsl_pool);
 	spa->spa_is_initializing = B_FALSE;
 	if (error != 0) {
 		spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_get_props(spa_t *spa)
 {
 	int error = 0;
 	uint64_t obj;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/* Grab the checksum salt from the MOS. */
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CHECKSUM_SALT, 1,
 	    sizeof (spa->spa_cksum_salt.zcs_bytes),
 	    spa->spa_cksum_salt.zcs_bytes);
 	if (error == ENOENT) {
 		/* Generate a new salt for subsequent use */
 		(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
 		    sizeof (spa->spa_cksum_salt.zcs_bytes));
 	} else if (error != 0) {
 		spa_load_failed(spa, "unable to retrieve checksum salt from "
 		    "MOS [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj);
 	if (error != 0) {
 		spa_load_failed(spa, "error opening deferred-frees bpobj "
 		    "[error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	/*
 	 * Load the bit that tells us to use the new accounting function
 	 * (raid-z deflation).  If we have an older pool, this will not
 	 * be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION,
 	    &spa->spa_creation_version, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/* Load time log */
 	spa_load_txg_log_time(spa);
 
 	/*
 	 * Load the persistent error log.  If we have an older pool, this will
 	 * not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last,
 	    B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
 	    &spa->spa_errlog_scrub, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/* Load the last scrubbed txg. */
 	error = spa_dir_prop(spa, DMU_POOL_LAST_SCRUBBED_TXG,
 	    &spa->spa_scrubbed_last_txg, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the livelist deletion field. If a livelist is queued for
 	 * deletion, indicate that in the spa
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES,
 	    &spa->spa_livelists_to_delete, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the history object.  If we have an older pool, this
 	 * will not be present.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the per-vdev ZAP map. If we have an older pool, this will not
 	 * be present; in this case, defer its creation to a later time to
 	 * avoid dirtying the MOS this early / out of sync context. See
 	 * spa_sync_config_object.
 	 */
 
 	/* The sentinel is only available in the MOS config. */
 	nvlist_t *mos_config;
 	if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) {
 		spa_load_failed(spa, "unable to retrieve MOS config");
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP,
 	    &spa->spa_all_vdev_zaps, B_FALSE);
 
 	if (error == ENOENT) {
 		VERIFY(!nvlist_exists(mos_config,
 		    ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
 		spa->spa_avz_action = AVZ_ACTION_INITIALIZE;
 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
 	} else if (error != 0) {
 		nvlist_free(mos_config);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	} else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) {
 		/*
 		 * An older version of ZFS overwrote the sentinel value, so
 		 * we have orphaned per-vdev ZAPs in the MOS. Defer their
 		 * destruction to later; see spa_sync_config_object.
 		 */
 		spa->spa_avz_action = AVZ_ACTION_DESTROY;
 		/*
 		 * We're assuming that no vdevs have had their ZAPs created
 		 * before this. Better be sure of it.
 		 */
 		ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev));
 	}
 	nvlist_free(mos_config);
 
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 
 	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object,
 	    B_FALSE);
 	if (error && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (error == 0) {
 		uint64_t autoreplace = 0;
 
 		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
 		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
 		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
 		spa_prop_find(spa, ZPOOL_PROP_DEDUP_TABLE_QUOTA,
 		    &spa->spa_dedup_table_quota);
 		spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost);
 		spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim);
 		spa->spa_autoreplace = (autoreplace != 0);
 	}
 
 	/*
 	 * If we are importing a pool with missing top-level vdevs,
 	 * we enforce that the pool doesn't panic or get suspended on
 	 * error since the likelihood of missing data is extremely high.
 	 */
 	if (spa->spa_missing_tvds > 0 &&
 	    spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE &&
 	    spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 		spa_load_note(spa, "forcing failmode to 'continue' "
 		    "as some top level vdevs are missing");
 		spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE;
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If we're assembling the pool from the split-off vdevs of
 	 * an existing pool, we don't want to attach the spares & cache
 	 * devices.
 	 */
 
 	/*
 	 * Load any hot spares for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object,
 	    B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
 		if (load_nvlist(spa, spa->spa_spares.sav_object,
 		    &spa->spa_spares.sav_config) != 0) {
 			spa_load_failed(spa, "error loading spares nvlist");
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Load any level 2 ARC devices for this pool.
 	 */
 	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
 	    &spa->spa_l2cache.sav_object, B_FALSE);
 	if (error != 0 && error != ENOENT)
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
 		    &spa->spa_l2cache.sav_config) != 0) {
 			spa_load_failed(spa, "error loading l2cache nvlist");
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 		}
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 	} else if (error == 0) {
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_load_vdev_metadata(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * If the 'multihost' property is set, then never allow a pool to
 	 * be imported when the system hostid is zero.  The exception to
 	 * this rule is zdb which is always allowed to access pools.
 	 */
 	if (spa_multihost(spa) && spa_get_hostid(spa) == 0 &&
 	    (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) {
 		fnvlist_add_uint64(spa->spa_load_info,
 		    ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID);
 		return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO));
 	}
 
 	/*
 	 * If the 'autoreplace' property is set, then post a resource notifying
 	 * the ZFS DE that it should not issue any faults for unopenable
 	 * devices.  We also iterate over the vdevs, and post a sysevent for any
 	 * unopenable vdevs so that the normal autoreplace handler can take
 	 * over.
 	 */
 	if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 		spa_check_removed(spa->spa_root_vdev);
 		/*
 		 * For the import case, this is done in spa_import(), because
 		 * at this point we're using the spare definitions from
 		 * the MOS config, not necessarily from the userland config.
 		 */
 		if (spa->spa_load_state != SPA_LOAD_IMPORT) {
 			spa_aux_check_removed(&spa->spa_spares);
 			spa_aux_check_removed(&spa->spa_l2cache);
 		}
 	}
 
 	/*
 	 * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc.
 	 */
 	error = vdev_load(rvd);
 	if (error != 0) {
 		spa_load_failed(spa, "vdev_load failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	error = spa_ld_log_spacemaps(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "spa_ld_log_spacemaps failed [error=%d]",
 		    error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error));
 	}
 
 	/*
 	 * Propagate the leaf DTLs we just loaded all the way up the vdev tree.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	return (0);
 }
 
 static int
 spa_ld_load_dedup_tables(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	error = ddt_load(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "ddt_load failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_load_brt(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	error = brt_load(spa);
 	if (error != 0) {
 		spa_load_failed(spa, "brt_load failed [error=%d]", error);
 		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, const char **ereport)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) {
 		boolean_t missing = spa_check_logs(spa);
 		if (missing) {
 			if (spa->spa_missing_tvds != 0) {
 				spa_load_note(spa, "spa_check_logs failed "
 				    "so dropping the logs");
 			} else {
 				*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
 				spa_load_failed(spa, "spa_check_logs failed");
 				return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG,
 				    ENXIO));
 			}
 		}
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_verify_pool_data(spa_t *spa)
 {
 	int error = 0;
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	/*
 	 * We've successfully opened the pool, verify that we're ready
 	 * to start pushing transactions.
 	 */
 	if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) {
 		error = spa_load_verify(spa);
 		if (error != 0) {
 			spa_load_failed(spa, "spa_load_verify failed "
 			    "[error=%d]", error);
 			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
 			    error));
 		}
 	}
 
 	return (0);
 }
 
 static void
 spa_ld_claim_log_blocks(spa_t *spa)
 {
 	dmu_tx_t *tx;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 
 	/*
 	 * Claim log blocks that haven't been committed yet.
 	 * This must all happen in a single txg.
 	 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
 	 * invoked from zil_claim_log_block()'s i/o done callback.
 	 * Price of rollback is that we abandon the log.
 	 */
 	spa->spa_claiming = B_TRUE;
 
 	tx = dmu_tx_create_assigned(dp, spa_first_txg(spa));
 	(void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj,
 	    zil_claim, tx, DS_FIND_CHILDREN);
 	dmu_tx_commit(tx);
 
 	spa->spa_claiming = B_FALSE;
 
 	spa_set_log_state(spa, SPA_LOG_GOOD);
 }
 
 static void
 spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg,
     boolean_t update_config_cache)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	int need_update = B_FALSE;
 
 	/*
 	 * If the config cache is stale, or we have uninitialized
 	 * metaslabs (see spa_vdev_add()), then update the config.
 	 *
 	 * If this is a verbatim import, trust the current
 	 * in-core spa_config and update the disk labels.
 	 */
 	if (update_config_cache || config_cache_txg != spa->spa_config_txg ||
 	    spa->spa_load_state == SPA_LOAD_IMPORT ||
 	    spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    (spa->spa_import_flags & ZFS_IMPORT_VERBATIM))
 		need_update = B_TRUE;
 
 	for (int c = 0; c < rvd->vdev_children; c++)
 		if (rvd->vdev_child[c]->vdev_ms_array == 0)
 			need_update = B_TRUE;
 
 	/*
 	 * Update the config cache asynchronously in case we're the
 	 * root pool, in which case the config cache isn't writable yet.
 	 */
 	if (need_update)
 		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 }
 
 static void
 spa_ld_prepare_for_reload(spa_t *spa)
 {
 	spa_mode_t mode = spa->spa_mode;
 	int async_suspended = spa->spa_async_suspended;
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_activate(spa, mode);
 
 	/*
 	 * We save the value of spa_async_suspended as it gets reset to 0 by
 	 * spa_unload(). We want to restore it back to the original value before
 	 * returning as we might be calling spa_async_resume() later.
 	 */
 	spa->spa_async_suspended = async_suspended;
 }
 
 static int
 spa_ld_read_checkpoint_txg(spa_t *spa)
 {
 	uberblock_t checkpoint;
 	int error = 0;
 
 	ASSERT0(spa->spa_checkpoint_txg);
 	ASSERT(spa_namespace_held() ||
 	    spa->spa_load_thread == curthread);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
 	if (error == ENOENT)
 		return (0);
 
 	if (error != 0)
 		return (error);
 
 	ASSERT3U(checkpoint.ub_txg, !=, 0);
 	ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0);
 	ASSERT3U(checkpoint.ub_timestamp, !=, 0);
 	spa->spa_checkpoint_txg = checkpoint.ub_txg;
 	spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp;
 
 	return (0);
 }
 
 static int
 spa_ld_mos_init(spa_t *spa, spa_import_type_t type)
 {
 	int error = 0;
 
 	ASSERT(spa_namespace_held());
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 
 	/*
 	 * Never trust the config that is provided unless we are assembling
 	 * a pool following a split.
 	 * This means don't trust blkptrs and the vdev tree in general. This
 	 * also effectively puts the spa in read-only mode since
 	 * spa_writeable() checks for spa_trust_config to be true.
 	 * We will later load a trusted config from the MOS.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE)
 		spa->spa_trust_config = B_FALSE;
 
 	/*
 	 * Parse the config provided to create a vdev tree.
 	 */
 	error = spa_ld_parse_config(spa, type);
 	if (error != 0)
 		return (error);
 
 	spa_import_progress_add(spa);
 
 	/*
 	 * Now that we have the vdev tree, try to open each vdev. This involves
 	 * opening the underlying physical device, retrieving its geometry and
 	 * probing the vdev with a dummy I/O. The state of each vdev will be set
 	 * based on the success of those operations. After this we'll be ready
 	 * to read from the vdevs.
 	 */
 	error = spa_ld_open_vdevs(spa);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Read the label of each vdev and make sure that the GUIDs stored
 	 * there match the GUIDs in the config provided.
 	 * If we're assembling a new pool that's been split off from an
 	 * existing pool, the labels haven't yet been updated so we skip
 	 * validation for now.
 	 */
 	if (type != SPA_IMPORT_ASSEMBLE) {
 		error = spa_ld_validate_vdevs(spa);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Read all vdev labels to find the best uberblock (i.e. latest,
 	 * unless spa_load_max_txg is set) and store it in spa_uberblock. We
 	 * get the list of features required to read blkptrs in the MOS from
 	 * the vdev label with the best uberblock and verify that our version
 	 * of zfs supports them all.
 	 */
 	error = spa_ld_select_uberblock(spa, type);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Pass that uberblock to the dsl_pool layer which will open the root
 	 * blkptr. This blkptr points to the latest version of the MOS and will
 	 * allow us to read its contents.
 	 */
 	error = spa_ld_open_rootbp(spa);
 	if (error != 0)
 		return (error);
 
 	return (0);
 }
 
 static int
 spa_ld_checkpoint_rewind(spa_t *spa)
 {
 	uberblock_t checkpoint;
 	int error = 0;
 
 	ASSERT(spa_namespace_held());
 	ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 
 	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t),
 	    sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint);
 
 	if (error != 0) {
 		spa_load_failed(spa, "unable to retrieve checkpointed "
 		    "uberblock from the MOS config [error=%d]", error);
 
 		if (error == ENOENT)
 			error = ZFS_ERR_NO_CHECKPOINT;
 
 		return (error);
 	}
 
 	ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg);
 	ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg);
 
 	/*
 	 * We need to update the txg and timestamp of the checkpointed
 	 * uberblock to be higher than the latest one. This ensures that
 	 * the checkpointed uberblock is selected if we were to close and
 	 * reopen the pool right after we've written it in the vdev labels.
 	 * (also see block comment in vdev_uberblock_compare)
 	 */
 	checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1;
 	checkpoint.ub_timestamp = gethrestime_sec();
 
 	/*
 	 * Set current uberblock to be the checkpointed uberblock.
 	 */
 	spa->spa_uberblock = checkpoint;
 
 	/*
 	 * If we are doing a normal rewind, then the pool is open for
 	 * writing and we sync the "updated" checkpointed uberblock to
 	 * disk. Once this is done, we've basically rewound the whole
 	 * pool and there is no way back.
 	 *
 	 * There are cases when we don't want to attempt and sync the
 	 * checkpointed uberblock to disk because we are opening a
 	 * pool as read-only. Specifically, verifying the checkpointed
 	 * state with zdb, and importing the checkpointed state to get
 	 * a "preview" of its content.
 	 */
 	if (spa_writeable(spa)) {
 		vdev_t *rvd = spa->spa_root_vdev;
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
 		int svdcount = 0;
 		int children = rvd->vdev_children;
 		int c0 = random_in_range(children);
 
 		for (int c = 0; c < children; c++) {
 			vdev_t *vd = rvd->vdev_child[(c0 + c) % children];
 
 			/* Stop when revisiting the first vdev */
 			if (c > 0 && svd[0] == vd)
 				break;
 
 			if (vd->vdev_ms_array == 0 || vd->vdev_islog ||
 			    !vdev_is_concrete(vd))
 				continue;
 
 			svd[svdcount++] = vd;
 			if (svdcount == SPA_SYNC_MIN_VDEVS)
 				break;
 		}
 		error = vdev_config_sync(svd, svdcount, spa->spa_first_txg);
 		if (error == 0)
 			spa->spa_last_synced_guid = rvd->vdev_guid;
 		spa_config_exit(spa, SCL_ALL, FTAG);
 
 		if (error != 0) {
 			spa_load_failed(spa, "failed to write checkpointed "
 			    "uberblock to the vdev labels [error=%d]", error);
 			return (error);
 		}
 	}
 
 	return (0);
 }
 
 static int
 spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type,
     boolean_t *update_config_cache)
 {
 	int error;
 
 	/*
 	 * Parse the config for pool, open and validate vdevs,
 	 * select an uberblock, and use that uberblock to open
 	 * the MOS.
 	 */
 	error = spa_ld_mos_init(spa, type);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Retrieve the trusted config stored in the MOS and use it to create
 	 * a new, exact version of the vdev tree, then reopen all vdevs.
 	 */
 	error = spa_ld_trusted_config(spa, type, B_FALSE);
 	if (error == EAGAIN) {
 		if (update_config_cache != NULL)
 			*update_config_cache = B_TRUE;
 
 		/*
 		 * Redo the loading process with the trusted config if it is
 		 * too different from the untrusted config.
 		 */
 		spa_ld_prepare_for_reload(spa);
 		spa_load_note(spa, "RELOADING");
 		error = spa_ld_mos_init(spa, type);
 		if (error != 0)
 			return (error);
 
 		error = spa_ld_trusted_config(spa, type, B_TRUE);
 		if (error != 0)
 			return (error);
 
 	} else if (error != 0) {
 		return (error);
 	}
 
 	return (0);
 }
 
 /*
  * Load an existing storage pool, using the config provided. This config
  * describes which vdevs are part of the pool and is later validated against
  * partial configs present in each vdev's label and an entire copy of the
  * config stored in the MOS.
  */
 static int
 spa_load_impl(spa_t *spa, spa_import_type_t type, const char **ereport)
 {
 	int error = 0;
 	boolean_t missing_feat_write = B_FALSE;
 	boolean_t checkpoint_rewind =
 	    (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 	boolean_t update_config_cache = B_FALSE;
 	hrtime_t load_start = gethrtime();
 
 	ASSERT(spa_namespace_held());
 	ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE);
 
 	spa_load_note(spa, "LOADING");
 
 	error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * If we are rewinding to the checkpoint then we need to repeat
 	 * everything we've done so far in this function but this time
 	 * selecting the checkpointed uberblock and using that to open
 	 * the MOS.
 	 */
 	if (checkpoint_rewind) {
 		/*
 		 * If we are rewinding to the checkpoint update config cache
 		 * anyway.
 		 */
 		update_config_cache = B_TRUE;
 
 		/*
 		 * Extract the checkpointed uberblock from the current MOS
 		 * and use this as the pool's uberblock from now on. If the
 		 * pool is imported as writeable we also write the checkpoint
 		 * uberblock to the labels, making the rewind permanent.
 		 */
 		error = spa_ld_checkpoint_rewind(spa);
 		if (error != 0)
 			return (error);
 
 		/*
 		 * Redo the loading process again with the
 		 * checkpointed uberblock.
 		 */
 		spa_ld_prepare_for_reload(spa);
 		spa_load_note(spa, "LOADING checkpointed uberblock");
 		error = spa_ld_mos_with_trusted_config(spa, type, NULL);
 		if (error != 0)
 			return (error);
 	}
 
 	/*
 	 * Drop the namespace lock for the rest of the function.
 	 */
 	spa->spa_load_thread = curthread;
 	spa_namespace_exit(FTAG);
 
 	/*
 	 * Retrieve the checkpoint txg if the pool has a checkpoint.
 	 */
 	spa_import_progress_set_notes(spa, "Loading checkpoint txg");
 	error = spa_ld_read_checkpoint_txg(spa);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Retrieve the mapping of indirect vdevs. Those vdevs were removed
 	 * from the pool and their contents were re-mapped to other vdevs. Note
 	 * that everything that we read before this step must have been
 	 * rewritten on concrete vdevs after the last device removal was
 	 * initiated. Otherwise we could be reading from indirect vdevs before
 	 * we have loaded their mappings.
 	 */
 	spa_import_progress_set_notes(spa, "Loading indirect vdev metadata");
 	error = spa_ld_open_indirect_vdev_metadata(spa);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Retrieve the full list of active features from the MOS and check if
 	 * they are all supported.
 	 */
 	spa_import_progress_set_notes(spa, "Checking feature flags");
 	error = spa_ld_check_features(spa, &missing_feat_write);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Load several special directories from the MOS needed by the dsl_pool
 	 * layer.
 	 */
 	spa_import_progress_set_notes(spa, "Loading special MOS directories");
 	error = spa_ld_load_special_directories(spa);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Retrieve pool properties from the MOS.
 	 */
 	spa_import_progress_set_notes(spa, "Loading properties");
 	error = spa_ld_get_props(spa);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Retrieve the list of auxiliary devices - cache devices and spares -
 	 * and open them.
 	 */
 	spa_import_progress_set_notes(spa, "Loading AUX vdevs");
 	error = spa_ld_open_aux_vdevs(spa, type);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Load the metadata for all vdevs. Also check if unopenable devices
 	 * should be autoreplaced.
 	 */
 	spa_import_progress_set_notes(spa, "Loading vdev metadata");
 	error = spa_ld_load_vdev_metadata(spa);
 	if (error != 0)
 		goto fail;
 
 	spa_import_progress_set_notes(spa, "Loading dedup tables");
 	error = spa_ld_load_dedup_tables(spa);
 	if (error != 0)
 		goto fail;
 
 	spa_import_progress_set_notes(spa, "Loading BRT");
 	error = spa_ld_load_brt(spa);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Verify the logs now to make sure we don't have any unexpected errors
 	 * when we claim log blocks later.
 	 */
 	spa_import_progress_set_notes(spa, "Verifying Log Devices");
 	error = spa_ld_verify_logs(spa, type, ereport);
 	if (error != 0)
 		goto fail;
 
 	if (missing_feat_write) {
 		ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * At this point, we know that we can open the pool in
 		 * read-only mode but not read-write mode. We now have enough
 		 * information and can return to userland.
 		 */
 		error = spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT,
 		    ENOTSUP);
 		goto fail;
 	}
 
 	/*
 	 * Traverse the last txgs to make sure the pool was left off in a safe
 	 * state. When performing an extreme rewind, we verify the whole pool,
 	 * which can take a very long time.
 	 */
 	spa_import_progress_set_notes(spa, "Verifying pool data");
 	error = spa_ld_verify_pool_data(spa);
 	if (error != 0)
 		goto fail;
 
 	/*
 	 * Calculate the deflated space for the pool. This must be done before
 	 * we write anything to the pool because we'd need to update the space
 	 * accounting using the deflated sizes.
 	 */
 	spa_import_progress_set_notes(spa, "Calculating deflated space");
 	spa_update_dspace(spa);
 
 	/*
 	 * We have now retrieved all the information we needed to open the
 	 * pool. If we are importing the pool in read-write mode, a few
 	 * additional steps must be performed to finish the import.
 	 */
 	spa_import_progress_set_notes(spa, "Starting import");
 	if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER ||
 	    spa->spa_load_max_txg == UINT64_MAX)) {
 		uint64_t config_cache_txg = spa->spa_config_txg;
 
 		ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT);
 
 		/*
 		 * Before we do any zio_write's, complete the raidz expansion
 		 * scratch space copying, if necessary.
 		 */
 		if (RRSS_GET_STATE(&spa->spa_uberblock) == RRSS_SCRATCH_VALID)
 			vdev_raidz_reflow_copy_scratch(spa);
 
 		/*
 		 * In case of a checkpoint rewind, log the original txg
 		 * of the checkpointed uberblock.
 		 */
 		if (checkpoint_rewind) {
 			spa_history_log_internal(spa, "checkpoint rewind",
 			    NULL, "rewound state to txg=%llu",
 			    (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg);
 		}
 
 		spa_import_progress_set_notes(spa, "Claiming ZIL blocks");
 		/*
 		 * Traverse the ZIL and claim all blocks.
 		 */
 		spa_ld_claim_log_blocks(spa);
 
 		/*
 		 * Kick-off the syncing thread.
 		 */
 		spa->spa_sync_on = B_TRUE;
 		txg_sync_start(spa->spa_dsl_pool);
 		mmp_thread_start(spa);
 
 		/*
 		 * Wait for all claims to sync.  We sync up to the highest
 		 * claimed log block birth time so that claimed log blocks
 		 * don't appear to be from the future.  spa_claim_max_txg
 		 * will have been set for us by ZIL traversal operations
 		 * performed above.
 		 */
 		spa_import_progress_set_notes(spa, "Syncing ZIL claims");
 		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
 		/*
 		 * Check if we need to request an update of the config. On the
 		 * next sync, we would update the config stored in vdev labels
 		 * and the cachefile (by default /etc/zfs/zpool.cache).
 		 */
 		spa_import_progress_set_notes(spa, "Updating configs");
 		spa_ld_check_for_config_update(spa, config_cache_txg,
 		    update_config_cache);
 
 		/*
 		 * Check if a rebuild was in progress and if so resume it.
 		 * Then check all DTLs to see if anything needs resilvering.
 		 * The resilver will be deferred if a rebuild was started.
 		 */
 		spa_import_progress_set_notes(spa, "Starting resilvers");
 		if (vdev_rebuild_active(spa->spa_root_vdev)) {
 			vdev_rebuild_restart(spa);
 		} else if (!dsl_scan_resilvering(spa->spa_dsl_pool) &&
 		    vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 			spa_async_request(spa, SPA_ASYNC_RESILVER);
 		}
 
 		/*
 		 * Log the fact that we booted up (so that we can detect if
 		 * we rebooted in the middle of an operation).
 		 */
 		spa_history_log_version(spa, "open", NULL);
 
 		spa_import_progress_set_notes(spa,
 		    "Restarting device removals");
 		spa_restart_removal(spa);
 		spa_spawn_aux_threads(spa);
 
 		/*
 		 * Delete any inconsistent datasets.
 		 *
 		 * Note:
 		 * Since we may be issuing deletes for clones here,
 		 * we make sure to do so after we've spawned all the
 		 * auxiliary threads above (from which the livelist
 		 * deletion zthr is part of).
 		 */
 		spa_import_progress_set_notes(spa,
 		    "Cleaning up inconsistent objsets");
 		(void) dmu_objset_find(spa_name(spa),
 		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
 
 		/*
 		 * Clean up any stale temporary dataset userrefs.
 		 */
 		spa_import_progress_set_notes(spa,
 		    "Cleaning up temporary userrefs");
 		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_import_progress_set_notes(spa, "Restarting initialize");
 		vdev_initialize_restart(spa->spa_root_vdev);
 		spa_import_progress_set_notes(spa, "Restarting TRIM");
 		vdev_trim_restart(spa->spa_root_vdev);
 		vdev_autotrim_restart(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_import_progress_set_notes(spa, "Finished importing");
 	}
 	zio_handle_import_delay(spa, gethrtime() - load_start);
 
 	spa_import_progress_remove(spa_guid(spa));
 	spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD);
 
 	spa_load_note(spa, "LOADED");
 fail:
 	spa_namespace_enter(FTAG);
 	spa->spa_load_thread = NULL;
 	spa_namespace_broadcast();
 
 	return (error);
 
 }
 
 static int
 spa_load_retry(spa_t *spa, spa_load_state_t state)
 {
 	spa_mode_t mode = spa->spa_mode;
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 
 	spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1;
 
 	spa_activate(spa, mode);
 	spa_async_suspend(spa);
 
 	spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu",
 	    (u_longlong_t)spa->spa_load_max_txg);
 
 	return (spa_load(spa, state, SPA_IMPORT_EXISTING));
 }
 
 /*
  * If spa_load() fails this function will try loading prior txg's. If
  * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool
  * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this
  * function will not rewind the pool and will return the same error as
  * spa_load().
  */
 static int
 spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request,
     int rewind_flags)
 {
 	nvlist_t *loadinfo = NULL;
 	nvlist_t *config = NULL;
 	int load_error, rewind_error;
 	uint64_t safe_rewind_txg;
 	uint64_t min_txg;
 
 	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
 		spa->spa_load_max_txg = spa->spa_load_txg;
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		spa->spa_load_max_txg = max_request;
 		if (max_request != UINT64_MAX)
 			spa->spa_extreme_rewind = B_TRUE;
 	}
 
 	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING);
 	if (load_error == 0)
 		return (0);
 	if (load_error == ZFS_ERR_NO_CHECKPOINT) {
 		/*
 		 * When attempting checkpoint-rewind on a pool with no
 		 * checkpoint, we should not attempt to load uberblocks
 		 * from previous txgs when spa_load fails.
 		 */
 		ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT);
 		spa_import_progress_remove(spa_guid(spa));
 		return (load_error);
 	}
 
 	if (spa->spa_root_vdev != NULL)
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
 	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
 
 	if (rewind_flags & ZPOOL_NEVER_REWIND) {
 		nvlist_free(config);
 		spa_import_progress_remove(spa_guid(spa));
 		return (load_error);
 	}
 
 	if (state == SPA_LOAD_RECOVER) {
 		/* Price of rolling back is discarding txgs, including log */
 		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	} else {
 		/*
 		 * If we aren't rolling back save the load info from our first
 		 * import attempt so that we can restore it after attempting
 		 * to rewind.
 		 */
 		loadinfo = spa->spa_load_info;
 		spa->spa_load_info = fnvlist_alloc();
 	}
 
 	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
 	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
 	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
 	    TXG_INITIAL : safe_rewind_txg;
 
 	/*
 	 * Continue as long as we're finding errors, we're still within
 	 * the acceptable rewind range, and we're still finding uberblocks
 	 */
 	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
 	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
 		if (spa->spa_load_max_txg < safe_rewind_txg)
 			spa->spa_extreme_rewind = B_TRUE;
 		rewind_error = spa_load_retry(spa, state);
 	}
 
 	spa->spa_extreme_rewind = B_FALSE;
 	spa->spa_load_max_txg = UINT64_MAX;
 
 	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
 		spa_config_set(spa, config);
 	else
 		nvlist_free(config);
 
 	if (state == SPA_LOAD_RECOVER) {
 		ASSERT0P(loadinfo);
 		spa_import_progress_remove(spa_guid(spa));
 		return (rewind_error);
 	} else {
 		/* Store the rewind info as part of the initial load info */
 		fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO,
 		    spa->spa_load_info);
 
 		/* Restore the initial load info */
 		fnvlist_free(spa->spa_load_info);
 		spa->spa_load_info = loadinfo;
 
 		spa_import_progress_remove(spa_guid(spa));
 		return (load_error);
 	}
 }
 
 /*
  * Pool Open/Import
  *
  * The import case is identical to an open except that the configuration is sent
  * down from userland, instead of grabbed from the configuration cache.  For the
  * case of an open, the pool configuration will exist in the
  * POOL_STATE_UNINITIALIZED state.
  *
  * The stats information (gen/count/ustats) is used to gather vdev statistics at
  * the same time open the pool, without having to keep around the spa_t in some
  * ambiguous state.
  */
 static int
 spa_open_common(const char *pool, spa_t **spapp, const void *tag,
     nvlist_t *nvpolicy, nvlist_t **config)
 {
 	spa_t *spa;
 	spa_load_state_t state = SPA_LOAD_OPEN;
 	int error;
 	int locked = B_FALSE;
 	int firstopen = B_FALSE;
 
 	*spapp = NULL;
 
 	/*
 	 * As disgusting as this is, we need to support recursive calls to this
 	 * function because dsl_dir_open() is called during spa_load(), and ends
 	 * up calling spa_open() again.  The real fix is to figure out how to
 	 * avoid dsl_dir_open() calling this in the first place.
 	 */
 	if (!spa_namespace_held()) {
 		spa_namespace_enter(FTAG);
 		locked = B_TRUE;
 	}
 
 	if ((spa = spa_lookup(pool)) == NULL) {
 		if (locked)
 			spa_namespace_exit(FTAG);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 		zpool_load_policy_t policy;
 
 		firstopen = B_TRUE;
 
 		zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config,
 		    &policy);
 		if (policy.zlp_rewind & ZPOOL_DO_REWIND)
 			state = SPA_LOAD_RECOVER;
 
 		spa_activate(spa, spa_mode_global);
 
 		if (state != SPA_LOAD_RECOVER)
 			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
 
 		zfs_dbgmsg("spa_open_common: opening %s", pool);
 		error = spa_load_best(spa, state, policy.zlp_txg,
 		    policy.zlp_rewind);
 
 		if (error == EBADF) {
 			/*
 			 * If vdev_validate() returns failure (indicated by
 			 * EBADF), it indicates that one of the vdevs indicates
 			 * that the pool has been exported or destroyed.  If
 			 * this is the case, the config cache is out of sync and
 			 * we should remove the pool from the namespace.
 			 */
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
 			spa_remove(spa);
 			if (locked)
 				spa_namespace_exit(FTAG);
 			return (SET_ERROR(ENOENT));
 		}
 
 		if (error) {
 			/*
 			 * We can't open the pool, but we still have useful
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
 			if (config != NULL && spa->spa_config) {
 				*config = fnvlist_dup(spa->spa_config);
 				fnvlist_add_nvlist(*config,
 				    ZPOOL_CONFIG_LOAD_INFO,
 				    spa->spa_load_info);
 			}
 			spa_unload(spa);
 			spa_deactivate(spa);
 			spa->spa_last_open_failed = error;
 			if (locked)
 				spa_namespace_exit(FTAG);
 			*spapp = NULL;
 			return (error);
 		}
 	}
 
 	spa_open_ref(spa, tag);
 
 	if (config != NULL)
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
 	/*
 	 * If we've recovered the pool, pass back any information we
 	 * gathered while doing the load.
 	 */
 	if (state == SPA_LOAD_RECOVER && config != NULL) {
 		fnvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info);
 	}
 
 	if (locked) {
 		spa->spa_last_open_failed = 0;
 		spa->spa_last_ubsync_txg = 0;
 		spa->spa_load_txg = 0;
 		spa_namespace_exit(FTAG);
 	}
 
 	if (firstopen)
 		zvol_create_minors(spa_name(spa));
 
 	*spapp = spa;
 
 	return (0);
 }
 
 int
 spa_open_rewind(const char *name, spa_t **spapp, const void *tag,
     nvlist_t *policy, nvlist_t **config)
 {
 	return (spa_open_common(name, spapp, tag, policy, config));
 }
 
 int
 spa_open(const char *name, spa_t **spapp, const void *tag)
 {
 	return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
 
 /*
  * Lookup the given spa_t, incrementing the inject count in the process,
  * preventing it from being exported or destroyed.
  */
 spa_t *
 spa_inject_addref(char *name)
 {
 	spa_t *spa;
 
 	spa_namespace_enter(FTAG);
 	if ((spa = spa_lookup(name)) == NULL) {
 		spa_namespace_exit(FTAG);
 		return (NULL);
 	}
 	spa->spa_inject_ref++;
 	spa_namespace_exit(FTAG);
 
 	return (spa);
 }
 
 void
 spa_inject_delref(spa_t *spa)
 {
 	spa_namespace_enter(FTAG);
 	spa->spa_inject_ref--;
 	spa_namespace_exit(FTAG);
 }
 
 /*
  * Add spares device information to the nvlist.
  */
 static void
 spa_add_spares(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **spares;
 	uint_t i, nspares;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_stat_t *vs;
 	uint_t vsc;
 	uint64_t pool;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_spares.sav_count == 0)
 		return;
 
 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 	VERIFY0(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config,
 	    ZPOOL_CONFIG_SPARES, &spares, &nspares));
 	if (nspares != 0) {
 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    (const nvlist_t * const *)spares, nspares);
 		VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 		    &spares, &nspares));
 
 		/*
 		 * Go through and find any spares which have since been
 		 * repurposed as an active spare.  If this is the case, update
 		 * their status appropriately.
 		 */
 		for (i = 0; i < nspares; i++) {
 			guid = fnvlist_lookup_uint64(spares[i],
 			    ZPOOL_CONFIG_GUID);
 			VERIFY0(nvlist_lookup_uint64_array(spares[i],
 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
 			if (spa_spare_exists(guid, &pool, NULL) &&
 			    pool != 0ULL) {
 				vs->vs_state = VDEV_STATE_CANT_OPEN;
 				vs->vs_aux = VDEV_AUX_SPARED;
 			} else {
 				vs->vs_state =
 				    spa->spa_spares.sav_vdevs[i]->vdev_state;
 			}
 		}
 	}
 }
 
 /*
  * Add l2cache device information to the nvlist, including vdev stats.
  */
 static void
 spa_add_l2cache(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t **l2cache;
 	uint_t i, j, nl2cache;
 	nvlist_t *nvroot;
 	uint64_t guid;
 	vdev_t *vd;
 	vdev_stat_t *vs;
 	uint_t vsc;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	if (spa->spa_l2cache.sav_count == 0)
 		return;
 
 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 	VERIFY0(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
 	    ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache));
 	if (nl2cache != 0) {
 		fnvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    (const nvlist_t * const *)l2cache, nl2cache);
 		VERIFY0(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    &l2cache, &nl2cache));
 
 		/*
 		 * Update level 2 cache device stats.
 		 */
 
 		for (i = 0; i < nl2cache; i++) {
 			guid = fnvlist_lookup_uint64(l2cache[i],
 			    ZPOOL_CONFIG_GUID);
 
 			vd = NULL;
 			for (j = 0; j < spa->spa_l2cache.sav_count; j++) {
 				if (guid ==
 				    spa->spa_l2cache.sav_vdevs[j]->vdev_guid) {
 					vd = spa->spa_l2cache.sav_vdevs[j];
 					break;
 				}
 			}
 			ASSERT(vd != NULL);
 
 			VERIFY0(nvlist_lookup_uint64_array(l2cache[i],
 			    ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc));
 			vdev_get_stats(vd, vs);
 			vdev_config_generate_stats(vd, l2cache[i]);
 
 		}
 	}
 }
 
 static void
 spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features)
 {
 	zap_cursor_t zc;
 	zap_attribute_t *za = zap_attribute_alloc();
 
 	if (spa->spa_feat_for_read_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_read_obj);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za->za_integer_length == sizeof (uint64_t) &&
 			    za->za_num_integers == 1);
 			VERIFY0(nvlist_add_uint64(features, za->za_name,
 			    za->za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	if (spa->spa_feat_for_write_obj != 0) {
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_feat_for_write_obj);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			ASSERT(za->za_integer_length == sizeof (uint64_t) &&
 			    za->za_num_integers == 1);
 			VERIFY0(nvlist_add_uint64(features, za->za_name,
 			    za->za_first_integer));
 		}
 		zap_cursor_fini(&zc);
 	}
 	zap_attribute_free(za);
 }
 
 static void
 spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features)
 {
 	int i;
 
 	for (i = 0; i < SPA_FEATURES; i++) {
 		zfeature_info_t feature = spa_feature_table[i];
 		uint64_t refcount;
 
 		if (feature_get_refcount(spa, &feature, &refcount) != 0)
 			continue;
 
 		VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount));
 	}
 }
 
 /*
  * Store a list of pool features and their reference counts in the
  * config.
  *
  * The first time this is called on a spa, allocate a new nvlist, fetch
  * the pool features and reference counts from disk, then save the list
  * in the spa. In subsequent calls on the same spa use the saved nvlist
  * and refresh its values from the cached reference counts.  This
  * ensures we don't block here on I/O on a suspended pool so 'zpool
  * clear' can resume the pool.
  */
 static void
 spa_add_feature_stats(spa_t *spa, nvlist_t *config)
 {
 	nvlist_t *features;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
 
 	mutex_enter(&spa->spa_feat_stats_lock);
 	features = spa->spa_feat_stats;
 
 	if (features != NULL) {
 		spa_feature_stats_from_cache(spa, features);
 	} else {
 		VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP));
 		spa->spa_feat_stats = features;
 		spa_feature_stats_from_disk(spa, features);
 	}
 
 	VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS,
 	    features));
 
 	mutex_exit(&spa->spa_feat_stats_lock);
 }
 
 int
 spa_get_stats(const char *name, nvlist_t **config,
     char *altroot, size_t buflen)
 {
 	int error;
 	spa_t *spa;
 
 	*config = NULL;
 	error = spa_open_common(name, &spa, FTAG, NULL, config);
 
 	if (spa != NULL) {
 		/*
 		 * This still leaves a window of inconsistency where the spares
 		 * or l2cache devices could change and the config would be
 		 * self-inconsistent.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 		if (*config != NULL) {
 			uint64_t loadtimes[2];
 
 			loadtimes[0] = spa->spa_loaded_ts.tv_sec;
 			loadtimes[1] = spa->spa_loaded_ts.tv_nsec;
 			fnvlist_add_uint64_array(*config,
 			    ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2);
 
 			fnvlist_add_uint64(*config,
 			    ZPOOL_CONFIG_ERRCOUNT,
 			    spa_approx_errlog_size(spa));
 
 			if (spa_suspended(spa)) {
 				fnvlist_add_uint64(*config,
 				    ZPOOL_CONFIG_SUSPENDED,
 				    spa->spa_failmode);
 				fnvlist_add_uint64(*config,
 				    ZPOOL_CONFIG_SUSPENDED_REASON,
 				    spa->spa_suspended);
 			}
 
 			spa_add_spares(spa, *config);
 			spa_add_l2cache(spa, *config);
 			spa_add_feature_stats(spa, *config);
 		}
 	}
 
 	/*
 	 * We want to get the alternate root even for faulted pools, so we cheat
 	 * and call spa_lookup() directly.
 	 */
 	if (altroot) {
 		if (spa == NULL) {
 			spa_namespace_enter(FTAG);
 			spa = spa_lookup(name);
 			if (spa)
 				spa_altroot(spa, altroot, buflen);
 			else
 				altroot[0] = '\0';
 			spa = NULL;
 			spa_namespace_exit(FTAG);
 		} else {
 			spa_altroot(spa, altroot, buflen);
 		}
 	}
 
 	if (spa != NULL) {
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_close(spa, FTAG);
 	}
 
 	return (error);
 }
 
 /*
  * Validate that the auxiliary device array is well formed.  We must have an
  * array of nvlists, each which describes a valid leaf vdev.  If this is an
  * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be
  * specified, as long as they are well-formed.
  */
 static int
 spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode,
     spa_aux_vdev_t *sav, const char *config, uint64_t version,
     vdev_labeltype_t label)
 {
 	nvlist_t **dev;
 	uint_t i, ndev;
 	vdev_t *vd;
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	/*
 	 * It's acceptable to have no devs specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0)
 		return (0);
 
 	if (ndev == 0)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Make sure the pool is formatted with a version that supports this
 	 * device type.
 	 */
 	if (spa_version(spa) < version)
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * Set the pending device list so we correctly handle device in-use
 	 * checking.
 	 */
 	sav->sav_pending = dev;
 	sav->sav_npending = ndev;
 
 	for (i = 0; i < ndev; i++) {
 		if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0,
 		    mode)) != 0)
 			goto out;
 
 		if (!vd->vdev_ops->vdev_op_leaf) {
 			vdev_free(vd);
 			error = SET_ERROR(EINVAL);
 			goto out;
 		}
 
 		vd->vdev_top = vd;
 
 		if ((error = vdev_open(vd)) == 0 &&
 		    (error = vdev_label_init(vd, crtxg, label)) == 0) {
 			fnvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID,
 			    vd->vdev_guid);
 		}
 
 		vdev_free(vd);
 
 		if (error &&
 		    (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE))
 			goto out;
 		else
 			error = 0;
 	}
 
 out:
 	sav->sav_pending = NULL;
 	sav->sav_npending = 0;
 	return (error);
 }
 
 static int
 spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode)
 {
 	int error;
 
 	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
 	if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES,
 	    VDEV_LABEL_SPARE)) != 0) {
 		return (error);
 	}
 
 	return (spa_validate_aux_devs(spa, nvroot, crtxg, mode,
 	    &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE,
 	    VDEV_LABEL_L2CACHE));
 }
 
 static void
 spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs,
     const char *config)
 {
 	int i;
 
 	if (sav->sav_config != NULL) {
 		nvlist_t **olddevs;
 		uint_t oldndevs;
 		nvlist_t **newdevs;
 
 		/*
 		 * Generate new dev list by concatenating with the
 		 * current dev list.
 		 */
 		VERIFY0(nvlist_lookup_nvlist_array(sav->sav_config, config,
 		    &olddevs, &oldndevs));
 
 		newdevs = kmem_alloc(sizeof (void *) *
 		    (ndevs + oldndevs), KM_SLEEP);
 		for (i = 0; i < oldndevs; i++)
 			newdevs[i] = fnvlist_dup(olddevs[i]);
 		for (i = 0; i < ndevs; i++)
 			newdevs[i + oldndevs] = fnvlist_dup(devs[i]);
 
 		fnvlist_remove(sav->sav_config, config);
 
 		fnvlist_add_nvlist_array(sav->sav_config, config,
 		    (const nvlist_t * const *)newdevs, ndevs + oldndevs);
 		for (i = 0; i < oldndevs + ndevs; i++)
 			nvlist_free(newdevs[i]);
 		kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *));
 	} else {
 		/*
 		 * Generate a new dev list.
 		 */
 		sav->sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(sav->sav_config, config,
 		    (const nvlist_t * const *)devs, ndevs);
 	}
 }
 
 /*
  * Stop and drop level 2 ARC devices
  */
 void
 spa_l2cache_drop(spa_t *spa)
 {
 	vdev_t *vd;
 	int i;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		uint64_t pool;
 
 		vd = sav->sav_vdevs[i];
 		ASSERT(vd != NULL);
 
 		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
 		    pool != 0ULL && l2arc_vdev_present(vd))
 			l2arc_remove_vdev(vd);
 	}
 }
 
 /*
  * Verify encryption parameters for spa creation. If we are encrypting, we must
  * have the encryption feature flag enabled.
  */
 static int
 spa_create_check_encryption_params(dsl_crypto_params_t *dcp,
     boolean_t has_encryption)
 {
 	if (dcp->cp_crypt != ZIO_CRYPT_OFF &&
 	    dcp->cp_crypt != ZIO_CRYPT_INHERIT &&
 	    !has_encryption)
 		return (SET_ERROR(ENOTSUP));
 
 	return (dmu_objset_create_crypt_check(NULL, dcp, NULL));
 }
 
 /*
  * Pool Creation
  */
 int
 spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     nvlist_t *zplprops, dsl_crypto_params_t *dcp)
 {
 	spa_t *spa;
 	const char *altroot = NULL;
 	vdev_t *rvd;
 	dsl_pool_t *dp;
 	dmu_tx_t *tx;
 	int error = 0;
 	uint64_t txg = TXG_INITIAL;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 	uint64_t version, obj, ndraid = 0;
 	boolean_t has_features;
 	boolean_t has_encryption;
 	boolean_t has_allocclass;
 	spa_feature_t feat;
 	const char *feat_name;
 	const char *poolname;
 	nvlist_t *nvl;
 
 	if (props == NULL ||
 	    nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_TNAME), &poolname) != 0)
 		poolname = (char *)pool;
 
 	/*
 	 * If this pool already exists, return failure.
 	 */
 	spa_namespace_enter(FTAG);
 	if (spa_lookup(poolname) != NULL) {
 		spa_namespace_exit(FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Allocate a new spa_t structure.
 	 */
 	nvl = fnvlist_alloc();
 	fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool);
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	spa = spa_add(poolname, nvl, altroot);
 	fnvlist_free(nvl);
 	spa_activate(spa, spa_mode_global);
 
 	if (props && (error = spa_prop_validate(spa, props))) {
 		spa_deactivate(spa);
 		spa_remove(spa);
 		spa_namespace_exit(FTAG);
 		return (error);
 	}
 
 	/*
 	 * Temporary pool names should never be written to disk.
 	 */
 	if (poolname != pool)
 		spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME;
 
 	has_features = B_FALSE;
 	has_encryption = B_FALSE;
 	has_allocclass = B_FALSE;
 	for (nvpair_t *elem = nvlist_next_nvpair(props, NULL);
 	    elem != NULL; elem = nvlist_next_nvpair(props, elem)) {
 		if (zpool_prop_feature(nvpair_name(elem))) {
 			has_features = B_TRUE;
 
 			feat_name = strchr(nvpair_name(elem), '@') + 1;
 			VERIFY0(zfeature_lookup_name(feat_name, &feat));
 			if (feat == SPA_FEATURE_ENCRYPTION)
 				has_encryption = B_TRUE;
 			if (feat == SPA_FEATURE_ALLOCATION_CLASSES)
 				has_allocclass = B_TRUE;
 		}
 	}
 
 	/* verify encryption params, if they were provided */
 	if (dcp != NULL) {
 		error = spa_create_check_encryption_params(dcp, has_encryption);
 		if (error != 0) {
 			spa_deactivate(spa);
 			spa_remove(spa);
 			spa_namespace_exit(FTAG);
 			return (error);
 		}
 	}
 	if (!has_allocclass && zfs_special_devs(nvroot, NULL)) {
 		spa_deactivate(spa);
 		spa_remove(spa);
 		spa_namespace_exit(FTAG);
 		return (ENOTSUP);
 	}
 
 	if (has_features || nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) {
 		version = SPA_VERSION;
 	}
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 
 	spa->spa_first_txg = txg;
 	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa->spa_load_state = SPA_LOAD_CREATE;
 	spa->spa_removing_phys.sr_state = DSS_NONE;
 	spa->spa_removing_phys.sr_removing_vdev = -1;
 	spa->spa_removing_phys.sr_prev_indirect_vdev = -1;
 	spa->spa_indirect_vdevs_loaded = B_TRUE;
 
 	/*
 	 * Create "The Godfather" zio to hold all async IOs
 	 */
 	spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *),
 	    KM_SLEEP);
 	for (int i = 0; i < max_ncpus; i++) {
 		spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
 		    ZIO_FLAG_GODFATHER);
 	}
 
 	/*
 	 * Create the root vdev.
 	 */
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD);
 
 	ASSERT(error != 0 || rvd != NULL);
 	ASSERT(error != 0 || spa->spa_root_vdev == rvd);
 
 	if (error == 0 && !zfs_allocatable_devs(nvroot))
 		error = SET_ERROR(EINVAL);
 
 	if (error == 0 &&
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
 	    (error = vdev_draid_spare_create(nvroot, rvd, &ndraid, 0)) == 0 &&
 	    (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) {
 		/*
 		 * instantiate the metaslab groups (this will dirty the vdevs)
 		 * we can no longer error exit past this point
 		 */
 		for (int c = 0; error == 0 && c < rvd->vdev_children; c++) {
 			vdev_t *vd = rvd->vdev_child[c];
 
 			vdev_metaslab_set_size(vd);
 			vdev_expand(vd, txg);
 		}
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (error != 0) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		spa_namespace_exit(FTAG);
 		return (error);
 	}
 
 	/*
 	 * Get the list of spares, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		spa->spa_spares.sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
 		    nspares);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Get the list of level 2 cache devices, if specified.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		VERIFY0(nvlist_alloc(&spa->spa_l2cache.sav_config,
 		    NV_UNIQUE_NAME, KM_SLEEP));
 		fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
 		    nl2cache);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	spa->spa_is_initializing = B_TRUE;
 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg);
 	spa->spa_is_initializing = B_FALSE;
 
 	/*
 	 * Create DDTs (dedup tables).
 	 */
 	ddt_create(spa);
 	/*
 	 * Create BRT table and BRT table object.
 	 */
 	brt_create(spa);
 
 	spa_update_dspace(spa);
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
 	 * Create the pool's history object.
 	 */
 	if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history)
 		spa_history_create_obj(spa, tx);
 
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE);
 	spa_history_log_version(spa, "create", tx);
 
 	/*
 	 * Create the pool config object.
 	 */
 	spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset,
 	    DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE,
 	    DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
 	    sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool config");
 	}
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION,
 	    sizeof (uint64_t), 1, &version, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add pool version");
 	}
 
 	/* Newly created pools with the right version are always deflated. */
 	if (version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		spa->spa_deflate = TRUE;
 		if (zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 		    sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) {
 			cmn_err(CE_PANIC, "failed to add deflate");
 		}
 	}
 
 	/*
 	 * Create the deferred-free bpobj.  Turn off compression
 	 * because sync-to-convergence takes longer if the blocksize
 	 * keeps changing.
 	 */
 	obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx);
 	dmu_object_set_compress(spa->spa_meta_objset, obj,
 	    ZIO_COMPRESS_OFF, tx);
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ,
 	    sizeof (uint64_t), 1, &obj, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add bpobj");
 	}
 	VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj,
 	    spa->spa_meta_objset, obj));
 
 	/*
 	 * Generate some random noise for salted checksums to operate on.
 	 */
 	(void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes,
 	    sizeof (spa->spa_cksum_salt.zcs_bytes));
 
 	/*
 	 * Set pool properties.
 	 */
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
 	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
 	spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST);
 	spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM);
 	spa->spa_dedup_table_quota =
 	    zpool_prop_default_numeric(ZPOOL_PROP_DEDUP_TABLE_QUOTA);
 
 	if (props != NULL) {
 		spa_configfile_set(spa, props, B_FALSE);
 		spa_sync_props(props, tx);
 	}
 
 	for (int i = 0; i < ndraid; i++)
 		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
 
 	dmu_tx_commit(tx);
 
 	spa->spa_sync_on = B_TRUE;
 	txg_sync_start(dp);
 	mmp_thread_start(spa);
 	txg_wait_synced(dp, txg);
 
 	spa_spawn_aux_threads(spa);
 
 	spa_write_cachefile(spa, B_FALSE, B_TRUE, B_TRUE);
 
 	/*
 	 * Don't count references from objsets that are already closed
 	 * and are making their way through the eviction process.
 	 */
 	spa_evicting_os_wait(spa);
 	spa->spa_minref = zfs_refcount_count(&spa->spa_refcount);
 	spa->spa_load_state = SPA_LOAD_NONE;
 
 	spa_import_os(spa);
 
 	spa_namespace_exit(FTAG);
 
 	return (0);
 }
 
 /*
  * Import a non-root pool into the system.
  */
 int
 spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags)
 {
 	spa_t *spa;
 	const char *altroot = NULL;
 	spa_load_state_t state = SPA_LOAD_IMPORT;
 	zpool_load_policy_t policy;
 	spa_mode_t mode = spa_mode_global;
 	uint64_t readonly = B_FALSE;
 	int error;
 	nvlist_t *nvroot;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	/*
 	 * If a pool with this name exists, return failure.
 	 */
 	spa_namespace_enter(FTAG);
 	if (spa_lookup(pool) != NULL) {
 		spa_namespace_exit(FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 	(void) nvlist_lookup_uint64(props,
 	    zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly);
 	if (readonly)
 		mode = SPA_MODE_READ;
 	spa = spa_add(pool, config, altroot);
 	spa->spa_import_flags = flags;
 
 	/*
 	 * Verbatim import - Take a pool and insert it into the namespace
 	 * as if it had been loaded at boot.
 	 */
 	if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) {
 		if (props != NULL)
 			spa_configfile_set(spa, props, B_FALSE);
 
 		spa_write_cachefile(spa, B_FALSE, B_TRUE, B_FALSE);
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 		zfs_dbgmsg("spa_import: verbatim import of %s", pool);
 		spa_namespace_exit(FTAG);
 		return (0);
 	}
 
 	spa_activate(spa, mode);
 
 	/*
 	 * Don't start async tasks until we know everything is healthy.
 	 */
 	spa_async_suspend(spa);
 
 	zpool_get_load_policy(config, &policy);
 	if (policy.zlp_rewind & ZPOOL_DO_REWIND)
 		state = SPA_LOAD_RECOVER;
 
 	spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT;
 
 	if (state != SPA_LOAD_RECOVER) {
 		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 		zfs_dbgmsg("spa_import: importing %s", pool);
 	} else {
 		zfs_dbgmsg("spa_import: importing %s, max_txg=%lld "
 		    "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg);
 	}
 	error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind);
 
 	/*
 	 * Propagate anything learned while loading the pool and pass it
 	 * back to caller (i.e. rewind info, missing devices, etc).
 	 */
 	fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info);
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
 	 * Toss any existing sparelist, as it doesn't have any validity
 	 * anymore, and conflicts with spa_has_spare().
 	 */
 	if (spa->spa_spares.sav_config) {
 		nvlist_free(spa->spa_spares.sav_config);
 		spa->spa_spares.sav_config = NULL;
 		spa_load_spares(spa);
 	}
 	if (spa->spa_l2cache.sav_config) {
 		nvlist_free(spa->spa_l2cache.sav_config);
 		spa->spa_l2cache.sav_config = NULL;
 		spa_load_l2cache(spa);
 	}
 
 	nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE);
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	if (props != NULL)
 		spa_configfile_set(spa, props, B_FALSE);
 
 	if (error != 0 || (props && spa_writeable(spa) &&
 	    (error = spa_prop_set(spa, props)))) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		spa_namespace_exit(FTAG);
 		return (error);
 	}
 
 	spa_async_resume(spa);
 
 	/*
 	 * Override any spares and level 2 cache devices as specified by
 	 * the user, as these may have correct device names/devids, etc.
 	 */
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
 	    &spares, &nspares) == 0) {
 		if (spa->spa_spares.sav_config)
 			fnvlist_remove(spa->spa_spares.sav_config,
 			    ZPOOL_CONFIG_SPARES);
 		else
 			spa->spa_spares.sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(spa->spa_spares.sav_config,
 		    ZPOOL_CONFIG_SPARES, (const nvlist_t * const *)spares,
 		    nspares);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_spares.sav_sync = B_TRUE;
 		spa->spa_spares.sav_label_sync = B_TRUE;
 	}
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache) == 0) {
 		if (spa->spa_l2cache.sav_config)
 			fnvlist_remove(spa->spa_l2cache.sav_config,
 			    ZPOOL_CONFIG_L2CACHE);
 		else
 			spa->spa_l2cache.sav_config = fnvlist_alloc();
 		fnvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
 		    ZPOOL_CONFIG_L2CACHE, (const nvlist_t * const *)l2cache,
 		    nl2cache);
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 		spa->spa_l2cache.sav_label_sync = B_TRUE;
 	}
 
 	/*
 	 * Check for any removed devices.
 	 */
 	if (spa->spa_autoreplace) {
 		spa_aux_check_removed(&spa->spa_spares);
 		spa_aux_check_removed(&spa->spa_l2cache);
 	}
 
 	if (spa_writeable(spa)) {
 		/*
 		 * Update the config cache to include the newly-imported pool.
 		 */
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	}
 
 	/*
 	 * It's possible that the pool was expanded while it was exported.
 	 * We kick off an async task to handle this for us.
 	 */
 	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 
 	spa_history_log_version(spa, "import", NULL);
 
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT);
 
 	spa_namespace_exit(FTAG);
 
 	zvol_create_minors(pool);
 
 	spa_import_os(spa);
 
 	return (0);
 }
 
 nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
 	nvlist_t *config = NULL;
 	const char *poolname, *cachefile;
 	spa_t *spa;
 	uint64_t state;
 	int error;
 	zpool_load_policy_t policy;
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
 		return (NULL);
 
 	if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state))
 		return (NULL);
 
 	/*
 	 * Create and initialize the spa structure.
 	 */
 	char *name = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 	(void) snprintf(name, MAXPATHLEN, "%s-%llx-%s",
 	    TRYIMPORT_NAME, (u_longlong_t)(uintptr_t)curthread, poolname);
 
 	spa_namespace_enter(FTAG);
 	spa = spa_add(name, tryconfig, NULL);
 	spa_activate(spa, SPA_MODE_READ);
 	kmem_free(name, MAXPATHLEN);
 
 	/*
 	 * Rewind pool if a max txg was provided.
 	 */
 	zpool_get_load_policy(spa->spa_config, &policy);
 	if (policy.zlp_txg != UINT64_MAX) {
 		spa->spa_load_max_txg = policy.zlp_txg;
 		spa->spa_extreme_rewind = B_TRUE;
 		zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld",
 		    poolname, (longlong_t)policy.zlp_txg);
 	} else {
 		zfs_dbgmsg("spa_tryimport: importing %s", poolname);
 	}
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile)
 	    == 0) {
 		zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile);
 		spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE;
 	} else {
 		spa->spa_config_source = SPA_CONFIG_SRC_SCAN;
 	}
 
 	/*
 	 * spa_import() relies on a pool config fetched by spa_try_import()
 	 * for spare/cache devices. Import flags are not passed to
 	 * spa_tryimport(), which makes it return early due to a missing log
 	 * device and missing retrieving the cache device and spare eventually.
 	 * Passing ZFS_IMPORT_MISSING_LOG to spa_tryimport() makes it fetch
 	 * the correct configuration regardless of the missing log device.
 	 */
 	spa->spa_import_flags |= ZFS_IMPORT_MISSING_LOG;
 
 	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
 	 */
 	if (spa->spa_root_vdev != NULL) {
 		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 		fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname);
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state);
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP,
 		    spa->spa_uberblock.ub_timestamp);
 		fnvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO,
 		    spa->spa_load_info);
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA,
 		    spa->spa_errata);
 
 		/*
 		 * If the bootfs property exists on this pool then we
 		 * copy it out so that external consumers can tell which
 		 * pools are bootable.
 		 */
 		if ((!error || error == EEXIST) && spa->spa_bootfs) {
 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 			/*
 			 * We have to play games with the name since the
 			 * pool was opened as TRYIMPORT_NAME.
 			 */
 			if (dsl_dsobj_to_dsname(spa_name(spa),
 			    spa->spa_bootfs, tmpname) == 0) {
 				char *cp;
 				char *dsname;
 
 				dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 				cp = strchr(tmpname, '/');
 				if (cp == NULL) {
 					(void) strlcpy(dsname, tmpname,
 					    MAXPATHLEN);
 				} else {
 					(void) snprintf(dsname, MAXPATHLEN,
 					    "%s/%s", poolname, ++cp);
 				}
 				fnvlist_add_string(config, ZPOOL_CONFIG_BOOTFS,
 				    dsname);
 				kmem_free(dsname, MAXPATHLEN);
 			}
 			kmem_free(tmpname, MAXPATHLEN);
 		}
 
 		/*
 		 * Add the list of hot spares and level 2 cache devices.
 		 */
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_add_spares(spa, config);
 		spa_add_l2cache(spa, config);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_unload(spa);
 	spa_deactivate(spa);
 	spa_remove(spa);
 	spa_namespace_exit(FTAG);
 
 	return (config);
 }
 
 /*
  * Pool export/destroy
  *
  * The act of destroying or exporting a pool is very simple.  We make sure there
  * is no more pending I/O and any references to the pool are gone.  Then, we
  * update the pool state and sync all the labels to disk, removing the
  * configuration from the cache afterwards. If the 'hardforce' flag is set, then
  * we don't sync the labels or remove the configuration cache.
  */
 static int
 spa_export_common(const char *pool, int new_state, nvlist_t **oldconfig,
     boolean_t force, boolean_t hardforce)
 {
 	int error = 0;
 	spa_t *spa;
 	hrtime_t export_start = gethrtime();
 
 	if (oldconfig)
 		*oldconfig = NULL;
 
 	if (!(spa_mode_global & SPA_MODE_WRITE))
 		return (SET_ERROR(EROFS));
 
 	spa_namespace_enter(FTAG);
 	if ((spa = spa_lookup(pool)) == NULL) {
 		spa_namespace_exit(FTAG);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (spa->spa_is_exporting) {
 		/* the pool is being exported by another thread */
 		spa_namespace_exit(FTAG);
 		return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS));
 	}
 	spa->spa_is_exporting = B_TRUE;
 
 	/*
 	 * Put a hold on the pool, drop the namespace lock, stop async tasks
 	 * and see if we can export.
 	 */
 	spa_open_ref(spa, FTAG);
 	spa_namespace_exit(FTAG);
 	spa_async_suspend(spa);
 	if (spa->spa_zvol_taskq) {
 		zvol_remove_minors(spa, spa_name(spa), B_TRUE);
 		taskq_wait(spa->spa_zvol_taskq);
 	}
 	spa_namespace_enter(FTAG);
 	spa->spa_export_thread = curthread;
 	spa_close(spa, FTAG);
 
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 		spa_namespace_exit(FTAG);
 		goto export_spa;
 	}
 
 	/*
 	 * The pool will be in core if it's openable, in which case we can
 	 * modify its state.  Objsets may be open only because they're dirty,
 	 * so we have to force it to sync before checking spa_refcnt.
 	 */
 	if (spa->spa_sync_on) {
 		txg_wait_synced(spa->spa_dsl_pool, 0);
 		spa_evicting_os_wait(spa);
 	}
 
 	/*
 	 * A pool cannot be exported or destroyed if there are active
 	 * references.  If we are resetting a pool, allow references by
 	 * fault injection handlers.
 	 */
 	if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0)) {
 		error = SET_ERROR(EBUSY);
 		goto fail;
 	}
 
 	spa_namespace_exit(FTAG);
 	/*
 	 * At this point we no longer hold the spa_namespace_lock and
 	 * there were no references on the spa. Future spa_lookups will
 	 * notice the spa->spa_export_thread and wait until we signal
 	 * that we are finshed.
 	 */
 
 	if (spa->spa_sync_on) {
 		vdev_t *rvd = spa->spa_root_vdev;
 		/*
 		 * A pool cannot be exported if it has an active shared spare.
 		 * This is to prevent other pools stealing the active spare
 		 * from an exported pool. At user's own will, such pool can
 		 * be forcedly exported.
 		 */
 		if (!force && new_state == POOL_STATE_EXPORTED &&
 		    spa_has_active_shared_spare(spa)) {
 			error = SET_ERROR(EXDEV);
 			spa_namespace_enter(FTAG);
 			goto fail;
 		}
 
 		/*
 		 * We're about to export or destroy this pool. Make sure
 		 * we stop all initialization and trim activity here before
 		 * we set the spa_final_txg. This will ensure that all
 		 * dirty data resulting from the initialization is
 		 * committed to disk before we unload the pool.
 		 */
 		vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE);
 		vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE);
 		vdev_autotrim_stop_all(spa);
 		vdev_rebuild_stop_all(spa);
 		l2arc_spa_rebuild_stop(spa);
 
 		/*
 		 * We want this to be reflected on every label,
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
 		 */
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
 			vdev_config_dirty(rvd);
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 
 		if (spa_should_sync_time_logger_on_unload(spa))
 			spa_unload_sync_time_logger(spa);
 
 		/*
 		 * If the log space map feature is enabled and the pool is
 		 * getting exported (but not destroyed), we want to spend some
 		 * time flushing as many metaslabs as we can in an attempt to
 		 * destroy log space maps and save import time. This has to be
 		 * done before we set the spa_final_txg, otherwise
 		 * spa_sync() -> spa_flush_metaslabs() may dirty the final TXGs.
 		 * spa_should_flush_logs_on_unload() should be called after
 		 * spa_state has been set to the new_state.
 		 */
 		if (spa_should_flush_logs_on_unload(spa))
 			spa_unload_log_sm_flush_all(spa);
 
 		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_final_txg = spa_last_synced_txg(spa) +
 			    TXG_DEFER_SIZE + 1;
 			spa_config_exit(spa, SCL_ALL, FTAG);
 		}
 	}
 
 export_spa:
 	spa_export_os(spa);
 
 	if (new_state == POOL_STATE_DESTROYED)
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY);
 	else if (new_state == POOL_STATE_EXPORTED)
 		spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT);
 
 	if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 		spa_unload(spa);
 		spa_deactivate(spa);
 	}
 
 	if (oldconfig && spa->spa_config)
 		*oldconfig = fnvlist_dup(spa->spa_config);
 
 	if (new_state == POOL_STATE_EXPORTED)
 		zio_handle_export_delay(spa, gethrtime() - export_start);
 
 	/*
 	 * Take the namespace lock for the actual spa_t removal
 	 */
 	spa_namespace_enter(FTAG);
 	if (new_state != POOL_STATE_UNINITIALIZED) {
 		if (!hardforce)
 			spa_write_cachefile(spa, B_TRUE, B_TRUE, B_FALSE);
 		spa_remove(spa);
 	} else {
 		/*
 		 * If spa_remove() is not called for this spa_t and
 		 * there is any possibility that it can be reused,
 		 * we make sure to reset the exporting flag.
 		 */
 		spa->spa_is_exporting = B_FALSE;
 		spa->spa_export_thread = NULL;
 	}
 
 	/*
 	 * Wake up any waiters in spa_lookup()
 	 */
 	spa_namespace_broadcast();
 	spa_namespace_exit(FTAG);
 	return (0);
 
 fail:
 	spa->spa_is_exporting = B_FALSE;
 	spa->spa_export_thread = NULL;
 
 	spa_async_resume(spa);
 	/*
 	 * Wake up any waiters in spa_lookup()
 	 */
 	spa_namespace_broadcast();
 	spa_namespace_exit(FTAG);
 	return (error);
 }
 
 /*
  * Destroy a storage pool.
  */
 int
 spa_destroy(const char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * Export a storage pool.
  */
 int
 spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce)
 {
 	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
 	    force, hardforce));
 }
 
 /*
  * Similar to spa_export(), this unloads the spa_t without actually removing it
  * from the namespace in any way.
  */
 int
 spa_reset(const char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
 	    B_FALSE, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * Device manipulation
  * ==========================================================================
  */
 
 /*
  * This is called as a synctask to increment the draid feature flag
  */
 static void
 spa_draid_feature_incr(void *arg, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	int draid = (int)(uintptr_t)arg;
 
 	for (int c = 0; c < draid; c++)
 		spa_feature_incr(spa, SPA_FEATURE_DRAID, tx);
 }
 
 /*
  * Add a device to a storage pool.
  */
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot, boolean_t check_ashift)
 {
 	uint64_t txg, ndraid = 0;
 	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	spa->spa_pending_vdev = vd;	/* spa_vdev_exit() will clear this */
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares,
 	    &nspares) != 0)
 		nspares = 0;
 
 	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache,
 	    &nl2cache) != 0)
 		nl2cache = 0;
 
 	if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0)
 		return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
 	if (vd->vdev_children != 0 &&
 	    (error = vdev_create(vd, txg, B_FALSE)) != 0) {
 		return (spa_vdev_exit(spa, vd, txg, error));
 	}
 
 	/*
 	 * The virtual dRAID spares must be added after vdev tree is created
 	 * and the vdev guids are generated.  The guid of their associated
 	 * dRAID is stored in the config and used when opening the spare.
 	 */
 	if ((error = vdev_draid_spare_create(nvroot, vd, &ndraid,
 	    rvd->vdev_children)) == 0) {
 		if (ndraid > 0 && nvlist_lookup_nvlist_array(nvroot,
 		    ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0)
 			nspares = 0;
 	} else {
 		return (spa_vdev_exit(spa, vd, txg, error));
 	}
 
 	/*
 	 * We must validate the spares and l2cache devices after checking the
 	 * children.  Otherwise, vdev_inuse() will blindly overwrite the spare.
 	 */
 	if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0)
 		return (spa_vdev_exit(spa, vd, txg, error));
 
 	/*
 	 * If we are in the middle of a device removal, we can only add
 	 * devices which match the existing devices in the pool.
 	 * If we are in the middle of a removal, or have some indirect
 	 * vdevs, we can not add raidz or dRAID top levels.
 	 */
 	if (spa->spa_vdev_removal != NULL ||
 	    spa->spa_removing_phys.sr_prev_indirect_vdev != -1) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			tvd = vd->vdev_child[c];
 			if (spa->spa_vdev_removal != NULL &&
 			    tvd->vdev_ashift != spa->spa_max_ashift) {
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
 			}
 			/* Fail if top level vdev is raidz or a dRAID */
 			if (vdev_get_nparity(tvd) != 0)
 				return (spa_vdev_exit(spa, vd, txg, EINVAL));
 
 			/*
 			 * Need the top level mirror to be
 			 * a mirror of leaf vdevs only
 			 */
 			if (tvd->vdev_ops == &vdev_mirror_ops) {
 				for (uint64_t cid = 0;
 				    cid < tvd->vdev_children; cid++) {
 					vdev_t *cvd = tvd->vdev_child[cid];
 					if (!cvd->vdev_ops->vdev_op_leaf) {
 						return (spa_vdev_exit(spa, vd,
 						    txg, EINVAL));
 					}
 				}
 			}
 		}
 	}
 
 	if (check_ashift && spa->spa_max_ashift == spa->spa_min_ashift) {
 		for (int c = 0; c < vd->vdev_children; c++) {
 			tvd = vd->vdev_child[c];
 			if (tvd->vdev_ashift != spa->spa_max_ashift) {
 				return (spa_vdev_exit(spa, vd, txg,
 				    ZFS_ERR_ASHIFT_MISMATCH));
 			}
 		}
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
 		tvd->vdev_id = rvd->vdev_children;
 		vdev_add_child(rvd, tvd);
 		vdev_config_dirty(tvd);
 	}
 
 	if (nspares != 0) {
 		spa_set_aux_vdevs(&spa->spa_spares, spares, nspares,
 		    ZPOOL_CONFIG_SPARES);
 		spa_load_spares(spa);
 		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	if (nl2cache != 0) {
 		spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache,
 		    ZPOOL_CONFIG_L2CACHE);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * We can't increment a feature while holding spa_vdev so we
 	 * have to do it in a synctask.
 	 */
 	if (ndraid != 0) {
 		dmu_tx_t *tx;
 
 		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 		dsl_sync_task_nowait(spa->spa_dsl_pool, spa_draid_feature_incr,
 		    (void *)(uintptr_t)ndraid, tx);
 		dmu_tx_commit(tx);
 	}
 
 	/*
 	 * We have to be careful when adding new vdevs to an existing pool.
 	 * If other threads start allocating from these vdevs before we
 	 * sync the config cache, and we lose power, then upon reboot we may
 	 * fail to open the pool because there are DVAs that the config cache
 	 * can't translate.  Therefore, we first add the vdevs without
 	 * initializing metaslabs; sync the config cache (via spa_vdev_exit());
 	 * and then let spa_config_update() initialize the new metaslabs.
 	 *
 	 * spa_load() checks for added-but-not-initialized vdevs, so that
 	 * if we lose power at any point in this sequence, the remaining
 	 * steps will be completed the next time we load the pool.
 	 */
 	(void) spa_vdev_exit(spa, vd, txg, 0);
 
 	spa_namespace_enter(FTAG);
 	spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 	spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD);
 	spa_namespace_exit(FTAG);
 
 	return (0);
 }
 
 /*
  * Given a vdev to be replaced and its parent, check for a possible
  * "double spare" condition if a vdev is to be replaced by a spare.  When this
  * happens, you can get two spares assigned to one failed vdev.
  *
  * To trigger a double spare condition:
  *
  * 1. disk1 fails
  * 2. 1st spare is kicked in for disk1 and it resilvers
  * 3. Someone replaces disk1 with a new blank disk
  * 4. New blank disk starts resilvering
  * 5. While resilvering, new blank disk has IO errors and faults
  * 6. 2nd spare is kicked in for new blank disk
  * 7. At this point two spares are kicked in for the original disk1.
  *
  * It looks like this:
  *
  * NAME                                            STATE     READ WRITE CKSUM
  * tank2                                           DEGRADED     0     0     0
  *   draid2:6d:10c:2s-0                            DEGRADED     0     0     0
  *     scsi-0QEMU_QEMU_HARDDISK_d1                 ONLINE       0     0     0
  *     scsi-0QEMU_QEMU_HARDDISK_d2                 ONLINE       0     0     0
  *     scsi-0QEMU_QEMU_HARDDISK_d3                 ONLINE       0     0     0
  *     scsi-0QEMU_QEMU_HARDDISK_d4                 ONLINE       0     0     0
  *     scsi-0QEMU_QEMU_HARDDISK_d5                 ONLINE       0     0     0
  *     scsi-0QEMU_QEMU_HARDDISK_d6                 ONLINE       0     0     0
  *     scsi-0QEMU_QEMU_HARDDISK_d7                 ONLINE       0     0     0
  *     scsi-0QEMU_QEMU_HARDDISK_d8                 ONLINE       0     0     0
  *     scsi-0QEMU_QEMU_HARDDISK_d9                 ONLINE       0     0     0
  *     spare-9                                     DEGRADED     0     0     0
  *       replacing-0                               DEGRADED     0    93     0
  *         scsi-0QEMU_QEMU_HARDDISK_d10-part1/old  UNAVAIL      0     0     0
  *         spare-1                                 DEGRADED     0     0     0
  *           scsi-0QEMU_QEMU_HARDDISK_d10          REMOVED      0     0     0
  *           draid2-0-0                            ONLINE       0     0     0
  *       draid2-0-1                                ONLINE       0     0     0
  * spares
  *   draid2-0-0                                    INUSE     currently in use
  *   draid2-0-1                                    INUSE     currently in use
  *
  * ARGS:
  *
  * newvd:  New spare disk
  * pvd:    Parent vdev_t the spare should attach to
  *
  * This function returns B_TRUE if adding the new vdev would create a double
  * spare condition, B_FALSE otherwise.
  */
 static boolean_t
 spa_vdev_new_spare_would_cause_double_spares(vdev_t *newvd, vdev_t *pvd)
 {
 	vdev_t *ppvd;
 
 	ppvd = pvd->vdev_parent;
 	if (ppvd == NULL)
 		return (B_FALSE);
 
 	/*
 	 * To determine if this configuration would cause a double spare, we
 	 * look at the vdev_op of the parent vdev, and of the parent's parent
 	 * vdev.  We also look at vdev_isspare on the new disk.  A double spare
 	 * condition looks like this:
 	 *
 	 * 1. parent of parent's op is a spare or draid spare
 	 * 2. parent's op is replacing
 	 * 3. new disk is a spare
 	 */
 	if ((ppvd->vdev_ops == &vdev_spare_ops) ||
 	    (ppvd->vdev_ops == &vdev_draid_spare_ops))
 		if (pvd->vdev_ops == &vdev_replacing_ops)
 			if (newvd->vdev_isspare)
 				return (B_TRUE);
 
 	return (B_FALSE);
 }
 
 /*
  * Attach a device to a vdev specified by its guid.  The vdev type can be
  * a mirror, a raidz, or a leaf device that is also a top-level (e.g. a
  * single device). When the vdev is a single device, a mirror vdev will be
  * automatically inserted.
  *
  * If 'replacing' is specified, the new device is intended to replace the
  * existing device; in this case the two devices are made into their own
  * mirror using the 'replacing' vdev, which is functionally identical to
  * the mirror vdev (it actually reuses all the same ops) but has a few
  * extra rules: you can't attach to it after it's been created, and upon
  * completion of resilvering, the first disk (the one being replaced)
  * is automatically detached.
  *
  * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild)
  * should be performed instead of traditional healing reconstruction.  From
  * an administrators perspective these are both resilver operations.
  */
 int
 spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing,
     int rebuild)
 {
 	uint64_t txg, dtl_max_txg;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
 	char *oldvdpath, *newvdpath;
 	int newvd_isspare = B_FALSE;
 	int error;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	oldvd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	ASSERT(spa_namespace_held());
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	if (rebuild) {
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD))
 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 		if (dsl_scan_resilvering(spa_get_dsl(spa)) ||
 		    dsl_scan_resilver_scheduled(spa_get_dsl(spa))) {
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_RESILVER_IN_PROGRESS));
 		}
 	} else {
 		if (vdev_rebuild_active(rvd))
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_REBUILD_IN_PROGRESS));
 	}
 
 	if (spa->spa_vdev_removal != NULL) {
 		return (spa_vdev_exit(spa, NULL, txg,
 		    ZFS_ERR_DEVRM_IN_PROGRESS));
 	}
 
 	if (oldvd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	boolean_t raidz = oldvd->vdev_ops == &vdev_raidz_ops;
 
 	if (raidz) {
 		if (!spa_feature_is_enabled(spa, SPA_FEATURE_RAIDZ_EXPANSION))
 			return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 		/*
 		 * Can't expand a raidz while prior expand is in progress.
 		 */
 		if (spa->spa_raidz_expand != NULL) {
 			return (spa_vdev_exit(spa, NULL, txg,
 			    ZFS_ERR_RAIDZ_EXPAND_IN_PROGRESS));
 		}
 	} else if (!oldvd->vdev_ops->vdev_op_leaf) {
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 	}
 
 	if (raidz)
 		pvd = oldvd;
 	else
 		pvd = oldvd->vdev_parent;
 
 	if (spa_config_parse(spa, &newrootvd, nvroot, NULL, 0,
 	    VDEV_ALLOC_ATTACH) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	if (newrootvd->vdev_children != 1)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	newvd = newrootvd->vdev_child[0];
 
 	if (!newvd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, newrootvd, txg, EINVAL));
 
 	if ((error = vdev_create(newrootvd, txg, replacing)) != 0)
 		return (spa_vdev_exit(spa, newrootvd, txg, error));
 
 	/*
 	 * log, dedup and special vdevs should not be replaced by spares.
 	 */
 	if ((oldvd->vdev_top->vdev_alloc_bias != VDEV_BIAS_NONE ||
 	    oldvd->vdev_top->vdev_islog) && newvd->vdev_isspare) {
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 	}
 
 	/*
 	 * A dRAID spare can only replace a child of its parent dRAID vdev.
 	 */
 	if (newvd->vdev_ops == &vdev_draid_spare_ops &&
 	    oldvd->vdev_top != vdev_draid_spare_get_parent(newvd)) {
 		return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 	}
 
 	if (rebuild) {
 		/*
 		 * For rebuilds, the top vdev must support reconstruction
 		 * using only space maps.  This means the only allowable
 		 * vdevs types are the root vdev, a mirror, or dRAID.
 		 */
 		tvd = pvd;
 		if (pvd->vdev_top != NULL)
 			tvd = pvd->vdev_top;
 
 		if (tvd->vdev_ops != &vdev_mirror_ops &&
 		    tvd->vdev_ops != &vdev_root_ops &&
 		    tvd->vdev_ops != &vdev_draid_ops) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 	}
 
 	if (!replacing) {
 		/*
 		 * For attach, the only allowable parent is a mirror or
 		 * the root vdev. A raidz vdev can be attached to, but
 		 * you cannot attach to a raidz child.
 		 */
 		if (pvd->vdev_ops != &vdev_mirror_ops &&
 		    pvd->vdev_ops != &vdev_root_ops &&
 		    !raidz)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		pvops = &vdev_mirror_ops;
 	} else {
 		/*
 		 * Active hot spares can only be replaced by inactive hot
 		 * spares.
 		 */
 		if (pvd->vdev_ops == &vdev_spare_ops &&
 		    oldvd->vdev_isspare &&
 		    !spa_has_spare(spa, newvd->vdev_guid))
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		/*
 		 * If the source is a hot spare, and the parent isn't already a
 		 * spare, then we want to create a new hot spare.  Otherwise, we
 		 * want to create a replacing vdev.  The user is not allowed to
 		 * attach to a spared vdev child unless the 'isspare' state is
 		 * the same (spare replaces spare, non-spare replaces
 		 * non-spare).
 		 */
 		if (pvd->vdev_ops == &vdev_replacing_ops &&
 		    spa_version(spa) < SPA_VERSION_MULTI_REPLACE) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		} else if (pvd->vdev_ops == &vdev_spare_ops &&
 		    newvd->vdev_isspare != oldvd->vdev_isspare) {
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 		}
 
 		if (spa_vdev_new_spare_would_cause_double_spares(newvd, pvd)) {
 			vdev_dbgmsg(newvd,
 			    "disk would create double spares, ignore.");
 			return (spa_vdev_exit(spa, newrootvd, txg, EEXIST));
 		}
 
 		if (newvd->vdev_isspare)
 			pvops = &vdev_spare_ops;
 		else
 			pvops = &vdev_replacing_ops;
 	}
 
 	/*
 	 * Make sure the new device is big enough.
 	 */
 	vdev_t *min_vdev = raidz ? oldvd->vdev_child[0] : oldvd;
 	if (newvd->vdev_asize < vdev_get_min_asize(min_vdev))
 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
 	/*
 	 * The new device cannot have a higher alignment requirement
 	 * than the top-level vdev.
 	 */
 	if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) {
 		return (spa_vdev_exit(spa, newrootvd, txg,
 		    ZFS_ERR_ASHIFT_MISMATCH));
 	}
 
 	/*
 	 * RAIDZ-expansion-specific checks.
 	 */
 	if (raidz) {
 		if (vdev_raidz_attach_check(newvd) != 0)
 			return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP));
 
 		/*
 		 * Fail early if a child is not healthy or being replaced
 		 */
 		for (int i = 0; i < oldvd->vdev_children; i++) {
 			if (vdev_is_dead(oldvd->vdev_child[i]) ||
 			    !oldvd->vdev_child[i]->vdev_ops->vdev_op_leaf) {
 				return (spa_vdev_exit(spa, newrootvd, txg,
 				    ENXIO));
 			}
 			/* Also fail if reserved boot area is in-use */
 			if (vdev_check_boot_reserve(spa, oldvd->vdev_child[i])
 			    != 0) {
 				return (spa_vdev_exit(spa, newrootvd, txg,
 				    EADDRINUSE));
 			}
 		}
 	}
 
 	if (raidz) {
 		/*
 		 * Note: oldvdpath is freed by spa_strfree(),  but
 		 * kmem_asprintf() is freed by kmem_strfree(), so we have to
 		 * move it to a spa_strdup-ed string.
 		 */
 		char *tmp = kmem_asprintf("raidz%u-%u",
 		    (uint_t)vdev_get_nparity(oldvd), (uint_t)oldvd->vdev_id);
 		oldvdpath = spa_strdup(tmp);
 		kmem_strfree(tmp);
 	} else {
 		oldvdpath = spa_strdup(oldvd->vdev_path);
 	}
 	newvdpath = spa_strdup(newvd->vdev_path);
 
 	/*
 	 * If this is an in-place replacement, update oldvd's path and devid
 	 * to make it distinguishable from newvd, and unopenable from now on.
 	 */
 	if (strcmp(oldvdpath, newvdpath) == 0) {
 		spa_strfree(oldvd->vdev_path);
 		oldvd->vdev_path = kmem_alloc(strlen(newvdpath) + 5,
 		    KM_SLEEP);
 		(void) sprintf(oldvd->vdev_path, "%s/old",
 		    newvdpath);
 		if (oldvd->vdev_devid != NULL) {
 			spa_strfree(oldvd->vdev_devid);
 			oldvd->vdev_devid = NULL;
 		}
 		spa_strfree(oldvdpath);
 		oldvdpath = spa_strdup(oldvd->vdev_path);
 	}
 
 	/*
 	 * If the parent is not a mirror, or if we're replacing, insert the new
 	 * mirror/replacing/spare vdev above oldvd.
 	 */
 	if (!raidz && pvd->vdev_ops != pvops) {
 		pvd = vdev_add_parent(oldvd, pvops);
 		ASSERT(pvd->vdev_ops == pvops);
 		ASSERT(oldvd->vdev_parent == pvd);
 	}
 
 	ASSERT(pvd->vdev_top->vdev_parent == rvd);
 
 	/*
 	 * Extract the new device from its root and add it to pvd.
 	 */
 	vdev_remove_child(newrootvd, newvd);
 	newvd->vdev_id = pvd->vdev_children;
 	newvd->vdev_crtxg = oldvd->vdev_crtxg;
 	vdev_add_child(pvd, newvd);
 
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
 	vdev_propagate_state(pvd);
 
 	tvd = newvd->vdev_top;
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account
 	 * for any dmu_sync-ed blocks.  It will propagate upward when
 	 * spa_vdev_exit() calls vdev_dtl_reassess().
 	 */
 	dtl_max_txg = txg + TXG_CONCURRENT_STATES;
 
 	if (raidz) {
 		/*
 		 * Wait for the youngest allocations and frees to sync,
 		 * and then wait for the deferral of those frees to finish.
 		 */
 		spa_vdev_config_exit(spa, NULL,
 		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
 
 		vdev_initialize_stop_all(tvd, VDEV_INITIALIZE_ACTIVE);
 		vdev_trim_stop_all(tvd, VDEV_TRIM_ACTIVE);
 		vdev_autotrim_stop_wait(tvd);
 
 		dtl_max_txg = spa_vdev_config_enter(spa);
 
 		tvd->vdev_rz_expanding = B_TRUE;
 
 		vdev_dirty_leaves(tvd, VDD_DTL, dtl_max_txg);
 		vdev_config_dirty(tvd);
 
 		dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool,
 		    dtl_max_txg);
 		dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_raidz_attach_sync,
 		    newvd, tx);
 		dmu_tx_commit(tx);
 	} else {
 		vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL,
 		    dtl_max_txg - TXG_INITIAL);
 
 		if (newvd->vdev_isspare) {
 			spa_spare_activate(newvd);
 			spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE);
 		}
 
 		newvd_isspare = newvd->vdev_isspare;
 
 		/*
 		 * Mark newvd's DTL dirty in this txg.
 		 */
 		vdev_dirty(tvd, VDD_DTL, newvd, txg);
 
 		/*
 		 * Schedule the resilver or rebuild to restart in the future.
 		 * We do this to ensure that dmu_sync-ed blocks have been
 		 * stitched into the respective datasets.
 		 */
 		if (rebuild) {
 			newvd->vdev_rebuild_txg = txg;
 
 			vdev_rebuild(tvd);
 		} else {
 			newvd->vdev_resilver_txg = txg;
 
 			if (dsl_scan_resilvering(spa_get_dsl(spa)) &&
 			    spa_feature_is_enabled(spa,
 			    SPA_FEATURE_RESILVER_DEFER)) {
 				vdev_defer_resilver(newvd);
 			} else {
 				dsl_scan_restart_resilver(spa->spa_dsl_pool,
 				    dtl_max_txg);
 			}
 		}
 	}
 
 	if (spa->spa_bootfs)
 		spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH);
 
 	spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH);
 
 	/*
 	 * Commit the config
 	 */
 	(void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0);
 
 	spa_history_log_internal(spa, "vdev attach", NULL,
 	    "%s vdev=%s %s vdev=%s",
 	    replacing && newvd_isspare ? "spare in" :
 	    replacing ? "replace" : "attach", newvdpath,
 	    replacing ? "for" : "to", oldvdpath);
 
 	spa_strfree(oldvdpath);
 	spa_strfree(newvdpath);
 
 	return (0);
 }
 
 /*
  * Detach a device from a mirror or replacing vdev.
  *
  * If 'replace_done' is specified, only detach if the parent
  * is a replacing or a spare vdev.
  */
 int
 spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 {
 	uint64_t txg;
 	int error;
 	vdev_t *rvd __maybe_unused = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
 	uint64_t unspare_guid = 0;
 	char *vdpath;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_detach_enter(spa, guid);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
 	/*
 	 * Besides being called directly from the userland through the
 	 * ioctl interface, spa_vdev_detach() can be potentially called
 	 * at the end of spa_vdev_resilver_done().
 	 *
 	 * In the regular case, when we have a checkpoint this shouldn't
 	 * happen as we never empty the DTLs of a vdev during the scrub
 	 * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done()
 	 * should never get here when we have a checkpoint.
 	 *
 	 * That said, even in a case when we checkpoint the pool exactly
 	 * as spa_vdev_resilver_done() calls this function everything
 	 * should be fine as the resilver will return right away.
 	 */
 	ASSERT(spa_namespace_held());
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	if (vd == NULL)
 		return (spa_vdev_exit(spa, NULL, txg, ENODEV));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	pvd = vd->vdev_parent;
 
 	/*
 	 * If the parent/child relationship is not as expected, don't do it.
 	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
 	 * vdev that's replacing B with C.  The user's intent in replacing
 	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
 	 * the replace by detaching C, the expected behavior is to end up
 	 * M(A,B).  But suppose that right after deciding to detach C,
 	 * the replacement of B completes.  We would have M(A,C), and then
 	 * ask to detach C, which would leave us with just A -- not what
 	 * the user wanted.  To prevent this, we make sure that the
 	 * parent/child relationship hasn't changed -- in this example,
 	 * that C's parent is still the replacing vdev R.
 	 */
 	if (pvd->vdev_guid != pguid && pguid != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	/*
 	 * Only 'replacing' or 'spare' vdevs can be replaced.
 	 */
 	if (replace_done && pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	ASSERT(pvd->vdev_ops != &vdev_spare_ops ||
 	    spa_version(spa) >= SPA_VERSION_SPARES);
 
 	/*
 	 * Only mirror, replacing, and spare vdevs support detach.
 	 */
 	if (pvd->vdev_ops != &vdev_replacing_ops &&
 	    pvd->vdev_ops != &vdev_mirror_ops &&
 	    pvd->vdev_ops != &vdev_spare_ops)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	/*
 	 * If this device has the only valid copy of some data,
 	 * we cannot safely detach it.
 	 */
 	if (vdev_dtl_required(vd))
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
 	ASSERT(pvd->vdev_children >= 2);
 
 	/*
 	 * If we are detaching the second disk from a replacing vdev, then
 	 * check to see if we changed the original vdev's path to have "/old"
 	 * at the end in spa_vdev_attach().  If so, undo that change now.
 	 */
 	if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 &&
 	    vd->vdev_path != NULL) {
 		size_t len = strlen(vd->vdev_path);
 
 		for (int c = 0; c < pvd->vdev_children; c++) {
 			cvd = pvd->vdev_child[c];
 
 			if (cvd == vd || cvd->vdev_path == NULL)
 				continue;
 
 			if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 &&
 			    strcmp(cvd->vdev_path + len, "/old") == 0) {
 				spa_strfree(cvd->vdev_path);
 				cvd->vdev_path = spa_strdup(vd->vdev_path);
 				break;
 			}
 		}
 	}
 
 	/*
 	 * If we are detaching the original disk from a normal spare, then it
 	 * implies that the spare should become a real disk, and be removed
 	 * from the active spare list for the pool.  dRAID spares on the
 	 * other hand are coupled to the pool and thus should never be removed
 	 * from the spares list.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0) {
 		vdev_t *last_cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 		if (last_cvd->vdev_isspare &&
 		    last_cvd->vdev_ops != &vdev_draid_spare_ops) {
 			unspare = B_TRUE;
 		}
 	}
 
 	/*
 	 * Erase the disk labels so the disk can be used for other things.
 	 * This must be done after all other error cases are handled,
 	 * but before we disembowel vd (so we can still do I/O to it).
 	 * But if we can't do it, don't treat the error as fatal --
 	 * it may be that the unwritability of the disk is the reason
 	 * it's being detached!
 	 */
 	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
 
 	/*
 	 * Remove vd from its parent and compact the parent's children.
 	 */
 	vdev_remove_child(pvd, vd);
 	vdev_compact_children(pvd);
 
 	/*
 	 * Remember one of the remaining children so we can get tvd below.
 	 */
 	cvd = pvd->vdev_child[pvd->vdev_children - 1];
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
 	 * do it now, marking the vdev as no longer a spare in the process.
 	 * We must do this before vdev_remove_parent(), because that can
 	 * change the GUID if it creates a new toplevel GUID.  For a similar
 	 * reason, we must remove the spare now, in the same txg as the detach;
 	 * otherwise someone could attach a new sibling, change the GUID, and
 	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
 	 */
 	if (unspare) {
 		ASSERT(cvd->vdev_isspare);
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
 		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
 		cvd->vdev_unspare = B_TRUE;
 	}
 
 	/*
 	 * If the parent mirror/replacing vdev only has one child,
 	 * the parent is no longer needed.  Remove it from the tree.
 	 */
 	if (pvd->vdev_children == 1) {
 		if (pvd->vdev_ops == &vdev_spare_ops)
 			cvd->vdev_unspare = B_FALSE;
 		vdev_remove_parent(cvd);
 	}
 
 	/*
 	 * We don't set tvd until now because the parent we just removed
 	 * may have been the previous top-level vdev.
 	 */
 	tvd = cvd->vdev_top;
 	ASSERT(tvd->vdev_parent == rvd);
 
 	/*
 	 * Reevaluate the parent vdev state.
 	 */
 	vdev_propagate_state(cvd);
 
 	/*
 	 * If the 'autoexpand' property is set on the pool then automatically
 	 * try to expand the size of the pool. For example if the device we
 	 * just detached was smaller than the others, it may be possible to
 	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
 	 * first so that we can obtain the updated sizes of the leaf vdevs.
 	 */
 	if (spa->spa_autoexpand) {
 		vdev_reopen(tvd);
 		vdev_expand(tvd, txg);
 	}
 
 	vdev_config_dirty(tvd);
 
 	/*
 	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
 	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
 	 * But first make sure we're not on any *other* txg's DTL list, to
 	 * prevent vd from being accessed after it's freed.
 	 */
 	vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none");
 	for (int t = 0; t < TXG_SIZE; t++)
 		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
 	vd->vdev_detached = B_TRUE;
 	vdev_dirty(tvd, VDD_DTL, vd, txg);
 
 	spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE);
 	spa_notify_waiters(spa);
 
 	/* hang on to the spa before we release the lock */
 	spa_open_ref(spa, FTAG);
 
 	error = spa_vdev_exit(spa, vd, txg, 0);
 
 	spa_history_log_internal(spa, "detach", NULL,
 	    "vdev=%s", vdpath);
 	spa_strfree(vdpath);
 
 	/*
 	 * If this was the removal of the original device in a hot spare vdev,
 	 * then we want to go through and remove the device from the hot spare
 	 * list of every other pool.
 	 */
 	if (unspare) {
 		spa_t *altspa = NULL;
 
 		spa_namespace_enter(FTAG);
 		while ((altspa = spa_next(altspa)) != NULL) {
 			if (altspa->spa_state != POOL_STATE_ACTIVE ||
 			    altspa == spa)
 				continue;
 
 			spa_open_ref(altspa, FTAG);
 			spa_namespace_exit(FTAG);
 			(void) spa_vdev_remove(altspa, unspare_guid, B_TRUE);
 			spa_namespace_enter(FTAG);
 			spa_close(altspa, FTAG);
 		}
 		spa_namespace_exit(FTAG);
 
 		/* search the rest of the vdevs for spares to remove */
 		spa_vdev_resilver_done(spa);
 	}
 
 	/* all done with the spa; OK to release */
 	spa_namespace_enter(FTAG);
 	spa_close(spa, FTAG);
 	spa_namespace_exit(FTAG);
 
 	return (error);
 }
 
 static int
 spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
     list_t *vd_list)
 {
 	ASSERT(spa_namespace_held());
 
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 
 	/* Look up vdev and ensure it's a leaf. */
 	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 	if (vd == NULL || vd->vdev_detached) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(ENODEV));
 	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EINVAL));
 	} else if (!vdev_writeable(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EROFS));
 	}
 	mutex_enter(&vd->vdev_initialize_lock);
 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
 	/*
 	 * When we activate an initialize action we check to see
 	 * if the vdev_initialize_thread is NULL. We do this instead
 	 * of using the vdev_initialize_state since there might be
 	 * a previous initialization process which has completed but
 	 * the thread is not exited.
 	 */
 	if (cmd_type == POOL_INITIALIZE_START &&
 	    (vd->vdev_initialize_thread != NULL ||
 	    vd->vdev_top->vdev_removing || vd->vdev_top->vdev_rz_expanding)) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(EBUSY));
 	} else if (cmd_type == POOL_INITIALIZE_CANCEL &&
 	    (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE &&
 	    vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(ESRCH));
 	} else if (cmd_type == POOL_INITIALIZE_SUSPEND &&
 	    vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(ESRCH));
 	} else if (cmd_type == POOL_INITIALIZE_UNINIT &&
 	    vd->vdev_initialize_thread != NULL) {
 		mutex_exit(&vd->vdev_initialize_lock);
 		return (SET_ERROR(EBUSY));
 	}
 
 	switch (cmd_type) {
 	case POOL_INITIALIZE_START:
 		vdev_initialize(vd);
 		break;
 	case POOL_INITIALIZE_CANCEL:
 		vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list);
 		break;
 	case POOL_INITIALIZE_SUSPEND:
 		vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list);
 		break;
 	case POOL_INITIALIZE_UNINIT:
 		vdev_uninitialize(vd);
 		break;
 	default:
 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
 	}
 	mutex_exit(&vd->vdev_initialize_lock);
 
 	return (0);
 }
 
 int
 spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     nvlist_t *vdev_errlist)
 {
 	int total_errors = 0;
 	list_t vd_list;
 
 	list_create(&vd_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_initialize_node));
 
 	/*
 	 * We hold the namespace lock through the whole function
 	 * to prevent any changes to the pool while we're starting or
 	 * stopping initialization. The config and state locks are held so that
 	 * we can properly assess the vdev state before we commit to
 	 * the initializing operation.
 	 */
 	spa_namespace_enter(FTAG);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
 		uint64_t vdev_guid = fnvpair_value_uint64(pair);
 
 		int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type,
 		    &vd_list);
 		if (error != 0) {
 			char guid_as_str[MAXNAMELEN];
 
 			(void) snprintf(guid_as_str, sizeof (guid_as_str),
 			    "%llu", (unsigned long long)vdev_guid);
 			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
 			total_errors++;
 		}
 	}
 
 	/* Wait for all initialize threads to stop. */
 	vdev_initialize_stop_wait(spa, &vd_list);
 
 	/* Sync out the initializing state */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	spa_namespace_exit(FTAG);
 
 	list_destroy(&vd_list);
 
 	return (total_errors);
 }
 
 static int
 spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type,
     uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list)
 {
 	ASSERT(spa_namespace_held());
 
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 
 	/* Look up vdev and ensure it's a leaf. */
 	vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 	if (vd == NULL || vd->vdev_detached) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(ENODEV));
 	} else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EINVAL));
 	} else if (!vdev_writeable(vd)) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EROFS));
 	} else if (!vd->vdev_has_trim) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EOPNOTSUPP));
 	} else if (secure && !vd->vdev_has_securetrim) {
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		return (SET_ERROR(EOPNOTSUPP));
 	}
 	mutex_enter(&vd->vdev_trim_lock);
 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
 	/*
 	 * When we activate a TRIM action we check to see if the
 	 * vdev_trim_thread is NULL. We do this instead of using the
 	 * vdev_trim_state since there might be a previous TRIM process
 	 * which has completed but the thread is not exited.
 	 */
 	if (cmd_type == POOL_TRIM_START &&
 	    (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing ||
 	    vd->vdev_top->vdev_rz_expanding)) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(EBUSY));
 	} else if (cmd_type == POOL_TRIM_CANCEL &&
 	    (vd->vdev_trim_state != VDEV_TRIM_ACTIVE &&
 	    vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(ESRCH));
 	} else if (cmd_type == POOL_TRIM_SUSPEND &&
 	    vd->vdev_trim_state != VDEV_TRIM_ACTIVE) {
 		mutex_exit(&vd->vdev_trim_lock);
 		return (SET_ERROR(ESRCH));
 	}
 
 	switch (cmd_type) {
 	case POOL_TRIM_START:
 		vdev_trim(vd, rate, partial, secure);
 		break;
 	case POOL_TRIM_CANCEL:
 		vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list);
 		break;
 	case POOL_TRIM_SUSPEND:
 		vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list);
 		break;
 	default:
 		panic("invalid cmd_type %llu", (unsigned long long)cmd_type);
 	}
 	mutex_exit(&vd->vdev_trim_lock);
 
 	return (0);
 }
 
 /*
  * Initiates a manual TRIM for the requested vdevs. This kicks off individual
  * TRIM threads for each child vdev.  These threads pass over all of the free
  * space in the vdev's metaslabs and issues TRIM commands for that space.
  */
 int
 spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate,
     boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist)
 {
 	int total_errors = 0;
 	list_t vd_list;
 
 	list_create(&vd_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_trim_node));
 
 	/*
 	 * We hold the namespace lock through the whole function
 	 * to prevent any changes to the pool while we're starting or
 	 * stopping TRIM. The config and state locks are held so that
 	 * we can properly assess the vdev state before we commit to
 	 * the TRIM operation.
 	 */
 	spa_namespace_enter(FTAG);
 
 	for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL);
 	    pair != NULL; pair = nvlist_next_nvpair(nv, pair)) {
 		uint64_t vdev_guid = fnvpair_value_uint64(pair);
 
 		int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type,
 		    rate, partial, secure, &vd_list);
 		if (error != 0) {
 			char guid_as_str[MAXNAMELEN];
 
 			(void) snprintf(guid_as_str, sizeof (guid_as_str),
 			    "%llu", (unsigned long long)vdev_guid);
 			fnvlist_add_int64(vdev_errlist, guid_as_str, error);
 			total_errors++;
 		}
 	}
 
 	/* Wait for all TRIM threads to stop. */
 	vdev_trim_stop_wait(spa, &vd_list);
 
 	/* Sync out the TRIM state */
 	txg_wait_synced(spa->spa_dsl_pool, 0);
 	spa_namespace_exit(FTAG);
 
 	list_destroy(&vd_list);
 
 	return (total_errors);
 }
 
 /*
  * Split a set of devices from their mirrors, and create a new pool from them.
  */
 int
 spa_vdev_split_mirror(spa_t *spa, const char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp)
 {
 	int error = 0;
 	uint64_t txg, *glist;
 	spa_t *newspa;
 	uint_t c, children, lastlog;
 	nvlist_t **child, *nvl, *tmp;
 	dmu_tx_t *tx;
 	const char *altroot = NULL;
 	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
 	boolean_t activate_slog;
 
 	ASSERT(spa_writeable(spa));
 
 	txg = spa_vdev_enter(spa);
 
 	ASSERT(spa_namespace_held());
 	if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) {
 		error = (spa_has_checkpoint(spa)) ?
 		    ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT;
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	/* clear the log and flush everything up to now */
 	activate_slog = spa_passivate_log(spa);
 	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 	error = spa_reset_logs(spa);
 	txg = spa_vdev_config_enter(spa);
 
 	if (activate_slog)
 		spa_activate_log(spa);
 
 	if (error != 0)
 		return (spa_vdev_exit(spa, NULL, txg, error));
 
 	/* check new spa name before going any further */
 	if (spa_lookup(newname) != NULL)
 		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
 
 	/*
 	 * scan through all the children to ensure they're all mirrors
 	 */
 	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
 	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
 	    &children) != 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* first, check to ensure we've got the right child count */
 	rvd = spa->spa_root_vdev;
 	lastlog = 0;
 	for (c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 
 		/* don't count the holes & logs as children */
 		if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops &&
 		    !vdev_is_concrete(vd))) {
 			if (lastlog == 0)
 				lastlog = c;
 			continue;
 		}
 
 		lastlog = 0;
 	}
 	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	/* next, ensure no spare or cache devices are part of the split */
 	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
 	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
 		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
 
 	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
 	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
 
 	/* then, loop over each vdev and validate it */
 	for (c = 0; c < children; c++) {
 		uint64_t is_hole = 0;
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
 		    &is_hole);
 
 		if (is_hole != 0) {
 			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
 			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
 				continue;
 			} else {
 				error = SET_ERROR(EINVAL);
 				break;
 			}
 		}
 
 		/* deal with indirect vdevs */
 		if (spa->spa_root_vdev->vdev_child[c]->vdev_ops ==
 		    &vdev_indirect_ops)
 			continue;
 
 		/* which disk is going to be split? */
 		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
 		    &glist[c]) != 0) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		/* look it up in the spa */
 		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
 		if (vml[c] == NULL) {
 			error = SET_ERROR(ENODEV);
 			break;
 		}
 
 		/* make sure there's nothing stopping the split */
 		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
 		    vml[c]->vdev_islog ||
 		    !vdev_is_concrete(vml[c]) ||
 		    vml[c]->vdev_isspare ||
 		    vml[c]->vdev_isl2cache ||
 		    !vdev_writeable(vml[c]) ||
 		    vml[c]->vdev_children != 0 ||
 		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
 		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
 			error = SET_ERROR(EINVAL);
 			break;
 		}
 
 		if (vdev_dtl_required(vml[c]) ||
 		    vdev_resilver_needed(vml[c], NULL, NULL)) {
 			error = SET_ERROR(EBUSY);
 			break;
 		}
 
 		/* we need certain info from the top level */
 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
 		    vml[c]->vdev_top->vdev_ms_array);
 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
 		    vml[c]->vdev_top->vdev_ms_shift);
 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
 		    vml[c]->vdev_top->vdev_asize);
 		fnvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
 		    vml[c]->vdev_top->vdev_ashift);
 
 		/* transfer per-vdev ZAPs */
 		ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0);
 		VERIFY0(nvlist_add_uint64(child[c],
 		    ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap));
 
 		ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0);
 		VERIFY0(nvlist_add_uint64(child[c],
 		    ZPOOL_CONFIG_VDEV_TOP_ZAP,
 		    vml[c]->vdev_parent->vdev_top_zap));
 	}
 
 	if (error != 0) {
 		kmem_free(vml, children * sizeof (vdev_t *));
 		kmem_free(glist, children * sizeof (uint64_t));
 		return (spa_vdev_exit(spa, NULL, txg, error));
 	}
 
 	/* stop writers from using the disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_TRUE;
 	}
 	vdev_reopen(spa->spa_root_vdev);
 
 	/*
 	 * Temporarily record the splitting vdevs in the spa config.  This
 	 * will disappear once the config is regenerated.
 	 */
 	nvl = fnvlist_alloc();
 	fnvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children);
 	kmem_free(glist, children * sizeof (uint64_t));
 
 	mutex_enter(&spa->spa_props_lock);
 	fnvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl);
 	mutex_exit(&spa->spa_props_lock);
 	spa->spa_config_splitting = nvl;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	/* configure and create the new pool */
 	fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname);
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
 	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE);
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa));
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg);
 	fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    spa_generate_guid(NULL));
 	VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS));
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 
 	/* add the new pool to the namespace */
 	newspa = spa_add(newname, config, altroot);
 	newspa->spa_avz_action = AVZ_ACTION_REBUILD;
 	newspa->spa_config_txg = spa->spa_config_txg;
 	spa_set_log_state(newspa, SPA_LOG_CLEAR);
 
 	/* release the spa config lock, retaining the namespace lock */
 	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 1);
 
 	spa_activate(newspa, spa_mode_global);
 	spa_async_suspend(newspa);
 
 	/*
 	 * Temporarily stop the initializing and TRIM activity.  We set the
 	 * state to ACTIVE so that we know to resume initializing or TRIM
 	 * once the split has completed.
 	 */
 	list_t vd_initialize_list;
 	list_create(&vd_initialize_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_initialize_node));
 
 	list_t vd_trim_list;
 	list_create(&vd_trim_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_trim_node));
 
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
 			mutex_enter(&vml[c]->vdev_initialize_lock);
 			vdev_initialize_stop(vml[c],
 			    VDEV_INITIALIZE_ACTIVE, &vd_initialize_list);
 			mutex_exit(&vml[c]->vdev_initialize_lock);
 
 			mutex_enter(&vml[c]->vdev_trim_lock);
 			vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list);
 			mutex_exit(&vml[c]->vdev_trim_lock);
 		}
 	}
 
 	vdev_initialize_stop_wait(spa, &vd_initialize_list);
 	vdev_trim_stop_wait(spa, &vd_trim_list);
 
 	list_destroy(&vd_initialize_list);
 	list_destroy(&vd_trim_list);
 
 	newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT;
 	newspa->spa_is_splitting = B_TRUE;
 
 	/* create the new pool from the disks of the original pool */
 	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE);
 	if (error)
 		goto out;
 
 	/* if that worked, generate a real config for the new pool */
 	if (newspa->spa_root_vdev != NULL) {
 		newspa->spa_config_splitting = fnvlist_alloc();
 		fnvlist_add_uint64(newspa->spa_config_splitting,
 		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa));
 		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
 		    B_TRUE));
 	}
 
 	/* set the props */
 	if (props != NULL) {
 		spa_configfile_set(newspa, props, B_FALSE);
 		error = spa_prop_set(newspa, props);
 		if (error)
 			goto out;
 	}
 
 	/* flush everything */
 	txg = spa_vdev_config_enter(newspa);
 	vdev_config_dirty(newspa->spa_root_vdev);
 	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 2);
 
 	spa_async_resume(newspa);
 
 	/* finally, update the original pool's config */
 	txg = spa_vdev_config_enter(spa);
 	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
 	error = dmu_tx_assign(tx, DMU_TX_WAIT);
 	if (error != 0)
 		dmu_tx_abort(tx);
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) {
 			vdev_t *tvd = vml[c]->vdev_top;
 
 			/*
 			 * Need to be sure the detachable VDEV is not
 			 * on any *other* txg's DTL list to prevent it
 			 * from being accessed after it's freed.
 			 */
 			for (int t = 0; t < TXG_SIZE; t++) {
 				(void) txg_list_remove_this(
 				    &tvd->vdev_dtl_list, vml[c], t);
 			}
 
 			vdev_split(vml[c]);
 			if (error == 0)
 				spa_history_log_internal(spa, "detach", tx,
 				    "vdev=%s", vml[c]->vdev_path);
 
 			vdev_free(vml[c]);
 		}
 	}
 	spa->spa_avz_action = AVZ_ACTION_REBUILD;
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa->spa_config_splitting = NULL;
 	nvlist_free(nvl);
 	if (error == 0)
 		dmu_tx_commit(tx);
 	(void) spa_vdev_exit(spa, NULL, txg, 0);
 
 	if (zio_injection_enabled)
 		zio_handle_panic_injection(spa, FTAG, 3);
 
 	/* split is complete; log a history record */
 	spa_history_log_internal(newspa, "split", NULL,
 	    "from pool %s", spa_name(spa));
 
 	newspa->spa_is_splitting = B_FALSE;
 	kmem_free(vml, children * sizeof (vdev_t *));
 
 	/* if we're not going to mount the filesystems in userland, export */
 	if (exp)
 		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
 		    B_FALSE, B_FALSE);
 
 	return (error);
 
 out:
 	spa_unload(newspa);
 	spa_deactivate(newspa);
 	spa_remove(newspa);
 
 	txg = spa_vdev_config_enter(spa);
 
 	/* re-online all offlined disks */
 	for (c = 0; c < children; c++) {
 		if (vml[c] != NULL)
 			vml[c]->vdev_offline = B_FALSE;
 	}
 
 	/* restart initializing or trimming disks as necessary */
 	spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART);
 	spa_async_request(spa, SPA_ASYNC_TRIM_RESTART);
 	spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART);
 
 	vdev_reopen(spa->spa_root_vdev);
 
 	nvlist_free(spa->spa_config_splitting);
 	spa->spa_config_splitting = NULL;
 	(void) spa_vdev_exit(spa, NULL, txg, error);
 
 	kmem_free(vml, children * sizeof (vdev_t *));
 	return (error);
 }
 
 /*
  * Find any device that's done replacing, or a vdev marked 'unspare' that's
  * currently spared, so we can detach it.
  */
 static vdev_t *
 spa_vdev_resilver_done_hunt(vdev_t *vd)
 {
 	vdev_t *newvd, *oldvd;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 		if (oldvd != NULL)
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed replacement.  We always consider the first
 	 * vdev in the list to be the oldest vdev, and the last one to be
 	 * the newest (see spa_vdev_attach() for how that works).  In
 	 * the case where the newest vdev is faulted, we will not automatically
 	 * remove it after a resilver completes.  This is OK as it will require
 	 * user intervention to determine which disk the admin wishes to keep.
 	 */
 	if (vd->vdev_ops == &vdev_replacing_ops) {
 		ASSERT(vd->vdev_children > 1);
 
 		newvd = vd->vdev_child[vd->vdev_children - 1];
 		oldvd = vd->vdev_child[0];
 
 		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 	}
 
 	/*
 	 * Check for a completed resilver with the 'unspare' flag set.
 	 * Also potentially update faulted state.
 	 */
 	if (vd->vdev_ops == &vdev_spare_ops) {
 		vdev_t *first = vd->vdev_child[0];
 		vdev_t *last = vd->vdev_child[vd->vdev_children - 1];
 
 		if (last->vdev_unspare) {
 			oldvd = first;
 			newvd = last;
 		} else if (first->vdev_unspare) {
 			oldvd = last;
 			newvd = first;
 		} else {
 			oldvd = NULL;
 		}
 
 		if (oldvd != NULL &&
 		    vdev_dtl_empty(newvd, DTL_MISSING) &&
 		    vdev_dtl_empty(newvd, DTL_OUTAGE) &&
 		    !vdev_dtl_required(oldvd))
 			return (oldvd);
 
 		vdev_propagate_state(vd);
 
 		/*
 		 * If there are more than two spares attached to a disk,
 		 * and those spares are not required, then we want to
 		 * attempt to free them up now so that they can be used
 		 * by other pools.  Once we're back down to a single
 		 * disk+spare, we stop removing them.
 		 */
 		if (vd->vdev_children > 2) {
 			newvd = vd->vdev_child[1];
 
 			if (newvd->vdev_isspare && last->vdev_isspare &&
 			    vdev_dtl_empty(last, DTL_MISSING) &&
 			    vdev_dtl_empty(last, DTL_OUTAGE) &&
 			    !vdev_dtl_required(newvd))
 				return (newvd);
 		}
 	}
 
 	return (NULL);
 }
 
 static void
 spa_vdev_resilver_done(spa_t *spa)
 {
 	vdev_t *vd, *pvd, *ppvd;
 	uint64_t guid, sguid, pguid, ppguid;
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
 		pvd = vd->vdev_parent;
 		ppvd = pvd->vdev_parent;
 		guid = vd->vdev_guid;
 		pguid = pvd->vdev_guid;
 		ppguid = ppvd->vdev_guid;
 		sguid = 0;
 		/*
 		 * If we have just finished replacing a hot spared device, then
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
 		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 &&
 		    ppvd->vdev_children == 2) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
 			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
 		ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd));
 
 		spa_config_exit(spa, SCL_ALL, FTAG);
 		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
 		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
 			return;
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	/*
 	 * If a detach was not performed above replace waiters will not have
 	 * been notified.  In which case we must do so now.
 	 */
 	spa_notify_waiters(spa);
 }
 
 /*
  * Update the stored path or FRU for this vdev.
  */
 static int
 spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
     boolean_t ispath)
 {
 	vdev_t *vd;
 	boolean_t sync = B_FALSE;
 
 	ASSERT(spa_writeable(spa));
 
 	spa_vdev_state_enter(spa, SCL_ALL);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENOENT));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
 	if (ispath) {
 		if (strcmp(value, vd->vdev_path) != 0) {
 			spa_strfree(vd->vdev_path);
 			vd->vdev_path = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	} else {
 		if (vd->vdev_fru == NULL) {
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		} else if (strcmp(value, vd->vdev_fru) != 0) {
 			spa_strfree(vd->vdev_fru);
 			vd->vdev_fru = spa_strdup(value);
 			sync = B_TRUE;
 		}
 	}
 
 	return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0));
 }
 
 int
 spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
 {
 	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
 }
 
 int
 spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
 {
 	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
 }
 
 /*
  * ==========================================================================
  * SPA Scanning
  * ==========================================================================
  */
 int
 spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd)
 {
 	ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER));
 
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
 
 	return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd));
 }
 
 int
 spa_scan_stop(spa_t *spa)
 {
 	ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER));
 	if (dsl_scan_resilvering(spa->spa_dsl_pool))
 		return (SET_ERROR(EBUSY));
 
 	return (dsl_scan_cancel(spa->spa_dsl_pool));
 }
 
 int
 spa_scan(spa_t *spa, pool_scan_func_t func)
 {
 	return (spa_scan_range(spa, func, 0, 0));
 }
 
 int
 spa_scan_range(spa_t *spa, pool_scan_func_t func, uint64_t txgstart,
     uint64_t txgend)
 {
 	ASSERT0(spa_config_held(spa, SCL_ALL, RW_WRITER));
 
 	if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE)
 		return (SET_ERROR(ENOTSUP));
 
 	if (func == POOL_SCAN_RESILVER &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER))
 		return (SET_ERROR(ENOTSUP));
 
 	if (func != POOL_SCAN_SCRUB && (txgstart != 0 || txgend != 0))
 		return (SET_ERROR(ENOTSUP));
 
 	/*
 	 * If a resilver was requested, but there is no DTL on a
 	 * writeable leaf device, we have nothing to do.
 	 */
 	if (func == POOL_SCAN_RESILVER &&
 	    !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) {
 		spa_async_request(spa, SPA_ASYNC_RESILVER_DONE);
 		return (0);
 	}
 
 	if (func == POOL_SCAN_ERRORSCRUB &&
 	    !spa_feature_is_enabled(spa, SPA_FEATURE_HEAD_ERRLOG))
 		return (SET_ERROR(ENOTSUP));
 
 	return (dsl_scan(spa->spa_dsl_pool, func, txgstart, txgend));
 }
 
 /*
  * ==========================================================================
  * SPA async task processing
  * ==========================================================================
  */
 
 static void
 spa_async_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel)
 {
 	if (vd->vdev_remove_wanted) {
 		vd->vdev_remove_wanted = B_FALSE;
 		vd->vdev_delayed_close = B_FALSE;
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
 
 		/*
 		 * We want to clear the stats, but we don't want to do a full
 		 * vdev_clear() as that will cause us to throw away
 		 * degraded/faulted state as well as attempt to reopen the
 		 * device, all of which is a waste.
 		 */
 		vd->vdev_stat.vs_read_errors = 0;
 		vd->vdev_stat.vs_write_errors = 0;
 		vd->vdev_stat.vs_checksum_errors = 0;
 
 		vdev_state_dirty(vd->vdev_top);
 
 		/* Tell userspace that the vdev is gone. */
 		zfs_post_remove(spa, vd, by_kernel);
 	}
 
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_remove(spa, vd->vdev_child[c], by_kernel);
 }
 
 static void
 spa_async_fault_vdev(vdev_t *vd, boolean_t *suspend)
 {
 	if (vd->vdev_fault_wanted) {
 		vdev_state_t newstate = VDEV_STATE_FAULTED;
 		vd->vdev_fault_wanted = B_FALSE;
 
 		/*
 		 * If this device has the only valid copy of the data, then
 		 * back off and simply mark the vdev as degraded instead.
 		 */
 		if (!vd->vdev_top->vdev_islog && vd->vdev_aux == NULL &&
 		    vdev_dtl_required(vd)) {
 			newstate = VDEV_STATE_DEGRADED;
 			/* A required disk is missing so suspend the pool */
 			*suspend = B_TRUE;
 		}
 		vdev_set_state(vd, B_TRUE, newstate, VDEV_AUX_ERR_EXCEEDED);
 	}
 	for (int c = 0; c < vd->vdev_children; c++)
 		spa_async_fault_vdev(vd->vdev_child[c], suspend);
 }
 
 static void
 spa_async_autoexpand(spa_t *spa, vdev_t *vd)
 {
 	if (!spa->spa_autoexpand)
 		return;
 
 	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
 		spa_async_autoexpand(spa, cvd);
 	}
 
 	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
 		return;
 
 	spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND);
 }
 
 static __attribute__((noreturn)) void
 spa_async_thread(void *arg)
 {
 	spa_t *spa = (spa_t *)arg;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	int tasks;
 
 	ASSERT(spa->spa_sync_on);
 
 	mutex_enter(&spa->spa_async_lock);
 	tasks = spa->spa_async_tasks;
 	spa->spa_async_tasks = 0;
 	mutex_exit(&spa->spa_async_lock);
 
 	/*
 	 * See if the config needs to be updated.
 	 */
 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
 		uint64_t old_space, new_space;
 
 		spa_namespace_enter(FTAG);
 		old_space = metaslab_class_get_space(spa_normal_class(spa));
 		old_space += metaslab_class_get_space(spa_special_class(spa));
 		old_space += metaslab_class_get_space(spa_dedup_class(spa));
 		old_space += metaslab_class_get_space(
 		    spa_embedded_log_class(spa));
 		old_space += metaslab_class_get_space(
 		    spa_special_embedded_log_class(spa));
 
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
 
 		new_space = metaslab_class_get_space(spa_normal_class(spa));
 		new_space += metaslab_class_get_space(spa_special_class(spa));
 		new_space += metaslab_class_get_space(spa_dedup_class(spa));
 		new_space += metaslab_class_get_space(
 		    spa_embedded_log_class(spa));
 		new_space += metaslab_class_get_space(
 		    spa_special_embedded_log_class(spa));
 		spa_namespace_exit(FTAG);
 
 		/*
 		 * If the pool grew as a result of the config update,
 		 * then log an internal history event.
 		 */
 		if (new_space != old_space) {
 			spa_history_log_internal(spa, "vdev online", NULL,
 			    "pool '%s' size: %llu(+%llu)",
 			    spa_name(spa), (u_longlong_t)new_space,
 			    (u_longlong_t)(new_space - old_space));
 		}
 	}
 
 	/*
 	 * See if any devices need to be marked REMOVED.
 	 */
 	if (tasks & (SPA_ASYNC_REMOVE | SPA_ASYNC_REMOVE_BY_USER)) {
 		boolean_t by_kernel = B_TRUE;
 		if (tasks & SPA_ASYNC_REMOVE_BY_USER)
 			by_kernel = B_FALSE;
 		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_remove(spa, spa->spa_root_vdev, by_kernel);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i],
 			    by_kernel);
 		for (int i = 0; i < spa->spa_spares.sav_count; i++)
 			spa_async_remove(spa, spa->spa_spares.sav_vdevs[i],
 			    by_kernel);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
 	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_async_autoexpand(spa, spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	/*
 	 * See if any devices need to be marked faulted.
 	 */
 	if (tasks & SPA_ASYNC_FAULT_VDEV) {
 		spa_vdev_state_enter(spa, SCL_NONE);
 		boolean_t suspend = B_FALSE;
 		spa_async_fault_vdev(spa->spa_root_vdev, &suspend);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 		if (suspend)
 			zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
 	}
 
 	/*
 	 * If any devices are done replacing, detach them.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER_DONE ||
 	    tasks & SPA_ASYNC_REBUILD_DONE ||
 	    tasks & SPA_ASYNC_DETACH_SPARE) {
 		spa_vdev_resilver_done(spa);
 	}
 
 	/*
 	 * Kick off a resilver.
 	 */
 	if (tasks & SPA_ASYNC_RESILVER &&
 	    !vdev_rebuild_active(spa->spa_root_vdev) &&
 	    (!dsl_scan_resilvering(dp) ||
 	    !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)))
 		dsl_scan_restart_resilver(dp, 0);
 
 	if (tasks & SPA_ASYNC_INITIALIZE_RESTART) {
 		spa_namespace_enter(FTAG);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_initialize_restart(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_namespace_exit(FTAG);
 	}
 
 	if (tasks & SPA_ASYNC_TRIM_RESTART) {
 		spa_namespace_enter(FTAG);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_trim_restart(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_namespace_exit(FTAG);
 	}
 
 	if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) {
 		spa_namespace_enter(FTAG);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_autotrim_restart(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_namespace_exit(FTAG);
 	}
 
 	/*
 	 * Kick off L2 cache whole device TRIM.
 	 */
 	if (tasks & SPA_ASYNC_L2CACHE_TRIM) {
 		spa_namespace_enter(FTAG);
 		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		vdev_trim_l2arc(spa);
 		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_namespace_exit(FTAG);
 	}
 
 	/*
 	 * Kick off L2 cache rebuilding.
 	 */
 	if (tasks & SPA_ASYNC_L2CACHE_REBUILD) {
 		spa_namespace_enter(FTAG);
 		spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER);
 		l2arc_spa_rebuild_start(spa);
 		spa_config_exit(spa, SCL_L2ARC, FTAG);
 		spa_namespace_exit(FTAG);
 	}
 
 	/*
 	 * Let the world know that we're done.
 	 */
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_thread = NULL;
 	cv_broadcast(&spa->spa_async_cv);
 	mutex_exit(&spa->spa_async_lock);
 	thread_exit();
 }
 
 void
 spa_async_suspend(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_suspended++;
 	while (spa->spa_async_thread != NULL)
 		cv_wait(&spa->spa_async_cv, &spa->spa_async_lock);
 	mutex_exit(&spa->spa_async_lock);
 
 	spa_vdev_remove_suspend(spa);
 
 	zthr_t *condense_thread = spa->spa_condense_zthr;
 	if (condense_thread != NULL)
 		zthr_cancel(condense_thread);
 
 	zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
 	if (raidz_expand_thread != NULL)
 		zthr_cancel(raidz_expand_thread);
 
 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 	if (discard_thread != NULL)
 		zthr_cancel(discard_thread);
 
 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
 	if (ll_delete_thread != NULL)
 		zthr_cancel(ll_delete_thread);
 
 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 	if (ll_condense_thread != NULL)
 		zthr_cancel(ll_condense_thread);
 }
 
 void
 spa_async_resume(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	ASSERT(spa->spa_async_suspended != 0);
 	spa->spa_async_suspended--;
 	mutex_exit(&spa->spa_async_lock);
 	spa_restart_removal(spa);
 
 	zthr_t *condense_thread = spa->spa_condense_zthr;
 	if (condense_thread != NULL)
 		zthr_resume(condense_thread);
 
 	zthr_t *raidz_expand_thread = spa->spa_raidz_expand_zthr;
 	if (raidz_expand_thread != NULL)
 		zthr_resume(raidz_expand_thread);
 
 	zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr;
 	if (discard_thread != NULL)
 		zthr_resume(discard_thread);
 
 	zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr;
 	if (ll_delete_thread != NULL)
 		zthr_resume(ll_delete_thread);
 
 	zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr;
 	if (ll_condense_thread != NULL)
 		zthr_resume(ll_condense_thread);
 }
 
 static boolean_t
 spa_async_tasks_pending(spa_t *spa)
 {
 	uint_t non_config_tasks;
 	uint_t config_task;
 	boolean_t config_task_suspended;
 
 	non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE;
 	config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE;
 	if (spa->spa_ccw_fail_time == 0) {
 		config_task_suspended = B_FALSE;
 	} else {
 		config_task_suspended =
 		    (gethrtime() - spa->spa_ccw_fail_time) <
 		    ((hrtime_t)zfs_ccw_retry_interval * NANOSEC);
 	}
 
 	return (non_config_tasks || (config_task && !config_task_suspended));
 }
 
 static void
 spa_async_dispatch(spa_t *spa)
 {
 	mutex_enter(&spa->spa_async_lock);
 	if (spa_async_tasks_pending(spa) &&
 	    !spa->spa_async_suspended &&
 	    spa->spa_async_thread == NULL)
 		spa->spa_async_thread = thread_create(NULL, 0,
 		    spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri);
 	mutex_exit(&spa->spa_async_lock);
 }
 
 void
 spa_async_request(spa_t *spa, int task)
 {
 	zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task);
 	mutex_enter(&spa->spa_async_lock);
 	spa->spa_async_tasks |= task;
 	mutex_exit(&spa->spa_async_lock);
 }
 
 int
 spa_async_tasks(spa_t *spa)
 {
 	return (spa->spa_async_tasks);
 }
 
 /*
  * ==========================================================================
  * SPA syncing routines
  * ==========================================================================
  */
 
 
 static int
 bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	bpobj_t *bpo = arg;
 	bpobj_enqueue(bpo, bp, bp_freed, tx);
 	return (0);
 }
 
 int
 bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx));
 }
 
 int
 bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx));
 }
 
 static int
 spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
 {
 	zio_t *pio = arg;
 
 	zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp,
 	    pio->io_flags));
 	return (0);
 }
 
 static int
 bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed,
     dmu_tx_t *tx)
 {
 	ASSERT(!bp_freed);
 	return (spa_free_sync_cb(arg, bp, tx));
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing frees.
  */
 static void
 spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx)
 {
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	bplist_iterate(bpl, spa_free_sync_cb, zio, tx);
 	VERIFY0(zio_wait(zio));
 }
 
 /*
  * Note: this simple function is not inlined to make it easier to dtrace the
  * amount of time spent syncing deferred frees.
  */
 static void
 spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx)
 {
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	/*
 	 * Note:
 	 * If the log space map feature is active, we stop deferring
 	 * frees to the next TXG and therefore running this function
 	 * would be considered a no-op as spa_deferred_bpobj should
 	 * not have any entries.
 	 *
 	 * That said we run this function anyway (instead of returning
 	 * immediately) for the edge-case scenario where we just
 	 * activated the log space map feature in this TXG but we have
 	 * deferred frees from the previous TXG.
 	 */
 	zio_t *zio = zio_root(spa, NULL, NULL, 0);
 	VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj,
 	    bpobj_spa_free_sync_cb, zio, tx), ==, 0);
 	VERIFY0(zio_wait(zio));
 }
 
 static void
 spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx)
 {
 	char *packed = NULL;
 	size_t bufsize;
 	size_t nvsize = 0;
 	dmu_buf_t *db;
 
 	VERIFY0(nvlist_size(nv, &nvsize, NV_ENCODE_XDR));
 
 	/*
 	 * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration
 	 * information.  This avoids the dmu_buf_will_dirty() path and
 	 * saves us a pre-read to get data we don't actually care about.
 	 */
 	bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE);
 	packed = vmem_alloc(bufsize, KM_SLEEP);
 
 	VERIFY0(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR,
 	    KM_SLEEP));
 	memset(packed + nvsize, 0, bufsize - nvsize);
 
 	dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx,
 	    DMU_READ_NO_PREFETCH);
 
 	vmem_free(packed, bufsize);
 
 	VERIFY0(dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db));
 	dmu_buf_will_dirty(db, tx);
 	*(uint64_t *)db->db_data = nvsize;
 	dmu_buf_rele(db, FTAG);
 }
 
 static void
 spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx,
     const char *config, const char *entry)
 {
 	nvlist_t *nvroot;
 	nvlist_t **list;
 	int i;
 
 	if (!sav->sav_sync)
 		return;
 
 	/*
 	 * Update the MOS nvlist describing the list of available devices.
 	 * spa_validate_aux() will have already made sure this nvlist is
 	 * valid and the vdevs are labeled appropriately.
 	 */
 	if (sav->sav_object == 0) {
 		sav->sav_object = dmu_object_alloc(spa->spa_meta_objset,
 		    DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE,
 		    sizeof (uint64_t), tx);
 		VERIFY(zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1,
 		    &sav->sav_object, tx) == 0);
 	}
 
 	nvroot = fnvlist_alloc();
 	if (sav->sav_count == 0) {
 		fnvlist_add_nvlist_array(nvroot, config,
 		    (const nvlist_t * const *)NULL, 0);
 	} else {
 		list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP);
 		for (i = 0; i < sav->sav_count; i++)
 			list[i] = vdev_config_generate(spa, sav->sav_vdevs[i],
 			    B_FALSE, VDEV_CONFIG_L2CACHE);
 		fnvlist_add_nvlist_array(nvroot, config,
 		    (const nvlist_t * const *)list, sav->sav_count);
 		for (i = 0; i < sav->sav_count; i++)
 			nvlist_free(list[i]);
 		kmem_free(list, sav->sav_count * sizeof (void *));
 	}
 
 	spa_sync_nvlist(spa, sav->sav_object, nvroot, tx);
 	nvlist_free(nvroot);
 
 	sav->sav_sync = B_FALSE;
 }
 
 /*
  * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t.
  * The all-vdev ZAP must be empty.
  */
 static void
 spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	if (vd->vdev_root_zap != 0 &&
 	    spa_feature_is_active(spa, SPA_FEATURE_AVZ_V2)) {
 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 		    vd->vdev_root_zap, tx));
 	}
 	if (vd->vdev_top_zap != 0) {
 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 		    vd->vdev_top_zap, tx));
 	}
 	if (vd->vdev_leaf_zap != 0) {
 		VERIFY0(zap_add_int(spa->spa_meta_objset, avz,
 		    vd->vdev_leaf_zap, tx));
 	}
 	for (uint64_t i = 0; i < vd->vdev_children; i++) {
 		spa_avz_build(vd->vdev_child[i], avz, tx);
 	}
 }
 
 static void
 spa_sync_config_object(spa_t *spa, dmu_tx_t *tx)
 {
 	nvlist_t *config;
 
 	/*
 	 * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS,
 	 * its config may not be dirty but we still need to build per-vdev ZAPs.
 	 * Similarly, if the pool is being assembled (e.g. after a split), we
 	 * need to rebuild the AVZ although the config may not be dirty.
 	 */
 	if (list_is_empty(&spa->spa_config_dirty_list) &&
 	    spa->spa_avz_action == AVZ_ACTION_NONE)
 		return;
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 	ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE ||
 	    spa->spa_avz_action == AVZ_ACTION_INITIALIZE ||
 	    spa->spa_all_vdev_zaps != 0);
 
 	if (spa->spa_avz_action == AVZ_ACTION_REBUILD) {
 		/* Make and build the new AVZ */
 		uint64_t new_avz = zap_create(spa->spa_meta_objset,
 		    DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx);
 		spa_avz_build(spa->spa_root_vdev, new_avz, tx);
 
 		/* Diff old AVZ with new one */
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t vdzap = za->za_first_integer;
 			if (zap_lookup_int(spa->spa_meta_objset, new_avz,
 			    vdzap) == ENOENT) {
 				/*
 				 * ZAP is listed in old AVZ but not in new one;
 				 * destroy it
 				 */
 				VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap,
 				    tx));
 			}
 		}
 
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 
 		/* Destroy the old AVZ */
 		VERIFY0(zap_destroy(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, tx));
 
 		/* Replace the old AVZ in the dir obj with the new one */
 		VERIFY0(zap_update(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP,
 		    sizeof (new_avz), 1, &new_avz, tx));
 
 		spa->spa_all_vdev_zaps = new_avz;
 	} else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) {
 		zap_cursor_t zc;
 		zap_attribute_t *za = zap_attribute_alloc();
 
 		/* Walk through the AVZ and destroy all listed ZAPs */
 		for (zap_cursor_init(&zc, spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps);
 		    zap_cursor_retrieve(&zc, za) == 0;
 		    zap_cursor_advance(&zc)) {
 			uint64_t zap = za->za_first_integer;
 			VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx));
 		}
 
 		zap_cursor_fini(&zc);
 		zap_attribute_free(za);
 
 		/* Destroy and unlink the AVZ itself */
 		VERIFY0(zap_destroy(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, tx));
 		VERIFY0(zap_remove(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx));
 		spa->spa_all_vdev_zaps = 0;
 	}
 
 	if (spa->spa_all_vdev_zaps == 0) {
 		spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset,
 		    DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_VDEV_ZAP_MAP, tx);
 	}
 	spa->spa_avz_action = AVZ_ACTION_NONE;
 
 	/* Create ZAPs for vdevs that don't have them. */
 	vdev_construct_zaps(spa->spa_root_vdev, tx);
 
 	config = spa_config_generate(spa, spa->spa_root_vdev,
 	    dmu_tx_get_txg(tx), B_FALSE);
 
 	/*
 	 * If we're upgrading the spa version then make sure that
 	 * the config object gets updated with the correct version.
 	 */
 	if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version)
 		fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
 		    spa->spa_uberblock.ub_version);
 
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	nvlist_free(spa->spa_config_syncing);
 	spa->spa_config_syncing = config;
 
 	spa_sync_nvlist(spa, spa->spa_config_object, config, tx);
 }
 
 static void
 spa_sync_version(void *arg, dmu_tx_t *tx)
 {
 	uint64_t *versionp = arg;
 	uint64_t version = *versionp;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 
 	/*
 	 * Setting the version is special cased when first creating the pool.
 	 */
 	ASSERT(tx->tx_txg != TXG_INITIAL);
 
 	ASSERT(SPA_VERSION_IS_SUPPORTED(version));
 	ASSERT(version >= spa_version(spa));
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 	spa_history_log_internal(spa, "set", tx, "version=%lld",
 	    (longlong_t)version);
 }
 
 /*
  * Set zpool properties.
  */
 static void
 spa_sync_props(void *arg, dmu_tx_t *tx)
 {
 	nvlist_t *nvp = arg;
 	spa_t *spa = dmu_tx_pool(tx)->dp_spa;
 	objset_t *mos = spa->spa_meta_objset;
 	nvpair_t *elem = NULL;
 
 	mutex_enter(&spa->spa_props_lock);
 
 	while ((elem = nvlist_next_nvpair(nvp, elem))) {
 		uint64_t intval;
 		const char *strval, *fname;
 		zpool_prop_t prop;
 		const char *propname;
 		const char *elemname = nvpair_name(elem);
 		zprop_type_t proptype;
 		spa_feature_t fid;
 
 		switch (prop = zpool_name_to_prop(elemname)) {
 		case ZPOOL_PROP_VERSION:
 			intval = fnvpair_value_uint64(elem);
 			/*
 			 * The version is synced separately before other
 			 * properties and should be correct by now.
 			 */
 			ASSERT3U(spa_version(spa), >=, intval);
 			break;
 
 		case ZPOOL_PROP_ALTROOT:
 			/*
 			 * 'altroot' is a non-persistent property. It should
 			 * have been set temporarily at creation or import time.
 			 */
 			ASSERT(spa->spa_root != NULL);
 			break;
 
 		case ZPOOL_PROP_READONLY:
 		case ZPOOL_PROP_CACHEFILE:
 			/*
 			 * 'readonly' and 'cachefile' are also non-persistent
 			 * properties.
 			 */
 			break;
 		case ZPOOL_PROP_COMMENT:
 			strval = fnvpair_value_string(elem);
 			if (spa->spa_comment != NULL)
 				spa_strfree(spa->spa_comment);
 			spa->spa_comment = spa_strdup(strval);
 			/*
 			 * We need to dirty the configuration on all the vdevs
 			 * so that their labels get updated.  We also need to
 			 * update the cache file to keep it in sync with the
 			 * MOS version. It's unnecessary to do this for pool
 			 * creation since the vdev's configuration has already
 			 * been dirtied.
 			 */
 			if (tx->tx_txg != TXG_INITIAL) {
 				vdev_config_dirty(spa->spa_root_vdev);
 				spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 			}
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=%s", elemname, strval);
 			break;
 		case ZPOOL_PROP_COMPATIBILITY:
 			strval = fnvpair_value_string(elem);
 			if (spa->spa_compatibility != NULL)
 				spa_strfree(spa->spa_compatibility);
 			spa->spa_compatibility = spa_strdup(strval);
 			/*
 			 * Dirty the configuration on vdevs as above.
 			 */
 			if (tx->tx_txg != TXG_INITIAL) {
 				vdev_config_dirty(spa->spa_root_vdev);
 				spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 			}
 
 			spa_history_log_internal(spa, "set", tx,
 			    "%s=%s", nvpair_name(elem), strval);
 			break;
 
 		case ZPOOL_PROP_INVAL:
 			if (zpool_prop_feature(elemname)) {
 				fname = strchr(elemname, '@') + 1;
 				VERIFY0(zfeature_lookup_name(fname, &fid));
 
 				spa_feature_enable(spa, fid, tx);
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=enabled", elemname);
 				break;
 			} else if (!zfs_prop_user(elemname)) {
 				ASSERT(zpool_prop_feature(elemname));
 				break;
 			}
 			zfs_fallthrough;
 		default:
 			/*
 			 * Set pool property values in the poolprops mos object.
 			 */
 			if (spa->spa_pool_props_object == 0) {
 				spa->spa_pool_props_object =
 				    zap_create_link(mos, DMU_OT_POOL_PROPS,
 				    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS,
 				    tx);
 			}
 
 			/* normalize the property name */
 			if (prop == ZPOOL_PROP_INVAL) {
 				propname = elemname;
 				proptype = PROP_TYPE_STRING;
 			} else {
 				propname = zpool_prop_to_name(prop);
 				proptype = zpool_prop_get_type(prop);
 			}
 
 			if (nvpair_type(elem) == DATA_TYPE_STRING) {
 				ASSERT(proptype == PROP_TYPE_STRING);
 				strval = fnvpair_value_string(elem);
 				if (strlen(strval) == 0) {
 					/* remove the property if value == "" */
 					(void) zap_remove(mos,
 					    spa->spa_pool_props_object,
 					    propname, tx);
 				} else {
 					VERIFY0(zap_update(mos,
 					    spa->spa_pool_props_object,
 					    propname, 1, strlen(strval) + 1,
 					    strval, tx));
 				}
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%s", elemname, strval);
 			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
 				intval = fnvpair_value_uint64(elem);
 
 				if (proptype == PROP_TYPE_INDEX) {
 					const char *unused;
 					VERIFY0(zpool_prop_index_to_string(
 					    prop, intval, &unused));
 				}
 				VERIFY0(zap_update(mos,
 				    spa->spa_pool_props_object, propname,
 				    8, 1, &intval, tx));
 				spa_history_log_internal(spa, "set", tx,
 				    "%s=%lld", elemname,
 				    (longlong_t)intval);
 
 				switch (prop) {
 				case ZPOOL_PROP_DELEGATION:
 					spa->spa_delegation = intval;
 					break;
 				case ZPOOL_PROP_BOOTFS:
 					spa->spa_bootfs = intval;
 					break;
 				case ZPOOL_PROP_FAILUREMODE:
 					spa->spa_failmode = intval;
 					break;
 				case ZPOOL_PROP_AUTOTRIM:
 					spa->spa_autotrim = intval;
 					spa_async_request(spa,
 					    SPA_ASYNC_AUTOTRIM_RESTART);
 					break;
 				case ZPOOL_PROP_AUTOEXPAND:
 					spa->spa_autoexpand = intval;
 					if (tx->tx_txg != TXG_INITIAL)
 						spa_async_request(spa,
 						    SPA_ASYNC_AUTOEXPAND);
 					break;
 				case ZPOOL_PROP_MULTIHOST:
 					spa->spa_multihost = intval;
 					break;
 				case ZPOOL_PROP_DEDUP_TABLE_QUOTA:
 					spa->spa_dedup_table_quota = intval;
 					break;
 				default:
 					break;
 				}
 			} else {
 				ASSERT(0); /* not allowed */
 			}
 		}
 
 	}
 
 	mutex_exit(&spa->spa_props_lock);
 }
 
 /*
  * Perform one-time upgrade on-disk changes.  spa_version() does not
  * reflect the new version this txg, so there must be no changes this
  * txg to anything that the upgrade code depends on after it executes.
  * Therefore this must be called after dsl_pool_sync() does the sync
  * tasks.
  */
 static void
 spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx)
 {
 	if (spa_sync_pass(spa) != 1)
 		return;
 
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG);
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) {
 		dsl_pool_create_origin(dp, tx);
 
 		/* Keeping the origin open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) {
 		dsl_pool_upgrade_clones(dp, tx);
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) {
 		dsl_pool_upgrade_dir_clones(dp, tx);
 
 		/* Keeping the freedir open increases spa_minref */
 		spa->spa_minref += 3;
 	}
 
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		spa_feature_create_zap_objects(spa, tx);
 	}
 
 	/*
 	 * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable
 	 * when possibility to use lz4 compression for metadata was added
 	 * Old pools that have this feature enabled must be upgraded to have
 	 * this feature active
 	 */
 	if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) {
 		boolean_t lz4_en = spa_feature_is_enabled(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 		boolean_t lz4_ac = spa_feature_is_active(spa,
 		    SPA_FEATURE_LZ4_COMPRESS);
 
 		if (lz4_en && !lz4_ac)
 			spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx);
 	}
 
 	/*
 	 * If we haven't written the salt, do so now.  Note that the
 	 * feature may not be activated yet, but that's fine since
 	 * the presence of this ZAP entry is backwards compatible.
 	 */
 	if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_CHECKSUM_SALT) == ENOENT) {
 		VERIFY0(zap_add(spa->spa_meta_objset,
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1,
 		    sizeof (spa->spa_cksum_salt.zcs_bytes),
 		    spa->spa_cksum_salt.zcs_bytes, tx));
 	}
 
 	rrw_exit(&dp->dp_config_rwlock, FTAG);
 }
 
 static void
 vdev_indirect_state_sync_verify(vdev_t *vd)
 {
 	vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping;
 	vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births;
 
 	if (vd->vdev_ops == &vdev_indirect_ops) {
 		ASSERT(vim != NULL);
 		ASSERT(vib != NULL);
 	}
 
 	uint64_t obsolete_sm_object = 0;
 	ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object));
 	if (obsolete_sm_object != 0) {
 		ASSERT(vd->vdev_obsolete_sm != NULL);
 		ASSERT(vd->vdev_removing ||
 		    vd->vdev_ops == &vdev_indirect_ops);
 		ASSERT(vdev_indirect_mapping_num_entries(vim) > 0);
 		ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0);
 		ASSERT3U(obsolete_sm_object, ==,
 		    space_map_object(vd->vdev_obsolete_sm));
 		ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=,
 		    space_map_allocated(vd->vdev_obsolete_sm));
 	}
 	ASSERT(vd->vdev_obsolete_segments != NULL);
 
 	/*
 	 * Since frees / remaps to an indirect vdev can only
 	 * happen in syncing context, the obsolete segments
 	 * tree must be empty when we start syncing.
 	 */
 	ASSERT0(zfs_range_tree_space(vd->vdev_obsolete_segments));
 }
 
 /*
  * Set the top-level vdev's max queue depth. Evaluate each top-level's
  * async write queue depth in case it changed. The max queue depth will
  * not change in the middle of syncing out this txg.
  */
 static void
 spa_sync_adjust_vdev_max_queue_depth(spa_t *spa)
 {
 	ASSERT(spa_writeable(spa));
 
 	metaslab_class_balance(spa_normal_class(spa), B_TRUE);
 	metaslab_class_balance(spa_special_class(spa), B_TRUE);
 	metaslab_class_balance(spa_dedup_class(spa), B_TRUE);
 }
 
 static void
 spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx)
 {
 	ASSERT(spa_writeable(spa));
 
 	vdev_t *rvd = spa->spa_root_vdev;
 	for (int c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 		vdev_indirect_state_sync_verify(vd);
 
 		if (vdev_indirect_should_condense(vd)) {
 			spa_condense_indirect_start_sync(vd, tx);
 			break;
 		}
 	}
 }
 
 static void
 spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx)
 {
 	objset_t *mos = spa->spa_meta_objset;
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	uint64_t txg = tx->tx_txg;
 	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 
 	do {
 		int pass = ++spa->spa_sync_pass;
 
 		spa_sync_config_object(spa, tx);
 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
 		    ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES);
 		spa_sync_aux_dev(spa, &spa->spa_l2cache, tx,
 		    ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE);
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
 		if (pass < zfs_sync_pass_deferred_free ||
 		    spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) {
 			/*
 			 * If the log space map feature is active we don't
 			 * care about deferred frees and the deferred bpobj
 			 * as the log space map should effectively have the
 			 * same results (i.e. appending only to one object).
 			 */
 			spa_sync_frees(spa, free_bpl, tx);
 		} else {
 			/*
 			 * We can not defer frees in pass 1, because
 			 * we sync the deferred frees later in pass 1.
 			 */
 			ASSERT3U(pass, >, 1);
 			bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb,
 			    &spa->spa_deferred_bpobj, tx);
 		}
 
 		brt_sync(spa, txg);
 		ddt_sync(spa, txg);
 		dsl_scan_sync(dp, tx);
 		dsl_errorscrub_sync(dp, tx);
 		svr_sync(spa, tx);
 		spa_sync_upgrades(spa, tx);
 
 		spa_flush_metaslabs(spa, tx);
 
 		vdev_t *vd = NULL;
 		while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
 		    != NULL)
 			vdev_sync(vd, txg);
 
 		if (pass == 1) {
 			/*
 			 * dsl_pool_sync() -> dp_sync_tasks may have dirtied
 			 * the config. If that happens, this txg should not
 			 * be a no-op. So we must sync the config to the MOS
 			 * before checking for no-op.
 			 *
 			 * Note that when the config is dirty, it will
 			 * be written to the MOS (i.e. the MOS will be
 			 * dirtied) every time we call spa_sync_config_object()
 			 * in this txg.  Therefore we can't call this after
 			 * dsl_pool_sync() every pass, because it would
 			 * prevent us from converging, since we'd dirty
 			 * the MOS every pass.
 			 *
 			 * Sync tasks can only be processed in pass 1, so
 			 * there's no need to do this in later passes.
 			 */
 			spa_sync_config_object(spa, tx);
 		}
 
 		/*
 		 * Note: We need to check if the MOS is dirty because we could
 		 * have marked the MOS dirty without updating the uberblock
 		 * (e.g. if we have sync tasks but no dirty user data). We need
 		 * to check the uberblock's rootbp because it is updated if we
 		 * have synced out dirty data (though in this case the MOS will
 		 * most likely also be dirty due to second order effects, we
 		 * don't want to rely on that here).
 		 */
 		if (pass == 1 &&
 		    BP_GET_LOGICAL_BIRTH(&spa->spa_uberblock.ub_rootbp) < txg &&
 		    !dmu_objset_is_dirty(mos, txg)) {
 			/*
 			 * Nothing changed on the first pass, therefore this
 			 * TXG is a no-op. Avoid syncing deferred frees, so
 			 * that we can keep this TXG as a no-op.
 			 */
 			ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 			ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 			ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg));
 			ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg));
 			break;
 		}
 
 		spa_sync_deferred_frees(spa, tx);
 	} while (dmu_objset_is_dirty(mos, txg));
 }
 
 /*
  * Rewrite the vdev configuration (which includes the uberblock) to
  * commit the transaction group.
  *
  * If there are no dirty vdevs, we sync the uberblock to a few random
  * top-level vdevs that are known to be visible in the config cache
  * (see spa_vdev_add() for a complete description). If there *are* dirty
  * vdevs, sync the uberblock to all vdevs.
  */
 static void
 spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t txg = tx->tx_txg;
 
 	for (;;) {
 		int error = 0;
 
 		/*
 		 * We hold SCL_STATE to prevent vdev open/close/etc.
 		 * while we're attempting to write the vdev labels.
 		 */
 		spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 
 		if (list_is_empty(&spa->spa_config_dirty_list)) {
 			vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL };
 			int svdcount = 0;
 			int children = rvd->vdev_children;
 			int c0 = random_in_range(children);
 
 			for (int c = 0; c < children; c++) {
 				vdev_t *vd =
 				    rvd->vdev_child[(c0 + c) % children];
 
 				/* Stop when revisiting the first vdev */
 				if (c > 0 && svd[0] == vd)
 					break;
 
 				if (vd->vdev_ms_array == 0 ||
 				    vd->vdev_islog ||
 				    !vdev_is_concrete(vd))
 					continue;
 
 				svd[svdcount++] = vd;
 				if (svdcount == SPA_SYNC_MIN_VDEVS)
 					break;
 			}
 			error = vdev_config_sync(svd, svdcount, txg);
 		} else {
 			error = vdev_config_sync(rvd->vdev_child,
 			    rvd->vdev_children, txg);
 		}
 
 		if (error == 0)
 			spa->spa_last_synced_guid = rvd->vdev_guid;
 
 		spa_config_exit(spa, SCL_STATE, FTAG);
 
 		if (error == 0)
 			break;
 		zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR);
 		zio_resume_wait(spa);
 	}
 }
 
 /*
  * Sync the specified transaction group.  New blocks may be dirtied as
  * part of the process, so we iterate until it converges.
  */
 void
 spa_sync(spa_t *spa, uint64_t txg)
 {
 	vdev_t *vd = NULL;
 
 	VERIFY(spa_writeable(spa));
 
 	/*
 	 * Wait for i/os issued in open context that need to complete
 	 * before this txg syncs.
 	 */
 	(void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]);
 	spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL,
 	    ZIO_FLAG_CANFAIL);
 
 	/*
 	 * Now that there can be no more cloning in this transaction group,
 	 * but we are still before issuing frees, we can process pending BRT
 	 * updates.
 	 */
 	brt_pending_apply(spa, txg);
 
 	spa_sync_time_logger(spa, txg);
 
 	/*
 	 * Lock out configuration changes.
 	 */
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
 	spa->spa_syncing_txg = txg;
 	spa->spa_sync_pass = 0;
 
 	/*
 	 * If there are any pending vdev state changes, convert them
 	 * into config changes that go out with this transaction group.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
 		/* Avoid holding the write lock unless actually necessary */
 		if (vd->vdev_aux == NULL) {
 			vdev_state_clean(vd);
 			vdev_config_dirty(vd);
 			continue;
 		}
 		/*
 		 * We need the write lock here because, for aux vdevs,
 		 * calling vdev_config_dirty() modifies sav_config.
 		 * This is ugly and will become unnecessary when we
 		 * eliminate the aux vdev wart by integrating all vdevs
 		 * into the root vdev tree.
 		 */
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
 		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
 			vdev_state_clean(vd);
 			vdev_config_dirty(vd);
 		}
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
 
 	spa->spa_sync_starttime = gethrtime();
 
-	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
 	spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq,
 	    spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() +
 	    NSEC_TO_TICK(spa->spa_deadman_synctime));
 
 	/*
 	 * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg,
 	 * set spa_deflate if we have no raid-z vdevs.
 	 */
 	if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE &&
 	    spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) {
 		vdev_t *rvd = spa->spa_root_vdev;
 
 		int i;
 		for (i = 0; i < rvd->vdev_children; i++) {
 			vd = rvd->vdev_child[i];
 			if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE)
 				break;
 		}
 		if (i == rvd->vdev_children) {
 			spa->spa_deflate = TRUE;
 			VERIFY0(zap_add(spa->spa_meta_objset,
 			    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
 			    sizeof (uint64_t), 1, &spa->spa_deflate, tx));
 		}
 	}
 
 	spa_sync_adjust_vdev_max_queue_depth(spa);
 
 	spa_sync_condense_indirect(spa, tx);
 
 	spa_sync_iterate_to_convergence(spa, tx);
 
 #ifdef ZFS_DEBUG
 	if (!list_is_empty(&spa->spa_config_dirty_list)) {
 	/*
 	 * Make sure that the number of ZAPs for all the vdevs matches
 	 * the number of ZAPs in the per-vdev ZAP list. This only gets
 	 * called if the config is dirty; otherwise there may be
 	 * outstanding AVZ operations that weren't completed in
 	 * spa_sync_config_object.
 	 */
 		uint64_t all_vdev_zap_entry_count;
 		ASSERT0(zap_count(spa->spa_meta_objset,
 		    spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count));
 		ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==,
 		    all_vdev_zap_entry_count);
 	}
 #endif
 
 	if (spa->spa_vdev_removal != NULL) {
 		ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]);
 	}
 
 	spa_sync_rewrite_vdev_config(spa, tx);
 	dmu_tx_commit(tx);
 
-	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid);
+	taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid, B_TRUE);
 	spa->spa_deadman_tqid = 0;
 
 	/*
 	 * Clear the dirty config list.
 	 */
 	while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL)
 		vdev_config_clean(vd);
 
 	/*
 	 * Now that the new config has synced transactionally,
 	 * let it become visible to the config cache.
 	 */
 	if (spa->spa_config_syncing != NULL) {
 		spa_config_set(spa, spa->spa_config_syncing);
 		spa->spa_config_txg = txg;
 		spa->spa_config_syncing = NULL;
 	}
 
 	dsl_pool_sync_done(dp, txg);
 
 	/*
 	 * Update usable space statistics.
 	 */
 	while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 	    != NULL)
 		vdev_sync_done(vd, txg);
 
 	metaslab_class_evict_old(spa->spa_normal_class, txg);
 	metaslab_class_evict_old(spa->spa_log_class, txg);
 	/* Embedded log classes have only one metaslab per vdev. */
 	metaslab_class_evict_old(spa->spa_special_class, txg);
 	metaslab_class_evict_old(spa->spa_dedup_class, txg);
 
 	spa_sync_close_syncing_log_sm(spa);
 
 	spa_update_dspace(spa);
 
 	if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON)
 		vdev_autotrim_kick(spa);
 
 	/*
 	 * It had better be the case that we didn't dirty anything
 	 * since vdev_config_sync().
 	 */
 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
 
 	while (zfs_pause_spa_sync)
 		delay(1);
 
 	spa->spa_sync_pass = 0;
 
 	/*
 	 * Update the last synced uberblock here. We want to do this at
 	 * the end of spa_sync() so that consumers of spa_last_synced_txg()
 	 * will be guaranteed that all the processing associated with
 	 * that txg has been completed.
 	 */
 	spa->spa_ubsync = spa->spa_uberblock;
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_handle_ignored_writes(spa);
 
 	/*
 	 * If any async tasks have been requested, kick them off.
 	 */
 	spa_async_dispatch(spa);
 }
 
 /*
  * Sync all pools.  We don't want to hold the namespace lock across these
  * operations, so we take a reference on the spa_t and drop the lock during the
  * sync.
  */
 void
 spa_sync_allpools(void)
 {
 	spa_t *spa = NULL;
 	spa_namespace_enter(FTAG);
 	while ((spa = spa_next(spa)) != NULL) {
 		if (spa_state(spa) != POOL_STATE_ACTIVE ||
 		    !spa_writeable(spa) || spa_suspended(spa))
 			continue;
 		spa_open_ref(spa, FTAG);
 		spa_namespace_exit(FTAG);
 		txg_wait_synced(spa_get_dsl(spa), 0);
 		spa_namespace_enter(FTAG);
 		spa_close(spa, FTAG);
 	}
 	spa_namespace_exit(FTAG);
 }
 
 taskq_t *
 spa_sync_tq_create(spa_t *spa, const char *name)
 {
 	kthread_t **kthreads;
 
 	ASSERT0P(spa->spa_sync_tq);
 	ASSERT3S(spa->spa_alloc_count, <=, boot_ncpus);
 
 	/*
 	 * - do not allow more allocators than cpus.
 	 * - there may be more cpus than allocators.
 	 * - do not allow more sync taskq threads than allocators or cpus.
 	 */
 	int nthreads = spa->spa_alloc_count;
 	spa->spa_syncthreads = kmem_zalloc(sizeof (spa_syncthread_info_t) *
 	    nthreads, KM_SLEEP);
 
 	spa->spa_sync_tq = taskq_create_synced(name, nthreads, minclsyspri,
 	    nthreads, INT_MAX, TASKQ_PREPOPULATE, &kthreads);
 	VERIFY(spa->spa_sync_tq != NULL);
 	VERIFY(kthreads != NULL);
 
 	spa_syncthread_info_t *ti = spa->spa_syncthreads;
 	for (int i = 0; i < nthreads; i++, ti++) {
 		ti->sti_thread = kthreads[i];
 		ti->sti_allocator = i;
 	}
 
 	kmem_free(kthreads, sizeof (*kthreads) * nthreads);
 	return (spa->spa_sync_tq);
 }
 
 void
 spa_sync_tq_destroy(spa_t *spa)
 {
 	ASSERT(spa->spa_sync_tq != NULL);
 
 	taskq_wait(spa->spa_sync_tq);
 	taskq_destroy(spa->spa_sync_tq);
 	kmem_free(spa->spa_syncthreads,
 	    sizeof (spa_syncthread_info_t) * spa->spa_alloc_count);
 	spa->spa_sync_tq = NULL;
 }
 
 uint_t
 spa_acq_allocator(spa_t *spa)
 {
 	int i;
 
 	if (spa->spa_alloc_count == 1)
 		return (0);
 
 	mutex_enter(&spa->spa_allocs_use->sau_lock);
 	uint_t r = spa->spa_allocs_use->sau_rotor;
 	do {
 		if (++r == spa->spa_alloc_count)
 			r = 0;
 	} while (spa->spa_allocs_use->sau_inuse[r]);
 	spa->spa_allocs_use->sau_inuse[r] = B_TRUE;
 	spa->spa_allocs_use->sau_rotor = r;
 	mutex_exit(&spa->spa_allocs_use->sau_lock);
 
 	spa_syncthread_info_t *ti = spa->spa_syncthreads;
 	for (i = 0; i < spa->spa_alloc_count; i++, ti++) {
 		if (ti->sti_thread == curthread) {
 			ti->sti_allocator = r;
 			break;
 		}
 	}
 	ASSERT3S(i, <, spa->spa_alloc_count);
 	return (r);
 }
 
 void
 spa_rel_allocator(spa_t *spa, uint_t allocator)
 {
 	if (spa->spa_alloc_count > 1)
 		spa->spa_allocs_use->sau_inuse[allocator] = B_FALSE;
 }
 
 void
 spa_select_allocator(zio_t *zio)
 {
 	zbookmark_phys_t *bm = &zio->io_bookmark;
 	spa_t *spa = zio->io_spa;
 
 	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 
 	/*
 	 * A gang block (for example) may have inherited its parent's
 	 * allocator, in which case there is nothing further to do here.
 	 */
 	if (ZIO_HAS_ALLOCATOR(zio))
 		return;
 
 	ASSERT(spa != NULL);
 	ASSERT(bm != NULL);
 
 	/*
 	 * First try to use an allocator assigned to the syncthread, and set
 	 * the corresponding write issue taskq for the allocator.
 	 * Note, we must have an open pool to do this.
 	 */
 	if (spa->spa_sync_tq != NULL) {
 		spa_syncthread_info_t *ti = spa->spa_syncthreads;
 		for (int i = 0; i < spa->spa_alloc_count; i++, ti++) {
 			if (ti->sti_thread == curthread) {
 				zio->io_allocator = ti->sti_allocator;
 				return;
 			}
 		}
 	}
 
 	/*
 	 * We want to try to use as many allocators as possible to help improve
 	 * performance, but we also want logically adjacent IOs to be physically
 	 * adjacent to improve sequential read performance. We chunk each object
 	 * into 2^20 block regions, and then hash based on the objset, object,
 	 * level, and region to accomplish both of these goals.
 	 */
 	uint64_t hv = cityhash4(bm->zb_objset, bm->zb_object, bm->zb_level,
 	    bm->zb_blkid >> 20);
 
 	zio->io_allocator = (uint_t)hv % spa->spa_alloc_count;
 }
 
 /*
  * ==========================================================================
  * Miscellaneous routines
  * ==========================================================================
  */
 
 /*
  * Remove all pools in the system.
  */
 void
 spa_evict_all(void)
 {
 	spa_t *spa;
 
 	/*
 	 * Remove all cached state.  All pools should be closed now,
 	 * so every spa in the AVL tree should be unreferenced.
 	 */
 	spa_namespace_enter(FTAG);
 	while ((spa = spa_next(NULL)) != NULL) {
 		/*
 		 * Stop async tasks.  The async thread may need to detach
 		 * a device that's been replaced, which requires grabbing
 		 * spa_namespace_lock, so we must drop it here.
 		 */
 		spa_open_ref(spa, FTAG);
 		spa_namespace_exit(FTAG);
 		spa_async_suspend(spa);
 		spa_namespace_enter(FTAG);
 		spa_close(spa, FTAG);
 
 		if (spa->spa_state != POOL_STATE_UNINITIALIZED) {
 			spa_unload(spa);
 			spa_deactivate(spa);
 		}
 		spa_remove(spa);
 	}
 	spa_namespace_exit(FTAG);
 }
 
 vdev_t *
 spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
 {
 	vdev_t *vd;
 	int i;
 
 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
 		return (vd);
 
 	if (aux) {
 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vd = spa->spa_l2cache.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 
 		for (i = 0; i < spa->spa_spares.sav_count; i++) {
 			vd = spa->spa_spares.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
 	}
 
 	return (NULL);
 }
 
 void
 spa_upgrade(spa_t *spa, uint64_t version)
 {
 	ASSERT(spa_writeable(spa));
 
 	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	/*
 	 * This should only be called for a non-faulted pool, and since a
 	 * future version would result in an unopenable pool, this shouldn't be
 	 * possible.
 	 */
 	ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version));
 	ASSERT3U(version, >=, spa->spa_uberblock.ub_version);
 
 	spa->spa_uberblock.ub_version = version;
 	vdev_config_dirty(spa->spa_root_vdev);
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 }
 
 static boolean_t
 spa_has_aux_vdev(spa_t *spa, uint64_t guid, spa_aux_vdev_t *sav)
 {
 	(void) spa;
 	int i;
 	uint64_t vdev_guid;
 
 	for (i = 0; i < sav->sav_count; i++)
 		if (sav->sav_vdevs[i]->vdev_guid == guid)
 			return (B_TRUE);
 
 	for (i = 0; i < sav->sav_npending; i++) {
 		if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID,
 		    &vdev_guid) == 0 && vdev_guid == guid)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 boolean_t
 spa_has_l2cache(spa_t *spa, uint64_t guid)
 {
 	return (spa_has_aux_vdev(spa, guid, &spa->spa_l2cache));
 }
 
 boolean_t
 spa_has_spare(spa_t *spa, uint64_t guid)
 {
 	return (spa_has_aux_vdev(spa, guid, &spa->spa_spares));
 }
 
 /*
  * Check if a pool has an active shared spare device.
  * Note: reference count of an active spare is 2, as a spare and as a replace
  */
 static boolean_t
 spa_has_active_shared_spare(spa_t *spa)
 {
 	int i, refcnt;
 	uint64_t pool;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 
 	for (i = 0; i < sav->sav_count; i++) {
 		if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool,
 		    &refcnt) && pool != 0ULL && pool == spa_guid(spa) &&
 		    refcnt > 2)
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 uint64_t
 spa_total_metaslabs(spa_t *spa)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 
 	uint64_t m = 0;
 	for (uint64_t c = 0; c < rvd->vdev_children; c++) {
 		vdev_t *vd = rvd->vdev_child[c];
 		if (!vdev_is_concrete(vd))
 			continue;
 		m += vd->vdev_ms_count;
 	}
 	return (m);
 }
 
 /*
  * Notify any waiting threads that some activity has switched from being in-
  * progress to not-in-progress so that the thread can wake up and determine
  * whether it is finished waiting.
  */
 void
 spa_notify_waiters(spa_t *spa)
 {
 	/*
 	 * Acquiring spa_activities_lock here prevents the cv_broadcast from
 	 * happening between the waiting thread's check and cv_wait.
 	 */
 	mutex_enter(&spa->spa_activities_lock);
 	cv_broadcast(&spa->spa_activities_cv);
 	mutex_exit(&spa->spa_activities_lock);
 }
 
 /*
  * Notify any waiting threads that the pool is exporting, and then block until
  * they are finished using the spa_t.
  */
 void
 spa_wake_waiters(spa_t *spa)
 {
 	mutex_enter(&spa->spa_activities_lock);
 	spa->spa_waiters_cancel = B_TRUE;
 	cv_broadcast(&spa->spa_activities_cv);
 	while (spa->spa_waiters != 0)
 		cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock);
 	spa->spa_waiters_cancel = B_FALSE;
 	mutex_exit(&spa->spa_activities_lock);
 }
 
 /* Whether the vdev or any of its descendants are being initialized/trimmed. */
 static boolean_t
 spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity)
 {
 	spa_t *spa = vd->vdev_spa;
 
 	ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER));
 	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
 	ASSERT(activity == ZPOOL_WAIT_INITIALIZE ||
 	    activity == ZPOOL_WAIT_TRIM);
 
 	kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ?
 	    &vd->vdev_initialize_lock : &vd->vdev_trim_lock;
 
 	mutex_exit(&spa->spa_activities_lock);
 	mutex_enter(lock);
 	mutex_enter(&spa->spa_activities_lock);
 
 	boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ?
 	    (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) :
 	    (vd->vdev_trim_state == VDEV_TRIM_ACTIVE);
 	mutex_exit(lock);
 
 	if (in_progress)
 		return (B_TRUE);
 
 	for (int i = 0; i < vd->vdev_children; i++) {
 		if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i],
 		    activity))
 			return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 /*
  * If use_guid is true, this checks whether the vdev specified by guid is
  * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool
  * is being initialized/trimmed. The caller must hold the config lock and
  * spa_activities_lock.
  */
 static int
 spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid,
     zpool_wait_activity_t activity, boolean_t *in_progress)
 {
 	mutex_exit(&spa->spa_activities_lock);
 	spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	mutex_enter(&spa->spa_activities_lock);
 
 	vdev_t *vd;
 	if (use_guid) {
 		vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 		if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) {
 			spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 			return (EINVAL);
 		}
 	} else {
 		vd = spa->spa_root_vdev;
 	}
 
 	*in_progress = spa_vdev_activity_in_progress_impl(vd, activity);
 
 	spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 	return (0);
 }
 
 /*
  * Locking for waiting threads
  * ---------------------------
  *
  * Waiting threads need a way to check whether a given activity is in progress,
  * and then, if it is, wait for it to complete. Each activity will have some
  * in-memory representation of the relevant on-disk state which can be used to
  * determine whether or not the activity is in progress. The in-memory state and
  * the locking used to protect it will be different for each activity, and may
  * not be suitable for use with a cvar (e.g., some state is protected by the
  * config lock). To allow waiting threads to wait without any races, another
  * lock, spa_activities_lock, is used.
  *
  * When the state is checked, both the activity-specific lock (if there is one)
  * and spa_activities_lock are held. In some cases, the activity-specific lock
  * is acquired explicitly (e.g. the config lock). In others, the locking is
  * internal to some check (e.g. bpobj_is_empty). After checking, the waiting
  * thread releases the activity-specific lock and, if the activity is in
  * progress, then cv_waits using spa_activities_lock.
  *
  * The waiting thread is woken when another thread, one completing some
  * activity, updates the state of the activity and then calls
  * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only
  * needs to hold its activity-specific lock when updating the state, and this
  * lock can (but doesn't have to) be dropped before calling spa_notify_waiters.
  *
  * Because spa_notify_waiters acquires spa_activities_lock before broadcasting,
  * and because it is held when the waiting thread checks the state of the
  * activity, it can never be the case that the completing thread both updates
  * the activity state and cv_broadcasts in between the waiting thread's check
  * and cv_wait. Thus, a waiting thread can never miss a wakeup.
  *
  * In order to prevent deadlock, when the waiting thread does its check, in some
  * cases it will temporarily drop spa_activities_lock in order to acquire the
  * activity-specific lock. The order in which spa_activities_lock and the
  * activity specific lock are acquired in the waiting thread is determined by
  * the order in which they are acquired in the completing thread; if the
  * completing thread calls spa_notify_waiters with the activity-specific lock
  * held, then the waiting thread must also acquire the activity-specific lock
  * first.
  */
 
 static int
 spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity,
     boolean_t use_tag, uint64_t tag, boolean_t *in_progress)
 {
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(&spa->spa_activities_lock));
 
 	switch (activity) {
 	case ZPOOL_WAIT_CKPT_DISCARD:
 		*in_progress =
 		    (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) &&
 		    zap_contains(spa_meta_objset(spa),
 		    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) ==
 		    ENOENT);
 		break;
 	case ZPOOL_WAIT_FREE:
 		*in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS &&
 		    !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) ||
 		    spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) ||
 		    spa_livelist_delete_check(spa));
 		break;
 	case ZPOOL_WAIT_INITIALIZE:
 	case ZPOOL_WAIT_TRIM:
 		error = spa_vdev_activity_in_progress(spa, use_tag, tag,
 		    activity, in_progress);
 		break;
 	case ZPOOL_WAIT_REPLACE:
 		mutex_exit(&spa->spa_activities_lock);
 		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 		mutex_enter(&spa->spa_activities_lock);
 
 		*in_progress = vdev_replace_in_progress(spa->spa_root_vdev);
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 		break;
 	case ZPOOL_WAIT_REMOVE:
 		*in_progress = (spa->spa_removing_phys.sr_state ==
 		    DSS_SCANNING);
 		break;
 	case ZPOOL_WAIT_RESILVER:
 		*in_progress = vdev_rebuild_active(spa->spa_root_vdev);
 		if (*in_progress)
 			break;
 		zfs_fallthrough;
 	case ZPOOL_WAIT_SCRUB:
 	{
 		boolean_t scanning, paused, is_scrub;
 		dsl_scan_t *scn =  spa->spa_dsl_pool->dp_scan;
 
 		is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB);
 		scanning = (scn->scn_phys.scn_state == DSS_SCANNING);
 		paused = dsl_scan_is_paused_scrub(scn);
 		*in_progress = (scanning && !paused &&
 		    is_scrub == (activity == ZPOOL_WAIT_SCRUB));
 		break;
 	}
 	case ZPOOL_WAIT_RAIDZ_EXPAND:
 	{
 		vdev_raidz_expand_t *vre = spa->spa_raidz_expand;
 		*in_progress = (vre != NULL && vre->vre_state == DSS_SCANNING);
 		break;
 	}
 	default:
 		panic("unrecognized value for activity %d", activity);
 	}
 
 	return (error);
 }
 
 static int
 spa_wait_common(const char *pool, zpool_wait_activity_t activity,
     boolean_t use_tag, uint64_t tag, boolean_t *waited)
 {
 	/*
 	 * The tag is used to distinguish between instances of an activity.
 	 * 'initialize' and 'trim' are the only activities that we use this for.
 	 * The other activities can only have a single instance in progress in a
 	 * pool at one time, making the tag unnecessary.
 	 *
 	 * There can be multiple devices being replaced at once, but since they
 	 * all finish once resilvering finishes, we don't bother keeping track
 	 * of them individually, we just wait for them all to finish.
 	 */
 	if (use_tag && activity != ZPOOL_WAIT_INITIALIZE &&
 	    activity != ZPOOL_WAIT_TRIM)
 		return (EINVAL);
 
 	if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES)
 		return (EINVAL);
 
 	spa_t *spa;
 	int error = spa_open(pool, &spa, FTAG);
 	if (error != 0)
 		return (error);
 
 	/*
 	 * Increment the spa's waiter count so that we can call spa_close and
 	 * still ensure that the spa_t doesn't get freed before this thread is
 	 * finished with it when the pool is exported. We want to call spa_close
 	 * before we start waiting because otherwise the additional ref would
 	 * prevent the pool from being exported or destroyed throughout the
 	 * potentially long wait.
 	 */
 	mutex_enter(&spa->spa_activities_lock);
 	spa->spa_waiters++;
 	spa_close(spa, FTAG);
 
 	*waited = B_FALSE;
 	for (;;) {
 		boolean_t in_progress;
 		error = spa_activity_in_progress(spa, activity, use_tag, tag,
 		    &in_progress);
 
 		if (error || !in_progress || spa->spa_waiters_cancel)
 			break;
 
 		*waited = B_TRUE;
 
 		if (cv_wait_sig(&spa->spa_activities_cv,
 		    &spa->spa_activities_lock) == 0) {
 			error = EINTR;
 			break;
 		}
 	}
 
 	spa->spa_waiters--;
 	cv_signal(&spa->spa_waiters_cv);
 	mutex_exit(&spa->spa_activities_lock);
 
 	return (error);
 }
 
 /*
  * Wait for a particular instance of the specified activity to complete, where
  * the instance is identified by 'tag'
  */
 int
 spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag,
     boolean_t *waited)
 {
 	return (spa_wait_common(pool, activity, B_TRUE, tag, waited));
 }
 
 /*
  * Wait for all instances of the specified activity complete
  */
 int
 spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited)
 {
 
 	return (spa_wait_common(pool, activity, B_FALSE, 0, waited));
 }
 
 sysevent_t *
 spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 {
 	sysevent_t *ev = NULL;
 #ifdef _KERNEL
 	nvlist_t *resource;
 
 	resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl);
 	if (resource) {
 		ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP);
 		ev->resource = resource;
 	}
 #else
 	(void) spa, (void) vd, (void) hist_nvl, (void) name;
 #endif
 	return (ev);
 }
 
 void
 spa_event_post(sysevent_t *ev)
 {
 #ifdef _KERNEL
 	if (ev) {
 		zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb);
 		kmem_free(ev, sizeof (*ev));
 	}
 #else
 	(void) ev;
 #endif
 }
 
 /*
  * Post a zevent corresponding to the given sysevent.   The 'name' must be one
  * of the event definitions in sys/sysevent/eventdefs.h.  The payload will be
  * filled in from the spa and (optionally) the vdev.  This doesn't do anything
  * in the userland libzpool, as we don't want consumers to misinterpret ztest
  * or zdb as real changes.
  */
 void
 spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name)
 {
 	spa_event_post(spa_event_create(spa, vd, hist_nvl, name));
 }
 
 /* state manipulation functions */
 EXPORT_SYMBOL(spa_open);
 EXPORT_SYMBOL(spa_open_rewind);
 EXPORT_SYMBOL(spa_get_stats);
 EXPORT_SYMBOL(spa_create);
 EXPORT_SYMBOL(spa_import);
 EXPORT_SYMBOL(spa_tryimport);
 EXPORT_SYMBOL(spa_destroy);
 EXPORT_SYMBOL(spa_export);
 EXPORT_SYMBOL(spa_reset);
 EXPORT_SYMBOL(spa_async_request);
 EXPORT_SYMBOL(spa_async_suspend);
 EXPORT_SYMBOL(spa_async_resume);
 EXPORT_SYMBOL(spa_inject_addref);
 EXPORT_SYMBOL(spa_inject_delref);
 EXPORT_SYMBOL(spa_scan_stat_init);
 EXPORT_SYMBOL(spa_scan_get_stats);
 
 /* device manipulation */
 EXPORT_SYMBOL(spa_vdev_add);
 EXPORT_SYMBOL(spa_vdev_attach);
 EXPORT_SYMBOL(spa_vdev_detach);
 EXPORT_SYMBOL(spa_vdev_setpath);
 EXPORT_SYMBOL(spa_vdev_setfru);
 EXPORT_SYMBOL(spa_vdev_split_mirror);
 
 /* spare statech is global across all pools) */
 EXPORT_SYMBOL(spa_spare_add);
 EXPORT_SYMBOL(spa_spare_remove);
 EXPORT_SYMBOL(spa_spare_exists);
 EXPORT_SYMBOL(spa_spare_activate);
 
 /* L2ARC statech is global across all pools) */
 EXPORT_SYMBOL(spa_l2cache_add);
 EXPORT_SYMBOL(spa_l2cache_remove);
 EXPORT_SYMBOL(spa_l2cache_exists);
 EXPORT_SYMBOL(spa_l2cache_activate);
 EXPORT_SYMBOL(spa_l2cache_drop);
 
 /* scanning */
 EXPORT_SYMBOL(spa_scan);
 EXPORT_SYMBOL(spa_scan_range);
 EXPORT_SYMBOL(spa_scan_stop);
 
 /* spa syncing */
 EXPORT_SYMBOL(spa_sync); /* only for DMU use */
 EXPORT_SYMBOL(spa_sync_allpools);
 
 /* properties */
 EXPORT_SYMBOL(spa_prop_set);
 EXPORT_SYMBOL(spa_prop_get);
 EXPORT_SYMBOL(spa_prop_clear_bootfs);
 
 /* asynchronous event notification */
 EXPORT_SYMBOL(spa_event_notify);
 
 ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_pct, UINT, ZMOD_RW,
 	"Percentage of CPUs to run a metaslab preload taskq");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, UINT, ZMOD_RW,
 	"log2 fraction of arc that can be used by inflight I/Os when "
 	"verifying pool during import");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW,
 	"Set to traverse metadata on pool import");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW,
 	"Set to traverse data on pool import");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW,
 	"Print vdev tree to zfs_dbgmsg during pool import");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RW,
 	"Percentage of CPUs to run an IO worker thread");
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_tpq, UINT, ZMOD_RW,
 	"Number of threads per IO worker taskqueue");
 
 ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, U64, ZMOD_RW,
 	"Allow importing pool with up to this number of missing top-level "
 	"vdevs (in read-only mode)");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT,
 	ZMOD_RW, "Set the livelist condense zthr to pause");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT,
 	ZMOD_RW, "Set the livelist condense synctask to pause");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel,
 	INT, ZMOD_RW,
 	"Whether livelist condensing was canceled in the synctask");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel,
 	INT, ZMOD_RW,
 	"Whether livelist condensing was canceled in the zthr function");
 
 ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT,
 	ZMOD_RW,
 	"Whether extra ALLOC blkptrs were added to a livelist entry while it "
 	"was being condensed");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, note_txg_time, UINT, ZMOD_RW,
 	"How frequently TXG timestamps are stored internally (in seconds)");
 
 ZFS_MODULE_PARAM(zfs_spa, spa_, flush_txg_time, UINT, ZMOD_RW,
 	"How frequently the TXG timestamps database should be flushed "
 	"to disk (in seconds)");
 
 #ifdef _KERNEL
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read,
 	spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RW,
 	"Configure IO queues for read IO");
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write,
 	spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RW,
 	"Configure IO queues for write IO");
 ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_free,
 	spa_taskq_free_param_set, spa_taskq_free_param_get, ZMOD_RW,
 	"Configure IO queues for free IO");
 #endif
 
 ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_write_tpq, UINT, ZMOD_RW,
 	"Number of CPUs per write issue taskq");
diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c
index 4a0d41c24eed..eb18296ec3f2 100644
--- a/module/zfs/zfs_fm.c
+++ b/module/zfs/zfs_fm.c
@@ -1,1617 +1,1618 @@
 // SPDX-License-Identifier: CDDL-1.0
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or https://opensource.org/licenses/CDDL-1.0.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2012,2021 by Delphix. All rights reserved.
  */
 
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
 
 #include <sys/fm/fs/zfs.h>
 #include <sys/fm/protocol.h>
 #include <sys/fm/util.h>
 #include <sys/sysevent.h>
 
 /*
  * This general routine is responsible for generating all the different ZFS
  * ereports.  The payload is dependent on the class, and which arguments are
  * supplied to the function:
  *
  * 	EREPORT			POOL	VDEV	IO
  * 	block			X	X	X
  * 	data			X		X
  * 	device			X	X
  * 	pool			X
  *
  * If we are in a loading state, all errors are chained together by the same
  * SPA-wide ENA (Error Numeric Association).
  *
  * For isolated I/O requests, we get the ENA from the zio_t. The propagation
  * gets very complicated due to RAID-Z, gang blocks, and vdev caching.  We want
  * to chain together all ereports associated with a logical piece of data.  For
  * read I/Os, there  are basically three 'types' of I/O, which form a roughly
  * layered diagram:
  *
  * 	+---------------+
  * 	| Aggregate I/O |	No associated logical data or device
  * 	+---------------+
  *              |
  *              V
  * 	+---------------+	Reads associated with a piece of logical data.
  * 	|   Read I/O    |	This includes reads on behalf of RAID-Z,
  * 	+---------------+       mirrors, gang blocks, retries, etc.
  *              |
  *              V
  * 	+---------------+	Reads associated with a particular device, but
  * 	| Physical I/O  |	no logical data.  Issued as part of vdev caching
  * 	+---------------+	and I/O aggregation.
  *
  * Note that 'physical I/O' here is not the same terminology as used in the rest
  * of ZIO.  Typically, 'physical I/O' simply means that there is no attached
  * blockpointer.  But I/O with no associated block pointer can still be related
  * to a logical piece of data (i.e. RAID-Z requests).
  *
  * Purely physical I/O always have unique ENAs.  They are not related to a
  * particular piece of logical data, and therefore cannot be chained together.
  * We still generate an ereport, but the DE doesn't correlate it with any
  * logical piece of data.  When such an I/O fails, the delegated I/O requests
  * will issue a retry, which will trigger the 'real' ereport with the correct
  * ENA.
  *
  * We keep track of the ENA for a ZIO chain through the 'io_logical' member.
  * When a new logical I/O is issued, we set this to point to itself.  Child I/Os
  * then inherit this pointer, so that when it is first set subsequent failures
  * will use the same ENA.  For vdev cache fill and queue aggregation I/O,
  * this pointer is set to NULL, and no ereport will be generated (since it
  * doesn't actually correspond to any particular device or piece of data,
  * and the caller will always retry without caching or queueing anyway).
  *
  * For checksum errors, we want to include more information about the actual
  * error which occurs.  Accordingly, we build an ereport when the error is
  * noticed, but instead of sending it in immediately, we hang it off of the
  * io_cksum_report field of the logical IO.  When the logical IO completes
  * (successfully or not), zfs_ereport_finish_checksum() is called with the
  * good and bad versions of the buffer (if available), and we annotate the
  * ereport with information about the differences.
  */
 
 #ifdef _KERNEL
 /*
  * Duplicate ereport Detection
  *
  * Some ereports are retained momentarily for detecting duplicates.  These
  * are kept in a recent_events_node_t in both a time-ordered list and an AVL
  * tree of recent unique ereports.
  *
  * The lifespan of these recent ereports is bounded (15 mins) and a cleaner
  * task is used to purge stale entries.
  */
 static list_t recent_events_list;
 static avl_tree_t recent_events_tree;
 static kmutex_t recent_events_lock;
 static taskqid_t recent_events_cleaner_tqid;
 
 /*
  * Each node is about 128 bytes so 2,000 would consume 1/4 MiB.
  *
  * This setting can be changed dynamically and setting it to zero
  * disables duplicate detection.
  */
 static unsigned int zfs_zevent_retain_max = 2000;
 
 /*
  * The lifespan for a recent ereport entry. The default of 15 minutes is
  * intended to outlive the zfs diagnosis engine's threshold of 10 errors
  * over a period of 10 minutes.
  */
 static unsigned int zfs_zevent_retain_expire_secs = 900;
 
 typedef enum zfs_subclass {
 	ZSC_IO,
 	ZSC_DATA,
 	ZSC_CHECKSUM
 } zfs_subclass_t;
 
 typedef struct {
 	/* common criteria */
 	uint64_t	re_pool_guid;
 	uint64_t	re_vdev_guid;
 	int		re_io_error;
 	uint64_t	re_io_size;
 	uint64_t	re_io_offset;
 	zfs_subclass_t	re_subclass;
 	zio_priority_t	re_io_priority;
 
 	/* logical zio criteria (optional) */
 	zbookmark_phys_t re_io_bookmark;
 
 	/* internal state */
 	avl_node_t	re_tree_link;
 	list_node_t	re_list_link;
 	uint64_t	re_timestamp;
 } recent_events_node_t;
 
 static int
 recent_events_compare(const void *a, const void *b)
 {
 	const recent_events_node_t *node1 = a;
 	const recent_events_node_t *node2 = b;
 	int cmp;
 
 	/*
 	 * The comparison order here is somewhat arbitrary.
 	 * What's important is that if every criteria matches, then it
 	 * is a duplicate (i.e. compare returns 0)
 	 */
 	if ((cmp = TREE_CMP(node1->re_subclass, node2->re_subclass)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_pool_guid, node2->re_pool_guid)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_vdev_guid, node2->re_vdev_guid)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_io_error, node2->re_io_error)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_io_priority, node2->re_io_priority)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_io_size, node2->re_io_size)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(node1->re_io_offset, node2->re_io_offset)) != 0)
 		return (cmp);
 
 	const zbookmark_phys_t *zb1 = &node1->re_io_bookmark;
 	const zbookmark_phys_t *zb2 = &node2->re_io_bookmark;
 
 	if ((cmp = TREE_CMP(zb1->zb_objset, zb2->zb_objset)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(zb1->zb_object, zb2->zb_object)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(zb1->zb_level, zb2->zb_level)) != 0)
 		return (cmp);
 	if ((cmp = TREE_CMP(zb1->zb_blkid, zb2->zb_blkid)) != 0)
 		return (cmp);
 
 	return (0);
 }
 
 /*
  * workaround: vdev properties don't have inheritance
  */
 static uint64_t
 vdev_prop_get_inherited(vdev_t *vd, vdev_prop_t prop)
 {
 	uint64_t propdef, propval;
 
 	propdef = vdev_prop_default_numeric(prop);
 	switch (prop) {
 		case VDEV_PROP_CHECKSUM_N:
 			propval = vd->vdev_checksum_n;
 			break;
 		case VDEV_PROP_CHECKSUM_T:
 			propval = vd->vdev_checksum_t;
 			break;
 		case VDEV_PROP_IO_N:
 			propval = vd->vdev_io_n;
 			break;
 		case VDEV_PROP_IO_T:
 			propval = vd->vdev_io_t;
 			break;
 		case VDEV_PROP_SLOW_IO_EVENTS:
 			propval = vd->vdev_slow_io_events;
 			break;
 		case VDEV_PROP_SLOW_IO_N:
 			propval = vd->vdev_slow_io_n;
 			break;
 		case VDEV_PROP_SLOW_IO_T:
 			propval = vd->vdev_slow_io_t;
 			break;
 		default:
 			propval = propdef;
 			break;
 	}
 
 	if (propval != propdef)
 		return (propval);
 
 	if (vd->vdev_parent == NULL)
 		return (propdef);
 
 	return (vdev_prop_get_inherited(vd->vdev_parent, prop));
 }
 
 static void zfs_ereport_schedule_cleaner(void);
 
 /*
  * background task to clean stale recent event nodes.
  */
 static void
 zfs_ereport_cleaner(void *arg)
 {
 	recent_events_node_t *entry;
 	uint64_t now = gethrtime();
 
 	/*
 	 * purge expired entries
 	 */
 	mutex_enter(&recent_events_lock);
 	while ((entry = list_tail(&recent_events_list)) != NULL) {
 		uint64_t age = NSEC2SEC(now - entry->re_timestamp);
 		if (age <= zfs_zevent_retain_expire_secs)
 			break;
 
 		/* remove expired node */
 		avl_remove(&recent_events_tree, entry);
 		list_remove(&recent_events_list, entry);
 		kmem_free(entry, sizeof (*entry));
 	}
 
 	/* Restart the cleaner if more entries remain */
 	recent_events_cleaner_tqid = 0;
 	if (!list_is_empty(&recent_events_list))
 		zfs_ereport_schedule_cleaner();
 
 	mutex_exit(&recent_events_lock);
 }
 
 static void
 zfs_ereport_schedule_cleaner(void)
 {
 	ASSERT(MUTEX_HELD(&recent_events_lock));
 
 	uint64_t timeout = SEC2NSEC(zfs_zevent_retain_expire_secs + 1);
 
 	recent_events_cleaner_tqid = taskq_dispatch_delay(
 	    system_delay_taskq, zfs_ereport_cleaner, NULL, TQ_SLEEP,
 	    ddi_get_lbolt() + NSEC_TO_TICK(timeout));
 }
 
 /*
  * Clear entries for a given vdev or all vdevs in a pool when vdev == NULL
  */
 void
 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
 {
 	uint64_t vdev_guid, pool_guid;
 
 	ASSERT(vd != NULL || spa != NULL);
 	if (vd == NULL) {
 		vdev_guid = 0;
 		pool_guid = spa_guid(spa);
 	} else {
 		vdev_guid = vd->vdev_guid;
 		pool_guid = 0;
 	}
 
 	mutex_enter(&recent_events_lock);
 
 	recent_events_node_t *next = list_head(&recent_events_list);
 	while (next != NULL) {
 		recent_events_node_t *entry = next;
 
 		next = list_next(&recent_events_list, next);
 
 		if (entry->re_vdev_guid == vdev_guid ||
 		    entry->re_pool_guid == pool_guid) {
 			avl_remove(&recent_events_tree, entry);
 			list_remove(&recent_events_list, entry);
 			kmem_free(entry, sizeof (*entry));
 		}
 	}
 
 	mutex_exit(&recent_events_lock);
 }
 
 /*
  * Check if an ereport would be a duplicate of one recently posted.
  *
  * An ereport is considered a duplicate if the set of criteria in
  * recent_events_node_t all match.
  *
  * Only FM_EREPORT_ZFS_IO, FM_EREPORT_ZFS_DATA, and FM_EREPORT_ZFS_CHECKSUM
  * are candidates for duplicate checking.
  */
 static boolean_t
 zfs_ereport_is_duplicate(const char *subclass, spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, zio_t *zio, uint64_t offset, uint64_t size)
 {
 	recent_events_node_t search = {0}, *entry;
 
 	if (vd == NULL || zio == NULL)
 		return (B_FALSE);
 
 	if (zfs_zevent_retain_max == 0)
 		return (B_FALSE);
 
 	if (strcmp(subclass, FM_EREPORT_ZFS_IO) == 0)
 		search.re_subclass = ZSC_IO;
 	else if (strcmp(subclass, FM_EREPORT_ZFS_DATA) == 0)
 		search.re_subclass = ZSC_DATA;
 	else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0)
 		search.re_subclass = ZSC_CHECKSUM;
 	else
 		return (B_FALSE);
 
 	search.re_pool_guid = spa_guid(spa);
 	search.re_vdev_guid = vd->vdev_guid;
 	search.re_io_error = zio->io_error;
 	search.re_io_priority = zio->io_priority;
 	/* if size is supplied use it over what's in zio */
 	if (size) {
 		search.re_io_size = size;
 		search.re_io_offset = offset;
 	} else {
 		search.re_io_size = zio->io_size;
 		search.re_io_offset = zio->io_offset;
 	}
 
 	/* grab optional logical zio criteria */
 	if (zb != NULL) {
 		search.re_io_bookmark.zb_objset = zb->zb_objset;
 		search.re_io_bookmark.zb_object = zb->zb_object;
 		search.re_io_bookmark.zb_level = zb->zb_level;
 		search.re_io_bookmark.zb_blkid = zb->zb_blkid;
 	}
 
 	uint64_t now = gethrtime();
 
 	mutex_enter(&recent_events_lock);
 
 	/* check if we have seen this one recently */
 	entry = avl_find(&recent_events_tree, &search, NULL);
 	if (entry != NULL) {
 		uint64_t age = NSEC2SEC(now - entry->re_timestamp);
 
 		/*
 		 * There is still an active cleaner (since we're here).
 		 * Reset the last seen time for this duplicate entry
 		 * so that its lifespand gets extended.
 		 */
 		list_remove(&recent_events_list, entry);
 		list_insert_head(&recent_events_list, entry);
 		entry->re_timestamp = now;
 
 		zfs_zevent_track_duplicate();
 		mutex_exit(&recent_events_lock);
 
 		return (age <= zfs_zevent_retain_expire_secs);
 	}
 
 	if (avl_numnodes(&recent_events_tree) >= zfs_zevent_retain_max) {
 		/* recycle oldest node */
 		entry = list_tail(&recent_events_list);
 		ASSERT(entry != NULL);
 		list_remove(&recent_events_list, entry);
 		avl_remove(&recent_events_tree, entry);
 	} else {
 		entry = kmem_alloc(sizeof (recent_events_node_t), KM_SLEEP);
 	}
 
 	/* record this as a recent ereport */
 	*entry = search;
 	avl_add(&recent_events_tree, entry);
 	list_insert_head(&recent_events_list, entry);
 	entry->re_timestamp = now;
 
 	/* Start a cleaner if not already scheduled */
 	if (recent_events_cleaner_tqid == 0)
 		zfs_ereport_schedule_cleaner();
 
 	mutex_exit(&recent_events_lock);
 	return (B_FALSE);
 }
 
 void
 zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector)
 {
 	if (nvl)
 		fm_nvlist_destroy(nvl, FM_NVA_FREE);
 
 	if (detector)
 		fm_nvlist_destroy(detector, FM_NVA_FREE);
 }
 
 /*
  * We want to rate limit ZIO delay, deadman, and checksum events so as to not
  * flood zevent consumers when a disk is acting up.
  *
  * Returns 1 if we're ratelimiting, 0 if not.
  */
 static int
 zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd)
 {
 	int rc = 0;
 	/*
 	 * zfs_ratelimit() returns 1 if we're *not* ratelimiting and 0 if we
 	 * are.  Invert it to get our return value.
 	 */
 	if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
 		rc = !zfs_ratelimit(&vd->vdev_delay_rl);
 	} else if (strcmp(subclass, FM_EREPORT_ZFS_DEADMAN) == 0) {
 		rc = !zfs_ratelimit(&vd->vdev_deadman_rl);
 	} else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
 		rc = !zfs_ratelimit(&vd->vdev_checksum_rl);
 	}
 
 	if (rc)	{
 		/* We're rate limiting */
 		fm_erpt_dropped_increment();
 	}
 
 	return (rc);
 }
 
 /*
  * Return B_TRUE if the event actually posted, B_FALSE if not.
  */
 static boolean_t
 zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
     const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
     zio_t *zio, uint64_t stateoroffset, uint64_t size)
 {
 	nvlist_t *ereport, *detector;
 
 	uint64_t ena;
 	char class[64];
 
 	if ((ereport = fm_nvlist_create(NULL)) == NULL)
 		return (B_FALSE);
 
 	if ((detector = fm_nvlist_create(NULL)) == NULL) {
 		fm_nvlist_destroy(ereport, FM_NVA_FREE);
 		return (B_FALSE);
 	}
 
 	/*
 	 * Serialize ereport generation
 	 */
 	mutex_enter(&spa->spa_errlist_lock);
 
 	/*
 	 * Determine the ENA to use for this event.  If we are in a loading
 	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
 	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
 	 */
 	if (spa_load_state(spa) != SPA_LOAD_NONE) {
 		if (spa->spa_ena == 0)
 			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
 		ena = spa->spa_ena;
 	} else if (zio != NULL && zio->io_logical != NULL) {
 		if (zio->io_logical->io_ena == 0)
 			zio->io_logical->io_ena =
 			    fm_ena_generate(0, FM_ENA_FMT1);
 		ena = zio->io_logical->io_ena;
 	} else {
 		ena = fm_ena_generate(0, FM_ENA_FMT1);
 	}
 
 	/*
 	 * Construct the full class, detector, and other standard FMA fields.
 	 */
 	(void) snprintf(class, sizeof (class), "%s.%s",
 	    ZFS_ERROR_CLASS, subclass);
 
 	fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa),
 	    vd != NULL ? vd->vdev_guid : 0);
 
 	fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL);
 
 	/*
 	 * Construct the per-ereport payload, depending on which parameters are
 	 * passed in.
 	 */
 
 	/*
 	 * Generic payload members common to all ereports.
 	 */
 	fm_payload_set(ereport,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa),
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa),
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64,
 	    (uint64_t)spa_state(spa),
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
 	    (int32_t)spa_load_state(spa), NULL);
 
 	fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
 	    DATA_TYPE_STRING,
 	    spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ?
 	    FM_EREPORT_FAILMODE_WAIT :
 	    spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ?
 	    FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC,
 	    NULL);
 
 	if (vd != NULL) {
 		vdev_t *pvd = vd->vdev_parent;
 		vdev_queue_t *vq = &vd->vdev_queue;
 		vdev_stat_t *vs = &vd->vdev_stat;
 		vdev_t *spare_vd;
 		uint64_t *spare_guids;
 		char **spare_paths;
 		int i, spare_count;
 
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID,
 		    DATA_TYPE_UINT64, vd->vdev_guid,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
 		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
 		if (vd->vdev_path != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
 			    DATA_TYPE_STRING, vd->vdev_path, NULL);
 		if (vd->vdev_devid != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
 			    DATA_TYPE_STRING, vd->vdev_devid, NULL);
 		if (vd->vdev_fru != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
 			    DATA_TYPE_STRING, vd->vdev_fru, NULL);
 		if (vd->vdev_enc_sysfs_path != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
 			    DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL);
 		if (vd->vdev_ashift)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT,
 			    DATA_TYPE_UINT64, vd->vdev_ashift, NULL);
 
 		if (vq != NULL) {
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS,
 			    DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL);
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS,
 			    DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL);
 		}
 
 		if (vs != NULL) {
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS,
 			    DATA_TYPE_UINT64, vs->vs_read_errors,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS,
 			    DATA_TYPE_UINT64, vs->vs_write_errors,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS,
 			    DATA_TYPE_UINT64, vs->vs_checksum_errors,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS,
 			    DATA_TYPE_UINT64, vs->vs_slow_ios,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DIO_VERIFY_ERRORS,
 			    DATA_TYPE_UINT64, vs->vs_dio_verify_errors,
 			    NULL);
 		}
 
 		if (pvd != NULL) {
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID,
 			    DATA_TYPE_UINT64, pvd->vdev_guid,
 			    FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE,
 			    DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type,
 			    NULL);
 			if (pvd->vdev_path)
 				fm_payload_set(ereport,
 				    FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH,
 				    DATA_TYPE_STRING, pvd->vdev_path, NULL);
 			if (pvd->vdev_devid)
 				fm_payload_set(ereport,
 				    FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID,
 				    DATA_TYPE_STRING, pvd->vdev_devid, NULL);
 		}
 
 		spare_count = spa->spa_spares.sav_count;
 		spare_paths = kmem_zalloc(sizeof (char *) * spare_count,
 		    KM_SLEEP);
 		spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count,
 		    KM_SLEEP);
 
 		for (i = 0; i < spare_count; i++) {
 			spare_vd = spa->spa_spares.sav_vdevs[i];
 			if (spare_vd) {
 				spare_paths[i] = spare_vd->vdev_path;
 				spare_guids[i] = spare_vd->vdev_guid;
 			}
 		}
 
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS,
 		    DATA_TYPE_STRING_ARRAY, spare_count, spare_paths,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS,
 		    DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL);
 
 		kmem_free(spare_guids, sizeof (uint64_t) * spare_count);
 		kmem_free(spare_paths, sizeof (char *) * spare_count);
 	}
 
 	if (zio != NULL) {
 		/*
 		 * Payload common to all I/Os.
 		 */
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR,
 		    DATA_TYPE_INT32, zio->io_error, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS,
 		    DATA_TYPE_UINT64, zio->io_flags, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE,
 		    DATA_TYPE_UINT32, zio->io_stage, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE,
 		    DATA_TYPE_UINT32, zio->io_pipeline, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY,
 		    DATA_TYPE_UINT64, zio->io_delay, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP,
 		    DATA_TYPE_UINT64, zio->io_timestamp, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA,
 		    DATA_TYPE_UINT64, zio->io_delta, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TYPE,
 		    DATA_TYPE_UINT32, zio->io_type, NULL);
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PRIORITY,
 		    DATA_TYPE_UINT32, zio->io_priority, NULL);
 
 		/*
 		 * If the 'size' parameter is non-zero, it indicates this is a
 		 * RAID-Z or other I/O where the physical offset and length are
 		 * provided for us, instead of within the zio_t.
 		 */
 		if (vd != NULL) {
 			if (size)
 				fm_payload_set(ereport,
 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
 				    DATA_TYPE_UINT64, stateoroffset,
 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
 				    DATA_TYPE_UINT64, size, NULL);
 			else
 				fm_payload_set(ereport,
 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET,
 				    DATA_TYPE_UINT64, zio->io_offset,
 				    FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE,
 				    DATA_TYPE_UINT64, zio->io_size, NULL);
 		}
 	} else if (vd != NULL) {
 		/*
 		 * If we have a vdev but no zio, this is a device fault, and the
 		 * 'stateoroffset' parameter indicates the previous state of the
 		 * vdev.
 		 */
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
 		    DATA_TYPE_UINT64, stateoroffset, NULL);
 	}
 
 	/*
 	 * Payload for I/Os with corresponding logical information.
 	 */
 	if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) {
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET,
 		    DATA_TYPE_UINT64, zb->zb_objset,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT,
 		    DATA_TYPE_UINT64, zb->zb_object,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL,
 		    DATA_TYPE_INT64, zb->zb_level,
 		    FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID,
 		    DATA_TYPE_UINT64, zb->zb_blkid, NULL);
 	}
 
 	/*
 	 * Payload for tuning the zed
 	 */
 	if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) {
 		uint64_t cksum_n, cksum_t;
 
 		cksum_n = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_N);
 		if (cksum_n != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_N))
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_N,
 			    DATA_TYPE_UINT64,
 			    cksum_n,
 			    NULL);
 
 		cksum_t = vdev_prop_get_inherited(vd, VDEV_PROP_CHECKSUM_T);
 		if (cksum_t != vdev_prop_default_numeric(VDEV_PROP_CHECKSUM_T))
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_T,
 			    DATA_TYPE_UINT64,
 			    cksum_t,
 			    NULL);
 	}
 
 	if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_IO) == 0) {
 		uint64_t io_n, io_t;
 
 		io_n = vdev_prop_get_inherited(vd, VDEV_PROP_IO_N);
 		if (io_n != vdev_prop_default_numeric(VDEV_PROP_IO_N))
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_N,
 			    DATA_TYPE_UINT64,
 			    io_n,
 			    NULL);
 
 		io_t = vdev_prop_get_inherited(vd, VDEV_PROP_IO_T);
 		if (io_t != vdev_prop_default_numeric(VDEV_PROP_IO_T))
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_IO_T,
 			    DATA_TYPE_UINT64,
 			    io_t,
 			    NULL);
 	}
 
 	if (vd != NULL && strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) {
 		uint64_t slow_io_n, slow_io_t;
 
 		slow_io_n = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_N);
 		if (slow_io_n != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_N))
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_N,
 			    DATA_TYPE_UINT64,
 			    slow_io_n,
 			    NULL);
 
 		slow_io_t = vdev_prop_get_inherited(vd, VDEV_PROP_SLOW_IO_T);
 		if (slow_io_t != vdev_prop_default_numeric(VDEV_PROP_SLOW_IO_T))
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_SLOW_IO_T,
 			    DATA_TYPE_UINT64,
 			    slow_io_t,
 			    NULL);
 	}
 
 	mutex_exit(&spa->spa_errlist_lock);
 
 	*ereport_out = ereport;
 	*detector_out = detector;
 	return (B_TRUE);
 }
 
 /* if it's <= 128 bytes, save the corruption directly */
 #define	ZFM_MAX_INLINE		(128 / sizeof (uint64_t))
 
 #define	MAX_RANGES		16
 
 typedef struct zfs_ecksum_info {
 	/* inline arrays of bits set and cleared. */
 	uint64_t zei_bits_set[ZFM_MAX_INLINE];
 	uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
 
 	/*
 	 * for each range, the number of bits set and cleared.  The Hamming
 	 * distance between the good and bad buffers is the sum of them all.
 	 */
 	uint32_t zei_range_sets[MAX_RANGES];
 	uint32_t zei_range_clears[MAX_RANGES];
 
 	struct zei_ranges {
 		uint32_t	zr_start;
 		uint32_t	zr_end;
 	} zei_ranges[MAX_RANGES];
 
 	size_t	zei_range_count;
 	uint32_t zei_mingap;
 	uint32_t zei_allowed_mingap;
 
 } zfs_ecksum_info_t;
 
 static void
 update_bad_bits(uint64_t value_arg, uint32_t *count)
 {
 	size_t i;
 	size_t bits = 0;
 	uint64_t value = BE_64(value_arg);
 
 	/* We store the bits in big-endian (largest-first) order */
 	for (i = 0; i < 64; i++) {
 		if (value & (1ull << i))
 			++bits;
 	}
 	/* update the count of bits changed */
 	*count += bits;
 }
 
 /*
  * We've now filled up the range array, and need to increase "mingap" and
  * shrink the range list accordingly.  zei_mingap is always the smallest
  * distance between array entries, so we set the new_allowed_gap to be
  * one greater than that.  We then go through the list, joining together
  * any ranges which are closer than the new_allowed_gap.
  *
  * By construction, there will be at least one.  We also update zei_mingap
  * to the new smallest gap, to prepare for our next invocation.
  */
 static void
 zei_shrink_ranges(zfs_ecksum_info_t *eip)
 {
 	uint32_t mingap = UINT32_MAX;
 	uint32_t new_allowed_gap = eip->zei_mingap + 1;
 
 	size_t idx, output;
 	size_t max = eip->zei_range_count;
 
 	struct zei_ranges *r = eip->zei_ranges;
 
 	ASSERT3U(eip->zei_range_count, >, 0);
 	ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
 
 	output = idx = 0;
 	while (idx < max - 1) {
 		uint32_t start = r[idx].zr_start;
 		uint32_t end = r[idx].zr_end;
 
 		while (idx < max - 1) {
 			idx++;
 
 			uint32_t nstart = r[idx].zr_start;
 			uint32_t nend = r[idx].zr_end;
 
 			uint32_t gap = nstart - end;
 			if (gap < new_allowed_gap) {
 				end = nend;
 				continue;
 			}
 			if (gap < mingap)
 				mingap = gap;
 			break;
 		}
 		r[output].zr_start = start;
 		r[output].zr_end = end;
 		output++;
 	}
 	ASSERT3U(output, <, eip->zei_range_count);
 	eip->zei_range_count = output;
 	eip->zei_mingap = mingap;
 	eip->zei_allowed_mingap = new_allowed_gap;
 }
 
 static void
 zei_add_range(zfs_ecksum_info_t *eip, int start, int end)
 {
 	struct zei_ranges *r = eip->zei_ranges;
 	size_t count = eip->zei_range_count;
 
 	if (count >= MAX_RANGES) {
 		zei_shrink_ranges(eip);
 		count = eip->zei_range_count;
 	}
 	if (count == 0) {
 		eip->zei_mingap = UINT32_MAX;
 		eip->zei_allowed_mingap = 1;
 	} else {
 		int gap = start - r[count - 1].zr_end;
 
 		if (gap < eip->zei_allowed_mingap) {
 			r[count - 1].zr_end = end;
 			return;
 		}
 		if (gap < eip->zei_mingap)
 			eip->zei_mingap = gap;
 	}
 	r[count].zr_start = start;
 	r[count].zr_end = end;
 	eip->zei_range_count++;
 }
 
 static size_t
 zei_range_total_size(zfs_ecksum_info_t *eip)
 {
 	struct zei_ranges *r = eip->zei_ranges;
 	size_t count = eip->zei_range_count;
 	size_t result = 0;
 	size_t idx;
 
 	for (idx = 0; idx < count; idx++)
 		result += (r[idx].zr_end - r[idx].zr_start);
 
 	return (result);
 }
 
 static zfs_ecksum_info_t *
 annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
     const abd_t *goodabd, const abd_t *badabd, size_t size,
     boolean_t drop_if_identical)
 {
 	const uint64_t *good;
 	const uint64_t *bad;
 
 	size_t nui64s = size / sizeof (uint64_t);
 
 	size_t inline_size;
 	int no_inline = 0;
 	size_t idx;
 	size_t range;
 
 	size_t offset = 0;
 	ssize_t start = -1;
 
 	zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
 
 	/* don't do any annotation for injected checksum errors */
 	if (info != NULL && info->zbc_injected)
 		return (eip);
 
 	if (info != NULL && info->zbc_has_cksum) {
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
 		    DATA_TYPE_STRING,
 		    info->zbc_checksum_name,
 		    NULL);
 
 		if (info->zbc_byteswapped) {
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
 			    DATA_TYPE_BOOLEAN, 1,
 			    NULL);
 		}
 	}
 
 	if (badabd == NULL || goodabd == NULL)
 		return (eip);
 
 	ASSERT3U(nui64s, <=, UINT32_MAX);
 	ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
 	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
 	ASSERT3U(size, <=, UINT32_MAX);
 
 	good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size);
 	bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size);
 
 	/* build up the range list by comparing the two buffers. */
 	for (idx = 0; idx < nui64s; idx++) {
 		if (good[idx] == bad[idx]) {
 			if (start == -1)
 				continue;
 
 			zei_add_range(eip, start, idx);
 			start = -1;
 		} else {
 			if (start != -1)
 				continue;
 
 			start = idx;
 		}
 	}
 	if (start != -1)
 		zei_add_range(eip, start, idx);
 
 	/* See if it will fit in our inline buffers */
 	inline_size = zei_range_total_size(eip);
 	if (inline_size > ZFM_MAX_INLINE)
 		no_inline = 1;
 
 	/*
 	 * If there is no change and we want to drop if the buffers are
 	 * identical, do so.
 	 */
 	if (inline_size == 0 && drop_if_identical) {
 		kmem_free(eip, sizeof (*eip));
 		abd_return_buf((abd_t *)goodabd, (void *)good, size);
 		abd_return_buf((abd_t *)badabd, (void *)bad, size);
 		return (NULL);
 	}
 
 	/*
 	 * Now walk through the ranges, filling in the details of the
 	 * differences.  Also convert our uint64_t-array offsets to byte
 	 * offsets.
 	 */
 	for (range = 0; range < eip->zei_range_count; range++) {
 		size_t start = eip->zei_ranges[range].zr_start;
 		size_t end = eip->zei_ranges[range].zr_end;
 
 		for (idx = start; idx < end; idx++) {
 			uint64_t set, cleared;
 
 			// bits set in bad, but not in good
 			set = ((~good[idx]) & bad[idx]);
 			// bits set in good, but not in bad
 			cleared = (good[idx] & (~bad[idx]));
 
 			if (!no_inline) {
 				ASSERT3U(offset, <, inline_size);
 				eip->zei_bits_set[offset] = set;
 				eip->zei_bits_cleared[offset] = cleared;
 				offset++;
 			}
 
 			update_bad_bits(set, &eip->zei_range_sets[range]);
 			update_bad_bits(cleared, &eip->zei_range_clears[range]);
 		}
 
 		/* convert to byte offsets */
 		eip->zei_ranges[range].zr_start	*= sizeof (uint64_t);
 		eip->zei_ranges[range].zr_end	*= sizeof (uint64_t);
 	}
 
 	abd_return_buf((abd_t *)goodabd, (void *)good, size);
 	abd_return_buf((abd_t *)badabd, (void *)bad, size);
 
 	eip->zei_allowed_mingap	*= sizeof (uint64_t);
 	inline_size		*= sizeof (uint64_t);
 
 	/* fill in ereport */
 	fm_payload_set(ereport,
 	    FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
 	    DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
 	    (uint32_t *)eip->zei_ranges,
 	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
 	    DATA_TYPE_UINT32, eip->zei_allowed_mingap,
 	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
 	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
 	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
 	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
 	    NULL);
 
 	if (!no_inline) {
 		fm_payload_set(ereport,
 		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
 		    DATA_TYPE_UINT8_ARRAY,
 		    inline_size, (uint8_t *)eip->zei_bits_set,
 		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
 		    DATA_TYPE_UINT8_ARRAY,
 		    inline_size, (uint8_t *)eip->zei_bits_cleared,
 		    NULL);
 	}
 	return (eip);
 }
 #else
 void
 zfs_ereport_clear(spa_t *spa, vdev_t *vd)
 {
 	(void) spa, (void) vd;
 }
 #endif
 
 /*
  * Make sure our event is still valid for the given zio/vdev/pool.  For example,
  * we don't want to keep logging events for a faulted or missing vdev.
  */
 boolean_t
 zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio)
 {
 #ifdef _KERNEL
 	/*
 	 * If we are doing a spa_tryimport() or in recovery mode,
 	 * ignore errors.
 	 */
 	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
 	    spa_load_state(spa) == SPA_LOAD_RECOVER)
 		return (B_FALSE);
 
 	/*
 	 * If we are in the middle of opening a pool, and the previous attempt
 	 * failed, don't bother logging any new ereports - we're just going to
 	 * get the same diagnosis anyway.
 	 */
 	if (spa_load_state(spa) != SPA_LOAD_NONE &&
 	    spa->spa_last_open_failed)
 		return (B_FALSE);
 
 	if (zio != NULL) {
 		/* If this is not a read or write zio, ignore the error */
 		if (zio->io_type != ZIO_TYPE_READ &&
 		    zio->io_type != ZIO_TYPE_WRITE)
 			return (B_FALSE);
 
 		if (vd != NULL) {
 			/*
 			 * If the vdev has already been marked as failing due
 			 * to a failed probe, then ignore any subsequent I/O
 			 * errors, as the DE will automatically fault the vdev
 			 * on the first such failure.  This also catches cases
 			 * where vdev_remove_wanted is set and the device has
 			 * not yet been asynchronously placed into the REMOVED
 			 * state.
 			 */
 			if (zio->io_vd == vd && !vdev_accessible(vd, zio))
 				return (B_FALSE);
 
 			/*
 			 * Ignore checksum errors for reads from DTL regions of
 			 * leaf vdevs.
 			 */
 			if (zio->io_type == ZIO_TYPE_READ &&
 			    zio->io_error == ECKSUM &&
 			    vd->vdev_ops->vdev_op_leaf &&
 			    vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
 				return (B_FALSE);
 		}
 	}
 
 	/*
 	 * For probe failure, we want to avoid posting ereports if we've
 	 * already removed the device in the meantime.
 	 */
 	if (vd != NULL &&
 	    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
 	    (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
 		return (B_FALSE);
 
 	/* Ignore bogus delay events (like from ioctls or unqueued IOs) */
 	if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) &&
 	    (zio != NULL) && (!zio->io_timestamp)) {
 		return (B_FALSE);
 	}
 #else
 	(void) subclass, (void) spa, (void) vd, (void) zio;
 #endif
 	return (B_TRUE);
 }
 
 /*
  * Post an ereport for the given subclass
  *
  * Returns
  * - 0 if an event was posted
  * - EINVAL if there was a problem posting event
  * - EBUSY if the event was rate limited
  * - EALREADY if the event was already posted (duplicate)
  */
 int
 zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, zio_t *zio, uint64_t state)
 {
 	int rc = 0;
 #ifdef _KERNEL
 	nvlist_t *ereport = NULL;
 	nvlist_t *detector = NULL;
 
 	if (!zfs_ereport_is_valid(subclass, spa, vd, zio))
 		return (EINVAL);
 
 	if (zfs_ereport_is_duplicate(subclass, spa, vd, zb, zio, 0, 0))
 		return (SET_ERROR(EALREADY));
 
 	if (zfs_is_ratelimiting_event(subclass, vd))
 		return (SET_ERROR(EBUSY));
 
 	if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd,
 	    zb, zio, state, 0))
 		return (SET_ERROR(EINVAL));	/* couldn't post event */
 
 	if (ereport == NULL)
 		return (SET_ERROR(EINVAL));
 
 	/* Cleanup is handled by the callback function */
 	rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 #else
 	(void) subclass, (void) spa, (void) vd, (void) zb, (void) zio,
 	    (void) state;
 #endif
 	return (rc);
 }
 
 /*
  * Prepare a checksum ereport
  *
  * Returns
  * - 0 if an event was posted
  * - EINVAL if there was a problem posting event
  * - EBUSY if the event was rate limited
  * - EALREADY if the event was already posted (duplicate)
  */
 int
 zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
     struct zio *zio, uint64_t offset, uint64_t length, zio_bad_cksum_t *info)
 {
 	zio_cksum_report_t *report;
 
 #ifdef _KERNEL
 	if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
 		return (SET_ERROR(EINVAL));
 
 	if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
 	    offset, length))
 		return (SET_ERROR(EALREADY));
 
 	if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
 		return (SET_ERROR(EBUSY));
 #else
 	(void) zb, (void) offset;
 #endif
 
 	report = kmem_zalloc(sizeof (*report), KM_SLEEP);
 
 	zio_vsd_default_cksum_report(zio, report);
 
 	/* copy the checksum failure information if it was provided */
 	if (info != NULL) {
 		report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
 		memcpy(report->zcr_ckinfo, info, sizeof (*info));
 	}
 
 	report->zcr_sector = 1ULL << vd->vdev_top->vdev_ashift;
 	report->zcr_align =
 	    vdev_psize_to_asize(vd->vdev_top, report->zcr_sector);
 	report->zcr_length = length;
 
 #ifdef _KERNEL
 	(void) zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
 	    FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length);
 
 	if (report->zcr_ereport == NULL) {
 		zfs_ereport_free_checksum(report);
 		return (0);
 	}
 #endif
 
 	mutex_enter(&spa->spa_errlist_lock);
 	report->zcr_next = zio->io_logical->io_cksum_report;
 	zio->io_logical->io_cksum_report = report;
 	mutex_exit(&spa->spa_errlist_lock);
 	return (0);
 }
 
 void
 zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data,
     const abd_t *bad_data, boolean_t drop_if_identical)
 {
 #ifdef _KERNEL
 	zfs_ecksum_info_t *info;
 
 	info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
 	    good_data, bad_data, report->zcr_length, drop_if_identical);
 	if (info != NULL)
 		zfs_zevent_post(report->zcr_ereport,
 		    report->zcr_detector, zfs_zevent_post_cb);
 	else
 		zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector);
 
 	report->zcr_ereport = report->zcr_detector = NULL;
 	if (info != NULL)
 		kmem_free(info, sizeof (*info));
 #else
 	(void) report, (void) good_data, (void) bad_data,
 	    (void) drop_if_identical;
 #endif
 }
 
 void
 zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
 {
 #ifdef _KERNEL
 	if (rpt->zcr_ereport != NULL) {
 		fm_nvlist_destroy(rpt->zcr_ereport,
 		    FM_NVA_FREE);
 		fm_nvlist_destroy(rpt->zcr_detector,
 		    FM_NVA_FREE);
 	}
 #endif
 	rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
 
 	if (rpt->zcr_ckinfo != NULL)
 		kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
 
 	kmem_free(rpt, sizeof (*rpt));
 }
 
 /*
  * Post a checksum ereport
  *
  * Returns
  * - 0 if an event was posted
  * - EINVAL if there was a problem posting event
  * - EBUSY if the event was rate limited
  * - EALREADY if the event was already posted (duplicate)
  */
 int
 zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb,
     struct zio *zio, uint64_t offset, uint64_t length,
     const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc)
 {
 	int rc = 0;
 #ifdef _KERNEL
 	nvlist_t *ereport = NULL;
 	nvlist_t *detector = NULL;
 	zfs_ecksum_info_t *info;
 
 	if (!zfs_ereport_is_valid(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio))
 		return (SET_ERROR(EINVAL));
 
 	if (zfs_ereport_is_duplicate(FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio,
 	    offset, length))
 		return (SET_ERROR(EALREADY));
 
 	if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd))
 		return (SET_ERROR(EBUSY));
 
 	if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM,
 	    spa, vd, zb, zio, offset, length) || (ereport == NULL)) {
 		return (SET_ERROR(EINVAL));
 	}
 
 	info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
 	    B_FALSE);
 
 	if (info != NULL) {
 		rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb);
 		kmem_free(info, sizeof (*info));
 	}
 #else
 	(void) spa, (void) vd, (void) zb, (void) zio, (void) offset,
 	    (void) length, (void) good_data, (void) bad_data, (void) zbc;
 #endif
 	return (rc);
 }
 
 /*
  * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of
  * change in the pool.  All sysevents are listed in sys/sysevent/eventdefs.h
  * and are designed to be consumed by the ZFS Event Daemon (ZED).  For
  * additional details refer to the zed(8) man page.
  */
 nvlist_t *
 zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name,
     nvlist_t *aux)
 {
 	nvlist_t *resource = NULL;
 #ifdef _KERNEL
 	char class[64];
 
 	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
 		return (NULL);
 
 	if ((resource = fm_nvlist_create(NULL)) == NULL)
 		return (NULL);
 
 	(void) snprintf(class, sizeof (class), "%s.%s.%s", type,
 	    ZFS_ERROR_CLASS, name);
 	VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION));
 	VERIFY0(nvlist_add_string(resource, FM_CLASS, class));
 	VERIFY0(nvlist_add_string(resource,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa)));
 	VERIFY0(nvlist_add_uint64(resource,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa)));
 	VERIFY0(nvlist_add_uint64(resource,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa)));
 	VERIFY0(nvlist_add_int32(resource,
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa)));
 
 	if (vd) {
 		VERIFY0(nvlist_add_uint64(resource,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid));
 		VERIFY0(nvlist_add_uint64(resource,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state));
 		if (vd->vdev_path != NULL)
 			VERIFY0(nvlist_add_string(resource,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path));
 		if (vd->vdev_devid != NULL)
 			VERIFY0(nvlist_add_string(resource,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid));
 		if (vd->vdev_fru != NULL)
 			VERIFY0(nvlist_add_string(resource,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru));
 		if (vd->vdev_enc_sysfs_path != NULL)
 			VERIFY0(nvlist_add_string(resource,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
 			    vd->vdev_enc_sysfs_path));
 	}
 
 	/* also copy any optional payload data */
 	if (aux) {
 		nvpair_t *elem = NULL;
 
 		while ((elem = nvlist_next_nvpair(aux, elem)) != NULL)
 			(void) nvlist_add_nvpair(resource, elem);
 	}
 #else
 	(void) spa, (void) vd, (void) type, (void) name, (void) aux;
 #endif
 	return (resource);
 }
 
 static void
 zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name,
     nvlist_t *aux)
 {
 #ifdef _KERNEL
 	nvlist_t *resource;
 
 	resource = zfs_event_create(spa, vd, type, name, aux);
 	if (resource)
 		zfs_zevent_post(resource, NULL, zfs_zevent_post_cb);
 #else
 	(void) spa, (void) vd, (void) type, (void) name, (void) aux;
 #endif
 }
 
 /*
  * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev
  * has been removed from the system.  This will cause the DE to ignore any
  * recent I/O errors, inferring that they are due to the asynchronous device
  * removal.
  */
 void
 zfs_post_remove(spa_t *spa, vdev_t *vd, boolean_t by_kernel)
 {
 	nvlist_t *aux = NULL;
 
 	if (by_kernel) {
 		/*
 		 * Add optional supplemental keys to payload
 		 */
 		aux = fm_nvlist_create(NULL);
 		if (aux)
 			fnvlist_add_boolean(aux, "by_kernel");
 	}
 
 	zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, aux);
 
 	if (by_kernel && aux)
 		fm_nvlist_destroy(aux, FM_NVA_FREE);
 }
 
 /*
  * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool
  * has the 'autoreplace' property set, and therefore any broken vdevs will be
  * handled by higher level logic, and no vdev fault should be generated.
  */
 void
 zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
 {
 	zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL);
 }
 
 /*
  * The 'resource.fs.zfs.statechange' event is an internal signal that the
  * given vdev has transitioned its state to DEGRADED or HEALTHY.  This will
  * cause the retire agent to repair any outstanding fault management cases
  * open because the device was not found (fault.fs.zfs.device).
  */
 void
 zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate)
 {
 #ifdef _KERNEL
 	nvlist_t *aux;
 
 	/*
 	 * Add optional supplemental keys to payload
 	 */
 	aux = fm_nvlist_create(NULL);
 	if (vd && aux) {
 		if (vd->vdev_physpath) {
 			fnvlist_add_string(aux,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH,
 			    vd->vdev_physpath);
 		}
 		if (vd->vdev_enc_sysfs_path) {
 			fnvlist_add_string(aux,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH,
 			    vd->vdev_enc_sysfs_path);
 		}
 
 		fnvlist_add_uint64(aux,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate);
 	}
 
 	zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE,
 	    aux);
 
 	if (aux)
 		fm_nvlist_destroy(aux, FM_NVA_FREE);
 #else
 	(void) spa, (void) vd, (void) laststate;
 #endif
 }
 
 #ifdef _KERNEL
 void
 zfs_ereport_init(void)
 {
 	mutex_init(&recent_events_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&recent_events_list, sizeof (recent_events_node_t),
 	    offsetof(recent_events_node_t, re_list_link));
 	avl_create(&recent_events_tree,  recent_events_compare,
 	    sizeof (recent_events_node_t), offsetof(recent_events_node_t,
 	    re_tree_link));
 }
 
 /*
  * This 'early' fini needs to run before zfs_fini() which on Linux waits
  * for the system_delay_taskq to drain.
  */
 void
 zfs_ereport_taskq_fini(void)
 {
 	mutex_enter(&recent_events_lock);
 	if (recent_events_cleaner_tqid != 0) {
-		taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid);
+		taskq_cancel_id(system_delay_taskq, recent_events_cleaner_tqid,
+		    B_TRUE);
 		recent_events_cleaner_tqid = 0;
 	}
 	mutex_exit(&recent_events_lock);
 }
 
 void
 zfs_ereport_fini(void)
 {
 	recent_events_node_t *entry;
 
 	while ((entry = list_remove_head(&recent_events_list)) != NULL) {
 		avl_remove(&recent_events_tree, entry);
 		kmem_free(entry, sizeof (*entry));
 	}
 	avl_destroy(&recent_events_tree);
 	list_destroy(&recent_events_list);
 	mutex_destroy(&recent_events_lock);
 }
 
 void
 zfs_ereport_snapshot_post(const char *subclass, spa_t *spa, const char *name)
 {
 	nvlist_t *aux;
 
 	aux = fm_nvlist_create(NULL);
 	fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_SNAPSHOT_NAME, name);
 
 	zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
 	fm_nvlist_destroy(aux, FM_NVA_FREE);
 }
 
 /*
  * Post when a event when a zvol is created or removed
  *
  * This is currently only used by macOS, since it uses the event to create
  * symlinks between the volume name (mypool/myvol) and the actual /dev
  * device (/dev/disk3).  For example:
  *
  * /var/run/zfs/dsk/mypool/myvol -> /dev/disk3
  *
  * name: The full name of the zvol ("mypool/myvol")
  * dev_name: The full /dev name for the zvol ("/dev/disk3")
  * raw_name: The raw  /dev name for the zvol ("/dev/rdisk3")
  */
 void
 zfs_ereport_zvol_post(const char *subclass, const char *name,
     const char *dev_name, const char *raw_name)
 {
 	nvlist_t *aux;
 	char *r;
 
 	boolean_t locked = spa_namespace_held();
 	if (!locked) spa_namespace_enter(FTAG);
 	spa_t *spa = spa_lookup(name);
 	if (!locked) spa_namespace_exit(FTAG);
 
 	if (spa == NULL)
 		return;
 
 	aux = fm_nvlist_create(NULL);
 	fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_DEVICE_NAME, dev_name);
 	fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_RAW_DEVICE_NAME,
 	    raw_name);
 	r = strchr(name, '/');
 	if (r && r[1])
 		fnvlist_add_string(aux, FM_EREPORT_PAYLOAD_ZFS_VOLUME, &r[1]);
 
 	zfs_post_common(spa, NULL, FM_RSRC_CLASS, subclass, aux);
 	fm_nvlist_destroy(aux, FM_NVA_FREE);
 }
 
 EXPORT_SYMBOL(zfs_ereport_post);
 EXPORT_SYMBOL(zfs_ereport_is_valid);
 EXPORT_SYMBOL(zfs_ereport_post_checksum);
 EXPORT_SYMBOL(zfs_post_remove);
 EXPORT_SYMBOL(zfs_post_autoreplace);
 EXPORT_SYMBOL(zfs_post_state_change);
 
 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_max, UINT, ZMOD_RW,
 	"Maximum recent zevents records to retain for duplicate checking");
 ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, retain_expire_secs, UINT, ZMOD_RW,
 	"Expiration time for recent zevents records");
 #endif /* _KERNEL */