diff --git a/include/os/freebsd/spl/sys/kstat.h b/include/os/freebsd/spl/sys/kstat.h
index f5157c7f4fe3..947dfee62393 100644
--- a/include/os/freebsd/spl/sys/kstat.h
+++ b/include/os/freebsd/spl/sys/kstat.h
@@ -1,234 +1,230 @@
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _SPL_KSTAT_H
 #define	_SPL_KSTAT_H
 
 #include <sys/types.h>
 #ifndef _STANDALONE
 #include <sys/sysctl.h>
 #endif
 struct list_head {};
 #include <sys/mutex.h>
 #include <sys/proc.h>
 
 #define	KSTAT_STRLEN		255
 #define	KSTAT_RAW_MAX		(128*1024)
 
 /*
  * For reference valid classes are:
  * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc
  */
 
 #define	KSTAT_TYPE_RAW		0 /* can be anything; ks_ndata >= 1 */
 #define	KSTAT_TYPE_NAMED	1 /* name/value pair; ks_ndata >= 1 */
 #define	KSTAT_TYPE_INTR		2 /* interrupt stats; ks_ndata == 1 */
 #define	KSTAT_TYPE_IO		3 /* I/O stats; ks_ndata == 1 */
 #define	KSTAT_TYPE_TIMER	4 /* event timer; ks_ndata >= 1 */
 #define	KSTAT_NUM_TYPES		5
 
 #define	KSTAT_DATA_CHAR		0
 #define	KSTAT_DATA_INT32	1
 #define	KSTAT_DATA_UINT32	2
 #define	KSTAT_DATA_INT64	3
 #define	KSTAT_DATA_UINT64	4
 #define	KSTAT_DATA_LONG		5
 #define	KSTAT_DATA_ULONG	6
 #define	KSTAT_DATA_STRING	7
 #define	KSTAT_NUM_DATAS		8
 
 #define	KSTAT_INTR_HARD		0
 #define	KSTAT_INTR_SOFT		1
 #define	KSTAT_INTR_WATCHDOG	2
 #define	KSTAT_INTR_SPURIOUS	3
 #define	KSTAT_INTR_MULTSVC	4
 #define	KSTAT_NUM_INTRS		5
 
 #define	KSTAT_FLAG_VIRTUAL	0x01
 #define	KSTAT_FLAG_VAR_SIZE	0x02
 #define	KSTAT_FLAG_WRITABLE	0x04
 #define	KSTAT_FLAG_PERSISTENT	0x08
 #define	KSTAT_FLAG_DORMANT	0x10
 #define	KSTAT_FLAG_INVALID	0x20
 #define	KSTAT_FLAG_LONGSTRINGS	0x40
 #define	KSTAT_FLAG_NO_HEADERS	0x80
 
 #define	KS_MAGIC		0x9d9d9d9d
 
 /* Dynamic updates */
 #define	KSTAT_READ		0
 #define	KSTAT_WRITE		1
 
 struct kstat_s;
 typedef struct kstat_s kstat_t;
 
 typedef int kid_t;				/* unique kstat id */
 typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */
 
 struct seq_file {
 	char *sf_buf;
 	size_t sf_size;
 };
 
 void seq_printf(struct seq_file *m, const char *fmt, ...);
 
 
 typedef struct kstat_module {
 	char ksm_name[KSTAT_STRLEN+1];		/* module name */
 	struct list_head ksm_module_list;	/* module linkage */
 	struct list_head ksm_kstat_list;	/* list of kstat entries */
 	struct proc_dir_entry *ksm_proc;	/* proc entry */
 } kstat_module_t;
 
 typedef struct kstat_raw_ops {
 	int (*headers)(char *buf, size_t size);
 	int (*seq_headers)(struct seq_file *);
 	int (*data)(char *buf, size_t size, void *data);
 	void *(*addr)(kstat_t *ksp, loff_t index);
 } kstat_raw_ops_t;
 
 struct kstat_s {
 	int		ks_magic;		/* magic value */
 	kid_t		ks_kid;			/* unique kstat ID */
 	hrtime_t	ks_crtime;		/* creation time */
 	hrtime_t	ks_snaptime;		/* last access time */
 	char		ks_module[KSTAT_STRLEN+1]; /* provider module name */
 	int		ks_instance;		/* provider module instance */
 	char		ks_name[KSTAT_STRLEN+1]; /* kstat name */
 	char		ks_class[KSTAT_STRLEN+1]; /* kstat class */
 	uchar_t		ks_type;		/* kstat data type */
 	uchar_t		ks_flags;		/* kstat flags */
 	void		*ks_data;		/* kstat type-specific data */
 	uint_t		ks_ndata;		/* # of data records */
 	size_t		ks_data_size;		/* size of kstat data section */
 	kstat_update_t	*ks_update;		/* dynamic updates */
 	void		*ks_private;		/* private data */
 	void		*ks_private1;		/* private data */
 	kmutex_t	ks_private_lock;	/* kstat private data lock */
 	kmutex_t	*ks_lock;		/* kstat data lock */
 	struct list_head ks_list;		/* kstat linkage */
 	kstat_module_t	*ks_owner;		/* kstat module linkage */
 	kstat_raw_ops_t	ks_raw_ops;		/* ops table for raw type */
 	char		*ks_raw_buf;		/* buf used for raw ops */
 	size_t		ks_raw_bufsize;		/* size of raw ops buffer */
 #ifndef _STANDALONE
 	struct sysctl_ctx_list ks_sysctl_ctx;
 	struct sysctl_oid *ks_sysctl_root;
 #endif /* _STANDALONE */
 };
 
 typedef struct kstat_named_s {
 	char	name[KSTAT_STRLEN];	/* name of counter */
 	uchar_t	data_type;		/* data type */
 	union {
 		char c[16];	/* 128-bit int */
 		int32_t	i32;	/* 32-bit signed int */
 		uint32_t ui32;	/* 32-bit unsigned int */
 		int64_t i64;	/* 64-bit signed int */
 		uint64_t ui64;	/* 64-bit unsigned int */
 		long l;		/* native signed long */
 		ulong_t ul;	/* native unsigned long */
 		struct {
 			union {
 				char *ptr;	/* NULL-term string */
 				char __pad[8];	/* 64-bit padding */
 			} addr;
 			uint32_t len;		/* # bytes for strlen + '\0' */
 		} string;
 	} value;
 } kstat_named_t;
 
 #define	KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr)
 #define	KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len)
 
 typedef struct kstat_intr {
 	uint_t intrs[KSTAT_NUM_INTRS];
 } kstat_intr_t;
 
 typedef struct kstat_io {
 	u_longlong_t	nread;		/* number of bytes read */
 	u_longlong_t	nwritten;	/* number of bytes written */
 	uint_t		reads;		/* number of read operations */
 	uint_t		writes;		/* number of write operations */
 	hrtime_t	wtime;		/* cumulative wait (pre-service) time */
 	hrtime_t	wlentime;	/* cumulative wait len*time product */
 	hrtime_t	wlastupdate;	/* last time wait queue changed */
 	hrtime_t	rtime;		/* cumulative run (service) time */
 	hrtime_t	rlentime;	/* cumulative run length*time product */
 	hrtime_t	rlastupdate;	/* last time run queue changed */
 	uint_t		wcnt;		/* count of elements in wait state */
 	uint_t		rcnt;		/* count of elements in run state */
 } kstat_io_t;
 
 typedef struct kstat_timer {
 	char		name[KSTAT_STRLEN+1]; /* event name */
 	u_longlong_t	num_events;	 /* number of events */
 	hrtime_t	elapsed_time;	 /* cumulative elapsed time */
 	hrtime_t	min_time;	 /* shortest event duration */
 	hrtime_t	max_time;	 /* longest event duration */
 	hrtime_t	start_time;	 /* previous event start time */
 	hrtime_t	stop_time;	 /* previous event stop time */
 } kstat_timer_t;
 
 int spl_kstat_init(void);
 void spl_kstat_fini(void);
 
 extern void __kstat_set_raw_ops(kstat_t *ksp,
     int (*headers)(char *buf, size_t size),
     int (*data)(char *buf, size_t size, void *data),
     void* (*addr)(kstat_t *ksp, loff_t index));
 
 extern void __kstat_set_seq_raw_ops(kstat_t *ksp,
     int (*headers)(struct seq_file *),
     int (*data)(char *buf, size_t size, void *data),
     void* (*addr)(kstat_t *ksp, loff_t index));
 
 
 extern kstat_t *__kstat_create(const char *ks_module, int ks_instance,
     const char *ks_name, const char *ks_class, uchar_t ks_type,
     uint_t ks_ndata, uchar_t ks_flags);
 
 extern void __kstat_install(kstat_t *ksp);
 extern void __kstat_delete(kstat_t *ksp);
-extern void kstat_waitq_enter(kstat_io_t *);
-extern void kstat_waitq_exit(kstat_io_t *);
-extern void kstat_runq_enter(kstat_io_t *);
-extern void kstat_runq_exit(kstat_io_t *);
 
 #define	kstat_set_seq_raw_ops(k, h, d, a) \
     __kstat_set_seq_raw_ops(k, h, d, a)
 #define	kstat_set_raw_ops(k, h, d, a) \
     __kstat_set_raw_ops(k, h, d, a)
 #ifndef _STANDALONE
 #define	kstat_create(m, i, n, c, t, s, f) \
     __kstat_create(m, i, n, c, t, s, f)
 
 #define	kstat_install(k)		__kstat_install(k)
 #define	kstat_delete(k)			__kstat_delete(k)
 #else
 #define	kstat_create(m, i, n, c, t, s, f)	((kstat_t *)0)
 #define	kstat_install(k)
 #define	kstat_delete(k)
 #endif
 
 #endif  /* _SPL_KSTAT_H */
diff --git a/include/os/linux/spl/sys/kstat.h b/include/os/linux/spl/sys/kstat.h
index 905d8257c8d3..928f70757545 100644
--- a/include/os/linux/spl/sys/kstat.h
+++ b/include/os/linux/spl/sys/kstat.h
@@ -1,222 +1,218 @@
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  */
 
 #ifndef _SPL_KSTAT_H
 #define	_SPL_KSTAT_H
 
 #include <linux/module.h>
 #include <sys/types.h>
 #include <sys/time.h>
 #include <sys/kmem.h>
 #include <sys/mutex.h>
 #include <sys/proc.h>
 
 #define	KSTAT_STRLEN		255
 #define	KSTAT_RAW_MAX		(128*1024)
 
 /*
  * For reference valid classes are:
  * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc
  */
 
 #define	KSTAT_TYPE_RAW		0 /* can be anything; ks_ndata >= 1 */
 #define	KSTAT_TYPE_NAMED	1 /* name/value pair; ks_ndata >= 1 */
 #define	KSTAT_TYPE_INTR		2 /* interrupt stats; ks_ndata == 1 */
 #define	KSTAT_TYPE_IO		3 /* I/O stats; ks_ndata == 1 */
 #define	KSTAT_TYPE_TIMER	4 /* event timer; ks_ndata >= 1 */
 #define	KSTAT_NUM_TYPES		5
 
 #define	KSTAT_DATA_CHAR		0
 #define	KSTAT_DATA_INT32	1
 #define	KSTAT_DATA_UINT32	2
 #define	KSTAT_DATA_INT64	3
 #define	KSTAT_DATA_UINT64	4
 #define	KSTAT_DATA_LONG		5
 #define	KSTAT_DATA_ULONG	6
 #define	KSTAT_DATA_STRING	7
 #define	KSTAT_NUM_DATAS		8
 
 #define	KSTAT_INTR_HARD		0
 #define	KSTAT_INTR_SOFT		1
 #define	KSTAT_INTR_WATCHDOG	2
 #define	KSTAT_INTR_SPURIOUS	3
 #define	KSTAT_INTR_MULTSVC	4
 #define	KSTAT_NUM_INTRS		5
 
 #define	KSTAT_FLAG_VIRTUAL	0x01
 #define	KSTAT_FLAG_VAR_SIZE	0x02
 #define	KSTAT_FLAG_WRITABLE	0x04
 #define	KSTAT_FLAG_PERSISTENT	0x08
 #define	KSTAT_FLAG_DORMANT	0x10
 #define	KSTAT_FLAG_INVALID	0x20
 #define	KSTAT_FLAG_LONGSTRINGS	0x40
 #define	KSTAT_FLAG_NO_HEADERS	0x80
 
 #define	KS_MAGIC		0x9d9d9d9d
 
 /* Dynamic updates */
 #define	KSTAT_READ		0
 #define	KSTAT_WRITE		1
 
 struct kstat_s;
 typedef struct kstat_s kstat_t;
 
 typedef int kid_t;				/* unique kstat id */
 typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */
 
 typedef struct kstat_module {
 	char ksm_name[KSTAT_STRLEN+1];		/* module name */
 	struct list_head ksm_module_list;	/* module linkage */
 	struct list_head ksm_kstat_list;	/* list of kstat entries */
 	struct proc_dir_entry *ksm_proc;	/* proc entry */
 } kstat_module_t;
 
 typedef struct kstat_raw_ops {
 	int (*headers)(char *buf, size_t size);
 	int (*data)(char *buf, size_t size, void *data);
 	void *(*addr)(kstat_t *ksp, loff_t index);
 } kstat_raw_ops_t;
 
 typedef struct kstat_proc_entry {
 	char	kpe_name[KSTAT_STRLEN+1];	/* kstat name */
 	char	kpe_module[KSTAT_STRLEN+1];	/* provider module name */
 	kstat_module_t		*kpe_owner;	/* kstat module linkage */
 	struct list_head	kpe_list;	/* kstat linkage */
 	struct proc_dir_entry	*kpe_proc;	/* procfs entry */
 } kstat_proc_entry_t;
 
 struct kstat_s {
 	int		ks_magic;		/* magic value */
 	kid_t		ks_kid;			/* unique kstat ID */
 	hrtime_t	ks_crtime;		/* creation time */
 	hrtime_t	ks_snaptime;		/* last access time */
 	int		ks_instance;		/* provider module instance */
 	char		ks_class[KSTAT_STRLEN+1]; /* kstat class */
 	uchar_t		ks_type;		/* kstat data type */
 	uchar_t		ks_flags;		/* kstat flags */
 	void		*ks_data;		/* kstat type-specific data */
 	uint_t		ks_ndata;		/* # of data records */
 	size_t		ks_data_size;		/* size of kstat data section */
 	kstat_update_t	*ks_update;		/* dynamic updates */
 	void		*ks_private;		/* private data */
 	kmutex_t	ks_private_lock;	/* kstat private data lock */
 	kmutex_t	*ks_lock;		/* kstat data lock */
 	kstat_raw_ops_t	ks_raw_ops;		/* ops table for raw type */
 	char		*ks_raw_buf;		/* buf used for raw ops */
 	size_t		ks_raw_bufsize;		/* size of raw ops buffer */
 	kstat_proc_entry_t	ks_proc;	/* data for procfs entry */
 };
 
 typedef struct kstat_named_s {
 	char	name[KSTAT_STRLEN];	/* name of counter */
 	uchar_t	data_type;		/* data type */
 	union {
 		char c[16];	/* 128-bit int */
 		int32_t	i32;	/* 32-bit signed int */
 		uint32_t ui32;	/* 32-bit unsigned int */
 		int64_t i64;	/* 64-bit signed int */
 		uint64_t ui64;	/* 64-bit unsigned int */
 		long l;		/* native signed long */
 		ulong_t ul;	/* native unsigned long */
 		struct {
 			union {
 				char *ptr;	/* NULL-term string */
 				char __pad[8];	/* 64-bit padding */
 			} addr;
 			uint32_t len;		/* # bytes for strlen + '\0' */
 		} string;
 	} value;
 } kstat_named_t;
 
 #define	KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr)
 #define	KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len)
 
 #ifdef HAVE_PROC_OPS_STRUCT
 typedef struct proc_ops kstat_proc_op_t;
 #else
 typedef struct file_operations kstat_proc_op_t;
 #endif
 
 typedef struct kstat_intr {
 	uint_t intrs[KSTAT_NUM_INTRS];
 } kstat_intr_t;
 
 typedef struct kstat_io {
 	u_longlong_t	nread;		/* number of bytes read */
 	u_longlong_t	nwritten;	/* number of bytes written */
 	uint_t		reads;		/* number of read operations */
 	uint_t		writes;		/* number of write operations */
 	hrtime_t	wtime;		/* cumulative wait (pre-service) time */
 	hrtime_t	wlentime;	/* cumulative wait len*time product */
 	hrtime_t	wlastupdate;	/* last time wait queue changed */
 	hrtime_t	rtime;		/* cumulative run (service) time */
 	hrtime_t	rlentime;	/* cumulative run length*time product */
 	hrtime_t	rlastupdate;	/* last time run queue changed */
 	uint_t		wcnt;		/* count of elements in wait state */
 	uint_t		rcnt;		/* count of elements in run state */
 } kstat_io_t;
 
 typedef struct kstat_timer {
 	char		name[KSTAT_STRLEN+1]; /* event name */
 	u_longlong_t	num_events;	 /* number of events */
 	hrtime_t	elapsed_time;	 /* cumulative elapsed time */
 	hrtime_t	min_time;	 /* shortest event duration */
 	hrtime_t	max_time;	 /* longest event duration */
 	hrtime_t	start_time;	 /* previous event start time */
 	hrtime_t	stop_time;	 /* previous event stop time */
 } kstat_timer_t;
 
 int spl_kstat_init(void);
 void spl_kstat_fini(void);
 
 extern void __kstat_set_raw_ops(kstat_t *ksp,
     int (*headers)(char *buf, size_t size),
     int (*data)(char *buf, size_t size, void *data),
     void* (*addr)(kstat_t *ksp, loff_t index));
 
 extern kstat_t *__kstat_create(const char *ks_module, int ks_instance,
     const char *ks_name, const char *ks_class, uchar_t ks_type,
     uint_t ks_ndata, uchar_t ks_flags);
 
 extern void kstat_proc_entry_init(kstat_proc_entry_t *kpep,
     const char *module, const char *name);
 extern void kstat_proc_entry_delete(kstat_proc_entry_t *kpep);
 extern void kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode,
     const kstat_proc_op_t *file_ops, void *data);
 
 extern void __kstat_install(kstat_t *ksp);
 extern void __kstat_delete(kstat_t *ksp);
-extern void kstat_waitq_enter(kstat_io_t *);
-extern void kstat_waitq_exit(kstat_io_t *);
-extern void kstat_runq_enter(kstat_io_t *);
-extern void kstat_runq_exit(kstat_io_t *);
 
 #define	kstat_set_raw_ops(k, h, d, a) \
     __kstat_set_raw_ops(k, h, d, a)
 #define	kstat_create(m, i, n, c, t, s, f) \
     __kstat_create(m, i, n, c, t, s, f)
 
 #define	kstat_install(k)		__kstat_install(k)
 #define	kstat_delete(k)			__kstat_delete(k)
 
 #endif  /* _SPL_KSTAT_H */
diff --git a/include/sys/spa.h b/include/sys/spa.h
index 374d36e7327e..d37c6c923d8c 100644
--- a/include/sys/spa.h
+++ b/include/sys/spa.h
@@ -1,1211 +1,1210 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2021 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Joyent, Inc.
  * Copyright (c) 2017, 2019, Datto Inc. All rights reserved.
  * Copyright (c) 2017, Intel Corporation.
  * Copyright (c) 2019, Allan Jude
  * Copyright (c) 2019, Klara Inc.
  */
 
 #ifndef _SYS_SPA_H
 #define	_SYS_SPA_H
 
 #include <sys/avl.h>
 #include <sys/zfs_context.h>
 #include <sys/kstat.h>
 #include <sys/nvpair.h>
 #include <sys/sysmacros.h>
 #include <sys/types.h>
 #include <sys/fs/zfs.h>
 #include <sys/spa_checksum.h>
 #include <sys/dmu.h>
 #include <sys/space_map.h>
 #include <sys/bitops.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * Forward references that lots of things need.
  */
 typedef struct spa spa_t;
 typedef struct vdev vdev_t;
 typedef struct metaslab metaslab_t;
 typedef struct metaslab_group metaslab_group_t;
 typedef struct metaslab_class metaslab_class_t;
 typedef struct zio zio_t;
 typedef struct zilog zilog_t;
 typedef struct spa_aux_vdev spa_aux_vdev_t;
 typedef struct ddt ddt_t;
 typedef struct ddt_entry ddt_entry_t;
 typedef struct zbookmark_phys zbookmark_phys_t;
 
 struct bpobj;
 struct bplist;
 struct dsl_pool;
 struct dsl_dataset;
 struct dsl_crypto_params;
 
 /*
  * Alignment Shift (ashift) is an immutable, internal top-level vdev property
  * which can only be set at vdev creation time. Physical writes are always done
  * according to it, which makes 2^ashift the smallest possible IO on a vdev.
  *
  * We currently allow values ranging from 512 bytes (2^9 = 512) to 64 KiB
  * (2^16 = 65,536).
  */
 #define	ASHIFT_MIN		9
 #define	ASHIFT_MAX		16
 
 /*
  * Size of block to hold the configuration data (a packed nvlist)
  */
 #define	SPA_CONFIG_BLOCKSIZE	(1ULL << 14)
 
 /*
  * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB.
  * The ASIZE encoding should be at least 64 times larger (6 more bits)
  * to support up to 4-way RAID-Z mirror mode with worst-case gang block
  * overhead, three DVAs per bp, plus one more bit in case we do anything
  * else that expands the ASIZE.
  */
 #define	SPA_LSIZEBITS		16	/* LSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_PSIZEBITS		16	/* PSIZE up to 32M (2^16 * 512)	*/
 #define	SPA_ASIZEBITS		24	/* ASIZE up to 64 times larger	*/
 
 #define	SPA_COMPRESSBITS	7
 #define	SPA_VDEVBITS		24
 #define	SPA_COMPRESSMASK	((1U << SPA_COMPRESSBITS) - 1)
 
 /*
  * All SPA data is represented by 128-bit data virtual addresses (DVAs).
  * The members of the dva_t should be considered opaque outside the SPA.
  */
 typedef struct dva {
 	uint64_t	dva_word[2];
 } dva_t;
 
 
 /*
  * Some checksums/hashes need a 256-bit initialization salt. This salt is kept
  * secret and is suitable for use in MAC algorithms as the key.
  */
 typedef struct zio_cksum_salt {
 	uint8_t		zcs_bytes[32];
 } zio_cksum_salt_t;
 
 /*
  * Each block is described by its DVAs, time of birth, checksum, etc.
  * The word-by-word, bit-by-bit layout of the blkptr is as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 0	|  pad  |	  vdev1         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 2	|  pad  |	  vdev2         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 4	|  pad  |	  vdev3         | GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 9	|			physical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|			fill count				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * c	|			checksum[0]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * d	|			checksum[1]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * e	|			checksum[2]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * f	|			checksum[3]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * vdev		virtual device ID
  * offset	offset into virtual device
  * LSIZE	logical size
  * PSIZE	physical size (after compression)
  * ASIZE	allocated size (including RAID-Z parity and gang block headers)
  * GRID		RAID-Z layout information (reserved for future use)
  * cksum	checksum function
  * comp		compression function
  * G		gang block indicator
  * B		byteorder (endianness)
  * D		dedup
  * X		encryption
  * E		blkptr_t contains embedded data (see below)
  * lvl		level of indirection
  * type		DMU object type
  * phys birth	txg when dva[0] was written; zero if same as logical birth txg
  *              note that typically all the dva's would be written in this
  *              txg, but they could be different if they were moved by
  *              device removal.
  * log. birth	transaction group in which the block was logically born
  * fill count	number of non-zero blocks under this bp
  * checksum[4]	256-bit checksum of the data this bp describes
  */
 
 /*
  * The blkptr_t's of encrypted blocks also need to store the encryption
  * parameters so that the block can be decrypted. This layout is as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 0	|		vdev1		| GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 1	|G|			 offset1				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 2	|		vdev2		| GRID  |	  ASIZE		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 3	|G|			 offset2				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 4	|			salt					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|			IV1					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| cksum |E| comp|    PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 9	|			physical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|		IV2		|	    fill count		|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * c	|			checksum[0]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * d	|			checksum[1]				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * e	|			MAC[0]					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * f	|			MAC[1]					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * salt		Salt for generating encryption keys
  * IV1		First 64 bits of encryption IV
  * X		Block requires encryption handling (set to 1)
  * E		blkptr_t contains embedded data (set to 0, see below)
  * fill count	number of non-zero blocks under this bp (truncated to 32 bits)
  * IV2		Last 32 bits of encryption IV
  * checksum[2]	128-bit checksum of the data this bp describes
  * MAC[2]	128-bit message authentication code for this data
  *
  * The X bit being set indicates that this block is one of 3 types. If this is
  * a level 0 block with an encrypted object type, the block is encrypted
  * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted
  * object type, this block is authenticated with an HMAC (see
  * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC
  * words to store a checksum-of-MACs from the level below (see
  * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED()
  * refers to both encrypted and authenticated blocks and BP_USES_CRYPT()
  * refers to any of these 3 kinds of blocks.
  *
  * The additional encryption parameters are the salt, IV, and MAC which are
  * explained in greater detail in the block comment at the top of zio_crypt.c.
  * The MAC occupies half of the checksum space since it serves a very similar
  * purpose: to prevent data corruption on disk. The only functional difference
  * is that the checksum is used to detect on-disk corruption whether or not the
  * encryption key is loaded and the MAC provides additional protection against
  * malicious disk tampering. We use the 3rd DVA to store the salt and first
  * 64 bits of the IV. As a result encrypted blocks can only have 2 copies
  * maximum instead of the normal 3. The last 32 bits of the IV are stored in
  * the upper bits of what is usually the fill count. Note that only blocks at
  * level 0 or -2 are ever encrypted, which allows us to guarantee that these
  * 32 bits are not trampled over by other code (see zio_crypt.c for details).
  * The salt and IV are not used for authenticated bps or bps with an indirect
  * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits
  * for the fill count.
  */
 
 /*
  * "Embedded" blkptr_t's don't actually point to a block, instead they
  * have a data payload embedded in the blkptr_t itself.  See the comment
  * in blkptr.c for more details.
  *
  * The blkptr_t is laid out as follows:
  *
  *	64	56	48	40	32	24	16	8	0
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 0	|      payload                                                  |
  * 1	|      payload                                                  |
  * 2	|      payload                                                  |
  * 3	|      payload                                                  |
  * 4	|      payload                                                  |
  * 5	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 6	|BDX|lvl| type	| etype |E| comp| PSIZE|              LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|      payload                                                  |
  * 8	|      payload                                                  |
  * 9	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|      payload                                                  |
  * c	|      payload                                                  |
  * d	|      payload                                                  |
  * e	|      payload                                                  |
  * f	|      payload                                                  |
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  *
  * Legend:
  *
  * payload		contains the embedded data
  * B (byteorder)	byteorder (endianness)
  * D (dedup)		padding (set to zero)
  * X			encryption (set to zero)
  * E (embedded)		set to one
  * lvl			indirection level
  * type			DMU object type
  * etype		how to interpret embedded data (BP_EMBEDDED_TYPE_*)
  * comp			compression function of payload
  * PSIZE		size of payload after compression, in bytes
  * LSIZE		logical size of payload, in bytes
  *			note that 25 bits is enough to store the largest
  *			"normal" BP's LSIZE (2^16 * 2^9) in bytes
  * log. birth		transaction group in which the block was logically born
  *
  * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded
  * bp's they are stored in units of SPA_MINBLOCKSHIFT.
  * Generally, the generic BP_GET_*() macros can be used on embedded BP's.
  * The B, D, X, lvl, type, and comp fields are stored the same as with normal
  * BP's so the BP_SET_* macros can be used with them.  etype, PSIZE, LSIZE must
  * be set with the BPE_SET_* macros.  BP_SET_EMBEDDED() should be called before
  * other macros, as they assert that they are only used on BP's of the correct
  * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use
  * the payload space for encryption parameters (see the comment above on
  * how encryption parameters are stored).
  */
 
 #define	BPE_GET_ETYPE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET((bp)->blk_prop, 40, 8))
 #define	BPE_SET_ETYPE(bp, t)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET((bp)->blk_prop, 40, 8, t); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BPE_GET_LSIZE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1))
 #define	BPE_SET_LSIZE(bp, x)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BPE_GET_PSIZE(bp)	\
 	(ASSERT(BP_IS_EMBEDDED(bp)), \
 	BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1))
 #define	BPE_SET_PSIZE(bp, x)	do { \
 	ASSERT(BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \
 _NOTE(CONSTCOND) } while (0)
 
 typedef enum bp_embedded_type {
 	BP_EMBEDDED_TYPE_DATA,
 	BP_EMBEDDED_TYPE_RESERVED, /* Reserved for Delphix byteswap feature. */
 	BP_EMBEDDED_TYPE_REDACTED,
 	NUM_BP_EMBEDDED_TYPES
 } bp_embedded_type_t;
 
 #define	BPE_NUM_WORDS 14
 #define	BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t))
 #define	BPE_IS_PAYLOADWORD(bp, wp) \
 	((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth)
 
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
 #define	SPA_SYNC_MIN_VDEVS 3		/* min vdevs to update during sync */
 
 /*
  * A block is a hole when it has either 1) never been written to, or
  * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads
  * without physically allocating disk space. Holes are represented in the
  * blkptr_t structure by zeroed blk_dva. Correct checking for holes is
  * done through the BP_IS_HOLE macro. For holes, the logical size, level,
  * DMU object type, and birth times are all also stored for holes that
  * were written to at some point (i.e. were punched after having been filled).
  */
 typedef struct blkptr {
 	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
 	uint64_t	blk_prop;	/* size, compression, type, etc	    */
 	uint64_t	blk_pad[2];	/* Extra space for the future	    */
 	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
 	uint64_t	blk_birth;	/* transaction group at birth	    */
 	uint64_t	blk_fill;	/* fill count			    */
 	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
 } blkptr_t;
 
 /*
  * Macros to get and set fields in a bp or DVA.
  */
 
 /*
  * Note, for gang blocks, DVA_GET_ASIZE() is the total space allocated for
  * this gang DVA including its children BP's.  The space allocated at this
  * DVA's vdev/offset is vdev_gang_header_asize(vdev).
  */
 #define	DVA_GET_ASIZE(dva)	\
 	BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0)
 #define	DVA_SET_ASIZE(dva, x)	\
 	BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \
 	SPA_MINBLOCKSHIFT, 0, x)
 
 #define	DVA_GET_GRID(dva)	BF64_GET((dva)->dva_word[0], 24, 8)
 #define	DVA_SET_GRID(dva, x)	BF64_SET((dva)->dva_word[0], 24, 8, x)
 
 #define	DVA_GET_VDEV(dva)	BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS)
 #define	DVA_SET_VDEV(dva, x)	\
 	BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x)
 
 #define	DVA_GET_OFFSET(dva)	\
 	BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0)
 #define	DVA_SET_OFFSET(dva, x)	\
 	BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x)
 
 #define	DVA_GET_GANG(dva)	BF64_GET((dva)->dva_word[1], 63, 1)
 #define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
 
 #define	BP_GET_LSIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ?	\
 	(BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \
 	BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1))
 #define	BP_SET_LSIZE(bp, x)	do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, \
 	    0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_PSIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1))
 #define	BP_SET_PSIZE(bp, x)	do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET_SB((bp)->blk_prop, \
 	    16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_COMPRESS(bp)		\
 	BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS)
 #define	BP_SET_COMPRESS(bp, x)		\
 	BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x)
 
 #define	BP_IS_EMBEDDED(bp)		BF64_GET((bp)->blk_prop, 39, 1)
 #define	BP_SET_EMBEDDED(bp, x)		BF64_SET((bp)->blk_prop, 39, 1, x)
 
 #define	BP_GET_CHECKSUM(bp)		\
 	(BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \
 	BF64_GET((bp)->blk_prop, 40, 8))
 #define	BP_SET_CHECKSUM(bp, x)		do { \
 	ASSERT(!BP_IS_EMBEDDED(bp)); \
 	BF64_SET((bp)->blk_prop, 40, 8, x); \
 _NOTE(CONSTCOND) } while (0)
 
 #define	BP_GET_TYPE(bp)			BF64_GET((bp)->blk_prop, 48, 8)
 #define	BP_SET_TYPE(bp, x)		BF64_SET((bp)->blk_prop, 48, 8, x)
 
 #define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
 #define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)
 
 /* encrypted, authenticated, and MAC cksum bps use the same bit */
 #define	BP_USES_CRYPT(bp)		BF64_GET((bp)->blk_prop, 61, 1)
 #define	BP_SET_CRYPT(bp, x)		BF64_SET((bp)->blk_prop, 61, 1, x)
 
 #define	BP_IS_ENCRYPTED(bp)			\
 	(BP_USES_CRYPT(bp) &&			\
 	BP_GET_LEVEL(bp) <= 0 &&		\
 	DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp)))
 
 #define	BP_IS_AUTHENTICATED(bp)			\
 	(BP_USES_CRYPT(bp) &&			\
 	BP_GET_LEVEL(bp) <= 0 &&		\
 	!DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp)))
 
 #define	BP_HAS_INDIRECT_MAC_CKSUM(bp)		\
 	(BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0)
 
 #define	BP_IS_PROTECTED(bp)			\
 	(BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp))
 
 #define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
 #define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
 
 #define	BP_GET_BYTEORDER(bp)		BF64_GET((bp)->blk_prop, 63, 1)
 #define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)
 
 #define	BP_GET_FREE(bp)			BF64_GET((bp)->blk_fill, 0, 1)
 #define	BP_SET_FREE(bp, x)		BF64_SET((bp)->blk_fill, 0, 1, x)
 
 #define	BP_PHYSICAL_BIRTH(bp)		\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	(bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
 
 #define	BP_SET_BIRTH(bp, logical, physical)	\
 {						\
 	ASSERT(!BP_IS_EMBEDDED(bp));		\
 	(bp)->blk_birth = (logical);		\
 	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
 }
 
 #define	BP_GET_FILL(bp)				\
 	((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \
 	((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill))
 
 #define	BP_SET_FILL(bp, fill)			\
 {						\
 	if (BP_IS_ENCRYPTED(bp))			\
 		BF64_SET((bp)->blk_fill, 0, 32, fill); \
 	else					\
 		(bp)->blk_fill = fill;		\
 }
 
 #define	BP_GET_IV2(bp)				\
 	(ASSERT(BP_IS_ENCRYPTED(bp)),		\
 	BF64_GET((bp)->blk_fill, 32, 32))
 #define	BP_SET_IV2(bp, iv2)			\
 {						\
 	ASSERT(BP_IS_ENCRYPTED(bp));		\
 	BF64_SET((bp)->blk_fill, 32, 32, iv2);	\
 }
 
 #define	BP_IS_METADATA(bp)	\
 	(BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))
 
 #define	BP_GET_ASIZE(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	(DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))
 
 #define	BP_GET_UCSIZE(bp)	\
 	(BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp))
 
 #define	BP_GET_NDVAS(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \
 	!!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
 	(!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))
 
 #define	BP_COUNT_GANG(bp)	\
 	(BP_IS_EMBEDDED(bp) ? 0 : \
 	(DVA_GET_GANG(&(bp)->blk_dva[0]) + \
 	DVA_GET_GANG(&(bp)->blk_dva[1]) + \
 	(DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))))
 
 #define	DVA_EQUAL(dva1, dva2)	\
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
 
 #define	BP_EQUAL(bp1, bp2)	\
 	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
 	(bp1)->blk_birth == (bp2)->blk_birth &&			\
 	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
 	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
 
 
 #define	DVA_IS_VALID(dva)	(DVA_GET_ASIZE(dva) != 0)
 
 #define	BP_IDENTITY(bp)		(ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0])
 #define	BP_IS_GANG(bp)		\
 	(BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp)))
 #define	DVA_IS_EMPTY(dva)	((dva)->dva_word[0] == 0ULL &&	\
 				(dva)->dva_word[1] == 0ULL)
 #define	BP_IS_HOLE(bp) \
 	(!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp)))
 
 #define	BP_SET_REDACTED(bp) \
 {							\
 	BP_SET_EMBEDDED(bp, B_TRUE);			\
 	BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_REDACTED);	\
 }
 #define	BP_IS_REDACTED(bp) \
 	(BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_REDACTED)
 
 /* BP_IS_RAIDZ(bp) assumes no block compression */
 #define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
 				BP_GET_PSIZE(bp))
 
 #define	BP_ZERO(bp)				\
 {						\
 	(bp)->blk_dva[0].dva_word[0] = 0;	\
 	(bp)->blk_dva[0].dva_word[1] = 0;	\
 	(bp)->blk_dva[1].dva_word[0] = 0;	\
 	(bp)->blk_dva[1].dva_word[1] = 0;	\
 	(bp)->blk_dva[2].dva_word[0] = 0;	\
 	(bp)->blk_dva[2].dva_word[1] = 0;	\
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
 	(bp)->blk_phys_birth = 0;		\
 	(bp)->blk_birth = 0;			\
 	(bp)->blk_fill = 0;			\
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
 
 #ifdef _ZFS_BIG_ENDIAN
 #define	ZFS_HOST_BYTEORDER	(0ULL)
 #else
 #define	ZFS_HOST_BYTEORDER	(1ULL)
 #endif
 
 #define	BP_SHOULD_BYTESWAP(bp)	(BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER)
 
 #define	BP_SPRINTF_LEN	400
 
 /*
  * This macro allows code sharing between zfs, libzpool, and mdb.
  * 'func' is either snprintf() or mdb_snprintf().
  * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
  */
 
 #define	SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \
 {									\
 	static const char *copyname[] =					\
 	    { "zero", "single", "double", "triple" };			\
 	int len = 0;							\
 	int copies = 0;							\
 	const char *crypt_type;						\
 	if (bp != NULL) {						\
 		if (BP_IS_ENCRYPTED(bp)) {				\
 			crypt_type = "encrypted";			\
 			/* LINTED E_SUSPICIOUS_COMPARISON */		\
 		} else if (BP_IS_AUTHENTICATED(bp)) {			\
 			crypt_type = "authenticated";			\
 		} else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) {		\
 			crypt_type = "indirect-MAC";			\
 		} else {						\
 			crypt_type = "unencrypted";			\
 		}							\
 	}								\
 	if (bp == NULL) {						\
 		len += func(buf + len, size - len, "<NULL>");		\
 	} else if (BP_IS_HOLE(bp)) {					\
 		len += func(buf + len, size - len,			\
 		    "HOLE [L%llu %s] "					\
 		    "size=%llxL birth=%lluL",				\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth);			\
 	} else if (BP_IS_EMBEDDED(bp)) {				\
 		len = func(buf + len, size - len,			\
 		    "EMBEDDED [L%llu %s] et=%u %s "			\
 		    "size=%llxL/%llxP birth=%lluL",			\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (int)BPE_GET_ETYPE(bp),				\
 		    compress,						\
 		    (u_longlong_t)BPE_GET_LSIZE(bp),			\
 		    (u_longlong_t)BPE_GET_PSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth);			\
 	} else if (BP_IS_REDACTED(bp)) {				\
 		len += func(buf + len, size - len,			\
 		    "REDACTED [L%llu %s] size=%llxL birth=%lluL",	\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth);			\
 	} else {							\
 		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
 			const dva_t *dva = &bp->blk_dva[d];		\
 			if (DVA_IS_VALID(dva))				\
 				copies++;				\
 			len += func(buf + len, size - len,		\
 			    "DVA[%d]=<%llu:%llx:%llx>%c", d,		\
 			    (u_longlong_t)DVA_GET_VDEV(dva),		\
 			    (u_longlong_t)DVA_GET_OFFSET(dva),		\
 			    (u_longlong_t)DVA_GET_ASIZE(dva),		\
 			    ws);					\
 		}							\
 		if (BP_IS_ENCRYPTED(bp)) {				\
 			len += func(buf + len, size - len,		\
 			    "salt=%llx iv=%llx:%llx%c",			\
 			    (u_longlong_t)bp->blk_dva[2].dva_word[0],	\
 			    (u_longlong_t)bp->blk_dva[2].dva_word[1],	\
 			    (u_longlong_t)BP_GET_IV2(bp),		\
 			    ws);					\
 		}							\
 		if (BP_IS_GANG(bp) &&					\
 		    DVA_GET_ASIZE(&bp->blk_dva[2]) <=			\
 		    DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)			\
 			copies--;					\
 		len += func(buf + len, size - len,			\
 		    "[L%llu %s] %s %s %s %s %s %s %s%c"			\
 		    "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"	\
 		    "cksum=%llx:%llx:%llx:%llx",			\
 		    (u_longlong_t)BP_GET_LEVEL(bp),			\
 		    type,						\
 		    checksum,						\
 		    compress,						\
 		    crypt_type,						\
 		    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",		\
 		    BP_IS_GANG(bp) ? "gang" : "contiguous",		\
 		    BP_GET_DEDUP(bp) ? "dedup" : "unique",		\
 		    copyname[copies],					\
 		    ws,							\
 		    (u_longlong_t)BP_GET_LSIZE(bp),			\
 		    (u_longlong_t)BP_GET_PSIZE(bp),			\
 		    (u_longlong_t)bp->blk_birth,			\
 		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
 		    (u_longlong_t)BP_GET_FILL(bp),			\
 		    ws,							\
 		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[2],		\
 		    (u_longlong_t)bp->blk_cksum.zc_word[3]);		\
 	}								\
 	ASSERT(len < size);						\
 }
 
 #define	BP_GET_BUFC_TYPE(bp)						\
 	(BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
 typedef enum spa_import_type {
 	SPA_IMPORT_EXISTING,
 	SPA_IMPORT_ASSEMBLE
 } spa_import_type_t;
 
 typedef enum spa_mode {
 	SPA_MODE_UNINIT = 0,
 	SPA_MODE_READ = 1,
 	SPA_MODE_WRITE = 2,
 } spa_mode_t;
 
 /*
  * Send TRIM commands in-line during normal pool operation while deleting.
  *	OFF: no
  *	ON: yes
  * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources.
  */
 typedef enum {
 	SPA_AUTOTRIM_OFF = 0,	/* default */
 	SPA_AUTOTRIM_ON,
 #ifdef IN_FREEBSD_BASE
 	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON,
 #else
 	SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF,
 #endif
 } spa_autotrim_t;
 
 /*
  * Reason TRIM command was issued, used internally for accounting purposes.
  */
 typedef enum trim_type {
 	TRIM_TYPE_MANUAL = 0,
 	TRIM_TYPE_AUTO = 1,
 	TRIM_TYPE_SIMPLE = 2
 } trim_type_t;
 
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
 extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
     nvlist_t *policy, nvlist_t **config);
 extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot,
     size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
     nvlist_t *zplprops, struct dsl_crypto_params *dcp);
 extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props,
     uint64_t flags);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(const char *pool);
 extern int spa_checkpoint(const char *pool);
 extern int spa_checkpoint_discard(const char *pool);
 extern int spa_export(const char *pool, nvlist_t **oldconfig, boolean_t force,
     boolean_t hardforce);
 extern int spa_reset(const char *pool);
 extern void spa_async_request(spa_t *spa, int flag);
 extern void spa_async_unrequest(spa_t *spa, int flag);
 extern void spa_async_suspend(spa_t *spa);
 extern void spa_async_resume(spa_t *spa);
 extern int spa_async_tasks(spa_t *spa);
 extern spa_t *spa_inject_addref(char *pool);
 extern void spa_inject_delref(spa_t *spa);
 extern void spa_scan_stat_init(spa_t *spa);
 extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps);
 extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
 
 #define	SPA_ASYNC_CONFIG_UPDATE			0x01
 #define	SPA_ASYNC_REMOVE			0x02
 #define	SPA_ASYNC_PROBE				0x04
 #define	SPA_ASYNC_RESILVER_DONE			0x08
 #define	SPA_ASYNC_RESILVER			0x10
 #define	SPA_ASYNC_AUTOEXPAND			0x20
 #define	SPA_ASYNC_REMOVE_DONE			0x40
 #define	SPA_ASYNC_REMOVE_STOP			0x80
 #define	SPA_ASYNC_INITIALIZE_RESTART		0x100
 #define	SPA_ASYNC_TRIM_RESTART			0x200
 #define	SPA_ASYNC_AUTOTRIM_RESTART		0x400
 #define	SPA_ASYNC_L2CACHE_REBUILD		0x800
 #define	SPA_ASYNC_L2CACHE_TRIM			0x1000
 #define	SPA_ASYNC_REBUILD_DONE			0x2000
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
 extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing, int rebuild);
 extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
     int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
 extern boolean_t spa_vdev_remove_active(spa_t *spa);
 extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     nvlist_t *vdev_errlist);
 extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type,
     uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
 extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
 extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
     nvlist_t *props, boolean_t exp);
 
 /* spare state (which is global across all pools) */
 extern void spa_spare_add(vdev_t *vd);
 extern void spa_spare_remove(vdev_t *vd);
 extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt);
 extern void spa_spare_activate(vdev_t *vd);
 
 /* L2ARC state (which is global across all pools) */
 extern void spa_l2cache_add(vdev_t *vd);
 extern void spa_l2cache_remove(vdev_t *vd);
 extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
 extern void spa_l2cache_activate(vdev_t *vd);
 extern void spa_l2cache_drop(spa_t *spa);
 
 /* scanning */
 extern int spa_scan(spa_t *spa, pool_scan_func_t func);
 extern int spa_scan_stop(spa_t *spa);
 extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag);
 
 /* spa syncing */
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
 extern int zfs_sync_pass_deferred_free;
 
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
 
 /*
  * SPA configuration functions in spa_config.c
  */
 
 #define	SPA_CONFIG_UPDATE_POOL	0
 #define	SPA_CONFIG_UPDATE_VDEVS	1
 
 extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t);
 extern void spa_config_load(void);
 extern nvlist_t *spa_all_configs(uint64_t *);
 extern void spa_config_set(spa_t *spa, nvlist_t *config);
 extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
     int getstats);
 extern void spa_config_update(spa_t *spa, int what);
 extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv,
     vdev_t *parent, uint_t id, int atype);
 
 
 /*
  * Miscellaneous SPA routines in spa_misc.c
  */
 
 /* Namespace manipulation */
 extern spa_t *spa_lookup(const char *name);
 extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
 extern void spa_remove(spa_t *spa);
 extern spa_t *spa_next(spa_t *prev);
 
 /* Refcount functions */
 extern void spa_open_ref(spa_t *spa, void *tag);
 extern void spa_close(spa_t *spa, void *tag);
 extern void spa_async_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
 #define	SCL_NONE	0x00
 #define	SCL_CONFIG	0x01
 #define	SCL_STATE	0x02
 #define	SCL_L2ARC	0x04		/* hack until L2ARC 2.0 */
 #define	SCL_ALLOC	0x08
 #define	SCL_ZIO		0x10
 #define	SCL_FREE	0x20
 #define	SCL_VDEV	0x40
 #define	SCL_LOCKS	7
 #define	SCL_ALL		((1 << SCL_LOCKS) - 1)
 #define	SCL_STATE_ALL	(SCL_STATE | SCL_L2ARC | SCL_ZIO)
 
 /* Historical pool statistics */
 typedef struct spa_history_kstat {
 	kmutex_t		lock;
 	uint64_t		count;
 	uint64_t		size;
 	kstat_t			*kstat;
 	void			*priv;
 	list_t			list;
 } spa_history_kstat_t;
 
 typedef struct spa_history_list {
 	uint64_t		size;
 	procfs_list_t		procfs_list;
 } spa_history_list_t;
 
 typedef struct spa_stats {
 	spa_history_list_t	read_history;
 	spa_history_list_t	txg_history;
 	spa_history_kstat_t	tx_assign_histogram;
-	spa_history_kstat_t	io_history;
 	spa_history_list_t	mmp_history;
 	spa_history_kstat_t	state;		/* pool state */
 	spa_history_kstat_t	iostats;
 } spa_stats_t;
 
 typedef enum txg_state {
 	TXG_STATE_BIRTH		= 0,
 	TXG_STATE_OPEN		= 1,
 	TXG_STATE_QUIESCED	= 2,
 	TXG_STATE_WAIT_FOR_SYNC	= 3,
 	TXG_STATE_SYNCED	= 4,
 	TXG_STATE_COMMITTED	= 5,
 } txg_state_t;
 
 typedef struct txg_stat {
 	vdev_stat_t		vs1;
 	vdev_stat_t		vs2;
 	uint64_t		txg;
 	uint64_t		ndirty;
 } txg_stat_t;
 
 /* Assorted pool IO kstats */
 typedef struct spa_iostats {
 	kstat_named_t	trim_extents_written;
 	kstat_named_t	trim_bytes_written;
 	kstat_named_t	trim_extents_skipped;
 	kstat_named_t	trim_bytes_skipped;
 	kstat_named_t	trim_extents_failed;
 	kstat_named_t	trim_bytes_failed;
 	kstat_named_t	autotrim_extents_written;
 	kstat_named_t	autotrim_bytes_written;
 	kstat_named_t	autotrim_extents_skipped;
 	kstat_named_t	autotrim_bytes_skipped;
 	kstat_named_t	autotrim_extents_failed;
 	kstat_named_t	autotrim_bytes_failed;
 	kstat_named_t	simple_trim_extents_written;
 	kstat_named_t	simple_trim_bytes_written;
 	kstat_named_t	simple_trim_extents_skipped;
 	kstat_named_t	simple_trim_bytes_skipped;
 	kstat_named_t	simple_trim_extents_failed;
 	kstat_named_t	simple_trim_bytes_failed;
 } spa_iostats_t;
 
 extern void spa_stats_init(spa_t *spa);
 extern void spa_stats_destroy(spa_t *spa);
 extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb,
     uint32_t aflags);
 extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time);
 extern int spa_txg_history_set(spa_t *spa,  uint64_t txg,
     txg_state_t completed_state, hrtime_t completed_time);
 extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t,
     struct dsl_pool *);
 extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *);
 extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs);
 extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id);
 extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error,
     hrtime_t duration);
 extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
     uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id,
     int error);
 extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type,
     uint64_t extents_written, uint64_t bytes_written,
     uint64_t extents_skipped, uint64_t bytes_skipped,
     uint64_t extents_failed, uint64_t bytes_failed);
 extern void spa_import_progress_add(spa_t *spa);
 extern void spa_import_progress_remove(uint64_t spa_guid);
 extern int spa_import_progress_set_mmp_check(uint64_t pool_guid,
     uint64_t mmp_sec_remaining);
 extern int spa_import_progress_set_max_txg(uint64_t pool_guid,
     uint64_t max_txg);
 extern int spa_import_progress_set_state(uint64_t pool_guid,
     spa_load_state_t spa_load_state);
 
 /* Pool configuration locks */
 extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw);
 extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw);
 extern void spa_config_exit(spa_t *spa, int locks, const void *tag);
 extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
 
 /* Pool vdev add/remove lock */
 extern uint64_t spa_vdev_enter(spa_t *spa);
 extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid);
 extern uint64_t spa_vdev_config_enter(spa_t *spa);
 extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
     int error, char *tag);
 extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
 
 /* Pool vdev state change lock */
 extern void spa_vdev_state_enter(spa_t *spa, int oplock);
 extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
 
 /* Log state */
 typedef enum spa_log_state {
 	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
 	SPA_LOG_MISSING,	/* missing log(s) */
 	SPA_LOG_CLEAR,		/* clear the log(s) */
 	SPA_LOG_GOOD,		/* log(s) are good */
 } spa_log_state_t;
 
 extern spa_log_state_t spa_get_log_state(spa_t *spa);
 extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
 extern int spa_reset_logs(spa_t *spa);
 
 /* Log claim callback */
 extern void spa_claim_notify(zio_t *zio);
 extern void spa_deadman(void *);
 
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
 extern boolean_t spa_is_initializing(spa_t *spa);
 extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa);
 extern blkptr_t *spa_get_rootblkptr(spa_t *spa);
 extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp);
 extern void spa_altroot(spa_t *, char *, size_t);
 extern int spa_sync_pass(spa_t *spa);
 extern char *spa_name(spa_t *spa);
 extern uint64_t spa_guid(spa_t *spa);
 extern uint64_t spa_load_guid(spa_t *spa);
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
 extern uint64_t spa_syncing_txg(spa_t *spa);
 extern uint64_t spa_final_dirty_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern pool_state_t spa_state(spa_t *spa);
 extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
 extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize);
 extern uint64_t spa_get_dspace(spa_t *spa);
 extern uint64_t spa_get_checkpoint_space(spa_t *spa);
 extern uint64_t spa_get_slop_space(spa_t *spa);
 extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern boolean_t spa_deflate(spa_t *spa);
 extern metaslab_class_t *spa_normal_class(spa_t *spa);
 extern metaslab_class_t *spa_log_class(spa_t *spa);
 extern metaslab_class_t *spa_embedded_log_class(spa_t *spa);
 extern metaslab_class_t *spa_special_class(spa_t *spa);
 extern metaslab_class_t *spa_dedup_class(spa_t *spa);
 extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size,
     dmu_object_type_t objtype, uint_t level, uint_t special_smallblk);
 
 extern void spa_evicting_os_register(spa_t *, objset_t *os);
 extern void spa_evicting_os_deregister(spa_t *, objset_t *os);
 extern void spa_evicting_os_wait(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_prev_software_version(spa_t *spa);
 extern uint64_t spa_get_failmode(spa_t *spa);
 extern uint64_t spa_get_deadman_failmode(spa_t *spa);
 extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode);
 extern boolean_t spa_suspended(spa_t *spa);
 extern uint64_t spa_bootfs(spa_t *spa);
 extern uint64_t spa_delegation(spa_t *spa);
 extern objset_t *spa_meta_objset(spa_t *spa);
 extern space_map_t *spa_syncing_log_sm(spa_t *spa);
 extern uint64_t spa_deadman_synctime(spa_t *spa);
 extern uint64_t spa_deadman_ziotime(spa_t *spa);
 extern uint64_t spa_dirty_data(spa_t *spa);
 extern spa_autotrim_t spa_get_autotrim(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern void spa_load_failed(spa_t *spa, const char *fmt, ...);
 extern void spa_load_note(spa_t *spa, const char *fmt, ...);
 extern void spa_activate_mos_feature(spa_t *spa, const char *feature,
     dmu_tx_t *tx);
 extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature);
 extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid);
 extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
 extern uint64_t spa_get_random(uint64_t range);
 extern uint64_t spa_generate_guid(spa_t *spa);
 extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern int spa_change_guid(spa_t *spa);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
     boolean_t l2cache);
 extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
 extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
 extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
 extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
 extern boolean_t spa_writeable(spa_t *spa);
 extern boolean_t spa_has_pending_synctask(spa_t *spa);
 extern int spa_maxblocksize(spa_t *spa);
 extern int spa_maxdnodesize(spa_t *spa);
 extern boolean_t spa_has_checkpoint(spa_t *spa);
 extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa);
 extern boolean_t spa_suspend_async_destroy(spa_t *spa);
 extern uint64_t spa_min_claim_txg(spa_t *spa);
 extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva,
     const blkptr_t *bp);
 typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size,
     void *arg);
 extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp,
     spa_remap_cb_t callback, void *arg);
 extern uint64_t spa_get_last_removal_txg(spa_t *spa);
 extern boolean_t spa_trust_config(spa_t *spa);
 extern uint64_t spa_missing_tvds_allowed(spa_t *spa);
 extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing);
 extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa);
 extern uint64_t spa_total_metaslabs(spa_t *spa);
 extern boolean_t spa_multihost(spa_t *spa);
 extern uint32_t spa_get_hostid(spa_t *spa);
 extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *);
 extern boolean_t spa_livelist_delete_check(spa_t *spa);
 
 extern spa_mode_t spa_mode(spa_t *spa);
 extern uint64_t zfs_strtonum(const char *str, char **nptr);
 
 extern char *spa_his_ievent_table[];
 
 extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx);
 extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
     char *his_buf);
 extern int spa_history_log(spa_t *spa, const char *his_buf);
 extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl);
 extern void spa_history_log_version(spa_t *spa, const char *operation,
     dmu_tx_t *tx);
 extern void spa_history_log_internal(spa_t *spa, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5);
 extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op,
     dmu_tx_t *tx, const char *fmt, ...)  __printflike(4, 5);
 extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation,
     dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5);
 
 extern const char *spa_state_to_name(spa_t *spa);
 
 /* error handling */
 struct zbookmark_phys;
 extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb);
 extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd,
     const zbookmark_phys_t *zb, zio_t *zio, uint64_t state);
 extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd,
     zio_t *zio);
 extern void zfs_ereport_taskq_fini(void);
 extern void zfs_ereport_clear(spa_t *spa, vdev_t *vd);
 extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type,
     const char *name, nvlist_t *aux);
 extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
 extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate);
 extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
 extern uint64_t spa_get_errlog_size(spa_t *spa);
 extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
 extern void spa_errlog_rotate(spa_t *spa);
 extern void spa_errlog_drain(spa_t *spa);
 extern void spa_errlog_sync(spa_t *spa, uint64_t txg);
 extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub);
 
 /* vdev cache */
 extern void vdev_cache_stat_init(void);
 extern void vdev_cache_stat_fini(void);
 
 /* vdev mirror */
 extern void vdev_mirror_stat_init(void);
 extern void vdev_mirror_stat_fini(void);
 
 /* Initialization and termination */
 extern void spa_init(spa_mode_t mode);
 extern void spa_fini(void);
 extern void spa_boot_init(void);
 
 /* properties */
 extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
 extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
 extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
 extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
 
 /* asynchronous event notification */
 extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl,
     const char *name);
 
 /* waiting for pool activities to complete */
 extern int spa_wait(const char *pool, zpool_wait_activity_t activity,
     boolean_t *waited);
 extern int spa_wait_tag(const char *name, zpool_wait_activity_t activity,
     uint64_t tag, boolean_t *waited);
 extern void spa_notify_waiters(spa_t *spa);
 extern void spa_wake_waiters(spa_t *spa);
 
 /* module param call functions */
 int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS);
 int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS);
 int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS);
 int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_bp(bp, fmt, ...) do {				\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
 	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));	\
 	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} \
 _NOTE(CONSTCOND) } while (0)
 #else
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
 extern spa_mode_t spa_mode_global;
 extern int zfs_deadman_enabled;
 extern unsigned long zfs_deadman_synctime_ms;
 extern unsigned long zfs_deadman_ziotime_ms;
 extern unsigned long zfs_deadman_checktime_ms;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_SPA_H */
diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h
index 89afa98253f7..aa4338ed2859 100644
--- a/include/sys/zfs_context.h
+++ b/include/sys/zfs_context.h
@@ -1,776 +1,770 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  */
 
 #ifndef _SYS_ZFS_CONTEXT_H
 #define	_SYS_ZFS_CONTEXT_H
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
 /*
  * This code compiles in three different contexts. When __KERNEL__ is defined,
  * the code uses "unix-like" kernel interfaces. When _STANDALONE is defined, the
  * code is running in a reduced capacity environment of the boot loader which is
  * generally a subset of both POSIX and kernel interfaces (with a few unique
  * interfaces too). When neither are defined, it's in a userland POSIX or
  * similar environment.
  */
 #if defined(__KERNEL__) || defined(_STANDALONE)
 #include <sys/note.h>
 #include <sys/types.h>
 #include <sys/atomic.h>
 #include <sys/sysmacros.h>
 #include <sys/vmsystm.h>
 #include <sys/condvar.h>
 #include <sys/cmn_err.h>
 #include <sys/kmem.h>
 #include <sys/kmem_cache.h>
 #include <sys/vmem.h>
 #include <sys/taskq.h>
 #include <sys/param.h>
 #include <sys/disp.h>
 #include <sys/debug.h>
 #include <sys/random.h>
 #include <sys/strings.h>
 #include <sys/byteorder.h>
 #include <sys/list.h>
 #include <sys/time.h>
 #include <sys/zone.h>
 #include <sys/kstat.h>
 #include <sys/zfs_debug.h>
 #include <sys/sysevent.h>
 #include <sys/sysevent/eventdefs.h>
 #include <sys/zfs_delay.h>
 #include <sys/sunddi.h>
 #include <sys/ctype.h>
 #include <sys/disp.h>
 #include <sys/trace.h>
 #include <sys/procfs_list.h>
 #include <sys/mod.h>
 #include <sys/uio_impl.h>
 #include <sys/zfs_context_os.h>
 #else /* _KERNEL || _STANDALONE */
 
 #define	_SYS_MUTEX_H
 #define	_SYS_RWLOCK_H
 #define	_SYS_CONDVAR_H
 #define	_SYS_VNODE_H
 #define	_SYS_VFS_H
 #define	_SYS_SUNDDI_H
 #define	_SYS_CALLB_H
 
 #include <stdio.h>
 #include <stdlib.h>
 #include <stddef.h>
 #include <stdarg.h>
 #include <fcntl.h>
 #include <unistd.h>
 #include <errno.h>
 #include <string.h>
 #include <strings.h>
 #include <pthread.h>
 #include <setjmp.h>
 #include <assert.h>
 #include <umem.h>
 #include <limits.h>
 #include <atomic.h>
 #include <dirent.h>
 #include <time.h>
 #include <ctype.h>
 #include <signal.h>
 #include <sys/mman.h>
 #include <sys/note.h>
 #include <sys/types.h>
 #include <sys/cred.h>
 #include <sys/sysmacros.h>
 #include <sys/resource.h>
 #include <sys/byteorder.h>
 #include <sys/list.h>
 #include <sys/mod.h>
 #include <sys/uio.h>
 #include <sys/zfs_debug.h>
 #include <sys/kstat.h>
 #include <sys/u8_textprep.h>
 #include <sys/sysevent.h>
 #include <sys/sysevent/eventdefs.h>
 #include <sys/sunddi.h>
 #include <sys/debug.h>
 #include <sys/utsname.h>
 #include <sys/trace_zfs.h>
 
 #include <sys/zfs_context_os.h>
 
 /*
  * Stack
  */
 
 #define	noinline	__attribute__((noinline))
 #define	likely(x)	__builtin_expect((x), 1)
 #define	unlikely(x)	__builtin_expect((x), 0)
 
 /*
  * Debugging
  */
 
 /*
  * Note that we are not using the debugging levels.
  */
 
 #define	CE_CONT		0	/* continuation		*/
 #define	CE_NOTE		1	/* notice		*/
 #define	CE_WARN		2	/* warning		*/
 #define	CE_PANIC	3	/* panic		*/
 #define	CE_IGNORE	4	/* print nothing	*/
 
 /*
  * ZFS debugging
  */
 
 extern void dprintf_setup(int *argc, char **argv);
 
 extern void cmn_err(int, const char *, ...);
 extern void vcmn_err(int, const char *, va_list);
 extern void panic(const char *, ...)  __NORETURN;
 extern void vpanic(const char *, va_list)  __NORETURN;
 
 #define	fm_panic	panic
 
 /*
  * DTrace SDT probes have different signatures in userland than they do in
  * the kernel.  If they're being used in kernel code, re-define them out of
  * existence for their counterparts in libzpool.
  *
  * Here's an example of how to use the set-error probes in userland:
  * zfs$target:::set-error /arg0 == EBUSY/ {stack();}
  *
  * Here's an example of how to use DTRACE_PROBE probes in userland:
  * If there is a probe declared as follows:
  * DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn);
  * Then you can use it as follows:
  * zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/
  *     {printf("%u %p\n", arg1, arg2);}
  */
 
 #ifdef DTRACE_PROBE
 #undef	DTRACE_PROBE
 #endif	/* DTRACE_PROBE */
 #define	DTRACE_PROBE(a)
 
 #ifdef DTRACE_PROBE1
 #undef	DTRACE_PROBE1
 #endif	/* DTRACE_PROBE1 */
 #define	DTRACE_PROBE1(a, b, c)
 
 #ifdef DTRACE_PROBE2
 #undef	DTRACE_PROBE2
 #endif	/* DTRACE_PROBE2 */
 #define	DTRACE_PROBE2(a, b, c, d, e)
 
 #ifdef DTRACE_PROBE3
 #undef	DTRACE_PROBE3
 #endif	/* DTRACE_PROBE3 */
 #define	DTRACE_PROBE3(a, b, c, d, e, f, g)
 
 #ifdef DTRACE_PROBE4
 #undef	DTRACE_PROBE4
 #endif	/* DTRACE_PROBE4 */
 #define	DTRACE_PROBE4(a, b, c, d, e, f, g, h, i)
 
 /*
  * Tunables.
  */
 typedef struct zfs_kernel_param {
 	const char *name;	/* unused stub */
 } zfs_kernel_param_t;
 
 #define	ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc)
 #define	ZFS_MODULE_PARAM_ARGS void
 #define	ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \
 	getfunc, perm, desc)
 
 /*
  * Threads.
  */
 typedef pthread_t	kthread_t;
 
 #define	TS_RUN		0x00000002
 #define	TS_JOINABLE	0x00000004
 
 #define	curthread	((void *)(uintptr_t)pthread_self())
 #define	kpreempt(x)	yield()
 #define	getcomm()	"unknown"
 
 #define	thread_create_named(name, stk, stksize, func, arg, len, \
     pp, state, pri)	\
 	zk_thread_create(func, arg, stksize, state)
 #define	thread_create(stk, stksize, func, arg, len, pp, state, pri)	\
 	zk_thread_create(func, arg, stksize, state)
 #define	thread_exit()	pthread_exit(NULL)
 #define	thread_join(t)	pthread_join((pthread_t)(t), NULL)
 
 #define	newproc(f, a, cid, pri, ctp, pid)	(ENOSYS)
 
 /* in libzpool, p0 exists only to have its address taken */
 typedef struct proc {
 	uintptr_t	this_is_never_used_dont_dereference_it;
 } proc_t;
 
 extern struct proc p0;
 #define	curproc		(&p0)
 
 #define	PS_NONE		-1
 
 extern kthread_t *zk_thread_create(void (*func)(void *), void *arg,
     size_t stksize, int state);
 
 #define	issig(why)	(FALSE)
 #define	ISSIG(thr, why)	(FALSE)
 
 #define	kpreempt_disable()	((void)0)
 #define	kpreempt_enable()	((void)0)
 #define	cond_resched()		sched_yield()
 
 /*
  * Mutexes
  */
 typedef struct kmutex {
 	pthread_mutex_t		m_lock;
 	pthread_t		m_owner;
 } kmutex_t;
 
 #define	MUTEX_DEFAULT		0
 #define	MUTEX_NOLOCKDEP		MUTEX_DEFAULT
 #define	MUTEX_HELD(mp)		pthread_equal((mp)->m_owner, pthread_self())
 #define	MUTEX_NOT_HELD(mp)	!MUTEX_HELD(mp)
 
 extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie);
 extern void mutex_destroy(kmutex_t *mp);
 extern void mutex_enter(kmutex_t *mp);
 extern void mutex_exit(kmutex_t *mp);
 extern int mutex_tryenter(kmutex_t *mp);
 
 #define	NESTED_SINGLE 1
 #define	mutex_enter_nested(mp, class) mutex_enter(mp)
 /*
  * RW locks
  */
 typedef struct krwlock {
 	pthread_rwlock_t	rw_lock;
 	pthread_t		rw_owner;
 	uint_t			rw_readers;
 } krwlock_t;
 
 typedef int krw_t;
 
 #define	RW_READER		0
 #define	RW_WRITER		1
 #define	RW_DEFAULT		RW_READER
 #define	RW_NOLOCKDEP		RW_READER
 
 #define	RW_READ_HELD(rw)	((rw)->rw_readers > 0)
 #define	RW_WRITE_HELD(rw)	pthread_equal((rw)->rw_owner, pthread_self())
 #define	RW_LOCK_HELD(rw)	(RW_READ_HELD(rw) || RW_WRITE_HELD(rw))
 
 extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg);
 extern void rw_destroy(krwlock_t *rwlp);
 extern void rw_enter(krwlock_t *rwlp, krw_t rw);
 extern int rw_tryenter(krwlock_t *rwlp, krw_t rw);
 extern int rw_tryupgrade(krwlock_t *rwlp);
 extern void rw_exit(krwlock_t *rwlp);
 #define	rw_downgrade(rwlp) do { } while (0)
 
 /*
  * Credentials
  */
 extern uid_t crgetuid(cred_t *cr);
 extern uid_t crgetruid(cred_t *cr);
 extern gid_t crgetgid(cred_t *cr);
 extern int crgetngroups(cred_t *cr);
 extern gid_t *crgetgroups(cred_t *cr);
 
 /*
  * Condition variables
  */
 typedef pthread_cond_t		kcondvar_t;
 
 #define	CV_DEFAULT		0
 #define	CALLOUT_FLAG_ABSOLUTE	0x2
 
 extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg);
 extern void cv_destroy(kcondvar_t *cv);
 extern void cv_wait(kcondvar_t *cv, kmutex_t *mp);
 extern int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp);
 extern int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime);
 extern int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim,
     hrtime_t res, int flag);
 extern void cv_signal(kcondvar_t *cv);
 extern void cv_broadcast(kcondvar_t *cv);
 
 #define	cv_timedwait_io(cv, mp, at)		cv_timedwait(cv, mp, at)
 #define	cv_timedwait_idle(cv, mp, at)		cv_timedwait(cv, mp, at)
 #define	cv_timedwait_sig(cv, mp, at)		cv_timedwait(cv, mp, at)
 #define	cv_wait_io(cv, mp)			cv_wait(cv, mp)
 #define	cv_wait_idle(cv, mp)			cv_wait(cv, mp)
 #define	cv_wait_io_sig(cv, mp)			cv_wait_sig(cv, mp)
 #define	cv_timedwait_sig_hires(cv, mp, t, r, f) \
 	cv_timedwait_hires(cv, mp, t, r, f)
 #define	cv_timedwait_idle_hires(cv, mp, t, r, f) \
 	cv_timedwait_hires(cv, mp, t, r, f)
 
 /*
  * Thread-specific data
  */
 #define	tsd_get(k) pthread_getspecific(k)
 #define	tsd_set(k, v) pthread_setspecific(k, v)
 #define	tsd_create(kp, d) pthread_key_create((pthread_key_t *)kp, d)
 #define	tsd_destroy(kp) /* nothing */
 #ifdef __FreeBSD__
 typedef off_t loff_t;
 #endif
 
 /*
  * kstat creation, installation and deletion
  */
 extern kstat_t *kstat_create(const char *, int,
     const char *, const char *, uchar_t, ulong_t, uchar_t);
 extern void kstat_install(kstat_t *);
 extern void kstat_delete(kstat_t *);
-extern void kstat_waitq_enter(kstat_io_t *);
-extern void kstat_waitq_exit(kstat_io_t *);
-extern void kstat_runq_enter(kstat_io_t *);
-extern void kstat_runq_exit(kstat_io_t *);
-extern void kstat_waitq_to_runq(kstat_io_t *);
-extern void kstat_runq_back_to_waitq(kstat_io_t *);
 extern void kstat_set_raw_ops(kstat_t *ksp,
     int (*headers)(char *buf, size_t size),
     int (*data)(char *buf, size_t size, void *data),
     void *(*addr)(kstat_t *ksp, loff_t index));
 
 /*
  * procfs list manipulation
  */
 
 typedef struct procfs_list {
 	void		*pl_private;
 	kmutex_t	pl_lock;
 	list_t		pl_list;
 	uint64_t	pl_next_id;
 	size_t		pl_node_offset;
 } procfs_list_t;
 
 #ifndef __cplusplus
 struct seq_file { };
 void seq_printf(struct seq_file *m, const char *fmt, ...);
 
 typedef struct procfs_list_node {
 	list_node_t	pln_link;
 	uint64_t	pln_id;
 } procfs_list_node_t;
 
 void procfs_list_install(const char *module,
     const char *submodule,
     const char *name,
     mode_t mode,
     procfs_list_t *procfs_list,
     int (*show)(struct seq_file *f, void *p),
     int (*show_header)(struct seq_file *f),
     int (*clear)(procfs_list_t *procfs_list),
     size_t procfs_list_node_off);
 void procfs_list_uninstall(procfs_list_t *procfs_list);
 void procfs_list_destroy(procfs_list_t *procfs_list);
 void procfs_list_add(procfs_list_t *procfs_list, void *p);
 #endif
 
 /*
  * Kernel memory
  */
 #define	KM_SLEEP		UMEM_NOFAIL
 #define	KM_PUSHPAGE		KM_SLEEP
 #define	KM_NOSLEEP		UMEM_DEFAULT
 #define	KM_NORMALPRI		0	/* not needed with UMEM_DEFAULT */
 #define	KMC_NODEBUG		UMC_NODEBUG
 #define	KMC_KVMEM		0x0
 #define	kmem_alloc(_s, _f)	umem_alloc(_s, _f)
 #define	kmem_zalloc(_s, _f)	umem_zalloc(_s, _f)
 #define	kmem_free(_b, _s)	umem_free(_b, _s)
 #define	vmem_alloc(_s, _f)	kmem_alloc(_s, _f)
 #define	vmem_zalloc(_s, _f)	kmem_zalloc(_s, _f)
 #define	vmem_free(_b, _s)	kmem_free(_b, _s)
 #define	kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \
 	umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i)
 #define	kmem_cache_destroy(_c)	umem_cache_destroy(_c)
 #define	kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f)
 #define	kmem_cache_free(_c, _b)	umem_cache_free(_c, _b)
 #define	kmem_debugging()	0
 #define	kmem_cache_reap_now(_c)	umem_cache_reap_now(_c);
 #define	kmem_cache_set_move(_c, _cb)	/* nothing */
 #define	POINTER_INVALIDATE(_pp)		/* nothing */
 #define	POINTER_IS_VALID(_p)	0
 
 typedef umem_cache_t kmem_cache_t;
 
 typedef enum kmem_cbrc {
 	KMEM_CBRC_YES,
 	KMEM_CBRC_NO,
 	KMEM_CBRC_LATER,
 	KMEM_CBRC_DONT_NEED,
 	KMEM_CBRC_DONT_KNOW
 } kmem_cbrc_t;
 
 /*
  * Task queues
  */
 
 #define	TASKQ_NAMELEN	31
 
 typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 
 typedef struct taskq_ent {
 	struct taskq_ent	*tqent_next;
 	struct taskq_ent	*tqent_prev;
 	task_func_t		*tqent_func;
 	void			*tqent_arg;
 	uintptr_t		tqent_flags;
 } taskq_ent_t;
 
 typedef struct taskq {
 	char		tq_name[TASKQ_NAMELEN + 1];
 	kmutex_t	tq_lock;
 	krwlock_t	tq_threadlock;
 	kcondvar_t	tq_dispatch_cv;
 	kcondvar_t	tq_wait_cv;
 	kthread_t	**tq_threadlist;
 	int		tq_flags;
 	int		tq_active;
 	int		tq_nthreads;
 	int		tq_nalloc;
 	int		tq_minalloc;
 	int		tq_maxalloc;
 	kcondvar_t	tq_maxalloc_cv;
 	int		tq_maxalloc_wait;
 	taskq_ent_t	*tq_freelist;
 	taskq_ent_t	tq_task;
 } taskq_t;
 
 #define	TQENT_FLAG_PREALLOC	0x1	/* taskq_dispatch_ent used */
 
 #define	TASKQ_PREPOPULATE	0x0001
 #define	TASKQ_CPR_SAFE		0x0002	/* Use CPR safe protocol */
 #define	TASKQ_DYNAMIC		0x0004	/* Use dynamic thread scheduling */
 #define	TASKQ_THREADS_CPU_PCT	0x0008	/* Scale # threads by # cpus */
 #define	TASKQ_DC_BATCH		0x0010	/* Mark threads as batch */
 
 #define	TQ_SLEEP	KM_SLEEP	/* Can block for memory */
 #define	TQ_NOSLEEP	KM_NOSLEEP	/* cannot block for memory; may fail */
 #define	TQ_NOQUEUE	0x02		/* Do not enqueue if can't dispatch */
 #define	TQ_FRONT	0x08		/* Queue in front */
 
 #define	TASKQID_INVALID		((taskqid_t)0)
 
 extern taskq_t *system_taskq;
 extern taskq_t *system_delay_taskq;
 
 extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
 #define	taskq_create_proc(a, b, c, d, e, p, f) \
 	    (taskq_create(a, b, c, d, e, f))
 #define	taskq_create_sysdc(a, b, d, e, p, dc, f) \
 	    (taskq_create(a, b, maxclsyspri, d, e, f))
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t,
     clock_t);
 extern void	taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t,
     taskq_ent_t *);
 extern int	taskq_empty_ent(taskq_ent_t *);
 extern void	taskq_init_ent(taskq_ent_t *);
 extern void	taskq_destroy(taskq_t *);
 extern void	taskq_wait(taskq_t *);
 extern void	taskq_wait_id(taskq_t *, taskqid_t);
 extern void	taskq_wait_outstanding(taskq_t *, taskqid_t);
 extern int	taskq_member(taskq_t *, kthread_t *);
 extern taskq_t	*taskq_of_curthread(void);
 extern int	taskq_cancel_id(taskq_t *, taskqid_t);
 extern void	system_taskq_init(void);
 extern void	system_taskq_fini(void);
 
 #define	XVA_MAPSIZE	3
 #define	XVA_MAGIC	0x78766174
 
 extern char *vn_dumpdir;
 #define	AV_SCANSTAMP_SZ	32		/* length of anti-virus scanstamp */
 
 typedef struct xoptattr {
 	inode_timespec_t xoa_createtime;	/* Create time of file */
 	uint8_t		xoa_archive;
 	uint8_t		xoa_system;
 	uint8_t		xoa_readonly;
 	uint8_t		xoa_hidden;
 	uint8_t		xoa_nounlink;
 	uint8_t		xoa_immutable;
 	uint8_t		xoa_appendonly;
 	uint8_t		xoa_nodump;
 	uint8_t		xoa_settable;
 	uint8_t		xoa_opaque;
 	uint8_t		xoa_av_quarantined;
 	uint8_t		xoa_av_modified;
 	uint8_t		xoa_av_scanstamp[AV_SCANSTAMP_SZ];
 	uint8_t		xoa_reparse;
 	uint8_t		xoa_offline;
 	uint8_t		xoa_sparse;
 } xoptattr_t;
 
 typedef struct vattr {
 	uint_t		va_mask;	/* bit-mask of attributes */
 	u_offset_t	va_size;	/* file size in bytes */
 } vattr_t;
 
 
 typedef struct xvattr {
 	vattr_t		xva_vattr;	/* Embedded vattr structure */
 	uint32_t	xva_magic;	/* Magic Number */
 	uint32_t	xva_mapsize;	/* Size of attr bitmap (32-bit words) */
 	uint32_t	*xva_rtnattrmapp;	/* Ptr to xva_rtnattrmap[] */
 	uint32_t	xva_reqattrmap[XVA_MAPSIZE];	/* Requested attrs */
 	uint32_t	xva_rtnattrmap[XVA_MAPSIZE];	/* Returned attrs */
 	xoptattr_t	xva_xoptattrs;	/* Optional attributes */
 } xvattr_t;
 
 typedef struct vsecattr {
 	uint_t		vsa_mask;	/* See below */
 	int		vsa_aclcnt;	/* ACL entry count */
 	void		*vsa_aclentp;	/* pointer to ACL entries */
 	int		vsa_dfaclcnt;	/* default ACL entry count */
 	void		*vsa_dfaclentp;	/* pointer to default ACL entries */
 	size_t		vsa_aclentsz;	/* ACE size in bytes of vsa_aclentp */
 } vsecattr_t;
 
 #define	AT_MODE		0x00002
 #define	AT_UID		0x00004
 #define	AT_GID		0x00008
 #define	AT_FSID		0x00010
 #define	AT_NODEID	0x00020
 #define	AT_NLINK	0x00040
 #define	AT_SIZE		0x00080
 #define	AT_ATIME	0x00100
 #define	AT_MTIME	0x00200
 #define	AT_CTIME	0x00400
 #define	AT_RDEV		0x00800
 #define	AT_BLKSIZE	0x01000
 #define	AT_NBLOCKS	0x02000
 #define	AT_SEQ		0x08000
 #define	AT_XVATTR	0x10000
 
 #define	CRCREAT		0
 
 #define	F_FREESP	11
 #define	FIGNORECASE	0x80000 /* request case-insensitive lookups */
 
 /*
  * Random stuff
  */
 #define	ddi_get_lbolt()		(gethrtime() >> 23)
 #define	ddi_get_lbolt64()	(gethrtime() >> 23)
 #define	hz	119	/* frequency when using gethrtime() >> 23 for lbolt */
 
 #define	ddi_time_before(a, b)		(a < b)
 #define	ddi_time_after(a, b)		ddi_time_before(b, a)
 #define	ddi_time_before_eq(a, b)	(!ddi_time_after(a, b))
 #define	ddi_time_after_eq(a, b)		ddi_time_before_eq(b, a)
 
 #define	ddi_time_before64(a, b)		(a < b)
 #define	ddi_time_after64(a, b)		ddi_time_before64(b, a)
 #define	ddi_time_before_eq64(a, b)	(!ddi_time_after64(a, b))
 #define	ddi_time_after_eq64(a, b)	ddi_time_before_eq64(b, a)
 
 extern void delay(clock_t ticks);
 
 #define	SEC_TO_TICK(sec)	((sec) * hz)
 #define	MSEC_TO_TICK(msec)	(howmany((hrtime_t)(msec) * hz, MILLISEC))
 #define	USEC_TO_TICK(usec)	(howmany((hrtime_t)(usec) * hz, MICROSEC))
 #define	NSEC_TO_TICK(nsec)	(howmany((hrtime_t)(nsec) * hz, NANOSEC))
 
 #define	max_ncpus	64
 #define	boot_ncpus	(sysconf(_SC_NPROCESSORS_ONLN))
 
 /*
  * Process priorities as defined by setpriority(2) and getpriority(2).
  */
 #define	minclsyspri	19
 #define	maxclsyspri	-20
 #define	defclsyspri	0
 
 #define	CPU_SEQID	((uintptr_t)pthread_self() & (max_ncpus - 1))
 #define	CPU_SEQID_UNSTABLE	CPU_SEQID
 
 #define	kcred		NULL
 #define	CRED()		NULL
 
 #define	ptob(x)		((x) * PAGESIZE)
 
 #define	NN_DIVISOR_1000	(1U << 0)
 #define	NN_NUMBUF_SZ	(6)
 
 extern uint64_t physmem;
 extern const char *random_path;
 extern const char *urandom_path;
 
 extern int highbit64(uint64_t i);
 extern int lowbit64(uint64_t i);
 extern int random_get_bytes(uint8_t *ptr, size_t len);
 extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len);
 
 extern void kernel_init(int mode);
 extern void kernel_fini(void);
 extern void random_init(void);
 extern void random_fini(void);
 
 struct spa;
 extern void show_pool_stats(struct spa *);
 extern int set_global_var(char const *arg);
 
 typedef struct callb_cpr {
 	kmutex_t	*cc_lockp;
 } callb_cpr_t;
 
 #define	CALLB_CPR_INIT(cp, lockp, func, name)	{		\
 	(cp)->cc_lockp = lockp;					\
 }
 
 #define	CALLB_CPR_SAFE_BEGIN(cp) {				\
 	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
 }
 
 #define	CALLB_CPR_SAFE_END(cp, lockp) {				\
 	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
 }
 
 #define	CALLB_CPR_EXIT(cp) {					\
 	ASSERT(MUTEX_HELD((cp)->cc_lockp));			\
 	mutex_exit((cp)->cc_lockp);				\
 }
 
 #define	zone_dataset_visible(x, y)	(1)
 #define	INGLOBALZONE(z)			(1)
 extern uint32_t zone_get_hostid(void *zonep);
 
 extern char *kmem_vasprintf(const char *fmt, va_list adx);
 extern char *kmem_asprintf(const char *fmt, ...);
 #define	kmem_strfree(str) kmem_free((str), strlen(str) + 1)
 #define	kmem_strdup(s)  strdup(s)
 
 /*
  * Hostname information
  */
 extern char hw_serial[];	/* for userland-emulated hostid access */
 extern int ddi_strtoul(const char *str, char **nptr, int base,
     unsigned long *result);
 
 extern int ddi_strtoull(const char *str, char **nptr, int base,
     u_longlong_t *result);
 
 typedef struct utsname	utsname_t;
 extern utsname_t *utsname(void);
 
 /* ZFS Boot Related stuff. */
 
 struct _buf {
 	intptr_t	_fd;
 };
 
 struct bootstat {
 	uint64_t st_size;
 };
 
 typedef struct ace_object {
 	uid_t		a_who;
 	uint32_t	a_access_mask;
 	uint16_t	a_flags;
 	uint16_t	a_type;
 	uint8_t		a_obj_type[16];
 	uint8_t		a_inherit_obj_type[16];
 } ace_object_t;
 
 
 #define	ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE	0x05
 #define	ACE_ACCESS_DENIED_OBJECT_ACE_TYPE	0x06
 #define	ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE	0x07
 #define	ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE	0x08
 
 extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr);
 extern int zfs_secpolicy_rename_perms(const char *from, const char *to,
     cred_t *cr);
 extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
 extern int secpolicy_zfs(const cred_t *cr);
 extern int secpolicy_zfs_proc(const cred_t *cr, proc_t *proc);
 extern zoneid_t getzoneid(void);
 
 /* SID stuff */
 typedef struct ksiddomain {
 	uint_t	kd_ref;
 	uint_t	kd_len;
 	char	*kd_name;
 } ksiddomain_t;
 
 ksiddomain_t *ksid_lookupdomain(const char *);
 void ksiddomain_rele(ksiddomain_t *);
 
 #define	DDI_SLEEP	KM_SLEEP
 #define	ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \
 	sysevent_post_event(_c, _d, _b, "libzpool", _e, _f)
 
 #define	zfs_sleep_until(wakeup)						\
 	do {								\
 		hrtime_t delta = wakeup - gethrtime();			\
 		struct timespec ts;					\
 		ts.tv_sec = delta / NANOSEC;				\
 		ts.tv_nsec = delta % NANOSEC;				\
 		(void) nanosleep(&ts, NULL);				\
 	} while (0)
 
 typedef int fstrans_cookie_t;
 
 extern fstrans_cookie_t spl_fstrans_mark(void);
 extern void spl_fstrans_unmark(fstrans_cookie_t);
 extern int __spl_pf_fstrans_check(void);
 extern int kmem_cache_reap_active(void);
 
 #define	____cacheline_aligned
 
 /*
  * Kernel modules
  */
 #define	__init
 #define	__exit
 
 #endif  /* _KERNEL || _STANDALONE */
 
 #ifdef __cplusplus
 };
 #endif
 
 #endif	/* _SYS_ZFS_CONTEXT_H */
diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h
index 69fb6d401fc7..f73fb92eb797 100644
--- a/lib/libspl/include/sys/kstat.h
+++ b/lib/libspl/include/sys/kstat.h
@@ -1,822 +1,816 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_KSTAT_H
 #define	_SYS_KSTAT_H
 
 
 
 /*
  * Definition of general kernel statistics structures and /dev/kstat ioctls
  */
 
 #include <sys/types.h>
 #include <sys/time.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 typedef int	kid_t;		/* unique kstat id */
 
 /*
  * Kernel statistics driver (/dev/kstat) ioctls
  */
 
 #define	KSTAT_IOC_BASE		('K' << 8)
 
 #define	KSTAT_IOC_CHAIN_ID	KSTAT_IOC_BASE | 0x01
 #define	KSTAT_IOC_READ		KSTAT_IOC_BASE | 0x02
 #define	KSTAT_IOC_WRITE		KSTAT_IOC_BASE | 0x03
 
 /*
  * /dev/kstat ioctl usage (kd denotes /dev/kstat descriptor):
  *
  *	kcid = ioctl(kd, KSTAT_IOC_CHAIN_ID, NULL);
  *	kcid = ioctl(kd, KSTAT_IOC_READ, kstat_t *);
  *	kcid = ioctl(kd, KSTAT_IOC_WRITE, kstat_t *);
  */
 
 #define	KSTAT_STRLEN	255	/* 254 chars + NULL; must be 16 * n - 1 */
 
 /*
  * The generic kstat header
  */
 
 typedef struct kstat {
 	/*
 	 * Fields relevant to both kernel and user
 	 */
 	hrtime_t	ks_crtime;	/* creation time (from gethrtime()) */
 	struct kstat	*ks_next;	/* kstat chain linkage */
 	kid_t		ks_kid;		/* unique kstat ID */
 	char		ks_module[KSTAT_STRLEN]; /* provider module name */
 	uchar_t		ks_resv;	/* reserved, currently just padding */
 	int		ks_instance;	/* provider module's instance */
 	char		ks_name[KSTAT_STRLEN]; /* kstat name */
 	uchar_t		ks_type;	/* kstat data type */
 	char		ks_class[KSTAT_STRLEN]; /* kstat class */
 	uchar_t		ks_flags;	/* kstat flags */
 	void		*ks_data;	/* kstat type-specific data */
 	uint_t		ks_ndata;	/* # of type-specific data records */
 	size_t		ks_data_size;	/* total size of kstat data section */
 	hrtime_t	ks_snaptime;	/* time of last data snapshot */
 	/*
 	 * Fields relevant to kernel only
 	 */
 	int		(*ks_update)(struct kstat *, int); /* dynamic update */
 	void		*ks_private;	/* arbitrary provider-private data */
 	int		(*ks_snapshot)(struct kstat *, void *, int);
 	void		*ks_lock;	/* protects this kstat's data */
 } kstat_t;
 
 #ifdef _SYSCALL32
 
 typedef int32_t kid32_t;
 
 typedef struct kstat32 {
 	/*
 	 * Fields relevant to both kernel and user
 	 */
 	hrtime_t	ks_crtime;
 	caddr32_t	ks_next;		/* struct kstat pointer */
 	kid32_t		ks_kid;
 	char		ks_module[KSTAT_STRLEN];
 	uint8_t		ks_resv;
 	int32_t		ks_instance;
 	char		ks_name[KSTAT_STRLEN];
 	uint8_t		ks_type;
 	char		ks_class[KSTAT_STRLEN];
 	uint8_t		ks_flags;
 	caddr32_t	ks_data;		/* type-specific data */
 	uint32_t	ks_ndata;
 	size32_t	ks_data_size;
 	hrtime_t	ks_snaptime;
 	/*
 	 * Fields relevant to kernel only (only needed here for padding)
 	 */
 	int32_t		_ks_update;
 	caddr32_t	_ks_private;
 	int32_t		_ks_snapshot;
 	caddr32_t	_ks_lock;
 } kstat32_t;
 
 #endif	/* _SYSCALL32 */
 
 /*
  * kstat structure and locking strategy
  *
  * Each kstat consists of a header section (a kstat_t) and a data section.
  * The system maintains a set of kstats, protected by kstat_chain_lock.
  * kstat_chain_lock protects all additions to/deletions from this set,
  * as well as all changes to kstat headers.  kstat data sections are
  * *optionally* protected by the per-kstat ks_lock.  If ks_lock is non-NULL,
  * kstat clients (e.g. /dev/kstat) will acquire this lock for all of their
  * operations on that kstat.  It is up to the kstat provider to decide whether
  * guaranteeing consistent data to kstat clients is sufficiently important
  * to justify the locking cost.  Note, however, that most statistic updates
  * already occur under one of the provider's mutexes, so if the provider sets
  * ks_lock to point to that mutex, then kstat data locking is free.
  *
  * NOTE: variable-size kstats MUST employ kstat data locking, to prevent
  * data-size races with kstat clients.
  *
  * NOTE: ks_lock is really of type (kmutex_t *); it is declared as (void *)
  * in the kstat header so that users don't have to be exposed to all of the
  * kernel's lock-related data structures.
  */
 
 #if	defined(_KERNEL)
 
 #define	KSTAT_ENTER(k)	\
 	{ kmutex_t *lp = (k)->ks_lock; if (lp) mutex_enter(lp); }
 
 #define	KSTAT_EXIT(k)	\
 	{ kmutex_t *lp = (k)->ks_lock; if (lp) mutex_exit(lp); }
 
 #define	KSTAT_UPDATE(k, rw)		(*(k)->ks_update)((k), (rw))
 
 #define	KSTAT_SNAPSHOT(k, buf, rw)	(*(k)->ks_snapshot)((k), (buf), (rw))
 
 #endif	/* defined(_KERNEL) */
 
 /*
  * kstat time
  *
  * All times associated with kstats (e.g. creation time, snapshot time,
  * kstat_timer_t and kstat_io_t timestamps, etc.) are 64-bit nanosecond values,
  * as returned by gethrtime().  The accuracy of these timestamps is machine
  * dependent, but the precision (units) is the same across all platforms.
  */
 
 /*
  * kstat identity (KID)
  *
  * Each kstat is assigned a unique KID (kstat ID) when it is added to the
  * global kstat chain.  The KID is used as a cookie by /dev/kstat to
  * request information about the corresponding kstat.  There is also
  * an identity associated with the entire kstat chain, kstat_chain_id,
  * which is bumped each time a kstat is added or deleted.  /dev/kstat uses
  * the chain ID to detect changes in the kstat chain (e.g., a new disk
  * coming online) between ioctl()s.
  */
 
 /*
  * kstat module, kstat instance
  *
  * ks_module and ks_instance contain the name and instance of the module
  * that created the kstat.  In cases where there can only be one instance,
  * ks_instance is 0.  The kernel proper (/kernel/unix) uses "unix" as its
  * module name.
  */
 
 /*
  * kstat name
  *
  * ks_name gives a meaningful name to a kstat.  The full kstat namespace
  * is module.instance.name, so the name only need be unique within a
  * module.  kstat_create() will fail if you try to create a kstat with
  * an already-used (ks_module, ks_instance, ks_name) triplet.  Spaces are
  * allowed in kstat names, but strongly discouraged, since they hinder
  * awk-style processing at user level.
  */
 
 /*
  * kstat type
  *
  * The kstat mechanism provides several flavors of kstat data, defined
  * below.  The "raw" kstat type is just treated as an array of bytes; you
  * can use this to export any kind of data you want.
  *
  * Some kstat types allow multiple data structures per kstat, e.g.
  * KSTAT_TYPE_NAMED; others do not.  This is part of the spec for each
  * kstat data type.
  *
  * User-level tools should *not* rely on the #define KSTAT_NUM_TYPES.  To
  * get this information, read out the standard system kstat "kstat_types".
  */
 
 #define	KSTAT_TYPE_RAW		0	/* can be anything */
 					/* ks_ndata >= 1 */
 #define	KSTAT_TYPE_NAMED	1	/* name/value pair */
 					/* ks_ndata >= 1 */
 #define	KSTAT_TYPE_INTR		2	/* interrupt statistics */
 					/* ks_ndata == 1 */
 #define	KSTAT_TYPE_IO		3	/* I/O statistics */
 					/* ks_ndata == 1 */
 #define	KSTAT_TYPE_TIMER	4	/* event timer */
 					/* ks_ndata >= 1 */
 
 #define	KSTAT_NUM_TYPES		5
 
 /*
  * kstat class
  *
  * Each kstat can be characterized as belonging to some broad class
  * of statistics, e.g. disk, tape, net, vm, streams, etc.  This field
  * can be used as a filter to extract related kstats.  The following
  * values are currently in use: disk, tape, net, controller, vm, kvm,
  * hat, streams, kstat, and misc.  (The kstat class encompasses things
  * like kstat_types.)
  */
 
 /*
  * kstat flags
  *
  * Any of the following flags may be passed to kstat_create().  They are
  * all zero by default.
  *
  *	KSTAT_FLAG_VIRTUAL:
  *
  *		Tells kstat_create() not to allocate memory for the
  *		kstat data section; instead, you will set the ks_data
  *		field to point to the data you wish to export.  This
  *		provides a convenient way to export existing data
  *		structures.
  *
  *	KSTAT_FLAG_VAR_SIZE:
  *
  *		The size of the kstat you are creating will vary over time.
  *		For example, you may want to use the kstat mechanism to
  *		export a linked list.  NOTE: The kstat framework does not
  *		manage the data section, so all variable-size kstats must be
  *		virtual kstats.  Moreover, variable-size kstats MUST employ
  *		kstat data locking to prevent data-size races with kstat
  *		clients.  See the section on "kstat snapshot" for details.
  *
  *	KSTAT_FLAG_WRITABLE:
  *
  *		Makes the kstat's data section writable by root.
  *		The ks_snapshot routine (see below) does not need to check for
  *		this; permission checking is handled in the kstat driver.
  *
  *	KSTAT_FLAG_PERSISTENT:
  *
  *		Indicates that this kstat is to be persistent over time.
  *		For persistent kstats, kstat_delete() simply marks the
  *		kstat as dormant; a subsequent kstat_create() reactivates
  *		the kstat.  This feature is provided so that statistics
  *		are not lost across driver close/open (e.g., raw disk I/O
  *		on a disk with no mounted partitions.)
  *		NOTE: Persistent kstats cannot be virtual, since ks_data
  *		points to garbage as soon as the driver goes away.
  *
  * The following flags are maintained by the kstat framework:
  *
  *	KSTAT_FLAG_DORMANT:
  *
  *		For persistent kstats, indicates that the kstat is in the
  *		dormant state (e.g., the corresponding device is closed).
  *
  *	KSTAT_FLAG_INVALID:
  *
  *		This flag is set when a kstat is in a transitional state,
  *		e.g. between kstat_create() and kstat_install().
  *		kstat clients must not attempt to access the kstat's data
  *		if this flag is set.
  */
 
 #define	KSTAT_FLAG_VIRTUAL		0x01
 #define	KSTAT_FLAG_VAR_SIZE		0x02
 #define	KSTAT_FLAG_WRITABLE		0x04
 #define	KSTAT_FLAG_PERSISTENT		0x08
 #define	KSTAT_FLAG_DORMANT		0x10
 #define	KSTAT_FLAG_INVALID		0x20
 #define	KSTAT_FLAG_LONGSTRINGS		0x40
 #define	KSTAT_FLAG_NO_HEADERS		0x80
 
 /*
  * Dynamic update support
  *
  * The kstat mechanism allows for an optional ks_update function to update
  * kstat data.  This is useful for drivers where the underlying device
  * keeps cheap hardware stats, but extraction is expensive.  Instead of
  * constantly keeping the kstat data section up to date, you can supply a
  * ks_update function which updates the kstat's data section on demand.
  * To take advantage of this feature, simply set the ks_update field before
  * calling kstat_install().
  *
  * The ks_update function, if supplied, must have the following structure:
  *
  *	int
  *	foo_kstat_update(kstat_t *ksp, int rw)
  *	{
  *		if (rw == KSTAT_WRITE) {
  *			... update the native stats from ksp->ks_data;
  *				return EACCES if you don't support this
  *		} else {
  *			... update ksp->ks_data from the native stats
  *		}
  *	}
  *
  * The ks_update return codes are: 0 for success, EACCES if you don't allow
  * KSTAT_WRITE, and EIO for any other type of error.
  *
  * In general, the ks_update function may need to refer to provider-private
  * data; for example, it may need a pointer to the provider's raw statistics.
  * The ks_private field is available for this purpose.  Its use is entirely
  * at the provider's discretion.
  *
  * All variable-size kstats MUST supply a ks_update routine, which computes
  * and sets ks_data_size (and ks_ndata if that is meaningful), since these
  * are needed to perform kstat snapshots (see below).
  *
  * No kstat locking should be done inside the ks_update routine.  The caller
  * will already be holding the kstat's ks_lock (to ensure consistent data).
  */
 
 #define	KSTAT_READ	0
 #define	KSTAT_WRITE	1
 
 /*
  * Kstat snapshot
  *
  * In order to get a consistent view of a kstat's data, clients must obey
  * the kstat's locking strategy.  However, these clients may need to perform
  * operations on the data which could cause a fault (e.g. copyout()), or
  * operations which are simply expensive.  Doing so could cause deadlock
  * (e.g. if you're holding a disk's kstat lock which is ultimately required
  * to resolve a copyout() fault), performance degradation (since the providers'
  * activity is serialized at the kstat lock), device timing problems, etc.
  *
  * To avoid these problems, kstat data is provided via snapshots.  Taking
  * a snapshot is a simple process: allocate a wired-down kernel buffer,
  * acquire the kstat's data lock, copy the data into the buffer ("take the
  * snapshot"), and release the lock.  This ensures that the kstat's data lock
  * will be held as briefly as possible, and that no faults will occur while
  * the lock is held.
  *
  * Normally, the snapshot is taken by default_kstat_snapshot(), which
  * timestamps the data (sets ks_snaptime), copies it, and does a little
  * massaging to deal with incomplete transactions on i/o kstats.  However,
  * this routine only works for kstats with contiguous data (the typical case).
  * If you create a kstat whose data is, say, a linked list, you must provide
  * your own ks_snapshot routine.  The routine you supply must have the
  * following prototype (replace "foo" with something appropriate):
  *
  *	int foo_kstat_snapshot(kstat_t *ksp, void *buf, int rw);
  *
  * The minimal snapshot routine -- one which copies contiguous data that
  * doesn't need any massaging -- would be this:
  *
  *	ksp->ks_snaptime = gethrtime();
  *	if (rw == KSTAT_WRITE)
  *		bcopy(buf, ksp->ks_data, ksp->ks_data_size);
  *	else
  *		bcopy(ksp->ks_data, buf, ksp->ks_data_size);
  *	return (0);
  *
  * A more illuminating example is taking a snapshot of a linked list:
  *
  *	ksp->ks_snaptime = gethrtime();
  *	if (rw == KSTAT_WRITE)
  *		return (EACCES);		... See below ...
  *	for (foo = first_foo; foo; foo = foo->next) {
  *		bcopy((char *) foo, (char *) buf, sizeof (struct foo));
  *		buf = ((struct foo *) buf) + 1;
  *	}
  *	return (0);
  *
  * In the example above, we have decided that we don't want to allow
  * KSTAT_WRITE access, so we return EACCES if this is attempted.
  *
  * The key points are:
  *
  *	(1) ks_snaptime must be set (via gethrtime()) to timestamp the data.
  *	(2) Data gets copied from the kstat to the buffer on KSTAT_READ,
  *		and from the buffer to the kstat on KSTAT_WRITE.
  *	(3) ks_snapshot return values are: 0 for success, EACCES if you
  *		don't allow KSTAT_WRITE, and EIO for any other type of error.
  *
  * Named kstats (see section on "Named statistics" below) containing long
  * strings (KSTAT_DATA_STRING) need special handling.  The kstat driver
  * assumes that all strings are copied into the buffer after the array of
  * named kstats, and the pointers (KSTAT_NAMED_STR_PTR()) are updated to point
  * into the copy within the buffer. The default snapshot routine does this,
  * but overriding routines should contain at least the following:
  *
  * if (rw == KSTAT_READ) {
  * 	kstat_named_t *knp = buf;
  * 	char *end = knp + ksp->ks_ndata;
  * 	uint_t i;
  *
  * 	... Do the regular copy ...
  * 	bcopy(ksp->ks_data, buf, sizeof (kstat_named_t) * ksp->ks_ndata);
  *
  * 	for (i = 0; i < ksp->ks_ndata; i++, knp++) {
  *		if (knp[i].data_type == KSTAT_DATA_STRING &&
  *		    KSTAT_NAMED_STR_PTR(knp) != NULL) {
  *			bcopy(KSTAT_NAMED_STR_PTR(knp), end,
  *			    KSTAT_NAMED_STR_BUFLEN(knp));
  *			KSTAT_NAMED_STR_PTR(knp) = end;
  *			end += KSTAT_NAMED_STR_BUFLEN(knp);
  *		}
  *	}
  */
 
 /*
  * Named statistics.
  *
  * List of arbitrary name=value statistics.
  */
 
 typedef struct kstat_named {
 	char	name[KSTAT_STRLEN];	/* name of counter */
 	uchar_t	data_type;		/* data type */
 	union {
 		char		c[16];	/* enough for 128-bit ints */
 		int32_t		i32;
 		uint32_t	ui32;
 		struct {
 			union {
 				char 		*ptr;	/* NULL-term string */
 #if defined(_KERNEL) && defined(_MULTI_DATAMODEL)
 				caddr32_t	ptr32;
 #endif
 				char 		__pad[8]; /* 64-bit padding */
 			} addr;
 			uint32_t	len;	/* # bytes for strlen + '\0' */
 		} str;
 /*
  * The int64_t and uint64_t types are not valid for a maximally conformant
  * 32-bit compilation environment (cc -Xc) using compilers prior to the
  * introduction of C99 conforming compiler (reference ISO/IEC 9899:1990).
  * In these cases, the visibility of i64 and ui64 is only permitted for
  * 64-bit compilation environments or 32-bit non-maximally conformant
  * C89 or C90 ANSI C compilation environments (cc -Xt and cc -Xa). In the
  * C99 ANSI C compilation environment, the long long type is supported.
  * The _INT64_TYPE is defined by the implementation (see sys/int_types.h).
  */
 #if defined(_INT64_TYPE)
 		int64_t		i64;
 		uint64_t	ui64;
 #endif
 		long		l;
 		ulong_t		ul;
 
 		/* These structure members are obsolete */
 
 		longlong_t	ll;
 		u_longlong_t	ull;
 		float		f;
 		double		d;
 	} value;			/* value of counter */
 } kstat_named_t;
 
 #define	KSTAT_DATA_CHAR		0
 #define	KSTAT_DATA_INT32	1
 #define	KSTAT_DATA_UINT32	2
 #define	KSTAT_DATA_INT64	3
 #define	KSTAT_DATA_UINT64	4
 
 #if !defined(_LP64)
 #define	KSTAT_DATA_LONG		KSTAT_DATA_INT32
 #define	KSTAT_DATA_ULONG	KSTAT_DATA_UINT32
 #else
 #if !defined(_KERNEL)
 #define	KSTAT_DATA_LONG		KSTAT_DATA_INT64
 #define	KSTAT_DATA_ULONG	KSTAT_DATA_UINT64
 #else
 #define	KSTAT_DATA_LONG		7	/* only visible to the kernel */
 #define	KSTAT_DATA_ULONG	8	/* only visible to the kernel */
 #endif	/* !_KERNEL */
 #endif	/* !_LP64 */
 
 /*
  * Statistics exporting named kstats with long strings (KSTAT_DATA_STRING)
  * may not make the assumption that ks_data_size is equal to (ks_ndata * sizeof
  * (kstat_named_t)).  ks_data_size in these cases is equal to the sum of the
  * amount of space required to store the strings (ie, the sum of
  * KSTAT_NAMED_STR_BUFLEN() for all KSTAT_DATA_STRING statistics) plus the
  * space required to store the kstat_named_t's.
  *
  * The default update routine will update ks_data_size automatically for
  * variable-length kstats containing long strings (using the default update
  * routine only makes sense if the string is the only thing that is changing
  * in size, and ks_ndata is constant).  Fixed-length kstats containing long
  * strings must explicitly change ks_data_size (after creation but before
  * initialization) to reflect the correct amount of space required for the
  * long strings and the kstat_named_t's.
  */
 #define	KSTAT_DATA_STRING	9
 
 /* These types are obsolete */
 
 #define	KSTAT_DATA_LONGLONG	KSTAT_DATA_INT64
 #define	KSTAT_DATA_ULONGLONG	KSTAT_DATA_UINT64
 #define	KSTAT_DATA_FLOAT	5
 #define	KSTAT_DATA_DOUBLE	6
 
 #define	KSTAT_NAMED_PTR(kptr)	((kstat_named_t *)(kptr)->ks_data)
 
 /*
  * Retrieve the pointer of the string contained in the given named kstat.
  */
 #define	KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.str.addr.ptr)
 
 /*
  * Retrieve the length of the buffer required to store the string in the given
  * named kstat.
  */
 #define	KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.str.len)
 
 /*
  * Interrupt statistics.
  *
  * An interrupt is a hard interrupt (sourced from the hardware device
  * itself), a soft interrupt (induced by the system via the use of
  * some system interrupt source), a watchdog interrupt (induced by
  * a periodic timer call), spurious (an interrupt entry point was
  * entered but there was no interrupt condition to service),
  * or multiple service (an interrupt condition was detected and
  * serviced just prior to returning from any of the other types).
  *
  * Measurement of the spurious class of interrupts is useful for
  * autovectored devices in order to pinpoint any interrupt latency
  * problems in a particular system configuration.
  *
  * Devices that have more than one interrupt of the same
  * type should use multiple structures.
  */
 
 #define	KSTAT_INTR_HARD			0
 #define	KSTAT_INTR_SOFT			1
 #define	KSTAT_INTR_WATCHDOG		2
 #define	KSTAT_INTR_SPURIOUS		3
 #define	KSTAT_INTR_MULTSVC		4
 
 #define	KSTAT_NUM_INTRS			5
 
 typedef struct kstat_intr {
 	uint_t	intrs[KSTAT_NUM_INTRS];	/* interrupt counters */
 } kstat_intr_t;
 
 #define	KSTAT_INTR_PTR(kptr)	((kstat_intr_t *)(kptr)->ks_data)
 
 /*
  * I/O statistics.
  */
 
 typedef struct kstat_io {
 
 	/*
 	 * Basic counters.
 	 *
 	 * The counters should be updated at the end of service
 	 * (e.g., just prior to calling biodone()).
 	 */
 
 	u_longlong_t	nread;		/* number of bytes read */
 	u_longlong_t	nwritten;	/* number of bytes written */
 	uint_t		reads;		/* number of read operations */
 	uint_t		writes;		/* number of write operations */
 
 	/*
 	 * Accumulated time and queue length statistics.
 	 *
 	 * Accumulated time statistics are kept as a running sum
 	 * of "active" time.  Queue length statistics are kept as a
 	 * running sum of the product of queue length and elapsed time
 	 * at that length -- i.e., a Riemann sum for queue length
 	 * integrated against time.  (You can also think of the active time
 	 * as a Riemann sum, for the boolean function (queue_length > 0)
 	 * integrated against time, or you can think of it as the
 	 * Lebesgue measure of the set on which queue_length > 0.)
 	 *
 	 *		^
 	 *		|			_________
 	 *		8			| i4	|
 	 *		|			|	|
 	 *	Queue	6			|	|
 	 *	Length	|	_________	|	|
 	 *		4	| i2	|_______|	|
 	 *		|	|	    i3		|
 	 *		2_______|			|
 	 *		|    i1				|
 	 *		|_______________________________|
 	 *		Time->	t1	t2	t3	t4
 	 *
 	 * At each change of state (entry or exit from the queue),
 	 * we add the elapsed time (since the previous state change)
 	 * to the active time if the queue length was non-zero during
 	 * that interval; and we add the product of the elapsed time
 	 * times the queue length to the running length*time sum.
 	 *
 	 * This method is generalizable to measuring residency
 	 * in any defined system: instead of queue lengths, think
 	 * of "outstanding RPC calls to server X".
 	 *
 	 * A large number of I/O subsystems have at least two basic
 	 * "lists" of transactions they manage: one for transactions
 	 * that have been accepted for processing but for which processing
 	 * has yet to begin, and one for transactions which are actively
 	 * being processed (but not done). For this reason, two cumulative
 	 * time statistics are defined here: wait (pre-service) time,
 	 * and run (service) time.
 	 *
 	 * All times are 64-bit nanoseconds (hrtime_t), as returned by
 	 * gethrtime().
 	 *
 	 * The units of cumulative busy time are accumulated nanoseconds.
 	 * The units of cumulative length*time products are elapsed time
 	 * times queue length.
 	 *
 	 * Updates to the fields below are performed implicitly by calls to
 	 * these five functions:
 	 *
 	 *	kstat_waitq_enter()
 	 *	kstat_waitq_exit()
 	 *	kstat_runq_enter()
 	 *	kstat_runq_exit()
 	 *
 	 *	kstat_waitq_to_runq()		(see below)
 	 *	kstat_runq_back_to_waitq()	(see below)
 	 *
 	 * Since kstat_waitq_exit() is typically followed immediately
 	 * by kstat_runq_enter(), there is a single kstat_waitq_to_runq()
 	 * function which performs both operations.  This is a performance
 	 * win since only one timestamp is required.
 	 *
 	 * In some instances, it may be necessary to move a request from
 	 * the run queue back to the wait queue, e.g. for write throttling.
 	 * For these situations, call kstat_runq_back_to_waitq().
 	 *
 	 * These fields should never be updated by any other means.
 	 */
 
 	hrtime_t wtime;		/* cumulative wait (pre-service) time */
 	hrtime_t wlentime;	/* cumulative wait length*time product */
 	hrtime_t wlastupdate;	/* last time wait queue changed */
 	hrtime_t rtime;		/* cumulative run (service) time */
 	hrtime_t rlentime;	/* cumulative run length*time product */
 	hrtime_t rlastupdate;	/* last time run queue changed */
 
 	uint_t	wcnt;		/* count of elements in wait state */
 	uint_t	rcnt;		/* count of elements in run state */
 
 } kstat_io_t;
 
 #define	KSTAT_IO_PTR(kptr)	((kstat_io_t *)(kptr)->ks_data)
 
 /*
  * Event timer statistics - cumulative elapsed time and number of events.
  *
  * Updates to these fields are performed implicitly by calls to
  * kstat_timer_start() and kstat_timer_stop().
  */
 
 typedef struct kstat_timer {
 	char		name[KSTAT_STRLEN];	/* event name */
 	uchar_t		resv;			/* reserved */
 	u_longlong_t	num_events;		/* number of events */
 	hrtime_t	elapsed_time;		/* cumulative elapsed time */
 	hrtime_t	min_time;		/* shortest event duration */
 	hrtime_t	max_time;		/* longest event duration */
 	hrtime_t	start_time;		/* previous event start time */
 	hrtime_t	stop_time;		/* previous event stop time */
 } kstat_timer_t;
 
 #define	KSTAT_TIMER_PTR(kptr)	((kstat_timer_t *)(kptr)->ks_data)
 
 #if	defined(_KERNEL)
 
 #include <sys/t_lock.h>
 
 extern kid_t	kstat_chain_id;		/* bumped at each state change */
 extern void	kstat_init(void);	/* initialize kstat framework */
 
 /*
  * Adding and deleting kstats.
  *
  * The typical sequence to add a kstat is:
  *
  *	ksp = kstat_create(module, instance, name, class, type, ndata, flags);
  *	if (ksp) {
  *		... provider initialization, if necessary
  *		kstat_install(ksp);
  *	}
  *
  * There are three logically distinct steps here:
  *
  * Step 1: System Initialization (kstat_create)
  *
  * kstat_create() performs system initialization.  kstat_create()
  * allocates memory for the entire kstat (header plus data), initializes
  * all header fields, initializes the data section to all zeroes, assigns
  * a unique KID, and puts the kstat onto the system's kstat chain.
  * The returned kstat is marked invalid (KSTAT_FLAG_INVALID is set),
  * because the provider (caller) has not yet had a chance to initialize
  * the data section.
  *
  * By default, kstats are exported to all zones on the system.  A kstat may be
  * created via kstat_create_zone() to specify a zone to which the statistics
  * should be exported.  kstat_zone_add() may be used to specify additional
  * zones to which the statistics are to be exported.
  *
  * Step 2: Provider Initialization
  *
  * The provider performs any necessary initialization of the data section,
  * e.g. setting the name fields in a KSTAT_TYPE_NAMED.  Virtual kstats set
  * the ks_data field at this time.  The provider may also set the ks_update,
  * ks_snapshot, ks_private, and ks_lock fields if necessary.
  *
  * Step 3: Installation (kstat_install)
  *
  * Once the kstat is completely initialized, kstat_install() clears the
  * INVALID flag, thus making the kstat accessible to the outside world.
  * kstat_install() also clears the DORMANT flag for persistent kstats.
  *
  * Removing a kstat from the system
  *
  * kstat_delete(ksp) removes ksp from the kstat chain and frees all
  * associated system resources.  NOTE: When you call kstat_delete(),
  * you must NOT be holding that kstat's ks_lock.  Otherwise, you may
  * deadlock with a kstat reader.
  *
  * Persistent kstats
  *
  * From the provider's point of view, persistence is transparent.  The only
  * difference between ephemeral (normal) kstats and persistent kstats
  * is that you pass KSTAT_FLAG_PERSISTENT to kstat_create().  Magically,
  * this has the effect of making your data visible even when you're
  * not home.  Persistence is important to tools like iostat, which want
  * to get a meaningful picture of disk activity.  Without persistence,
  * raw disk i/o statistics could never accumulate: they would come and
  * go with each open/close of the raw device.
  *
  * The magic of persistence works by slightly altering the behavior of
  * kstat_create() and kstat_delete().  The first call to kstat_create()
  * creates a new kstat, as usual.  However, kstat_delete() does not
  * actually delete the kstat: it performs one final update of the data
  * (i.e., calls the ks_update routine), marks the kstat as dormant, and
  * sets the ks_lock, ks_update, ks_private, and ks_snapshot fields back
  * to their default values (since they might otherwise point to garbage,
  * e.g. if the provider is going away).  kstat clients can still access
  * the dormant kstat just like a live kstat; they just continue to see
  * the final data values as long as the kstat remains dormant.
  * All subsequent kstat_create() calls simply find the already-existing,
  * dormant kstat and return a pointer to it, without altering any fields.
  * The provider then performs its usual initialization sequence, and
  * calls kstat_install().  kstat_install() uses the old data values to
  * initialize the native data (i.e., ks_update is called with KSTAT_WRITE),
  * thus making it seem like you were never gone.
  */
 
 extern kstat_t *kstat_create(const char *, int, const char *, const char *,
     uchar_t, uint_t, uchar_t);
 extern kstat_t *kstat_create_zone(const char *, int, const char *,
     const char *, uchar_t, uint_t, uchar_t, zoneid_t);
 extern void kstat_install(kstat_t *);
 extern void kstat_delete(kstat_t *);
 extern void kstat_named_setstr(kstat_named_t *knp, const char *src);
 extern void kstat_set_string(char *, const char *);
 extern void kstat_delete_byname(const char *, int, const char *);
 extern void kstat_delete_byname_zone(const char *, int, const char *, zoneid_t);
 extern void kstat_named_init(kstat_named_t *, const char *, uchar_t);
 extern void kstat_timer_init(kstat_timer_t *, const char *);
-extern void kstat_waitq_enter(kstat_io_t *);
-extern void kstat_waitq_exit(kstat_io_t *);
-extern void kstat_runq_enter(kstat_io_t *);
-extern void kstat_runq_exit(kstat_io_t *);
-extern void kstat_waitq_to_runq(kstat_io_t *);
-extern void kstat_runq_back_to_waitq(kstat_io_t *);
 extern void kstat_timer_start(kstat_timer_t *);
 extern void kstat_timer_stop(kstat_timer_t *);
 
 extern void kstat_zone_add(kstat_t *, zoneid_t);
 extern void kstat_zone_remove(kstat_t *, zoneid_t);
 extern int kstat_zone_find(kstat_t *, zoneid_t);
 
 extern kstat_t *kstat_hold_bykid(kid_t kid, zoneid_t);
 extern kstat_t *kstat_hold_byname(const char *, int, const char *, zoneid_t);
 extern void kstat_rele(kstat_t *);
 
 #endif	/* defined(_KERNEL) */
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_KSTAT_H */
diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c
index e96a1d7521d9..cc8e534e7eb5 100644
--- a/lib/libzpool/kernel.c
+++ b/lib/libzpool/kernel.c
@@ -1,1417 +1,1387 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  * Copyright (c) 2016 Actifio, Inc. All rights reserved.
  */
 
 #include <assert.h>
 #include <fcntl.h>
 #include <libgen.h>
 #include <poll.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/crypto/icp.h>
 #include <sys/processor.h>
 #include <sys/rrwlock.h>
 #include <sys/spa.h>
 #include <sys/stat.h>
 #include <sys/systeminfo.h>
 #include <sys/time.h>
 #include <sys/utsname.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_onexit.h>
 #include <sys/zfs_vfsops.h>
 #include <sys/zstd/zstd.h>
 #include <sys/zvol.h>
 #include <zfs_fletcher.h>
 #include <zlib.h>
 
 /*
  * Emulation of kernel services in userland.
  */
 
 uint64_t physmem;
 char hw_serial[HW_HOSTID_LEN];
 struct utsname hw_utsname;
 
 /* If set, all blocks read will be copied to the specified directory. */
 char *vn_dumpdir = NULL;
 
 /* this only exists to have its address taken */
 struct proc p0;
 
 /*
  * =========================================================================
  * threads
  * =========================================================================
  *
  * TS_STACK_MIN is dictated by the minimum allowed pthread stack size.  While
  * TS_STACK_MAX is somewhat arbitrary, it was selected to be large enough for
  * the expected stack depth while small enough to avoid exhausting address
  * space with high thread counts.
  */
 #define	TS_STACK_MIN	MAX(PTHREAD_STACK_MIN, 32768)
 #define	TS_STACK_MAX	(256 * 1024)
 
 /*ARGSUSED*/
 kthread_t *
 zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state)
 {
 	pthread_attr_t attr;
 	pthread_t tid;
 	char *stkstr;
 	int detachstate = PTHREAD_CREATE_DETACHED;
 
 	VERIFY0(pthread_attr_init(&attr));
 
 	if (state & TS_JOINABLE)
 		detachstate = PTHREAD_CREATE_JOINABLE;
 
 	VERIFY0(pthread_attr_setdetachstate(&attr, detachstate));
 
 	/*
 	 * We allow the default stack size in user space to be specified by
 	 * setting the ZFS_STACK_SIZE environment variable.  This allows us
 	 * the convenience of observing and debugging stack overruns in
 	 * user space.  Explicitly specified stack sizes will be honored.
 	 * The usage of ZFS_STACK_SIZE is discussed further in the
 	 * ENVIRONMENT VARIABLES sections of the ztest(1) man page.
 	 */
 	if (stksize == 0) {
 		stkstr = getenv("ZFS_STACK_SIZE");
 
 		if (stkstr == NULL)
 			stksize = TS_STACK_MAX;
 		else
 			stksize = MAX(atoi(stkstr), TS_STACK_MIN);
 	}
 
 	VERIFY3S(stksize, >, 0);
 	stksize = P2ROUNDUP(MAX(stksize, TS_STACK_MIN), PAGESIZE);
 
 	/*
 	 * If this ever fails, it may be because the stack size is not a
 	 * multiple of system page size.
 	 */
 	VERIFY0(pthread_attr_setstacksize(&attr, stksize));
 	VERIFY0(pthread_attr_setguardsize(&attr, PAGESIZE));
 
 	VERIFY0(pthread_create(&tid, &attr, (void *(*)(void *))func, arg));
 	VERIFY0(pthread_attr_destroy(&attr));
 
 	return ((void *)(uintptr_t)tid);
 }
 
 /*
  * =========================================================================
  * kstats
  * =========================================================================
  */
 /*ARGSUSED*/
 kstat_t *
 kstat_create(const char *module, int instance, const char *name,
     const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag)
 {
 	return (NULL);
 }
 
 /*ARGSUSED*/
 void
 kstat_install(kstat_t *ksp)
 {}
 
 /*ARGSUSED*/
 void
 kstat_delete(kstat_t *ksp)
 {}
 
-/*ARGSUSED*/
-void
-kstat_waitq_enter(kstat_io_t *kiop)
-{}
-
-/*ARGSUSED*/
-void
-kstat_waitq_exit(kstat_io_t *kiop)
-{}
-
-/*ARGSUSED*/
-void
-kstat_runq_enter(kstat_io_t *kiop)
-{}
-
-/*ARGSUSED*/
-void
-kstat_runq_exit(kstat_io_t *kiop)
-{}
-
-/*ARGSUSED*/
-void
-kstat_waitq_to_runq(kstat_io_t *kiop)
-{}
-
-/*ARGSUSED*/
-void
-kstat_runq_back_to_waitq(kstat_io_t *kiop)
-{}
-
 void
 kstat_set_raw_ops(kstat_t *ksp,
     int (*headers)(char *buf, size_t size),
     int (*data)(char *buf, size_t size, void *data),
     void *(*addr)(kstat_t *ksp, loff_t index))
 {}
 
 /*
  * =========================================================================
  * mutexes
  * =========================================================================
  */
 
 void
 mutex_init(kmutex_t *mp, char *name, int type, void *cookie)
 {
 	VERIFY0(pthread_mutex_init(&mp->m_lock, NULL));
 	memset(&mp->m_owner, 0, sizeof (pthread_t));
 }
 
 void
 mutex_destroy(kmutex_t *mp)
 {
 	VERIFY0(pthread_mutex_destroy(&mp->m_lock));
 }
 
 void
 mutex_enter(kmutex_t *mp)
 {
 	VERIFY0(pthread_mutex_lock(&mp->m_lock));
 	mp->m_owner = pthread_self();
 }
 
 int
 mutex_tryenter(kmutex_t *mp)
 {
 	int error;
 
 	error = pthread_mutex_trylock(&mp->m_lock);
 	if (error == 0) {
 		mp->m_owner = pthread_self();
 		return (1);
 	} else {
 		VERIFY3S(error, ==, EBUSY);
 		return (0);
 	}
 }
 
 void
 mutex_exit(kmutex_t *mp)
 {
 	memset(&mp->m_owner, 0, sizeof (pthread_t));
 	VERIFY0(pthread_mutex_unlock(&mp->m_lock));
 }
 
 /*
  * =========================================================================
  * rwlocks
  * =========================================================================
  */
 
 void
 rw_init(krwlock_t *rwlp, char *name, int type, void *arg)
 {
 	VERIFY0(pthread_rwlock_init(&rwlp->rw_lock, NULL));
 	rwlp->rw_readers = 0;
 	rwlp->rw_owner = 0;
 }
 
 void
 rw_destroy(krwlock_t *rwlp)
 {
 	VERIFY0(pthread_rwlock_destroy(&rwlp->rw_lock));
 }
 
 void
 rw_enter(krwlock_t *rwlp, krw_t rw)
 {
 	if (rw == RW_READER) {
 		VERIFY0(pthread_rwlock_rdlock(&rwlp->rw_lock));
 		atomic_inc_uint(&rwlp->rw_readers);
 	} else {
 		VERIFY0(pthread_rwlock_wrlock(&rwlp->rw_lock));
 		rwlp->rw_owner = pthread_self();
 	}
 }
 
 void
 rw_exit(krwlock_t *rwlp)
 {
 	if (RW_READ_HELD(rwlp))
 		atomic_dec_uint(&rwlp->rw_readers);
 	else
 		rwlp->rw_owner = 0;
 
 	VERIFY0(pthread_rwlock_unlock(&rwlp->rw_lock));
 }
 
 int
 rw_tryenter(krwlock_t *rwlp, krw_t rw)
 {
 	int error;
 
 	if (rw == RW_READER)
 		error = pthread_rwlock_tryrdlock(&rwlp->rw_lock);
 	else
 		error = pthread_rwlock_trywrlock(&rwlp->rw_lock);
 
 	if (error == 0) {
 		if (rw == RW_READER)
 			atomic_inc_uint(&rwlp->rw_readers);
 		else
 			rwlp->rw_owner = pthread_self();
 
 		return (1);
 	}
 
 	VERIFY3S(error, ==, EBUSY);
 
 	return (0);
 }
 
 /* ARGSUSED */
 uint32_t
 zone_get_hostid(void *zonep)
 {
 	/*
 	 * We're emulating the system's hostid in userland.
 	 */
 	return (strtoul(hw_serial, NULL, 10));
 }
 
 int
 rw_tryupgrade(krwlock_t *rwlp)
 {
 	return (0);
 }
 
 /*
  * =========================================================================
  * condition variables
  * =========================================================================
  */
 
 void
 cv_init(kcondvar_t *cv, char *name, int type, void *arg)
 {
 	VERIFY0(pthread_cond_init(cv, NULL));
 }
 
 void
 cv_destroy(kcondvar_t *cv)
 {
 	VERIFY0(pthread_cond_destroy(cv));
 }
 
 void
 cv_wait(kcondvar_t *cv, kmutex_t *mp)
 {
 	memset(&mp->m_owner, 0, sizeof (pthread_t));
 	VERIFY0(pthread_cond_wait(cv, &mp->m_lock));
 	mp->m_owner = pthread_self();
 }
 
 int
 cv_wait_sig(kcondvar_t *cv, kmutex_t *mp)
 {
 	cv_wait(cv, mp);
 	return (1);
 }
 
 int
 cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime)
 {
 	int error;
 	struct timeval tv;
 	struct timespec ts;
 	clock_t delta;
 
 	delta = abstime - ddi_get_lbolt();
 	if (delta <= 0)
 		return (-1);
 
 	VERIFY(gettimeofday(&tv, NULL) == 0);
 
 	ts.tv_sec = tv.tv_sec + delta / hz;
 	ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz);
 	if (ts.tv_nsec >= NANOSEC) {
 		ts.tv_sec++;
 		ts.tv_nsec -= NANOSEC;
 	}
 
 	memset(&mp->m_owner, 0, sizeof (pthread_t));
 	error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
 	mp->m_owner = pthread_self();
 
 	if (error == ETIMEDOUT)
 		return (-1);
 
 	VERIFY0(error);
 
 	return (1);
 }
 
 /*ARGSUSED*/
 int
 cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res,
     int flag)
 {
 	int error;
 	struct timeval tv;
 	struct timespec ts;
 	hrtime_t delta;
 
 	ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE);
 
 	delta = tim;
 	if (flag & CALLOUT_FLAG_ABSOLUTE)
 		delta -= gethrtime();
 
 	if (delta <= 0)
 		return (-1);
 
 	VERIFY0(gettimeofday(&tv, NULL));
 
 	ts.tv_sec = tv.tv_sec + delta / NANOSEC;
 	ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % NANOSEC);
 	if (ts.tv_nsec >= NANOSEC) {
 		ts.tv_sec++;
 		ts.tv_nsec -= NANOSEC;
 	}
 
 	memset(&mp->m_owner, 0, sizeof (pthread_t));
 	error = pthread_cond_timedwait(cv, &mp->m_lock, &ts);
 	mp->m_owner = pthread_self();
 
 	if (error == ETIMEDOUT)
 		return (-1);
 
 	VERIFY0(error);
 
 	return (1);
 }
 
 void
 cv_signal(kcondvar_t *cv)
 {
 	VERIFY0(pthread_cond_signal(cv));
 }
 
 void
 cv_broadcast(kcondvar_t *cv)
 {
 	VERIFY0(pthread_cond_broadcast(cv));
 }
 
 /*
  * =========================================================================
  * procfs list
  * =========================================================================
  */
 
 void
 seq_printf(struct seq_file *m, const char *fmt, ...)
 {}
 
 void
 procfs_list_install(const char *module,
     const char *submodule,
     const char *name,
     mode_t mode,
     procfs_list_t *procfs_list,
     int (*show)(struct seq_file *f, void *p),
     int (*show_header)(struct seq_file *f),
     int (*clear)(procfs_list_t *procfs_list),
     size_t procfs_list_node_off)
 {
 	mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&procfs_list->pl_list,
 	    procfs_list_node_off + sizeof (procfs_list_node_t),
 	    procfs_list_node_off + offsetof(procfs_list_node_t, pln_link));
 	procfs_list->pl_next_id = 1;
 	procfs_list->pl_node_offset = procfs_list_node_off;
 }
 
 void
 procfs_list_uninstall(procfs_list_t *procfs_list)
 {}
 
 void
 procfs_list_destroy(procfs_list_t *procfs_list)
 {
 	ASSERT(list_is_empty(&procfs_list->pl_list));
 	list_destroy(&procfs_list->pl_list);
 	mutex_destroy(&procfs_list->pl_lock);
 }
 
 #define	NODE_ID(procfs_list, obj) \
 		(((procfs_list_node_t *)(((char *)obj) + \
 		(procfs_list)->pl_node_offset))->pln_id)
 
 void
 procfs_list_add(procfs_list_t *procfs_list, void *p)
 {
 	ASSERT(MUTEX_HELD(&procfs_list->pl_lock));
 	NODE_ID(procfs_list, p) = procfs_list->pl_next_id++;
 	list_insert_tail(&procfs_list->pl_list, p);
 }
 
 /*
  * =========================================================================
  * vnode operations
  * =========================================================================
  */
 
 /*
  * =========================================================================
  * Figure out which debugging statements to print
  * =========================================================================
  */
 
 static char *dprintf_string;
 static int dprintf_print_all;
 
 int
 dprintf_find_string(const char *string)
 {
 	char *tmp_str = dprintf_string;
 	int len = strlen(string);
 
 	/*
 	 * Find out if this is a string we want to print.
 	 * String format: file1.c,function_name1,file2.c,file3.c
 	 */
 
 	while (tmp_str != NULL) {
 		if (strncmp(tmp_str, string, len) == 0 &&
 		    (tmp_str[len] == ',' || tmp_str[len] == '\0'))
 			return (1);
 		tmp_str = strchr(tmp_str, ',');
 		if (tmp_str != NULL)
 			tmp_str++; /* Get rid of , */
 	}
 	return (0);
 }
 
 void
 dprintf_setup(int *argc, char **argv)
 {
 	int i, j;
 
 	/*
 	 * Debugging can be specified two ways: by setting the
 	 * environment variable ZFS_DEBUG, or by including a
 	 * "debug=..."  argument on the command line.  The command
 	 * line setting overrides the environment variable.
 	 */
 
 	for (i = 1; i < *argc; i++) {
 		int len = strlen("debug=");
 		/* First look for a command line argument */
 		if (strncmp("debug=", argv[i], len) == 0) {
 			dprintf_string = argv[i] + len;
 			/* Remove from args */
 			for (j = i; j < *argc; j++)
 				argv[j] = argv[j+1];
 			argv[j] = NULL;
 			(*argc)--;
 		}
 	}
 
 	if (dprintf_string == NULL) {
 		/* Look for ZFS_DEBUG environment variable */
 		dprintf_string = getenv("ZFS_DEBUG");
 	}
 
 	/*
 	 * Are we just turning on all debugging?
 	 */
 	if (dprintf_find_string("on"))
 		dprintf_print_all = 1;
 
 	if (dprintf_string != NULL)
 		zfs_flags |= ZFS_DEBUG_DPRINTF;
 }
 
 /*
  * =========================================================================
  * debug printfs
  * =========================================================================
  */
 void
 __dprintf(boolean_t dprint, const char *file, const char *func,
     int line, const char *fmt, ...)
 {
 	const char *newfile;
 	va_list adx;
 
 	/*
 	 * Get rid of annoying "../common/" prefix to filename.
 	 */
 	newfile = strrchr(file, '/');
 	if (newfile != NULL) {
 		newfile = newfile + 1; /* Get rid of leading / */
 	} else {
 		newfile = file;
 	}
 
 	if (dprint) {
 		/* dprintf messages are printed immediately */
 
 		if (!dprintf_print_all &&
 		    !dprintf_find_string(newfile) &&
 		    !dprintf_find_string(func))
 			return;
 
 		/* Print out just the function name if requested */
 		flockfile(stdout);
 		if (dprintf_find_string("pid"))
 			(void) printf("%d ", getpid());
 		if (dprintf_find_string("tid"))
 			(void) printf("%ju ",
 			    (uintmax_t)(uintptr_t)pthread_self());
 		if (dprintf_find_string("cpu"))
 			(void) printf("%u ", getcpuid());
 		if (dprintf_find_string("time"))
 			(void) printf("%llu ", gethrtime());
 		if (dprintf_find_string("long"))
 			(void) printf("%s, line %d: ", newfile, line);
 		(void) printf("dprintf: %s: ", func);
 		va_start(adx, fmt);
 		(void) vprintf(fmt, adx);
 		va_end(adx);
 		funlockfile(stdout);
 	} else {
 		/* zfs_dbgmsg is logged for dumping later */
 		size_t size;
 		char *buf;
 		int i;
 
 		size = 1024;
 		buf = umem_alloc(size, UMEM_NOFAIL);
 		i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func);
 
 		if (i < size) {
 			va_start(adx, fmt);
 			(void) vsnprintf(buf + i, size - i, fmt, adx);
 			va_end(adx);
 		}
 
 		__zfs_dbgmsg(buf);
 
 		umem_free(buf, size);
 	}
 }
 
 /*
  * =========================================================================
  * cmn_err() and panic()
  * =========================================================================
  */
 static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" };
 static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" };
 
 void
 vpanic(const char *fmt, va_list adx)
 {
 	(void) fprintf(stderr, "error: ");
 	(void) vfprintf(stderr, fmt, adx);
 	(void) fprintf(stderr, "\n");
 
 	abort();	/* think of it as a "user-level crash dump" */
 }
 
 void
 panic(const char *fmt, ...)
 {
 	va_list adx;
 
 	va_start(adx, fmt);
 	vpanic(fmt, adx);
 	va_end(adx);
 }
 
 void
 vcmn_err(int ce, const char *fmt, va_list adx)
 {
 	if (ce == CE_PANIC)
 		vpanic(fmt, adx);
 	if (ce != CE_NOTE) {	/* suppress noise in userland stress testing */
 		(void) fprintf(stderr, "%s", ce_prefix[ce]);
 		(void) vfprintf(stderr, fmt, adx);
 		(void) fprintf(stderr, "%s", ce_suffix[ce]);
 	}
 }
 
 /*PRINTFLIKE2*/
 void
 cmn_err(int ce, const char *fmt, ...)
 {
 	va_list adx;
 
 	va_start(adx, fmt);
 	vcmn_err(ce, fmt, adx);
 	va_end(adx);
 }
 
 /*
  * =========================================================================
  * misc routines
  * =========================================================================
  */
 
 void
 delay(clock_t ticks)
 {
 	(void) poll(0, 0, ticks * (1000 / hz));
 }
 
 /*
  * Find highest one bit set.
  * Returns bit number + 1 of highest bit that is set, otherwise returns 0.
  * The __builtin_clzll() function is supported by both GCC and Clang.
  */
 int
 highbit64(uint64_t i)
 {
 	if (i == 0)
 	return (0);
 
 	return (NBBY * sizeof (uint64_t) - __builtin_clzll(i));
 }
 
 /*
  * Find lowest one bit set.
  * Returns bit number + 1 of lowest bit that is set, otherwise returns 0.
  * The __builtin_ffsll() function is supported by both GCC and Clang.
  */
 int
 lowbit64(uint64_t i)
 {
 	if (i == 0)
 		return (0);
 
 	return (__builtin_ffsll(i));
 }
 
 const char *random_path = "/dev/random";
 const char *urandom_path = "/dev/urandom";
 static int random_fd = -1, urandom_fd = -1;
 
 void
 random_init(void)
 {
 	VERIFY((random_fd = open(random_path, O_RDONLY | O_CLOEXEC)) != -1);
 	VERIFY((urandom_fd = open(urandom_path, O_RDONLY | O_CLOEXEC)) != -1);
 }
 
 void
 random_fini(void)
 {
 	close(random_fd);
 	close(urandom_fd);
 
 	random_fd = -1;
 	urandom_fd = -1;
 }
 
 static int
 random_get_bytes_common(uint8_t *ptr, size_t len, int fd)
 {
 	size_t resid = len;
 	ssize_t bytes;
 
 	ASSERT(fd != -1);
 
 	while (resid != 0) {
 		bytes = read(fd, ptr, resid);
 		ASSERT3S(bytes, >=, 0);
 		ptr += bytes;
 		resid -= bytes;
 	}
 
 	return (0);
 }
 
 int
 random_get_bytes(uint8_t *ptr, size_t len)
 {
 	return (random_get_bytes_common(ptr, len, random_fd));
 }
 
 int
 random_get_pseudo_bytes(uint8_t *ptr, size_t len)
 {
 	return (random_get_bytes_common(ptr, len, urandom_fd));
 }
 
 int
 ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result)
 {
 	char *end;
 
 	*result = strtoul(hw_serial, &end, base);
 	if (*result == 0)
 		return (errno);
 	return (0);
 }
 
 int
 ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result)
 {
 	char *end;
 
 	*result = strtoull(str, &end, base);
 	if (*result == 0)
 		return (errno);
 	return (0);
 }
 
 utsname_t *
 utsname(void)
 {
 	return (&hw_utsname);
 }
 
 /*
  * =========================================================================
  * kernel emulation setup & teardown
  * =========================================================================
  */
 static int
 umem_out_of_memory(void)
 {
 	char errmsg[] = "out of memory -- generating core dump\n";
 
 	(void) fprintf(stderr, "%s", errmsg);
 	abort();
 	return (0);
 }
 
 void
 kernel_init(int mode)
 {
 	extern uint_t rrw_tsd_key;
 
 	umem_nofail_callback(umem_out_of_memory);
 
 	physmem = sysconf(_SC_PHYS_PAGES);
 
 	dprintf("physmem = %llu pages (%.2f GB)\n", physmem,
 	    (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30));
 
 	(void) snprintf(hw_serial, sizeof (hw_serial), "%ld",
 	    (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0);
 
 	random_init();
 
 	VERIFY0(uname(&hw_utsname));
 
 	system_taskq_init();
 	icp_init();
 
 	zstd_init();
 
 	spa_init((spa_mode_t)mode);
 
 	fletcher_4_init();
 
 	tsd_create(&rrw_tsd_key, rrw_tsd_destroy);
 }
 
 void
 kernel_fini(void)
 {
 	fletcher_4_fini();
 	spa_fini();
 
 	zstd_fini();
 
 	icp_fini();
 	system_taskq_fini();
 
 	random_fini();
 }
 
 uid_t
 crgetuid(cred_t *cr)
 {
 	return (0);
 }
 
 uid_t
 crgetruid(cred_t *cr)
 {
 	return (0);
 }
 
 gid_t
 crgetgid(cred_t *cr)
 {
 	return (0);
 }
 
 int
 crgetngroups(cred_t *cr)
 {
 	return (0);
 }
 
 gid_t *
 crgetgroups(cred_t *cr)
 {
 	return (NULL);
 }
 
 int
 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
 {
 	return (0);
 }
 
 int
 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
 {
 	return (0);
 }
 
 int
 zfs_secpolicy_destroy_perms(const char *name, cred_t *cr)
 {
 	return (0);
 }
 
 int
 secpolicy_zfs(const cred_t *cr)
 {
 	return (0);
 }
 
 int
 secpolicy_zfs_proc(const cred_t *cr, proc_t *proc)
 {
 	return (0);
 }
 
 ksiddomain_t *
 ksid_lookupdomain(const char *dom)
 {
 	ksiddomain_t *kd;
 
 	kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL);
 	kd->kd_name = spa_strdup(dom);
 	return (kd);
 }
 
 void
 ksiddomain_rele(ksiddomain_t *ksid)
 {
 	spa_strfree(ksid->kd_name);
 	umem_free(ksid, sizeof (ksiddomain_t));
 }
 
 char *
 kmem_vasprintf(const char *fmt, va_list adx)
 {
 	char *buf = NULL;
 	va_list adx_copy;
 
 	va_copy(adx_copy, adx);
 	VERIFY(vasprintf(&buf, fmt, adx_copy) != -1);
 	va_end(adx_copy);
 
 	return (buf);
 }
 
 char *
 kmem_asprintf(const char *fmt, ...)
 {
 	char *buf = NULL;
 	va_list adx;
 
 	va_start(adx, fmt);
 	VERIFY(vasprintf(&buf, fmt, adx) != -1);
 	va_end(adx);
 
 	return (buf);
 }
 
 /* ARGSUSED */
 int
 zfs_onexit_fd_hold(int fd, minor_t *minorp)
 {
 	*minorp = 0;
 	return (0);
 }
 
 /* ARGSUSED */
 void
 zfs_onexit_fd_rele(int fd)
 {
 }
 
 /* ARGSUSED */
 int
 zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data,
     uint64_t *action_handle)
 {
 	return (0);
 }
 
 fstrans_cookie_t
 spl_fstrans_mark(void)
 {
 	return ((fstrans_cookie_t)0);
 }
 
 void
 spl_fstrans_unmark(fstrans_cookie_t cookie)
 {
 }
 
 int
 __spl_pf_fstrans_check(void)
 {
 	return (0);
 }
 
 int
 kmem_cache_reap_active(void)
 {
 	return (0);
 }
 
 void *zvol_tag = "zvol_tag";
 
 void
 zvol_create_minor(const char *name)
 {
 }
 
 void
 zvol_create_minors_recursive(const char *name)
 {
 }
 
 void
 zvol_remove_minors(spa_t *spa, const char *name, boolean_t async)
 {
 }
 
 void
 zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname,
     boolean_t async)
 {
 }
 
 /*
  * Open file
  *
  * path - fully qualified path to file
  * flags - file attributes O_READ / O_WRITE / O_EXCL
  * fpp - pointer to return file pointer
  *
  * Returns 0 on success underlying error on failure.
  */
 int
 zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp)
 {
 	int fd = -1;
 	int dump_fd = -1;
 	int err;
 	int old_umask = 0;
 	zfs_file_t *fp;
 	struct stat64 st;
 
 	if (!(flags & O_CREAT) && stat64(path, &st) == -1)
 		return (errno);
 
 	if (!(flags & O_CREAT) && S_ISBLK(st.st_mode))
 		flags |= O_DIRECT;
 
 	if (flags & O_CREAT)
 		old_umask = umask(0);
 
 	fd = open64(path, flags, mode);
 	if (fd == -1)
 		return (errno);
 
 	if (flags & O_CREAT)
 		(void) umask(old_umask);
 
 	if (vn_dumpdir != NULL) {
 		char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL);
 		char *inpath = basename((char *)(uintptr_t)path);
 
 		(void) snprintf(dumppath, MAXPATHLEN,
 		    "%s/%s", vn_dumpdir, inpath);
 		dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666);
 		umem_free(dumppath, MAXPATHLEN);
 		if (dump_fd == -1) {
 			err = errno;
 			close(fd);
 			return (err);
 		}
 	} else {
 		dump_fd = -1;
 	}
 
 	(void) fcntl(fd, F_SETFD, FD_CLOEXEC);
 
 	fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL);
 	fp->f_fd = fd;
 	fp->f_dump_fd = dump_fd;
 	*fpp = fp;
 
 	return (0);
 }
 
 void
 zfs_file_close(zfs_file_t *fp)
 {
 	close(fp->f_fd);
 	if (fp->f_dump_fd != -1)
 		close(fp->f_dump_fd);
 
 	umem_free(fp, sizeof (zfs_file_t));
 }
 
 /*
  * Stateful write - use os internal file pointer to determine where to
  * write and update on successful completion.
  *
  * fp -  pointer to file (pipe, socket, etc) to write to
  * buf - buffer to write
  * count - # of bytes to write
  * resid -  pointer to count of unwritten bytes  (if short write)
  *
  * Returns 0 on success errno on failure.
  */
 int
 zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid)
 {
 	ssize_t rc;
 
 	rc = write(fp->f_fd, buf, count);
 	if (rc < 0)
 		return (errno);
 
 	if (resid) {
 		*resid = count - rc;
 	} else if (rc != count) {
 		return (EIO);
 	}
 
 	return (0);
 }
 
 /*
  * Stateless write - os internal file pointer is not updated.
  *
  * fp -  pointer to file (pipe, socket, etc) to write to
  * buf - buffer to write
  * count - # of bytes to write
  * off - file offset to write to (only valid for seekable types)
  * resid -  pointer to count of unwritten bytes
  *
  * Returns 0 on success errno on failure.
  */
 int
 zfs_file_pwrite(zfs_file_t *fp, const void *buf,
     size_t count, loff_t pos, ssize_t *resid)
 {
 	ssize_t rc, split, done;
 	int sectors;
 
 	/*
 	 * To simulate partial disk writes, we split writes into two
 	 * system calls so that the process can be killed in between.
 	 * This is used by ztest to simulate realistic failure modes.
 	 */
 	sectors = count >> SPA_MINBLOCKSHIFT;
 	split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT;
 	rc = pwrite64(fp->f_fd, buf, split, pos);
 	if (rc != -1) {
 		done = rc;
 		rc = pwrite64(fp->f_fd, (char *)buf + split,
 		    count - split, pos + split);
 	}
 #ifdef __linux__
 	if (rc == -1 && errno == EINVAL) {
 		/*
 		 * Under Linux, this most likely means an alignment issue
 		 * (memory or disk) due to O_DIRECT, so we abort() in order
 		 * to catch the offender.
 		 */
 		abort();
 	}
 #endif
 
 	if (rc < 0)
 		return (errno);
 
 	done += rc;
 
 	if (resid) {
 		*resid = count - done;
 	} else if (done != count) {
 		return (EIO);
 	}
 
 	return (0);
 }
 
 /*
  * Stateful read - use os internal file pointer to determine where to
  * read and update on successful completion.
  *
  * fp -  pointer to file (pipe, socket, etc) to read from
  * buf - buffer to write
  * count - # of bytes to read
  * resid -  pointer to count of unread bytes (if short read)
  *
  * Returns 0 on success errno on failure.
  */
 int
 zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid)
 {
 	int rc;
 
 	rc = read(fp->f_fd, buf, count);
 	if (rc < 0)
 		return (errno);
 
 	if (resid) {
 		*resid = count - rc;
 	} else if (rc != count) {
 		return (EIO);
 	}
 
 	return (0);
 }
 
 /*
  * Stateless read - os internal file pointer is not updated.
  *
  * fp -  pointer to file (pipe, socket, etc) to read from
  * buf - buffer to write
  * count - # of bytes to write
  * off - file offset to read from (only valid for seekable types)
  * resid -  pointer to count of unwritten bytes (if short write)
  *
  * Returns 0 on success errno on failure.
  */
 int
 zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off,
     ssize_t *resid)
 {
 	ssize_t rc;
 
 	rc = pread64(fp->f_fd, buf, count, off);
 	if (rc < 0) {
 #ifdef __linux__
 		/*
 		 * Under Linux, this most likely means an alignment issue
 		 * (memory or disk) due to O_DIRECT, so we abort() in order to
 		 * catch the offender.
 		 */
 		if (errno == EINVAL)
 			abort();
 #endif
 		return (errno);
 	}
 
 	if (fp->f_dump_fd != -1) {
 		int status;
 
 		status = pwrite64(fp->f_dump_fd, buf, rc, off);
 		ASSERT(status != -1);
 	}
 
 	if (resid) {
 		*resid = count - rc;
 	} else if (rc != count) {
 		return (EIO);
 	}
 
 	return (0);
 }
 
 /*
  * lseek - set / get file pointer
  *
  * fp -  pointer to file (pipe, socket, etc) to read from
  * offp - value to seek to, returns current value plus passed offset
  * whence - see man pages for standard lseek whence values
  *
  * Returns 0 on success errno on failure (ESPIPE for non seekable types)
  */
 int
 zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence)
 {
 	loff_t rc;
 
 	rc = lseek(fp->f_fd, *offp, whence);
 	if (rc < 0)
 		return (errno);
 
 	*offp = rc;
 
 	return (0);
 }
 
 /*
  * Get file attributes
  *
  * filp - file pointer
  * zfattr - pointer to file attr structure
  *
  * Currently only used for fetching size and file mode
  *
  * Returns 0 on success or error code of underlying getattr call on failure.
  */
 int
 zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr)
 {
 	struct stat64 st;
 
 	if (fstat64_blk(fp->f_fd, &st) == -1)
 		return (errno);
 
 	zfattr->zfa_size = st.st_size;
 	zfattr->zfa_mode = st.st_mode;
 
 	return (0);
 }
 
 /*
  * Sync file to disk
  *
  * filp - file pointer
  * flags - O_SYNC and or O_DSYNC
  *
  * Returns 0 on success or error code of underlying sync call on failure.
  */
 int
 zfs_file_fsync(zfs_file_t *fp, int flags)
 {
 	int rc;
 
 	rc = fsync(fp->f_fd);
 	if (rc < 0)
 		return (errno);
 
 	return (0);
 }
 
 /*
  * fallocate - allocate or free space on disk
  *
  * fp - file pointer
  * mode (non-standard options for hole punching etc)
  * offset - offset to start allocating or freeing from
  * len - length to free / allocate
  *
  * OPTIONAL
  */
 int
 zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len)
 {
 #ifdef __linux__
 	return (fallocate(fp->f_fd, mode, offset, len));
 #else
 	return (EOPNOTSUPP);
 #endif
 }
 
 /*
  * Request current file pointer offset
  *
  * fp - pointer to file
  *
  * Returns current file offset.
  */
 loff_t
 zfs_file_off(zfs_file_t *fp)
 {
 	return (lseek(fp->f_fd, SEEK_CUR, 0));
 }
 
 /*
  * unlink file
  *
  * path - fully qualified file path
  *
  * Returns 0 on success.
  *
  * OPTIONAL
  */
 int
 zfs_file_unlink(const char *path)
 {
 	return (remove(path));
 }
 
 /*
  * Get reference to file pointer
  *
  * fd - input file descriptor
  * fpp - pointer to file pointer
  *
  * Returns 0 on success EBADF on failure.
  * Unsupported in user space.
  */
 int
 zfs_file_get(int fd, zfs_file_t **fpp)
 {
 	abort();
 
 	return (EOPNOTSUPP);
 }
 
 /*
  * Drop reference to file pointer
  *
  * fd - input file descriptor
  *
  * Unsupported in user space.
  */
 void
 zfs_file_put(int fd)
 {
 	abort();
 }
 
 void
 zfsvfs_update_fromname(const char *oldname, const char *newname)
 {
 }
diff --git a/module/os/freebsd/spl/spl_kstat.c b/module/os/freebsd/spl/spl_kstat.c
index e591921ace1b..059ada235c4a 100644
--- a/module/os/freebsd/spl/spl_kstat.c
+++ b/module/os/freebsd/spl/spl_kstat.c
@@ -1,572 +1,510 @@
 /*
  * Copyright (c) 2007 Pawel Jakub Dawidek <pjd@FreeBSD.org>
  * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
  *
  * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
  * Links to Illumos.org for more information on kstat function:
  * [1] https://illumos.org/man/1M/kstat
  * [2] https://illumos.org/man/9f/kstat_create
  */
 
 #include <sys/cdefs.h>
 __FBSDID("$FreeBSD$");
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/malloc.h>
 #include <sys/sysctl.h>
 #include <sys/kstat.h>
 #include <sys/sbuf.h>
 
 static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics");
 
 SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics");
 
 void
 __kstat_set_raw_ops(kstat_t *ksp,
     int (*headers)(char *buf, size_t size),
     int (*data)(char *buf, size_t size, void *data),
     void *(*addr)(kstat_t *ksp, loff_t index))
 {
 	ksp->ks_raw_ops.headers = headers;
 	ksp->ks_raw_ops.data    = data;
 	ksp->ks_raw_ops.addr    = addr;
 }
 
 void
 __kstat_set_seq_raw_ops(kstat_t *ksp,
     int (*headers)(struct seq_file *f),
     int (*data)(char *buf, size_t size, void *data),
     void *(*addr)(kstat_t *ksp, loff_t index))
 {
 	ksp->ks_raw_ops.seq_headers = headers;
 	ksp->ks_raw_ops.data    = data;
 	ksp->ks_raw_ops.addr    = addr;
 }
 
 static int
 kstat_default_update(kstat_t *ksp, int rw)
 {
 	ASSERT3P(ksp, !=, NULL);
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
 	return (0);
 }
 
 static int
 kstat_resize_raw(kstat_t *ksp)
 {
 	if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX)
 		return (ENOMEM);
 
 	free(ksp->ks_raw_buf, M_TEMP);
 	ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX);
 	ksp->ks_raw_buf = malloc(ksp->ks_raw_bufsize, M_TEMP, M_WAITOK);
 
 	return (0);
 }
 
 static void *
 kstat_raw_default_addr(kstat_t *ksp, loff_t n)
 {
 	if (n == 0)
 		return (ksp->ks_data);
 	return (NULL);
 }
 
 static int
 kstat_sysctl(SYSCTL_HANDLER_ARGS)
 {
 	kstat_t *ksp = arg1;
 	kstat_named_t *ksent;
 	uint64_t val;
 
 	ksent = ksp->ks_data;
 	/* Select the correct element */
 	ksent += arg2;
 	/* Update the aggsums before reading */
 	(void) ksp->ks_update(ksp, KSTAT_READ);
 	val = ksent->value.ui64;
 
 	return (sysctl_handle_64(oidp, &val, 0, req));
 }
 
 static int
 kstat_sysctl_string(SYSCTL_HANDLER_ARGS)
 {
 	kstat_t *ksp = arg1;
 	kstat_named_t *ksent = ksp->ks_data;
 	char *val;
 	uint32_t len = 0;
 
 	/* Select the correct element */
 	ksent += arg2;
 	/* Update the aggsums before reading */
 	(void) ksp->ks_update(ksp, KSTAT_READ);
 	val = KSTAT_NAMED_STR_PTR(ksent);
 	len = KSTAT_NAMED_STR_BUFLEN(ksent);
 	val[len-1] = '\0';
 
 	return (sysctl_handle_string(oidp, val, len, req));
 }
 
 static int
 kstat_sysctl_io(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *sb;
 	kstat_t *ksp = arg1;
 	kstat_io_t *kip = ksp->ks_data;
 	int rc;
 
 	sb = sbuf_new_auto();
 	if (sb == NULL)
 		return (ENOMEM);
 	/* Update the aggsums before reading */
 	(void) ksp->ks_update(ksp, KSTAT_READ);
 
 	/* though wlentime & friends are signed, they will never be negative */
 	sbuf_printf(sb,
 	    "%-8llu %-8llu %-8u %-8u %-8llu %-8llu "
 	    "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n",
 	    kip->nread, kip->nwritten,
 	    kip->reads, kip->writes,
 	    kip->wtime, kip->wlentime, kip->wlastupdate,
 	    kip->rtime, kip->rlentime, kip->rlastupdate,
 	    kip->wcnt,  kip->rcnt);
 	rc = sbuf_finish(sb);
 	if (rc == 0)
 		rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
 	sbuf_delete(sb);
 	return (rc);
 }
 
 static int
 kstat_sysctl_raw(SYSCTL_HANDLER_ARGS)
 {
 	struct sbuf *sb;
 	void *data;
 	kstat_t *ksp = arg1;
 	void *(*addr_op)(kstat_t *ksp, loff_t index);
 	int n, has_header, rc = 0;
 
 	sb = sbuf_new_auto();
 	if (sb == NULL)
 		return (ENOMEM);
 
 	if (ksp->ks_raw_ops.addr)
 		addr_op = ksp->ks_raw_ops.addr;
 	else
 		addr_op = kstat_raw_default_addr;
 
 	mutex_enter(ksp->ks_lock);
 
 	/* Update the aggsums before reading */
 	(void) ksp->ks_update(ksp, KSTAT_READ);
 
 	ksp->ks_raw_bufsize = PAGE_SIZE;
 	ksp->ks_raw_buf = malloc(PAGE_SIZE, M_TEMP, M_WAITOK);
 
 	n = 0;
 	has_header = (ksp->ks_raw_ops.headers ||
 	    ksp->ks_raw_ops.seq_headers);
 
 restart_headers:
 	if (ksp->ks_raw_ops.headers) {
 		rc = ksp->ks_raw_ops.headers(
 		    ksp->ks_raw_buf, ksp->ks_raw_bufsize);
 	} else if (ksp->ks_raw_ops.seq_headers) {
 		struct seq_file f;
 
 		f.sf_buf = ksp->ks_raw_buf;
 		f.sf_size = ksp->ks_raw_bufsize;
 		rc = ksp->ks_raw_ops.seq_headers(&f);
 	}
 	if (has_header) {
 		if (rc == ENOMEM && !kstat_resize_raw(ksp))
 			goto restart_headers;
 		if (rc == 0)
 			sbuf_printf(sb, "\n%s", ksp->ks_raw_buf);
 	}
 
 	while ((data = addr_op(ksp, n)) != NULL) {
 restart:
 		if (ksp->ks_raw_ops.data) {
 			rc = ksp->ks_raw_ops.data(ksp->ks_raw_buf,
 			    ksp->ks_raw_bufsize, data);
 			if (rc == ENOMEM && !kstat_resize_raw(ksp))
 				goto restart;
 			if (rc == 0)
 				sbuf_printf(sb, "%s", ksp->ks_raw_buf);
 
 		} else {
 			ASSERT3U(ksp->ks_ndata, ==, 1);
 			sbuf_hexdump(sb, ksp->ks_data,
 			    ksp->ks_data_size, NULL, 0);
 		}
 		n++;
 	}
 	free(ksp->ks_raw_buf, M_TEMP);
 	mutex_exit(ksp->ks_lock);
 	sbuf_trim(sb);
 	rc = sbuf_finish(sb);
 	if (rc == 0)
 		rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb));
 	sbuf_delete(sb);
 	return (rc);
 }
 
 kstat_t *
 __kstat_create(const char *module, int instance, const char *name,
     const char *class, uchar_t ks_type, uint_t ks_ndata, uchar_t flags)
 {
 	char buf[KSTAT_STRLEN];
 	struct sysctl_oid *root;
 	kstat_t *ksp;
 	char *pool;
 
 	KASSERT(instance == 0, ("instance=%d", instance));
 	if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
 		ASSERT3U(ks_ndata, ==, 1);
 
 	if (class == NULL)
 		class = "misc";
 
 	/*
 	 * Allocate the main structure. We don't need to keep a copy of
 	 * module in here, because it is only used for sysctl node creation
 	 * done in this function.
 	 */
 	ksp = malloc(sizeof (*ksp), M_KSTAT, M_WAITOK|M_ZERO);
 
 	ksp->ks_crtime = gethrtime();
 	ksp->ks_snaptime = ksp->ks_crtime;
 	ksp->ks_instance = instance;
 	(void) strlcpy(ksp->ks_name, name, KSTAT_STRLEN);
 	(void) strlcpy(ksp->ks_class, class, KSTAT_STRLEN);
 	ksp->ks_type = ks_type;
 	ksp->ks_flags = flags;
 	ksp->ks_update = kstat_default_update;
 
 	mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL);
 	ksp->ks_lock = &ksp->ks_private_lock;
 
 	switch (ksp->ks_type) {
 	case KSTAT_TYPE_RAW:
 		ksp->ks_ndata = 1;
 		ksp->ks_data_size = ks_ndata;
 		break;
 	case KSTAT_TYPE_NAMED:
 		ksp->ks_ndata = ks_ndata;
 		ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
 		break;
 	case KSTAT_TYPE_INTR:
 		ksp->ks_ndata = ks_ndata;
 		ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
 		break;
 	case KSTAT_TYPE_IO:
 		ksp->ks_ndata = ks_ndata;
 		ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
 		break;
 	case KSTAT_TYPE_TIMER:
 		ksp->ks_ndata = ks_ndata;
 		ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
 		break;
 	default:
 		panic("Undefined kstat type %d\n", ksp->ks_type);
 	}
 
 	if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL)
 		ksp->ks_data = NULL;
 	else
 		ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
 
 	/*
 	 * Some kstats use a module name like "zfs/poolname" to distinguish a
 	 * set of kstats belonging to a specific pool.  Split on '/' to add an
 	 * extra node for the pool name if needed.
 	 */
 	(void) strlcpy(buf, module, KSTAT_STRLEN);
 	module = buf;
 	pool = strchr(module, '/');
 	if (pool != NULL)
 		*pool++ = '\0';
 
 	/*
 	 * Create sysctl tree for those statistics:
 	 *
 	 *	kstat.<module>[.<pool>].<class>.<name>
 	 */
 	sysctl_ctx_init(&ksp->ks_sysctl_ctx);
 	root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
 	    SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0,
 	    "");
 	if (root == NULL) {
 		printf("%s: Cannot create kstat.%s tree!\n", __func__, module);
 		sysctl_ctx_free(&ksp->ks_sysctl_ctx);
 		free(ksp, M_KSTAT);
 		return (NULL);
 	}
 	if (pool != NULL) {
 		root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
 		    SYSCTL_CHILDREN(root), OID_AUTO, pool, CTLFLAG_RW, 0, "");
 		if (root == NULL) {
 			printf("%s: Cannot create kstat.%s.%s tree!\n",
 			    __func__, module, pool);
 			sysctl_ctx_free(&ksp->ks_sysctl_ctx);
 			free(ksp, M_KSTAT);
 			return (NULL);
 		}
 	}
 	root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root),
 	    OID_AUTO, class, CTLFLAG_RW, 0, "");
 	if (root == NULL) {
 		if (pool != NULL)
 			printf("%s: Cannot create kstat.%s.%s.%s tree!\n",
 			    __func__, module, pool, class);
 		else
 			printf("%s: Cannot create kstat.%s.%s tree!\n",
 			    __func__, module, class);
 		sysctl_ctx_free(&ksp->ks_sysctl_ctx);
 		free(ksp, M_KSTAT);
 		return (NULL);
 	}
 	if (ksp->ks_type == KSTAT_TYPE_NAMED) {
 		root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx,
 		    SYSCTL_CHILDREN(root),
 		    OID_AUTO, name, CTLFLAG_RW, 0, "");
 		if (root == NULL) {
 			if (pool != NULL)
 				printf("%s: Cannot create kstat.%s.%s.%s.%s "
 				    "tree!\n", __func__, module, pool, class,
 				    name);
 			else
 				printf("%s: Cannot create kstat.%s.%s.%s "
 				    "tree!\n", __func__, module, class, name);
 			sysctl_ctx_free(&ksp->ks_sysctl_ctx);
 			free(ksp, M_KSTAT);
 			return (NULL);
 		}
 
 	}
 	ksp->ks_sysctl_root = root;
 
 	return (ksp);
 }
 
 static void
 kstat_install_named(kstat_t *ksp)
 {
 	kstat_named_t *ksent;
 	char *namelast;
 	int typelast;
 
 	ksent = ksp->ks_data;
 
 	VERIFY((ksp->ks_flags & KSTAT_FLAG_VIRTUAL) || ksent != NULL);
 
 	typelast = 0;
 	namelast = NULL;
 
 	for (int i = 0; i < ksp->ks_ndata; i++, ksent++) {
 		if (ksent->data_type != 0) {
 			typelast = ksent->data_type;
 			namelast = ksent->name;
 		}
 		switch (typelast) {
 		case KSTAT_DATA_CHAR:
 			/* Not Implemented */
 			break;
 		case KSTAT_DATA_INT32:
 			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, namelast,
 			    CTLTYPE_S32 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			    ksp, i, kstat_sysctl, "I", namelast);
 			break;
 		case KSTAT_DATA_UINT32:
 			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, namelast,
 			    CTLTYPE_U32 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			    ksp, i, kstat_sysctl, "IU", namelast);
 			break;
 		case KSTAT_DATA_INT64:
 			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, namelast,
 			    CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			    ksp, i, kstat_sysctl, "Q", namelast);
 			break;
 		case KSTAT_DATA_UINT64:
 			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, namelast,
 			    CTLTYPE_U64 | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			    ksp, i, kstat_sysctl, "QU", namelast);
 			break;
 		case KSTAT_DATA_LONG:
 			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, namelast,
 			    CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			    ksp, i, kstat_sysctl, "L", namelast);
 			break;
 		case KSTAT_DATA_ULONG:
 			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, namelast,
 			    CTLTYPE_ULONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			    ksp, i, kstat_sysctl, "LU", namelast);
 			break;
 		case KSTAT_DATA_STRING:
 			SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, namelast,
 			    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 			    ksp, i, kstat_sysctl_string, "A", namelast);
 			break;
 		default:
 			panic("unsupported type: %d", typelast);
 		}
 	}
 }
 
 void
 kstat_install(kstat_t *ksp)
 {
 	struct sysctl_oid *root;
 
 	if (ksp->ks_ndata == UINT32_MAX)
 		VERIFY3U(ksp->ks_type, ==, KSTAT_TYPE_RAW);
 
 	switch (ksp->ks_type) {
 	case KSTAT_TYPE_NAMED:
 		return (kstat_install_named(ksp));
 	case KSTAT_TYPE_RAW:
 		if (ksp->ks_raw_ops.data) {
 			root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, ksp->ks_name, CTLTYPE_STRING | CTLFLAG_RD
 			    | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
 			    ksp, 0, kstat_sysctl_raw, "A", ksp->ks_name);
 		} else {
 			root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 			    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 			    OID_AUTO, ksp->ks_name, CTLTYPE_OPAQUE | CTLFLAG_RD
 			    | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
 			    ksp, 0, kstat_sysctl_raw, "", ksp->ks_name);
 		}
 		break;
 	case KSTAT_TYPE_IO:
 		root = SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx,
 		    SYSCTL_CHILDREN(ksp->ks_sysctl_root),
 		    OID_AUTO, ksp->ks_name,
 		    CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE,
 		    ksp, 0, kstat_sysctl_io, "A", ksp->ks_name);
 		break;
 	case KSTAT_TYPE_TIMER:
 	case KSTAT_TYPE_INTR:
 	default:
 		panic("unsupported kstat type %d\n", ksp->ks_type);
 	}
 	VERIFY3P(root, !=, NULL);
 	ksp->ks_sysctl_root = root;
 }
 
 void
 kstat_delete(kstat_t *ksp)
 {
 
 	sysctl_ctx_free(&ksp->ks_sysctl_ctx);
 	ksp->ks_lock = NULL;
 	mutex_destroy(&ksp->ks_private_lock);
 	if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL))
 		kmem_free(ksp->ks_data, ksp->ks_data_size);
 	free(ksp, M_KSTAT);
 }
-
-void
-kstat_waitq_enter(kstat_io_t *kiop)
-{
-	hrtime_t new, delta;
-	ulong_t wcnt;
-
-	new = gethrtime();
-	delta = new - kiop->wlastupdate;
-	kiop->wlastupdate = new;
-	wcnt = kiop->wcnt++;
-	if (wcnt != 0) {
-		kiop->wlentime += delta * wcnt;
-		kiop->wtime += delta;
-	}
-}
-
-void
-kstat_waitq_exit(kstat_io_t *kiop)
-{
-	hrtime_t new, delta;
-	ulong_t wcnt;
-
-	new = gethrtime();
-	delta = new - kiop->wlastupdate;
-	kiop->wlastupdate = new;
-	wcnt = kiop->wcnt--;
-	ASSERT3S(wcnt, >, 0);
-	kiop->wlentime += delta * wcnt;
-	kiop->wtime += delta;
-}
-
-void
-kstat_runq_enter(kstat_io_t *kiop)
-{
-	hrtime_t new, delta;
-	ulong_t rcnt;
-
-	new = gethrtime();
-	delta = new - kiop->rlastupdate;
-	kiop->rlastupdate = new;
-	rcnt = kiop->rcnt++;
-	if (rcnt != 0) {
-		kiop->rlentime += delta * rcnt;
-		kiop->rtime += delta;
-	}
-}
-
-void
-kstat_runq_exit(kstat_io_t *kiop)
-{
-	hrtime_t new, delta;
-	ulong_t rcnt;
-
-	new = gethrtime();
-	delta = new - kiop->rlastupdate;
-	kiop->rlastupdate = new;
-	rcnt = kiop->rcnt--;
-	ASSERT3S(rcnt, >, 0);
-	kiop->rlentime += delta * rcnt;
-	kiop->rtime += delta;
-}
diff --git a/module/os/linux/spl/spl-kstat.c b/module/os/linux/spl/spl-kstat.c
index c7f1aadf784e..0c46708326d8 100644
--- a/module/os/linux/spl/spl-kstat.c
+++ b/module/os/linux/spl/spl-kstat.c
@@ -1,781 +1,715 @@
 /*
  *  Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC.
  *  Copyright (C) 2007 The Regents of the University of California.
  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
  *  Written by Brian Behlendorf <behlendorf1@llnl.gov>.
  *  UCRL-CODE-235197
  *
  *  This file is part of the SPL, Solaris Porting Layer.
  *
  *  The SPL is free software; you can redistribute it and/or modify it
  *  under the terms of the GNU General Public License as published by the
  *  Free Software Foundation; either version 2 of the License, or (at your
  *  option) any later version.
  *
  *  The SPL is distributed in the hope that it will be useful, but WITHOUT
  *  ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  *  FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  *  for more details.
  *
  *  You should have received a copy of the GNU General Public License along
  *  with the SPL.  If not, see <http://www.gnu.org/licenses/>.
  *
  *  Solaris Porting Layer (SPL) Kstat Implementation.
  *
  *  Links to Illumos.org for more information on kstat function:
  *  [1] https://illumos.org/man/1M/kstat
  *  [2] https://illumos.org/man/9f/kstat_create
  */
 
 #include <linux/seq_file.h>
 #include <sys/kstat.h>
 #include <sys/vmem.h>
 #include <sys/cmn_err.h>
 #include <sys/sysmacros.h>
 
 static kmutex_t kstat_module_lock;
 static struct list_head kstat_module_list;
 static kid_t kstat_id;
 
 static int
 kstat_resize_raw(kstat_t *ksp)
 {
 	if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX)
 		return (ENOMEM);
 
 	vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
 	ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX);
 	ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
 
 	return (0);
 }
 
-void
-kstat_waitq_enter(kstat_io_t *kiop)
-{
-	hrtime_t new, delta;
-	ulong_t wcnt;
-
-	new = gethrtime();
-	delta = new - kiop->wlastupdate;
-	kiop->wlastupdate = new;
-	wcnt = kiop->wcnt++;
-	if (wcnt != 0) {
-		kiop->wlentime += delta * wcnt;
-		kiop->wtime += delta;
-	}
-}
-EXPORT_SYMBOL(kstat_waitq_enter);
-
-void
-kstat_waitq_exit(kstat_io_t *kiop)
-{
-	hrtime_t new, delta;
-	ulong_t wcnt;
-
-	new = gethrtime();
-	delta = new - kiop->wlastupdate;
-	kiop->wlastupdate = new;
-	wcnt = kiop->wcnt--;
-	ASSERT((int)wcnt > 0);
-	kiop->wlentime += delta * wcnt;
-	kiop->wtime += delta;
-}
-EXPORT_SYMBOL(kstat_waitq_exit);
-
-void
-kstat_runq_enter(kstat_io_t *kiop)
-{
-	hrtime_t new, delta;
-	ulong_t rcnt;
-
-	new = gethrtime();
-	delta = new - kiop->rlastupdate;
-	kiop->rlastupdate = new;
-	rcnt = kiop->rcnt++;
-	if (rcnt != 0) {
-		kiop->rlentime += delta * rcnt;
-		kiop->rtime += delta;
-	}
-}
-EXPORT_SYMBOL(kstat_runq_enter);
-
-void
-kstat_runq_exit(kstat_io_t *kiop)
-{
-	hrtime_t new, delta;
-	ulong_t rcnt;
-
-	new = gethrtime();
-	delta = new - kiop->rlastupdate;
-	kiop->rlastupdate = new;
-	rcnt = kiop->rcnt--;
-	ASSERT((int)rcnt > 0);
-	kiop->rlentime += delta * rcnt;
-	kiop->rtime += delta;
-}
-EXPORT_SYMBOL(kstat_runq_exit);
-
 static int
 kstat_seq_show_headers(struct seq_file *f)
 {
 	kstat_t *ksp = (kstat_t *)f->private;
 	int rc = 0;
 
 	ASSERT(ksp->ks_magic == KS_MAGIC);
 
 	seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n",
 	    ksp->ks_kid, ksp->ks_type, ksp->ks_flags,
 	    ksp->ks_ndata, (int)ksp->ks_data_size,
 	    ksp->ks_crtime, ksp->ks_snaptime);
 
 	switch (ksp->ks_type) {
 		case KSTAT_TYPE_RAW:
 restart:
 			if (ksp->ks_raw_ops.headers) {
 				rc = ksp->ks_raw_ops.headers(
 				    ksp->ks_raw_buf, ksp->ks_raw_bufsize);
 				if (rc == ENOMEM && !kstat_resize_raw(ksp))
 					goto restart;
 				if (!rc)
 					seq_puts(f, ksp->ks_raw_buf);
 			} else {
 				seq_printf(f, "raw data\n");
 			}
 			break;
 		case KSTAT_TYPE_NAMED:
 			seq_printf(f, "%-31s %-4s %s\n",
 			    "name", "type", "data");
 			break;
 		case KSTAT_TYPE_INTR:
 			seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n",
 			    "hard", "soft", "watchdog",
 			    "spurious", "multsvc");
 			break;
 		case KSTAT_TYPE_IO:
 			seq_printf(f,
 			    "%-8s %-8s %-8s %-8s %-8s %-8s "
 			    "%-8s %-8s %-8s %-8s %-8s %-8s\n",
 			    "nread", "nwritten", "reads", "writes",
 			    "wtime", "wlentime", "wupdate",
 			    "rtime", "rlentime", "rupdate",
 			    "wcnt", "rcnt");
 			break;
 		case KSTAT_TYPE_TIMER:
 			seq_printf(f,
 			    "%-31s %-8s "
 			    "%-8s %-8s %-8s %-8s %-8s\n",
 			    "name", "events", "elapsed",
 			    "min", "max", "start", "stop");
 			break;
 		default:
 			PANIC("Undefined kstat type %d\n", ksp->ks_type);
 	}
 
 	return (-rc);
 }
 
 static int
 kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l)
 {
 	int i, j;
 
 	for (i = 0; ; i++) {
 		seq_printf(f, "%03x:", i);
 
 		for (j = 0; j < 16; j++) {
 			if (i * 16 + j >= l) {
 				seq_printf(f, "\n");
 				goto out;
 			}
 
 			seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]);
 		}
 		seq_printf(f, "\n");
 	}
 out:
 	return (0);
 }
 
 static int
 kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp)
 {
 	seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type);
 
 	switch (knp->data_type) {
 		case KSTAT_DATA_CHAR:
 			knp->value.c[15] = '\0'; /* NULL terminate */
 			seq_printf(f, "%-16s", knp->value.c);
 			break;
 		/*
 		 * NOTE - We need to be more careful able what tokens are
 		 * used for each arch, for now this is correct for x86_64.
 		 */
 		case KSTAT_DATA_INT32:
 			seq_printf(f, "%d", knp->value.i32);
 			break;
 		case KSTAT_DATA_UINT32:
 			seq_printf(f, "%u", knp->value.ui32);
 			break;
 		case KSTAT_DATA_INT64:
 			seq_printf(f, "%lld", (signed long long)knp->value.i64);
 			break;
 		case KSTAT_DATA_UINT64:
 			seq_printf(f, "%llu",
 			    (unsigned long long)knp->value.ui64);
 			break;
 		case KSTAT_DATA_LONG:
 			seq_printf(f, "%ld", knp->value.l);
 			break;
 		case KSTAT_DATA_ULONG:
 			seq_printf(f, "%lu", knp->value.ul);
 			break;
 		case KSTAT_DATA_STRING:
 			KSTAT_NAMED_STR_PTR(knp)
 				[KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0';
 			seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp));
 			break;
 		default:
 			PANIC("Undefined kstat data type %d\n", knp->data_type);
 	}
 
 	seq_printf(f, "\n");
 
 	return (0);
 }
 
 static int
 kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip)
 {
 	seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n",
 	    kip->intrs[KSTAT_INTR_HARD],
 	    kip->intrs[KSTAT_INTR_SOFT],
 	    kip->intrs[KSTAT_INTR_WATCHDOG],
 	    kip->intrs[KSTAT_INTR_SPURIOUS],
 	    kip->intrs[KSTAT_INTR_MULTSVC]);
 
 	return (0);
 }
 
 static int
 kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip)
 {
 	/* though wlentime & friends are signed, they will never be negative */
 	seq_printf(f,
 	    "%-8llu %-8llu %-8u %-8u %-8llu %-8llu "
 	    "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n",
 	    kip->nread, kip->nwritten,
 	    kip->reads, kip->writes,
 	    kip->wtime, kip->wlentime, kip->wlastupdate,
 	    kip->rtime, kip->rlentime, kip->rlastupdate,
 	    kip->wcnt,  kip->rcnt);
 
 	return (0);
 }
 
 static int
 kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp)
 {
 	seq_printf(f,
 	    "%-31s %-8llu %-8llu %-8llu %-8llu %-8llu %-8llu\n",
 	    ktp->name, ktp->num_events, ktp->elapsed_time,
 	    ktp->min_time, ktp->max_time,
 	    ktp->start_time, ktp->stop_time);
 
 	return (0);
 }
 
 static int
 kstat_seq_show(struct seq_file *f, void *p)
 {
 	kstat_t *ksp = (kstat_t *)f->private;
 	int rc = 0;
 
 	ASSERT(ksp->ks_magic == KS_MAGIC);
 
 	switch (ksp->ks_type) {
 		case KSTAT_TYPE_RAW:
 restart:
 			if (ksp->ks_raw_ops.data) {
 				rc = ksp->ks_raw_ops.data(
 				    ksp->ks_raw_buf, ksp->ks_raw_bufsize, p);
 				if (rc == ENOMEM && !kstat_resize_raw(ksp))
 					goto restart;
 				if (!rc)
 					seq_puts(f, ksp->ks_raw_buf);
 			} else {
 				ASSERT(ksp->ks_ndata == 1);
 				rc = kstat_seq_show_raw(f, ksp->ks_data,
 				    ksp->ks_data_size);
 			}
 			break;
 		case KSTAT_TYPE_NAMED:
 			rc = kstat_seq_show_named(f, (kstat_named_t *)p);
 			break;
 		case KSTAT_TYPE_INTR:
 			rc = kstat_seq_show_intr(f, (kstat_intr_t *)p);
 			break;
 		case KSTAT_TYPE_IO:
 			rc = kstat_seq_show_io(f, (kstat_io_t *)p);
 			break;
 		case KSTAT_TYPE_TIMER:
 			rc = kstat_seq_show_timer(f, (kstat_timer_t *)p);
 			break;
 		default:
 			PANIC("Undefined kstat type %d\n", ksp->ks_type);
 	}
 
 	return (-rc);
 }
 
 static int
 kstat_default_update(kstat_t *ksp, int rw)
 {
 	ASSERT(ksp != NULL);
 
 	if (rw == KSTAT_WRITE)
 		return (EACCES);
 
 	return (0);
 }
 
 static void *
 kstat_seq_data_addr(kstat_t *ksp, loff_t n)
 {
 	void *rc = NULL;
 
 	switch (ksp->ks_type) {
 		case KSTAT_TYPE_RAW:
 			if (ksp->ks_raw_ops.addr)
 				rc = ksp->ks_raw_ops.addr(ksp, n);
 			else
 				rc = ksp->ks_data;
 			break;
 		case KSTAT_TYPE_NAMED:
 			rc = ksp->ks_data + n * sizeof (kstat_named_t);
 			break;
 		case KSTAT_TYPE_INTR:
 			rc = ksp->ks_data + n * sizeof (kstat_intr_t);
 			break;
 		case KSTAT_TYPE_IO:
 			rc = ksp->ks_data + n * sizeof (kstat_io_t);
 			break;
 		case KSTAT_TYPE_TIMER:
 			rc = ksp->ks_data + n * sizeof (kstat_timer_t);
 			break;
 		default:
 			PANIC("Undefined kstat type %d\n", ksp->ks_type);
 	}
 
 	return (rc);
 }
 
 static void *
 kstat_seq_start(struct seq_file *f, loff_t *pos)
 {
 	loff_t n = *pos;
 	kstat_t *ksp = (kstat_t *)f->private;
 	ASSERT(ksp->ks_magic == KS_MAGIC);
 
 	mutex_enter(ksp->ks_lock);
 
 	if (ksp->ks_type == KSTAT_TYPE_RAW) {
 		ksp->ks_raw_bufsize = PAGE_SIZE;
 		ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP);
 	}
 
 	/* Dynamically update kstat, on error existing kstats are used */
 	(void) ksp->ks_update(ksp, KSTAT_READ);
 
 	ksp->ks_snaptime = gethrtime();
 
 	if (!(ksp->ks_flags & KSTAT_FLAG_NO_HEADERS) && !n &&
 	    kstat_seq_show_headers(f))
 		return (NULL);
 
 	if (n >= ksp->ks_ndata)
 		return (NULL);
 
 	return (kstat_seq_data_addr(ksp, n));
 }
 
 static void *
 kstat_seq_next(struct seq_file *f, void *p, loff_t *pos)
 {
 	kstat_t *ksp = (kstat_t *)f->private;
 	ASSERT(ksp->ks_magic == KS_MAGIC);
 
 	++*pos;
 	if (*pos >= ksp->ks_ndata)
 		return (NULL);
 
 	return (kstat_seq_data_addr(ksp, *pos));
 }
 
 static void
 kstat_seq_stop(struct seq_file *f, void *v)
 {
 	kstat_t *ksp = (kstat_t *)f->private;
 	ASSERT(ksp->ks_magic == KS_MAGIC);
 
 	if (ksp->ks_type == KSTAT_TYPE_RAW)
 		vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize);
 
 	mutex_exit(ksp->ks_lock);
 }
 
 static struct seq_operations kstat_seq_ops = {
 	.show  = kstat_seq_show,
 	.start = kstat_seq_start,
 	.next  = kstat_seq_next,
 	.stop  = kstat_seq_stop,
 };
 
 static kstat_module_t *
 kstat_find_module(char *name)
 {
 	kstat_module_t *module = NULL;
 
 	list_for_each_entry(module, &kstat_module_list, ksm_module_list) {
 		if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0)
 			return (module);
 	}
 
 	return (NULL);
 }
 
 static kstat_module_t *
 kstat_create_module(char *name)
 {
 	kstat_module_t *module;
 	struct proc_dir_entry *pde;
 
 	pde = proc_mkdir(name, proc_spl_kstat);
 	if (pde == NULL)
 		return (NULL);
 
 	module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP);
 	module->ksm_proc = pde;
 	strlcpy(module->ksm_name, name, KSTAT_STRLEN+1);
 	INIT_LIST_HEAD(&module->ksm_kstat_list);
 	list_add_tail(&module->ksm_module_list, &kstat_module_list);
 
 	return (module);
 
 }
 
 static void
 kstat_delete_module(kstat_module_t *module)
 {
 	ASSERT(list_empty(&module->ksm_kstat_list));
 	remove_proc_entry(module->ksm_name, proc_spl_kstat);
 	list_del(&module->ksm_module_list);
 	kmem_free(module, sizeof (kstat_module_t));
 }
 
 static int
 proc_kstat_open(struct inode *inode, struct file *filp)
 {
 	struct seq_file *f;
 	int rc;
 
 	rc = seq_open(filp, &kstat_seq_ops);
 	if (rc)
 		return (rc);
 
 	f = filp->private_data;
 	f->private = PDE_DATA(inode);
 
 	return (0);
 }
 
 static ssize_t
 proc_kstat_write(struct file *filp, const char __user *buf, size_t len,
     loff_t *ppos)
 {
 	struct seq_file *f = filp->private_data;
 	kstat_t *ksp = f->private;
 	int rc;
 
 	ASSERT(ksp->ks_magic == KS_MAGIC);
 
 	mutex_enter(ksp->ks_lock);
 	rc = ksp->ks_update(ksp, KSTAT_WRITE);
 	mutex_exit(ksp->ks_lock);
 
 	if (rc)
 		return (-rc);
 
 	*ppos += len;
 	return (len);
 }
 
 static const kstat_proc_op_t proc_kstat_operations = {
 #ifdef HAVE_PROC_OPS_STRUCT
 	.proc_open	= proc_kstat_open,
 	.proc_write	= proc_kstat_write,
 	.proc_read	= seq_read,
 	.proc_lseek	= seq_lseek,
 	.proc_release	= seq_release,
 #else
 	.open		= proc_kstat_open,
 	.write		= proc_kstat_write,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= seq_release,
 #endif
 };
 
 void
 __kstat_set_raw_ops(kstat_t *ksp,
     int (*headers)(char *buf, size_t size),
     int (*data)(char *buf, size_t size, void *data),
     void *(*addr)(kstat_t *ksp, loff_t index))
 {
 	ksp->ks_raw_ops.headers = headers;
 	ksp->ks_raw_ops.data    = data;
 	ksp->ks_raw_ops.addr    = addr;
 }
 EXPORT_SYMBOL(__kstat_set_raw_ops);
 
 void
 kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module,
     const char *name)
 {
 	kpep->kpe_owner = NULL;
 	kpep->kpe_proc = NULL;
 	INIT_LIST_HEAD(&kpep->kpe_list);
 	strncpy(kpep->kpe_module, module, KSTAT_STRLEN);
 	strncpy(kpep->kpe_name, name, KSTAT_STRLEN);
 }
 EXPORT_SYMBOL(kstat_proc_entry_init);
 
 kstat_t *
 __kstat_create(const char *ks_module, int ks_instance, const char *ks_name,
     const char *ks_class, uchar_t ks_type, uint_t ks_ndata,
     uchar_t ks_flags)
 {
 	kstat_t *ksp;
 
 	ASSERT(ks_module);
 	ASSERT(ks_instance == 0);
 	ASSERT(ks_name);
 
 	if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO))
 		ASSERT(ks_ndata == 1);
 
 	ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP);
 	if (ksp == NULL)
 		return (ksp);
 
 	mutex_enter(&kstat_module_lock);
 	ksp->ks_kid = kstat_id;
 	kstat_id++;
 	mutex_exit(&kstat_module_lock);
 
 	ksp->ks_magic = KS_MAGIC;
 	mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL);
 	ksp->ks_lock = &ksp->ks_private_lock;
 
 	ksp->ks_crtime = gethrtime();
 	ksp->ks_snaptime = ksp->ks_crtime;
 	ksp->ks_instance = ks_instance;
 	strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN);
 	ksp->ks_type = ks_type;
 	ksp->ks_flags = ks_flags;
 	ksp->ks_update = kstat_default_update;
 	ksp->ks_private = NULL;
 	ksp->ks_raw_ops.headers = NULL;
 	ksp->ks_raw_ops.data = NULL;
 	ksp->ks_raw_ops.addr = NULL;
 	ksp->ks_raw_buf = NULL;
 	ksp->ks_raw_bufsize = 0;
 	kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name);
 
 	switch (ksp->ks_type) {
 		case KSTAT_TYPE_RAW:
 			ksp->ks_ndata = 1;
 			ksp->ks_data_size = ks_ndata;
 			break;
 		case KSTAT_TYPE_NAMED:
 			ksp->ks_ndata = ks_ndata;
 			ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t);
 			break;
 		case KSTAT_TYPE_INTR:
 			ksp->ks_ndata = ks_ndata;
 			ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t);
 			break;
 		case KSTAT_TYPE_IO:
 			ksp->ks_ndata = ks_ndata;
 			ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t);
 			break;
 		case KSTAT_TYPE_TIMER:
 			ksp->ks_ndata = ks_ndata;
 			ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t);
 			break;
 		default:
 			PANIC("Undefined kstat type %d\n", ksp->ks_type);
 	}
 
 	if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) {
 		ksp->ks_data = NULL;
 	} else {
 		ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP);
 		if (ksp->ks_data == NULL) {
 			kmem_free(ksp, sizeof (*ksp));
 			ksp = NULL;
 		}
 	}
 
 	return (ksp);
 }
 EXPORT_SYMBOL(__kstat_create);
 
 static int
 kstat_detect_collision(kstat_proc_entry_t *kpep)
 {
 	kstat_module_t *module;
 	kstat_proc_entry_t *tmp = NULL;
 	char *parent;
 	char *cp;
 
 	parent = kmem_asprintf("%s", kpep->kpe_module);
 
 	if ((cp = strrchr(parent, '/')) == NULL) {
 		kmem_strfree(parent);
 		return (0);
 	}
 
 	cp[0] = '\0';
 	if ((module = kstat_find_module(parent)) != NULL) {
 		list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) {
 			if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) {
 				kmem_strfree(parent);
 				return (EEXIST);
 			}
 		}
 	}
 
 	kmem_strfree(parent);
 	return (0);
 }
 
 /*
  * Add a file to the proc filesystem under the kstat namespace (i.e.
  * /proc/spl/kstat/). The file need not necessarily be implemented as a
  * kstat.
  */
 void
 kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode,
     const kstat_proc_op_t *proc_ops, void *data)
 {
 	kstat_module_t *module;
 	kstat_proc_entry_t *tmp = NULL;
 
 	ASSERT(kpep);
 
 	mutex_enter(&kstat_module_lock);
 
 	module = kstat_find_module(kpep->kpe_module);
 	if (module == NULL) {
 		if (kstat_detect_collision(kpep) != 0) {
 			cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \
 			    " collision", kpep->kpe_module, kpep->kpe_name);
 			goto out;
 		}
 		module = kstat_create_module(kpep->kpe_module);
 		if (module == NULL)
 			goto out;
 	}
 
 	/*
 	 * Only one entry by this name per-module, on failure the module
 	 * shouldn't be deleted because we know it has at least one entry.
 	 */
 	list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) {
 		if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0)
 			goto out;
 	}
 
 	list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list);
 
 	kpep->kpe_owner = module;
 	kpep->kpe_proc = proc_create_data(kpep->kpe_name, mode,
 	    module->ksm_proc, proc_ops, data);
 	if (kpep->kpe_proc == NULL) {
 		list_del_init(&kpep->kpe_list);
 		if (list_empty(&module->ksm_kstat_list))
 			kstat_delete_module(module);
 	}
 out:
 	mutex_exit(&kstat_module_lock);
 
 }
 EXPORT_SYMBOL(kstat_proc_entry_install);
 
 void
 __kstat_install(kstat_t *ksp)
 {
 	ASSERT(ksp);
 	mode_t mode;
 	/* Specify permission modes for different kstats */
 	if (strncmp(ksp->ks_proc.kpe_name, "dbufs", KSTAT_STRLEN) == 0) {
 		mode = 0600;
 	} else {
 		mode = 0644;
 	}
 	kstat_proc_entry_install(
 	    &ksp->ks_proc, mode, &proc_kstat_operations, ksp);
 }
 EXPORT_SYMBOL(__kstat_install);
 
 void
 kstat_proc_entry_delete(kstat_proc_entry_t *kpep)
 {
 	kstat_module_t *module = kpep->kpe_owner;
 	if (kpep->kpe_proc)
 		remove_proc_entry(kpep->kpe_name, module->ksm_proc);
 
 	mutex_enter(&kstat_module_lock);
 	list_del_init(&kpep->kpe_list);
 
 	/*
 	 * Remove top level module directory if it wasn't empty before, but now
 	 * is.
 	 */
 	if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list))
 		kstat_delete_module(module);
 	mutex_exit(&kstat_module_lock);
 
 }
 EXPORT_SYMBOL(kstat_proc_entry_delete);
 
 void
 __kstat_delete(kstat_t *ksp)
 {
 	kstat_proc_entry_delete(&ksp->ks_proc);
 
 	if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL))
 		kmem_free(ksp->ks_data, ksp->ks_data_size);
 
 	ksp->ks_lock = NULL;
 	mutex_destroy(&ksp->ks_private_lock);
 	kmem_free(ksp, sizeof (*ksp));
 }
 EXPORT_SYMBOL(__kstat_delete);
 
 int
 spl_kstat_init(void)
 {
 	mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL);
 	INIT_LIST_HEAD(&kstat_module_list);
 	kstat_id = 0;
 	return (0);
 }
 
 void
 spl_kstat_fini(void)
 {
 	ASSERT(list_empty(&kstat_module_list));
 	mutex_destroy(&kstat_module_lock);
 }
diff --git a/module/zfs/spa_stats.c b/module/zfs/spa_stats.c
index c3eacc14239e..534ac72fee7b 100644
--- a/module/zfs/spa_stats.c
+++ b/module/zfs/spa_stats.c
@@ -1,1029 +1,979 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 #include <sys/zfs_context.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/spa.h>
 #include <zfs_comutil.h>
 
 /*
  * Keeps stats on last N reads per spa_t, disabled by default.
  */
 int zfs_read_history = 0;
 
 /*
  * Include cache hits in history, disabled by default.
  */
 int zfs_read_history_hits = 0;
 
 /*
  * Keeps stats on the last 100 txgs by default.
  */
 int zfs_txg_history = 100;
 
 /*
  * Keeps stats on the last N MMP updates, disabled by default.
  */
 int zfs_multihost_history = 0;
 
 /*
  * ==========================================================================
  * SPA Read History Routines
  * ==========================================================================
  */
 
 /*
  * Read statistics - Information exported regarding each arc_read call
  */
 typedef struct spa_read_history {
 	hrtime_t	start;		/* time read completed */
 	uint64_t	objset;		/* read from this objset */
 	uint64_t	object;		/* read of this object number */
 	uint64_t	level;		/* block's indirection level */
 	uint64_t	blkid;		/* read of this block id */
 	char		origin[24];	/* read originated from here */
 	uint32_t	aflags;		/* ARC flags (cached, prefetch, etc.) */
 	pid_t		pid;		/* PID of task doing read */
 	char		comm[16];	/* process name of task doing read */
 	procfs_list_node_t	srh_node;
 } spa_read_history_t;
 
 static int
 spa_read_history_show_header(struct seq_file *f)
 {
 	seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s "
 	    "%-24s %-8s %-16s\n", "UID", "start", "objset", "object",
 	    "level", "blkid", "aflags", "origin", "pid", "process");
 
 	return (0);
 }
 
 static int
 spa_read_history_show(struct seq_file *f, void *data)
 {
 	spa_read_history_t *srh = (spa_read_history_t *)data;
 
 	seq_printf(f, "%-8llu %-16llu 0x%-6llx "
 	    "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n",
 	    (u_longlong_t)srh->srh_node.pln_id, srh->start,
 	    (longlong_t)srh->objset, (longlong_t)srh->object,
 	    (longlong_t)srh->level, (longlong_t)srh->blkid,
 	    srh->aflags, srh->origin, srh->pid, srh->comm);
 
 	return (0);
 }
 
 /* Remove oldest elements from list until there are no more than 'size' left */
 static void
 spa_read_history_truncate(spa_history_list_t *shl, unsigned int size)
 {
 	spa_read_history_t *srh;
 	while (shl->size > size) {
 		srh = list_remove_head(&shl->procfs_list.pl_list);
 		ASSERT3P(srh, !=, NULL);
 		kmem_free(srh, sizeof (spa_read_history_t));
 		shl->size--;
 	}
 
 	if (size == 0)
 		ASSERT(list_is_empty(&shl->procfs_list.pl_list));
 }
 
 static int
 spa_read_history_clear(procfs_list_t *procfs_list)
 {
 	spa_history_list_t *shl = procfs_list->pl_private;
 	mutex_enter(&procfs_list->pl_lock);
 	spa_read_history_truncate(shl, 0);
 	mutex_exit(&procfs_list->pl_lock);
 	return (0);
 }
 
 static void
 spa_read_history_init(spa_t *spa)
 {
 	spa_history_list_t *shl = &spa->spa_stats.read_history;
 
 	shl->size = 0;
 	shl->procfs_list.pl_private = shl;
 	procfs_list_install("zfs",
 	    spa_name(spa),
 	    "reads",
 	    0600,
 	    &shl->procfs_list,
 	    spa_read_history_show,
 	    spa_read_history_show_header,
 	    spa_read_history_clear,
 	    offsetof(spa_read_history_t, srh_node));
 }
 
 static void
 spa_read_history_destroy(spa_t *spa)
 {
 	spa_history_list_t *shl = &spa->spa_stats.read_history;
 	procfs_list_uninstall(&shl->procfs_list);
 	spa_read_history_truncate(shl, 0);
 	procfs_list_destroy(&shl->procfs_list);
 }
 
 void
 spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags)
 {
 	spa_history_list_t *shl = &spa->spa_stats.read_history;
 	spa_read_history_t *srh;
 
 	ASSERT3P(spa, !=, NULL);
 	ASSERT3P(zb,  !=, NULL);
 
 	if (zfs_read_history == 0 && shl->size == 0)
 		return;
 
 	if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED))
 		return;
 
 	srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP);
 	strlcpy(srh->comm, getcomm(), sizeof (srh->comm));
 	srh->start  = gethrtime();
 	srh->objset = zb->zb_objset;
 	srh->object = zb->zb_object;
 	srh->level  = zb->zb_level;
 	srh->blkid  = zb->zb_blkid;
 	srh->aflags = aflags;
 	srh->pid    = getpid();
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 
 	procfs_list_add(&shl->procfs_list, srh);
 	shl->size++;
 
 	spa_read_history_truncate(shl, zfs_read_history);
 
 	mutex_exit(&shl->procfs_list.pl_lock);
 }
 
 /*
  * ==========================================================================
  * SPA TXG History Routines
  * ==========================================================================
  */
 
 /*
  * Txg statistics - Information exported regarding each txg sync
  */
 
 typedef struct spa_txg_history {
 	uint64_t	txg;		/* txg id */
 	txg_state_t	state;		/* active txg state */
 	uint64_t	nread;		/* number of bytes read */
 	uint64_t	nwritten;	/* number of bytes written */
 	uint64_t	reads;		/* number of read operations */
 	uint64_t	writes;		/* number of write operations */
 	uint64_t	ndirty;		/* number of dirty bytes */
 	hrtime_t	times[TXG_STATE_COMMITTED]; /* completion times */
 	procfs_list_node_t	sth_node;
 } spa_txg_history_t;
 
 static int
 spa_txg_history_show_header(struct seq_file *f)
 {
 	seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s "
 	    "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state",
 	    "ndirty", "nread", "nwritten", "reads", "writes",
 	    "otime", "qtime", "wtime", "stime");
 	return (0);
 }
 
 static int
 spa_txg_history_show(struct seq_file *f, void *data)
 {
 	spa_txg_history_t *sth = (spa_txg_history_t *)data;
 	uint64_t open = 0, quiesce = 0, wait = 0, sync = 0;
 	char state;
 
 	switch (sth->state) {
 		case TXG_STATE_BIRTH:		state = 'B';	break;
 		case TXG_STATE_OPEN:		state = 'O';	break;
 		case TXG_STATE_QUIESCED:	state = 'Q';	break;
 		case TXG_STATE_WAIT_FOR_SYNC:	state = 'W';	break;
 		case TXG_STATE_SYNCED:		state = 'S';	break;
 		case TXG_STATE_COMMITTED:	state = 'C';	break;
 		default:			state = '?';	break;
 	}
 
 	if (sth->times[TXG_STATE_OPEN])
 		open = sth->times[TXG_STATE_OPEN] -
 		    sth->times[TXG_STATE_BIRTH];
 
 	if (sth->times[TXG_STATE_QUIESCED])
 		quiesce = sth->times[TXG_STATE_QUIESCED] -
 		    sth->times[TXG_STATE_OPEN];
 
 	if (sth->times[TXG_STATE_WAIT_FOR_SYNC])
 		wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] -
 		    sth->times[TXG_STATE_QUIESCED];
 
 	if (sth->times[TXG_STATE_SYNCED])
 		sync = sth->times[TXG_STATE_SYNCED] -
 		    sth->times[TXG_STATE_WAIT_FOR_SYNC];
 
 	seq_printf(f, "%-8llu %-16llu %-5c %-12llu "
 	    "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n",
 	    (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state,
 	    (u_longlong_t)sth->ndirty,
 	    (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten,
 	    (u_longlong_t)sth->reads, (u_longlong_t)sth->writes,
 	    (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait,
 	    (u_longlong_t)sync);
 
 	return (0);
 }
 
 /* Remove oldest elements from list until there are no more than 'size' left */
 static void
 spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size)
 {
 	spa_txg_history_t *sth;
 	while (shl->size > size) {
 		sth = list_remove_head(&shl->procfs_list.pl_list);
 		ASSERT3P(sth, !=, NULL);
 		kmem_free(sth, sizeof (spa_txg_history_t));
 		shl->size--;
 	}
 
 	if (size == 0)
 		ASSERT(list_is_empty(&shl->procfs_list.pl_list));
 
 }
 
 static int
 spa_txg_history_clear(procfs_list_t *procfs_list)
 {
 	spa_history_list_t *shl = procfs_list->pl_private;
 	mutex_enter(&procfs_list->pl_lock);
 	spa_txg_history_truncate(shl, 0);
 	mutex_exit(&procfs_list->pl_lock);
 	return (0);
 }
 
 static void
 spa_txg_history_init(spa_t *spa)
 {
 	spa_history_list_t *shl = &spa->spa_stats.txg_history;
 
 	shl->size = 0;
 	shl->procfs_list.pl_private = shl;
 	procfs_list_install("zfs",
 	    spa_name(spa),
 	    "txgs",
 	    0644,
 	    &shl->procfs_list,
 	    spa_txg_history_show,
 	    spa_txg_history_show_header,
 	    spa_txg_history_clear,
 	    offsetof(spa_txg_history_t, sth_node));
 }
 
 static void
 spa_txg_history_destroy(spa_t *spa)
 {
 	spa_history_list_t *shl = &spa->spa_stats.txg_history;
 	procfs_list_uninstall(&shl->procfs_list);
 	spa_txg_history_truncate(shl, 0);
 	procfs_list_destroy(&shl->procfs_list);
 }
 
 /*
  * Add a new txg to historical record.
  */
 void
 spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time)
 {
 	spa_history_list_t *shl = &spa->spa_stats.txg_history;
 	spa_txg_history_t *sth;
 
 	if (zfs_txg_history == 0 && shl->size == 0)
 		return;
 
 	sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP);
 	sth->txg = txg;
 	sth->state = TXG_STATE_OPEN;
 	sth->times[TXG_STATE_BIRTH] = birth_time;
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	procfs_list_add(&shl->procfs_list, sth);
 	shl->size++;
 	spa_txg_history_truncate(shl, zfs_txg_history);
 	mutex_exit(&shl->procfs_list.pl_lock);
 }
 
 /*
  * Set txg state completion time and increment current state.
  */
 int
 spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state,
     hrtime_t completed_time)
 {
 	spa_history_list_t *shl = &spa->spa_stats.txg_history;
 	spa_txg_history_t *sth;
 	int error = ENOENT;
 
 	if (zfs_txg_history == 0)
 		return (0);
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
 	    sth = list_prev(&shl->procfs_list.pl_list, sth)) {
 		if (sth->txg == txg) {
 			sth->times[completed_state] = completed_time;
 			sth->state++;
 			error = 0;
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 
 	return (error);
 }
 
 /*
  * Set txg IO stats.
  */
 static int
 spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread,
     uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty)
 {
 	spa_history_list_t *shl = &spa->spa_stats.txg_history;
 	spa_txg_history_t *sth;
 	int error = ENOENT;
 
 	if (zfs_txg_history == 0)
 		return (0);
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL;
 	    sth = list_prev(&shl->procfs_list.pl_list, sth)) {
 		if (sth->txg == txg) {
 			sth->nread = nread;
 			sth->nwritten = nwritten;
 			sth->reads = reads;
 			sth->writes = writes;
 			sth->ndirty = ndirty;
 			error = 0;
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 
 	return (error);
 }
 
 txg_stat_t *
 spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp)
 {
 	txg_stat_t *ts;
 
 	if (zfs_txg_history == 0)
 		return (NULL);
 
 	ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP);
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	vdev_get_stats(spa->spa_root_vdev, &ts->vs1);
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	ts->txg = txg;
 	ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK];
 
 	spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime());
 
 	return (ts);
 }
 
 void
 spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts)
 {
 	if (ts == NULL)
 		return;
 
 	if (zfs_txg_history == 0) {
 		kmem_free(ts, sizeof (txg_stat_t));
 		return;
 	}
 
 	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 	vdev_get_stats(spa->spa_root_vdev, &ts->vs2);
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
 	spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime());
 	spa_txg_history_set_io(spa, ts->txg,
 	    ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ],
 	    ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE],
 	    ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ],
 	    ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE],
 	    ts->ndirty);
 
 	kmem_free(ts, sizeof (txg_stat_t));
 }
 
 /*
  * ==========================================================================
  * SPA TX Assign Histogram Routines
  * ==========================================================================
  */
 
 /*
  * Tx statistics - Information exported regarding dmu_tx_assign time.
  */
 
 /*
  * When the kstat is written zero all buckets.  When the kstat is read
  * count the number of trailing buckets set to zero and update ks_ndata
  * such that they are not output.
  */
 static int
 spa_tx_assign_update(kstat_t *ksp, int rw)
 {
 	spa_t *spa = ksp->ks_private;
 	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
 	int i;
 
 	if (rw == KSTAT_WRITE) {
 		for (i = 0; i < shk->count; i++)
 			((kstat_named_t *)shk->priv)[i].value.ui64 = 0;
 	}
 
 	for (i = shk->count; i > 0; i--)
 		if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0)
 			break;
 
 	ksp->ks_ndata = i;
 	ksp->ks_data_size = i * sizeof (kstat_named_t);
 
 	return (0);
 }
 
 static void
 spa_tx_assign_init(spa_t *spa)
 {
 	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
 	char *name;
 	kstat_named_t *ks;
 	kstat_t *ksp;
 	int i;
 
 	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
 
 	shk->count = 42; /* power of two buckets for 1ns to 2,199s */
 	shk->size = shk->count * sizeof (kstat_named_t);
 	shk->priv = kmem_alloc(shk->size, KM_SLEEP);
 
 	name = kmem_asprintf("zfs/%s", spa_name(spa));
 
 	for (i = 0; i < shk->count; i++) {
 		ks = &((kstat_named_t *)shk->priv)[i];
 		ks->data_type = KSTAT_DATA_UINT64;
 		ks->value.ui64 = 0;
 		(void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns",
 		    (u_longlong_t)1 << i);
 	}
 
 	ksp = kstat_create(name, 0, "dmu_tx_assign", "misc",
 	    KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL);
 	shk->kstat = ksp;
 
 	if (ksp) {
 		ksp->ks_lock = &shk->lock;
 		ksp->ks_data = shk->priv;
 		ksp->ks_ndata = shk->count;
 		ksp->ks_data_size = shk->size;
 		ksp->ks_private = spa;
 		ksp->ks_update = spa_tx_assign_update;
 		kstat_install(ksp);
 	}
 	kmem_strfree(name);
 }
 
 static void
 spa_tx_assign_destroy(spa_t *spa)
 {
 	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
 	kstat_t *ksp;
 
 	ksp = shk->kstat;
 	if (ksp)
 		kstat_delete(ksp);
 
 	kmem_free(shk->priv, shk->size);
 	mutex_destroy(&shk->lock);
 }
 
 void
 spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs)
 {
 	spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram;
 	uint64_t idx = 0;
 
 	while (((1ULL << idx) < nsecs) && (idx < shk->size - 1))
 		idx++;
 
 	atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64);
 }
 
-/*
- * ==========================================================================
- * SPA IO History Routines
- * ==========================================================================
- */
-static int
-spa_io_history_update(kstat_t *ksp, int rw)
-{
-	if (rw == KSTAT_WRITE)
-		memset(ksp->ks_data, 0, ksp->ks_data_size);
-
-	return (0);
-}
-
-static void
-spa_io_history_init(spa_t *spa)
-{
-	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
-	char *name;
-	kstat_t *ksp;
-
-	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
-
-	name = kmem_asprintf("zfs/%s", spa_name(spa));
-
-	ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0);
-	shk->kstat = ksp;
-
-	if (ksp) {
-		ksp->ks_lock = &shk->lock;
-		ksp->ks_private = spa;
-		ksp->ks_update = spa_io_history_update;
-		kstat_install(ksp);
-	}
-	kmem_strfree(name);
-}
-
-static void
-spa_io_history_destroy(spa_t *spa)
-{
-	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
-
-	if (shk->kstat)
-		kstat_delete(shk->kstat);
-
-	mutex_destroy(&shk->lock);
-}
-
 /*
  * ==========================================================================
  * SPA MMP History Routines
  * ==========================================================================
  */
 
 /*
  * MMP statistics - Information exported regarding attempted MMP writes
  *   For MMP writes issued, fields used as per comments below.
  *   For MMP writes skipped, an entry represents a span of time when
  *      writes were skipped for same reason (error from mmp_random_leaf).
  *      Differences are:
  *      timestamp	time first write skipped, if >1 skipped in a row
  *      mmp_delay	delay value at timestamp
  *      vdev_guid	number of writes skipped
  *      io_error	one of enum mmp_error
  *      duration	time span (ns) of skipped writes
  */
 
 typedef struct spa_mmp_history {
 	uint64_t	mmp_node_id;	/* unique # for updates */
 	uint64_t	txg;		/* txg of last sync */
 	uint64_t	timestamp;	/* UTC time MMP write issued */
 	uint64_t	mmp_delay;	/* mmp_thread.mmp_delay at timestamp */
 	uint64_t	vdev_guid;	/* unique ID of leaf vdev */
 	char		*vdev_path;
 	int		vdev_label;	/* vdev label */
 	int		io_error;	/* error status of MMP write */
 	hrtime_t	error_start;	/* hrtime of start of error period */
 	hrtime_t	duration;	/* time from submission to completion */
 	procfs_list_node_t	smh_node;
 } spa_mmp_history_t;
 
 static int
 spa_mmp_history_show_header(struct seq_file *f)
 {
 	seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s "
 	    "%-10s %s\n", "id", "txg", "timestamp", "error", "duration",
 	    "mmp_delay", "vdev_guid", "vdev_label", "vdev_path");
 	return (0);
 }
 
 static int
 spa_mmp_history_show(struct seq_file *f, void *data)
 {
 	spa_mmp_history_t *smh = (spa_mmp_history_t *)data;
 	char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu "
 	    "%-10lld %s\n";
 	char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu "
 	    "%-10lld %s\n";
 
 	seq_printf(f, (smh->error_start ? skip_fmt : write_fmt),
 	    (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg,
 	    (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error,
 	    (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay,
 	    (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label,
 	    (smh->vdev_path ? smh->vdev_path : "-"));
 
 	return (0);
 }
 
 /* Remove oldest elements from list until there are no more than 'size' left */
 static void
 spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size)
 {
 	spa_mmp_history_t *smh;
 	while (shl->size > size) {
 		smh = list_remove_head(&shl->procfs_list.pl_list);
 		if (smh->vdev_path)
 			kmem_strfree(smh->vdev_path);
 		kmem_free(smh, sizeof (spa_mmp_history_t));
 		shl->size--;
 	}
 
 	if (size == 0)
 		ASSERT(list_is_empty(&shl->procfs_list.pl_list));
 
 }
 
 static int
 spa_mmp_history_clear(procfs_list_t *procfs_list)
 {
 	spa_history_list_t *shl = procfs_list->pl_private;
 	mutex_enter(&procfs_list->pl_lock);
 	spa_mmp_history_truncate(shl, 0);
 	mutex_exit(&procfs_list->pl_lock);
 	return (0);
 }
 
 static void
 spa_mmp_history_init(spa_t *spa)
 {
 	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
 
 	shl->size = 0;
 
 	shl->procfs_list.pl_private = shl;
 	procfs_list_install("zfs",
 	    spa_name(spa),
 	    "multihost",
 	    0644,
 	    &shl->procfs_list,
 	    spa_mmp_history_show,
 	    spa_mmp_history_show_header,
 	    spa_mmp_history_clear,
 	    offsetof(spa_mmp_history_t, smh_node));
 }
 
 static void
 spa_mmp_history_destroy(spa_t *spa)
 {
 	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
 	procfs_list_uninstall(&shl->procfs_list);
 	spa_mmp_history_truncate(shl, 0);
 	procfs_list_destroy(&shl->procfs_list);
 }
 
 /*
  * Set duration in existing "skip" record to how long we have waited for a leaf
  * vdev to become available.
  *
  * Important that we start search at the tail of the list where new
  * records are inserted, so this is normally an O(1) operation.
  */
 int
 spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id)
 {
 	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
 	spa_mmp_history_t *smh;
 	int error = ENOENT;
 
 	if (zfs_multihost_history == 0 && shl->size == 0)
 		return (0);
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
 	    smh = list_prev(&shl->procfs_list.pl_list, smh)) {
 		if (smh->mmp_node_id == mmp_node_id) {
 			ASSERT3U(smh->io_error, !=, 0);
 			smh->duration = gethrtime() - smh->error_start;
 			smh->vdev_guid++;
 			error = 0;
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 
 	return (error);
 }
 
 /*
  * Set MMP write duration and error status in existing record.
  * See comment re: search order above spa_mmp_history_set_skip().
  */
 int
 spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error,
     hrtime_t duration)
 {
 	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
 	spa_mmp_history_t *smh;
 	int error = ENOENT;
 
 	if (zfs_multihost_history == 0 && shl->size == 0)
 		return (0);
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL;
 	    smh = list_prev(&shl->procfs_list.pl_list, smh)) {
 		if (smh->mmp_node_id == mmp_node_id) {
 			ASSERT(smh->io_error == 0);
 			smh->io_error = io_error;
 			smh->duration = duration;
 			error = 0;
 			break;
 		}
 	}
 	mutex_exit(&shl->procfs_list.pl_lock);
 
 	return (error);
 }
 
 /*
  * Add a new MMP historical record.
  * error == 0 : a write was issued.
  * error != 0 : a write was not issued because no leaves were found.
  */
 void
 spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp,
     uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id,
     int error)
 {
 	spa_history_list_t *shl = &spa->spa_stats.mmp_history;
 	spa_mmp_history_t *smh;
 
 	if (zfs_multihost_history == 0 && shl->size == 0)
 		return;
 
 	smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP);
 	smh->txg = txg;
 	smh->timestamp = timestamp;
 	smh->mmp_delay = mmp_delay;
 	if (vd) {
 		smh->vdev_guid = vd->vdev_guid;
 		if (vd->vdev_path)
 			smh->vdev_path = kmem_strdup(vd->vdev_path);
 	}
 	smh->vdev_label = label;
 	smh->mmp_node_id = mmp_node_id;
 
 	if (error) {
 		smh->io_error = error;
 		smh->error_start = gethrtime();
 		smh->vdev_guid = 1;
 	}
 
 	mutex_enter(&shl->procfs_list.pl_lock);
 	procfs_list_add(&shl->procfs_list, smh);
 	shl->size++;
 	spa_mmp_history_truncate(shl, zfs_multihost_history);
 	mutex_exit(&shl->procfs_list.pl_lock);
 }
 
 static void *
 spa_state_addr(kstat_t *ksp, loff_t n)
 {
 	if (n == 0)
 		return (ksp->ks_private);	/* return the spa_t */
 	return (NULL);
 }
 
 static int
 spa_state_data(char *buf, size_t size, void *data)
 {
 	spa_t *spa = (spa_t *)data;
 	(void) snprintf(buf, size, "%s\n", spa_state_to_name(spa));
 	return (0);
 }
 
 /*
  * Return the state of the pool in /proc/spl/kstat/zfs/<pool>/state.
  *
  * This is a lock-less read of the pool's state (unlike using 'zpool', which
  * can potentially block for seconds).  Because it doesn't block, it can useful
  * as a pool heartbeat value.
  */
 static void
 spa_state_init(spa_t *spa)
 {
 	spa_history_kstat_t *shk = &spa->spa_stats.state;
 	char *name;
 	kstat_t *ksp;
 
 	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
 
 	name = kmem_asprintf("zfs/%s", spa_name(spa));
 	ksp = kstat_create(name, 0, "state", "misc",
 	    KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 
 	shk->kstat = ksp;
 	if (ksp) {
 		ksp->ks_lock = &shk->lock;
 		ksp->ks_data = NULL;
 		ksp->ks_private = spa;
 		ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS;
 		kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr);
 		kstat_install(ksp);
 	}
 
 	kmem_strfree(name);
 }
 
 static void
 spa_health_destroy(spa_t *spa)
 {
 	spa_history_kstat_t *shk = &spa->spa_stats.state;
 	kstat_t *ksp = shk->kstat;
 	if (ksp)
 		kstat_delete(ksp);
 
 	mutex_destroy(&shk->lock);
 }
 
 static spa_iostats_t spa_iostats_template = {
 	{ "trim_extents_written",		KSTAT_DATA_UINT64 },
 	{ "trim_bytes_written",			KSTAT_DATA_UINT64 },
 	{ "trim_extents_skipped",		KSTAT_DATA_UINT64 },
 	{ "trim_bytes_skipped",			KSTAT_DATA_UINT64 },
 	{ "trim_extents_failed",		KSTAT_DATA_UINT64 },
 	{ "trim_bytes_failed",			KSTAT_DATA_UINT64 },
 	{ "autotrim_extents_written",		KSTAT_DATA_UINT64 },
 	{ "autotrim_bytes_written",		KSTAT_DATA_UINT64 },
 	{ "autotrim_extents_skipped",		KSTAT_DATA_UINT64 },
 	{ "autotrim_bytes_skipped",		KSTAT_DATA_UINT64 },
 	{ "autotrim_extents_failed",		KSTAT_DATA_UINT64 },
 	{ "autotrim_bytes_failed",		KSTAT_DATA_UINT64 },
 	{ "simple_trim_extents_written",	KSTAT_DATA_UINT64 },
 	{ "simple_trim_bytes_written",		KSTAT_DATA_UINT64 },
 	{ "simple_trim_extents_skipped",	KSTAT_DATA_UINT64 },
 	{ "simple_trim_bytes_skipped",		KSTAT_DATA_UINT64 },
 	{ "simple_trim_extents_failed",		KSTAT_DATA_UINT64 },
 	{ "simple_trim_bytes_failed",		KSTAT_DATA_UINT64 },
 };
 
 #define	SPA_IOSTATS_ADD(stat, val) \
     atomic_add_64(&iostats->stat.value.ui64, (val));
 
 void
 spa_iostats_trim_add(spa_t *spa, trim_type_t type,
     uint64_t extents_written, uint64_t bytes_written,
     uint64_t extents_skipped, uint64_t bytes_skipped,
     uint64_t extents_failed, uint64_t bytes_failed)
 {
 	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
 	kstat_t *ksp = shk->kstat;
 	spa_iostats_t *iostats;
 
 	if (ksp == NULL)
 		return;
 
 	iostats = ksp->ks_data;
 	if (type == TRIM_TYPE_MANUAL) {
 		SPA_IOSTATS_ADD(trim_extents_written, extents_written);
 		SPA_IOSTATS_ADD(trim_bytes_written, bytes_written);
 		SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped);
 		SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped);
 		SPA_IOSTATS_ADD(trim_extents_failed, extents_failed);
 		SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed);
 	} else if (type == TRIM_TYPE_AUTO) {
 		SPA_IOSTATS_ADD(autotrim_extents_written, extents_written);
 		SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written);
 		SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped);
 		SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped);
 		SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed);
 		SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed);
 	} else {
 		SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written);
 		SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written);
 		SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped);
 		SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped);
 		SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed);
 		SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed);
 	}
 }
 
 static int
 spa_iostats_update(kstat_t *ksp, int rw)
 {
 	if (rw == KSTAT_WRITE) {
 		memcpy(ksp->ks_data, &spa_iostats_template,
 		    sizeof (spa_iostats_t));
 	}
 
 	return (0);
 }
 
 static void
 spa_iostats_init(spa_t *spa)
 {
 	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
 
 	mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL);
 
 	char *name = kmem_asprintf("zfs/%s", spa_name(spa));
 	kstat_t *ksp = kstat_create(name, 0, "iostats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 
 	shk->kstat = ksp;
 	if (ksp) {
 		int size = sizeof (spa_iostats_t);
 		ksp->ks_lock = &shk->lock;
 		ksp->ks_private = spa;
 		ksp->ks_update = spa_iostats_update;
 		ksp->ks_data = kmem_alloc(size, KM_SLEEP);
 		memcpy(ksp->ks_data, &spa_iostats_template, size);
 		kstat_install(ksp);
 	}
 
 	kmem_strfree(name);
 }
 
 static void
 spa_iostats_destroy(spa_t *spa)
 {
 	spa_history_kstat_t *shk = &spa->spa_stats.iostats;
 	kstat_t *ksp = shk->kstat;
 	if (ksp) {
 		kmem_free(ksp->ks_data, sizeof (spa_iostats_t));
 		kstat_delete(ksp);
 	}
 
 	mutex_destroy(&shk->lock);
 }
 
 void
 spa_stats_init(spa_t *spa)
 {
 	spa_read_history_init(spa);
 	spa_txg_history_init(spa);
 	spa_tx_assign_init(spa);
-	spa_io_history_init(spa);
 	spa_mmp_history_init(spa);
 	spa_state_init(spa);
 	spa_iostats_init(spa);
 }
 
 void
 spa_stats_destroy(spa_t *spa)
 {
 	spa_iostats_destroy(spa);
 	spa_health_destroy(spa);
 	spa_tx_assign_destroy(spa);
 	spa_txg_history_destroy(spa);
 	spa_read_history_destroy(spa);
-	spa_io_history_destroy(spa);
 	spa_mmp_history_destroy(spa);
 }
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs, zfs_, read_history, INT, ZMOD_RW,
     "Historical statistics for the last N reads");
 
 ZFS_MODULE_PARAM(zfs, zfs_, read_history_hits, INT, ZMOD_RW,
     "Include cache hits in read history");
 
 ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, history, INT, ZMOD_RW,
     "Historical statistics for the last N txgs");
 
 ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, history, INT, ZMOD_RW,
     "Historical statistics for last N multihost writes");
 /* END CSTYLED */
diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c
index 25a4bc69cc23..198861edb816 100644
--- a/module/zfs/vdev_queue.c
+++ b/module/zfs/vdev_queue.c
@@ -1,1164 +1,1117 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
  * Copyright (c) 2012, 2018 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/vdev_impl.h>
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
 #include <sys/dsl_pool.h>
 #include <sys/metaslab_impl.h>
 #include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/kstat.h>
 #include <sys/abd.h>
 
 /*
  * ZFS I/O Scheduler
  * ---------------
  *
  * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios.  The
  * I/O scheduler determines when and in what order those operations are
  * issued.  The I/O scheduler divides operations into five I/O classes
  * prioritized in the following order: sync read, sync write, async read,
  * async write, and scrub/resilver.  Each queue defines the minimum and
  * maximum number of concurrent operations that may be issued to the device.
  * In addition, the device has an aggregate maximum. Note that the sum of the
  * per-queue minimums must not exceed the aggregate maximum. If the
  * sum of the per-queue maximums exceeds the aggregate maximum, then the
  * number of active i/os may reach zfs_vdev_max_active, in which case no
  * further i/os will be issued regardless of whether all per-queue
  * minimums have been met.
  *
  * For many physical devices, throughput increases with the number of
  * concurrent operations, but latency typically suffers. Further, physical
  * devices typically have a limit at which more concurrent operations have no
  * effect on throughput or can actually cause it to decrease.
  *
  * The scheduler selects the next operation to issue by first looking for an
  * I/O class whose minimum has not been satisfied. Once all are satisfied and
  * the aggregate maximum has not been hit, the scheduler looks for classes
  * whose maximum has not been satisfied. Iteration through the I/O classes is
  * done in the order specified above. No further operations are issued if the
  * aggregate maximum number of concurrent operations has been hit or if there
  * are no operations queued for an I/O class that has not hit its maximum.
  * Every time an i/o is queued or an operation completes, the I/O scheduler
  * looks for new operations to issue.
  *
  * All I/O classes have a fixed maximum number of outstanding operations
  * except for the async write class. Asynchronous writes represent the data
  * that is committed to stable storage during the syncing stage for
  * transaction groups (see txg.c). Transaction groups enter the syncing state
  * periodically so the number of queued async writes will quickly burst up and
  * then bleed down to zero. Rather than servicing them as quickly as possible,
  * the I/O scheduler changes the maximum number of active async write i/os
  * according to the amount of dirty data in the pool (see dsl_pool.c). Since
  * both throughput and latency typically increase with the number of
  * concurrent operations issued to physical devices, reducing the burstiness
  * in the number of concurrent operations also stabilizes the response time of
  * operations from other -- and in particular synchronous -- queues. In broad
  * strokes, the I/O scheduler will issue more concurrent operations from the
  * async write queue as there's more dirty data in the pool.
  *
  * Async Writes
  *
  * The number of concurrent operations issued for the async write I/O class
  * follows a piece-wise linear function defined by a few adjustable points.
  *
  *        |                   o---------| <-- zfs_vdev_async_write_max_active
  *   ^    |                  /^         |
  *   |    |                 / |         |
  * active |                /  |         |
  *  I/O   |               /   |         |
  * count  |              /    |         |
  *        |             /     |         |
  *        |------------o      |         | <-- zfs_vdev_async_write_min_active
  *       0|____________^______|_________|
  *        0%           |      |       100% of zfs_dirty_data_max
  *                     |      |
  *                     |      `-- zfs_vdev_async_write_active_max_dirty_percent
  *                     `--------- zfs_vdev_async_write_active_min_dirty_percent
  *
  * Until the amount of dirty data exceeds a minimum percentage of the dirty
  * data allowed in the pool, the I/O scheduler will limit the number of
  * concurrent operations to the minimum. As that threshold is crossed, the
  * number of concurrent operations issued increases linearly to the maximum at
  * the specified maximum percentage of the dirty data allowed in the pool.
  *
  * Ideally, the amount of dirty data on a busy pool will stay in the sloped
  * part of the function between zfs_vdev_async_write_active_min_dirty_percent
  * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the
  * maximum percentage, this indicates that the rate of incoming data is
  * greater than the rate that the backend storage can handle. In this case, we
  * must further throttle incoming writes (see dmu_tx_delay() for details).
  */
 
 /*
  * The maximum number of i/os active to each device.  Ideally, this will be >=
  * the sum of each queue's max_active.
  */
 uint32_t zfs_vdev_max_active = 1000;
 
 /*
  * Per-queue limits on the number of i/os active to each device.  If the
  * number of active i/os is < zfs_vdev_max_active, then the min_active comes
  * into play.  We will send min_active from each queue round-robin, and then
  * send from queues in the order defined by zio_priority_t up to max_active.
  * Some queues have additional mechanisms to limit number of active I/Os in
  * addition to min_active and max_active, see below.
  *
  * In general, smaller max_active's will lead to lower latency of synchronous
  * operations.  Larger max_active's may lead to higher overall throughput,
  * depending on underlying storage.
  *
  * The ratio of the queues' max_actives determines the balance of performance
  * between reads, writes, and scrubs.  E.g., increasing
  * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete
  * more quickly, but reads and writes to have higher latency and lower
  * throughput.
  */
 uint32_t zfs_vdev_sync_read_min_active = 10;
 uint32_t zfs_vdev_sync_read_max_active = 10;
 uint32_t zfs_vdev_sync_write_min_active = 10;
 uint32_t zfs_vdev_sync_write_max_active = 10;
 uint32_t zfs_vdev_async_read_min_active = 1;
 uint32_t zfs_vdev_async_read_max_active = 3;
 uint32_t zfs_vdev_async_write_min_active = 2;
 uint32_t zfs_vdev_async_write_max_active = 10;
 uint32_t zfs_vdev_scrub_min_active = 1;
 uint32_t zfs_vdev_scrub_max_active = 3;
 uint32_t zfs_vdev_removal_min_active = 1;
 uint32_t zfs_vdev_removal_max_active = 2;
 uint32_t zfs_vdev_initializing_min_active = 1;
 uint32_t zfs_vdev_initializing_max_active = 1;
 uint32_t zfs_vdev_trim_min_active = 1;
 uint32_t zfs_vdev_trim_max_active = 2;
 uint32_t zfs_vdev_rebuild_min_active = 1;
 uint32_t zfs_vdev_rebuild_max_active = 3;
 
 /*
  * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent
  * dirty data, use zfs_vdev_async_write_min_active.  When it has more than
  * zfs_vdev_async_write_active_max_dirty_percent, use
  * zfs_vdev_async_write_max_active. The value is linearly interpolated
  * between min and max.
  */
 int zfs_vdev_async_write_active_min_dirty_percent = 30;
 int zfs_vdev_async_write_active_max_dirty_percent = 60;
 
 /*
  * For non-interactive I/O (scrub, resilver, removal, initialize and rebuild),
  * the number of concurrently-active I/O's is limited to *_min_active, unless
  * the vdev is "idle".  When there are no interactive I/Os active (sync or
  * async), and zfs_vdev_nia_delay I/Os have completed since the last
  * interactive I/O, then the vdev is considered to be "idle", and the number
  * of concurrently-active non-interactive I/O's is increased to *_max_active.
  */
 uint_t zfs_vdev_nia_delay = 5;
 
 /*
  * Some HDDs tend to prioritize sequential I/O so high that concurrent
  * random I/O latency reaches several seconds.  On some HDDs it happens
  * even if sequential I/Os are submitted one at a time, and so setting
  * *_max_active to 1 does not help.  To prevent non-interactive I/Os, like
  * scrub, from monopolizing the device no more than zfs_vdev_nia_credit
  * I/Os can be sent while there are outstanding incomplete interactive
  * I/Os.  This enforced wait ensures the HDD services the interactive I/O
  * within a reasonable amount of time.
  */
 uint_t zfs_vdev_nia_credit = 5;
 
 /*
  * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
  * For read I/Os, we also aggregate across small adjacency gaps; for writes
  * we include spans of optional I/Os to aid aggregation at the disk even when
  * they aren't able to help us aggregate at this level.
  */
 int zfs_vdev_aggregation_limit = 1 << 20;
 int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE;
 int zfs_vdev_read_gap_limit = 32 << 10;
 int zfs_vdev_write_gap_limit = 4 << 10;
 
 /*
  * Define the queue depth percentage for each top-level. This percentage is
  * used in conjunction with zfs_vdev_async_max_active to determine how many
  * allocations a specific top-level vdev should handle. Once the queue depth
  * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100
  * then allocator will stop allocating blocks on that top-level device.
  * The default kernel setting is 1000% which will yield 100 allocations per
  * device. For userland testing, the default setting is 300% which equates
  * to 30 allocations per device.
  */
 #ifdef _KERNEL
 int zfs_vdev_queue_depth_pct = 1000;
 #else
 int zfs_vdev_queue_depth_pct = 300;
 #endif
 
 /*
  * When performing allocations for a given metaslab, we want to make sure that
  * there are enough IOs to aggregate together to improve throughput. We want to
  * ensure that there are at least 128k worth of IOs that can be aggregated, and
  * we assume that the average allocation size is 4k, so we need the queue depth
  * to be 32 per allocator to get good aggregation of sequential writes.
  */
 int zfs_vdev_def_queue_depth = 32;
 
 /*
  * Allow TRIM I/Os to be aggregated.  This should normally not be needed since
  * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted
  * by the TRIM code in zfs_trim.c.
  */
 int zfs_vdev_aggregate_trim = 0;
 
 static int
 vdev_queue_offset_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = (const zio_t *)x1;
 	const zio_t *z2 = (const zio_t *)x2;
 
 	int cmp = TREE_CMP(z1->io_offset, z2->io_offset);
 
 	if (likely(cmp))
 		return (cmp);
 
 	return (TREE_PCMP(z1, z2));
 }
 
 static inline avl_tree_t *
 vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
 {
 	return (&vq->vq_class[p].vqc_queued_tree);
 }
 
 static inline avl_tree_t *
 vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
 {
 	ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM);
 	if (t == ZIO_TYPE_READ)
 		return (&vq->vq_read_offset_tree);
 	else if (t == ZIO_TYPE_WRITE)
 		return (&vq->vq_write_offset_tree);
 	else
 		return (&vq->vq_trim_offset_tree);
 }
 
 static int
 vdev_queue_timestamp_compare(const void *x1, const void *x2)
 {
 	const zio_t *z1 = (const zio_t *)x1;
 	const zio_t *z2 = (const zio_t *)x2;
 
 	int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp);
 
 	if (likely(cmp))
 		return (cmp);
 
 	return (TREE_PCMP(z1, z2));
 }
 
 static int
 vdev_queue_class_min_active(vdev_queue_t *vq, zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SYNC_READ:
 		return (zfs_vdev_sync_read_min_active);
 	case ZIO_PRIORITY_SYNC_WRITE:
 		return (zfs_vdev_sync_write_min_active);
 	case ZIO_PRIORITY_ASYNC_READ:
 		return (zfs_vdev_async_read_min_active);
 	case ZIO_PRIORITY_ASYNC_WRITE:
 		return (zfs_vdev_async_write_min_active);
 	case ZIO_PRIORITY_SCRUB:
 		return (vq->vq_ia_active == 0 ? zfs_vdev_scrub_min_active :
 		    MIN(vq->vq_nia_credit, zfs_vdev_scrub_min_active));
 	case ZIO_PRIORITY_REMOVAL:
 		return (vq->vq_ia_active == 0 ? zfs_vdev_removal_min_active :
 		    MIN(vq->vq_nia_credit, zfs_vdev_removal_min_active));
 	case ZIO_PRIORITY_INITIALIZING:
 		return (vq->vq_ia_active == 0 ?zfs_vdev_initializing_min_active:
 		    MIN(vq->vq_nia_credit, zfs_vdev_initializing_min_active));
 	case ZIO_PRIORITY_TRIM:
 		return (zfs_vdev_trim_min_active);
 	case ZIO_PRIORITY_REBUILD:
 		return (vq->vq_ia_active == 0 ? zfs_vdev_rebuild_min_active :
 		    MIN(vq->vq_nia_credit, zfs_vdev_rebuild_min_active));
 	default:
 		panic("invalid priority %u", p);
 		return (0);
 	}
 }
 
 static int
 vdev_queue_max_async_writes(spa_t *spa)
 {
 	int writes;
 	uint64_t dirty = 0;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	uint64_t min_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_min_dirty_percent / 100;
 	uint64_t max_bytes = zfs_dirty_data_max *
 	    zfs_vdev_async_write_active_max_dirty_percent / 100;
 
 	/*
 	 * Async writes may occur before the assignment of the spa's
 	 * dsl_pool_t if a self-healing zio is issued prior to the
 	 * completion of dmu_objset_open_impl().
 	 */
 	if (dp == NULL)
 		return (zfs_vdev_async_write_max_active);
 
 	/*
 	 * Sync tasks correspond to interactive user actions. To reduce the
 	 * execution time of those actions we push data out as fast as possible.
 	 */
 	dirty = dp->dp_dirty_total;
 	if (dirty > max_bytes || spa_has_pending_synctask(spa))
 		return (zfs_vdev_async_write_max_active);
 
 	if (dirty < min_bytes)
 		return (zfs_vdev_async_write_min_active);
 
 	/*
 	 * linear interpolation:
 	 * slope = (max_writes - min_writes) / (max_bytes - min_bytes)
 	 * move right by min_bytes
 	 * move up by min_writes
 	 */
 	writes = (dirty - min_bytes) *
 	    (zfs_vdev_async_write_max_active -
 	    zfs_vdev_async_write_min_active) /
 	    (max_bytes - min_bytes) +
 	    zfs_vdev_async_write_min_active;
 	ASSERT3U(writes, >=, zfs_vdev_async_write_min_active);
 	ASSERT3U(writes, <=, zfs_vdev_async_write_max_active);
 	return (writes);
 }
 
 static int
 vdev_queue_class_max_active(spa_t *spa, vdev_queue_t *vq, zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SYNC_READ:
 		return (zfs_vdev_sync_read_max_active);
 	case ZIO_PRIORITY_SYNC_WRITE:
 		return (zfs_vdev_sync_write_max_active);
 	case ZIO_PRIORITY_ASYNC_READ:
 		return (zfs_vdev_async_read_max_active);
 	case ZIO_PRIORITY_ASYNC_WRITE:
 		return (vdev_queue_max_async_writes(spa));
 	case ZIO_PRIORITY_SCRUB:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
 			    zfs_vdev_scrub_min_active));
 		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
 			return (MAX(1, zfs_vdev_scrub_min_active));
 		return (zfs_vdev_scrub_max_active);
 	case ZIO_PRIORITY_REMOVAL:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
 			    zfs_vdev_removal_min_active));
 		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
 			return (MAX(1, zfs_vdev_removal_min_active));
 		return (zfs_vdev_removal_max_active);
 	case ZIO_PRIORITY_INITIALIZING:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
 			    zfs_vdev_initializing_min_active));
 		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
 			return (MAX(1, zfs_vdev_initializing_min_active));
 		return (zfs_vdev_initializing_max_active);
 	case ZIO_PRIORITY_TRIM:
 		return (zfs_vdev_trim_max_active);
 	case ZIO_PRIORITY_REBUILD:
 		if (vq->vq_ia_active > 0) {
 			return (MIN(vq->vq_nia_credit,
 			    zfs_vdev_rebuild_min_active));
 		} else if (vq->vq_nia_credit < zfs_vdev_nia_delay)
 			return (MAX(1, zfs_vdev_rebuild_min_active));
 		return (zfs_vdev_rebuild_max_active);
 	default:
 		panic("invalid priority %u", p);
 		return (0);
 	}
 }
 
 /*
  * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if
  * there is no eligible class.
  */
 static zio_priority_t
 vdev_queue_class_to_issue(vdev_queue_t *vq)
 {
 	spa_t *spa = vq->vq_vdev->vdev_spa;
 	zio_priority_t p, n;
 
 	if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active)
 		return (ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	/*
 	 * Find a queue that has not reached its minimum # outstanding i/os.
 	 * Do round-robin to reduce starvation due to zfs_vdev_max_active
 	 * and vq_nia_credit limits.
 	 */
 	for (n = 0; n < ZIO_PRIORITY_NUM_QUEUEABLE; n++) {
 		p = (vq->vq_last_prio + n + 1) % ZIO_PRIORITY_NUM_QUEUEABLE;
 		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 		    vq->vq_class[p].vqc_active <
 		    vdev_queue_class_min_active(vq, p)) {
 			vq->vq_last_prio = p;
 			return (p);
 		}
 	}
 
 	/*
 	 * If we haven't found a queue, look for one that hasn't reached its
 	 * maximum # outstanding i/os.
 	 */
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
 		    vq->vq_class[p].vqc_active <
 		    vdev_queue_class_max_active(spa, vq, p)) {
 			vq->vq_last_prio = p;
 			return (p);
 		}
 	}
 
 	/* No eligible queued i/os */
 	return (ZIO_PRIORITY_NUM_QUEUEABLE);
 }
 
 void
 vdev_queue_init(vdev_t *vd)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 	zio_priority_t p;
 
 	mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL);
 	vq->vq_vdev = vd;
 	taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent);
 
 	avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
 	    sizeof (zio_t), offsetof(struct zio, io_queue_node));
 	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
 	    vdev_queue_offset_compare, sizeof (zio_t),
 	    offsetof(struct zio, io_offset_node));
 	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
 	    vdev_queue_offset_compare, sizeof (zio_t),
 	    offsetof(struct zio, io_offset_node));
 	avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM),
 	    vdev_queue_offset_compare, sizeof (zio_t),
 	    offsetof(struct zio, io_offset_node));
 
 	for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
 		int (*compfn) (const void *, const void *);
 
 		/*
 		 * The synchronous/trim i/o queues are dispatched in FIFO rather
 		 * than LBA order. This provides more consistent latency for
 		 * these i/os.
 		 */
 		if (p == ZIO_PRIORITY_SYNC_READ ||
 		    p == ZIO_PRIORITY_SYNC_WRITE ||
 		    p == ZIO_PRIORITY_TRIM) {
 			compfn = vdev_queue_timestamp_compare;
 		} else {
 			compfn = vdev_queue_offset_compare;
 		}
 		avl_create(vdev_queue_class_tree(vq, p), compfn,
 		    sizeof (zio_t), offsetof(struct zio, io_queue_node));
 	}
 
 	vq->vq_last_offset = 0;
 }
 
 void
 vdev_queue_fini(vdev_t *vd)
 {
 	vdev_queue_t *vq = &vd->vdev_queue;
 
 	for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
 		avl_destroy(vdev_queue_class_tree(vq, p));
 	avl_destroy(&vq->vq_active_tree);
 	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
 	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));
 	avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM));
 
 	mutex_destroy(&vq->vq_lock);
 }
 
 static void
 vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
 {
-	spa_t *spa = zio->io_spa;
-	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
-
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
 	avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);
-
-	if (shk->kstat != NULL) {
-		mutex_enter(&shk->lock);
-		kstat_waitq_enter(shk->kstat->ks_data);
-		mutex_exit(&shk->lock);
-	}
 }
 
 static void
 vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 {
-	spa_t *spa = zio->io_spa;
-	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
-
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
 	avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);
-
-	if (shk->kstat != NULL) {
-		mutex_enter(&shk->lock);
-		kstat_waitq_exit(shk->kstat->ks_data);
-		mutex_exit(&shk->lock);
-	}
 }
 
 static boolean_t
 vdev_queue_is_interactive(zio_priority_t p)
 {
 	switch (p) {
 	case ZIO_PRIORITY_SCRUB:
 	case ZIO_PRIORITY_REMOVAL:
 	case ZIO_PRIORITY_INITIALIZING:
 	case ZIO_PRIORITY_REBUILD:
 		return (B_FALSE);
 	default:
 		return (B_TRUE);
 	}
 }
 
 static void
 vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio)
 {
-	spa_t *spa = zio->io_spa;
-	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
-
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	vq->vq_class[zio->io_priority].vqc_active++;
 	if (vdev_queue_is_interactive(zio->io_priority)) {
 		if (++vq->vq_ia_active == 1)
 			vq->vq_nia_credit = 1;
 	} else if (vq->vq_ia_active > 0) {
 		vq->vq_nia_credit--;
 	}
 	avl_add(&vq->vq_active_tree, zio);
-
-	if (shk->kstat != NULL) {
-		mutex_enter(&shk->lock);
-		kstat_runq_enter(shk->kstat->ks_data);
-		mutex_exit(&shk->lock);
-	}
 }
 
 static void
 vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio)
 {
-	spa_t *spa = zio->io_spa;
-	spa_history_kstat_t *shk = &spa->spa_stats.io_history;
-
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	vq->vq_class[zio->io_priority].vqc_active--;
 	if (vdev_queue_is_interactive(zio->io_priority)) {
 		if (--vq->vq_ia_active == 0)
 			vq->vq_nia_credit = 0;
 		else
 			vq->vq_nia_credit = zfs_vdev_nia_credit;
 	} else if (vq->vq_ia_active == 0)
 		vq->vq_nia_credit++;
 	avl_remove(&vq->vq_active_tree, zio);
-
-	if (shk->kstat != NULL) {
-		kstat_io_t *ksio = shk->kstat->ks_data;
-
-		mutex_enter(&shk->lock);
-		kstat_runq_exit(ksio);
-		if (zio->io_type == ZIO_TYPE_READ) {
-			ksio->reads++;
-			ksio->nread += zio->io_size;
-		} else if (zio->io_type == ZIO_TYPE_WRITE) {
-			ksio->writes++;
-			ksio->nwritten += zio->io_size;
-		}
-		mutex_exit(&shk->lock);
-	}
 }
 
 static void
 vdev_queue_agg_io_done(zio_t *aio)
 {
 	abd_free(aio->io_abd);
 }
 
 /*
  * Compute the range spanned by two i/os, which is the endpoint of the last
  * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
  * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
  * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
  */
 #define	IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
 #define	IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
 
 /*
  * Sufficiently adjacent io_offset's in ZIOs will be aggregated. We do this
  * by creating a gang ABD from the adjacent ZIOs io_abd's. By using
  * a gang ABD we avoid doing memory copies to and from the parent,
  * child ZIOs. The gang ABD also accounts for gaps between adjacent
  * io_offsets by simply getting the zero ABD for writes or allocating
  * a new ABD for reads and placing them in the gang ABD as well.
  */
 static zio_t *
 vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
 {
 	zio_t *first, *last, *aio, *dio, *mandatory, *nio;
 	zio_link_t *zl = NULL;
 	uint64_t maxgap = 0;
 	uint64_t size;
 	uint64_t limit;
 	int maxblocksize;
 	boolean_t stretch = B_FALSE;
 	avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
 	enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;
 	uint64_t next_offset;
 	abd_t *abd;
 
 	maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa);
 	if (vq->vq_vdev->vdev_nonrot)
 		limit = zfs_vdev_aggregation_limit_non_rotating;
 	else
 		limit = zfs_vdev_aggregation_limit;
 	limit = MAX(MIN(limit, maxblocksize), 0);
 
 	if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0)
 		return (NULL);
 
 	/*
 	 * While TRIM commands could be aggregated based on offset this
 	 * behavior is disabled until it's determined to be beneficial.
 	 */
 	if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim)
 		return (NULL);
 
 	/*
 	 * I/Os to distributed spares are directly dispatched to the dRAID
 	 * leaf vdevs for aggregation.  See the comment at the end of the
 	 * zio_vdev_io_start() function.
 	 */
 	ASSERT(vq->vq_vdev->vdev_ops != &vdev_draid_spare_ops);
 
 	first = last = zio;
 
 	if (zio->io_type == ZIO_TYPE_READ)
 		maxgap = zfs_vdev_read_gap_limit;
 
 	/*
 	 * We can aggregate I/Os that are sufficiently adjacent and of
 	 * the same flavor, as expressed by the AGG_INHERIT flags.
 	 * The latter requirement is necessary so that certain
 	 * attributes of the I/O, such as whether it's a normal I/O
 	 * or a scrub/resilver, can be preserved in the aggregate.
 	 * We can include optional I/Os, but don't allow them
 	 * to begin a range as they add no benefit in that situation.
 	 */
 
 	/*
 	 * We keep track of the last non-optional I/O.
 	 */
 	mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first;
 
 	/*
 	 * Walk backwards through sufficiently contiguous I/Os
 	 * recording the last non-optional I/O.
 	 */
 	while ((dio = AVL_PREV(t, first)) != NULL &&
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    IO_SPAN(dio, last) <= limit &&
 	    IO_GAP(dio, first) <= maxgap &&
 	    dio->io_type == zio->io_type) {
 		first = dio;
 		if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL))
 			mandatory = first;
 	}
 
 	/*
 	 * Skip any initial optional I/Os.
 	 */
 	while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) {
 		first = AVL_NEXT(t, first);
 		ASSERT(first != NULL);
 	}
 
 
 	/*
 	 * Walk forward through sufficiently contiguous I/Os.
 	 * The aggregation limit does not apply to optional i/os, so that
 	 * we can issue contiguous writes even if they are larger than the
 	 * aggregation limit.
 	 */
 	while ((dio = AVL_NEXT(t, last)) != NULL &&
 	    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
 	    (IO_SPAN(first, dio) <= limit ||
 	    (dio->io_flags & ZIO_FLAG_OPTIONAL)) &&
 	    IO_SPAN(first, dio) <= maxblocksize &&
 	    IO_GAP(last, dio) <= maxgap &&
 	    dio->io_type == zio->io_type) {
 		last = dio;
 		if (!(last->io_flags & ZIO_FLAG_OPTIONAL))
 			mandatory = last;
 	}
 
 	/*
 	 * Now that we've established the range of the I/O aggregation
 	 * we must decide what to do with trailing optional I/Os.
 	 * For reads, there's nothing to do. While we are unable to
 	 * aggregate further, it's possible that a trailing optional
 	 * I/O would allow the underlying device to aggregate with
 	 * subsequent I/Os. We must therefore determine if the next
 	 * non-optional I/O is close enough to make aggregation
 	 * worthwhile.
 	 */
 	if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) {
 		zio_t *nio = last;
 		while ((dio = AVL_NEXT(t, nio)) != NULL &&
 		    IO_GAP(nio, dio) == 0 &&
 		    IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) {
 			nio = dio;
 			if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
 				stretch = B_TRUE;
 				break;
 			}
 		}
 	}
 
 	if (stretch) {
 		/*
 		 * We are going to include an optional io in our aggregated
 		 * span, thus closing the write gap.  Only mandatory i/os can
 		 * start aggregated spans, so make sure that the next i/o
 		 * after our span is mandatory.
 		 */
 		dio = AVL_NEXT(t, last);
 		dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
 	} else {
 		/* do not include the optional i/o */
 		while (last != mandatory && last != first) {
 			ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL);
 			last = AVL_PREV(t, last);
 			ASSERT(last != NULL);
 		}
 	}
 
 	if (first == last)
 		return (NULL);
 
 	size = IO_SPAN(first, last);
 	ASSERT3U(size, <=, maxblocksize);
 
 	abd = abd_alloc_gang();
 	if (abd == NULL)
 		return (NULL);
 
 	aio = zio_vdev_delegated_io(first->io_vd, first->io_offset,
 	    abd, size, first->io_type, zio->io_priority,
 	    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 	    vdev_queue_agg_io_done, NULL);
 	aio->io_timestamp = first->io_timestamp;
 
 	nio = first;
 	next_offset = first->io_offset;
 	do {
 		dio = nio;
 		nio = AVL_NEXT(t, dio);
 		zio_add_child(dio, aio);
 		vdev_queue_io_remove(vq, dio);
 
 		if (dio->io_offset != next_offset) {
 			/* allocate a buffer for a read gap */
 			ASSERT3U(dio->io_type, ==, ZIO_TYPE_READ);
 			ASSERT3U(dio->io_offset, >, next_offset);
 			abd = abd_alloc_for_io(
 			    dio->io_offset - next_offset, B_TRUE);
 			abd_gang_add(aio->io_abd, abd, B_TRUE);
 		}
 		if (dio->io_abd &&
 		    (dio->io_size != abd_get_size(dio->io_abd))) {
 			/* abd size not the same as IO size */
 			ASSERT3U(abd_get_size(dio->io_abd), >, dio->io_size);
 			abd = abd_get_offset_size(dio->io_abd, 0, dio->io_size);
 			abd_gang_add(aio->io_abd, abd, B_TRUE);
 		} else {
 			if (dio->io_flags & ZIO_FLAG_NODATA) {
 				/* allocate a buffer for a write gap */
 				ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE);
 				ASSERT3P(dio->io_abd, ==, NULL);
 				abd_gang_add(aio->io_abd,
 				    abd_get_zeros(dio->io_size), B_TRUE);
 			} else {
 				/*
 				 * We pass B_FALSE to abd_gang_add()
 				 * because we did not allocate a new
 				 * ABD, so it is assumed the caller
 				 * will free this ABD.
 				 */
 				abd_gang_add(aio->io_abd, dio->io_abd,
 				    B_FALSE);
 			}
 		}
 		next_offset = dio->io_offset + dio->io_size;
 	} while (dio != last);
 	ASSERT3U(abd_get_size(aio->io_abd), ==, aio->io_size);
 
 	/*
 	 * We need to drop the vdev queue's lock during zio_execute() to
 	 * avoid a deadlock that we could encounter due to lock order
 	 * reversal between vq_lock and io_lock in zio_change_priority().
 	 */
 	mutex_exit(&vq->vq_lock);
 	while ((dio = zio_walk_parents(aio, &zl)) != NULL) {
 		ASSERT3U(dio->io_type, ==, aio->io_type);
 
 		zio_vdev_io_bypass(dio);
 		zio_execute(dio);
 	}
 	mutex_enter(&vq->vq_lock);
 
 	return (aio);
 }
 
 static zio_t *
 vdev_queue_io_to_issue(vdev_queue_t *vq)
 {
 	zio_t *zio, *aio;
 	zio_priority_t p;
 	avl_index_t idx;
 	avl_tree_t *tree;
 
 again:
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
 	p = vdev_queue_class_to_issue(vq);
 
 	if (p == ZIO_PRIORITY_NUM_QUEUEABLE) {
 		/* No eligible queued i/os */
 		return (NULL);
 	}
 
 	/*
 	 * For LBA-ordered queues (async / scrub / initializing), issue the
 	 * i/o which follows the most recently issued i/o in LBA (offset) order.
 	 *
 	 * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp.
 	 */
 	tree = vdev_queue_class_tree(vq, p);
 	vq->vq_io_search.io_timestamp = 0;
 	vq->vq_io_search.io_offset = vq->vq_last_offset - 1;
 	VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL);
 	zio = avl_nearest(tree, idx, AVL_AFTER);
 	if (zio == NULL)
 		zio = avl_first(tree);
 	ASSERT3U(zio->io_priority, ==, p);
 
 	aio = vdev_queue_aggregate(vq, zio);
 	if (aio != NULL)
 		zio = aio;
 	else
 		vdev_queue_io_remove(vq, zio);
 
 	/*
 	 * If the I/O is or was optional and therefore has no data, we need to
 	 * simply discard it. We need to drop the vdev queue's lock to avoid a
 	 * deadlock that we could encounter since this I/O will complete
 	 * immediately.
 	 */
 	if (zio->io_flags & ZIO_FLAG_NODATA) {
 		mutex_exit(&vq->vq_lock);
 		zio_vdev_io_bypass(zio);
 		zio_execute(zio);
 		mutex_enter(&vq->vq_lock);
 		goto again;
 	}
 
 	vdev_queue_pending_add(vq, zio);
 	vq->vq_last_offset = zio->io_offset + zio->io_size;
 
 	return (zio);
 }
 
 zio_t *
 vdev_queue_io(zio_t *zio)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *nio;
 
 	if (zio->io_flags & ZIO_FLAG_DONT_QUEUE)
 		return (zio);
 
 	/*
 	 * Children i/os inherent their parent's priority, which might
 	 * not match the child's i/o type.  Fix it up here.
 	 */
 	if (zio->io_type == ZIO_TYPE_READ) {
 		ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
 
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_READ &&
 		    zio->io_priority != ZIO_PRIORITY_SCRUB &&
 		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
 		    zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
 		    zio->io_priority != ZIO_PRIORITY_REBUILD) {
 			zio->io_priority = ZIO_PRIORITY_ASYNC_READ;
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
 		ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM);
 
 		if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE &&
 		    zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE &&
 		    zio->io_priority != ZIO_PRIORITY_REMOVAL &&
 		    zio->io_priority != ZIO_PRIORITY_INITIALIZING &&
 		    zio->io_priority != ZIO_PRIORITY_REBUILD) {
 			zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE;
 		}
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_TRIM);
 		ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM);
 	}
 
 	zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE;
 
 	mutex_enter(&vq->vq_lock);
 	zio->io_timestamp = gethrtime();
 	vdev_queue_io_add(vq, zio);
 	nio = vdev_queue_io_to_issue(vq);
 	mutex_exit(&vq->vq_lock);
 
 	if (nio == NULL)
 		return (NULL);
 
 	if (nio->io_done == vdev_queue_agg_io_done) {
 		zio_nowait(nio);
 		return (NULL);
 	}
 
 	return (nio);
 }
 
 void
 vdev_queue_io_done(zio_t *zio)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	zio_t *nio;
 
 	mutex_enter(&vq->vq_lock);
 
 	vdev_queue_pending_remove(vq, zio);
 
 	zio->io_delta = gethrtime() - zio->io_timestamp;
 	vq->vq_io_complete_ts = gethrtime();
 	vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp;
 
 	while ((nio = vdev_queue_io_to_issue(vq)) != NULL) {
 		mutex_exit(&vq->vq_lock);
 		if (nio->io_done == vdev_queue_agg_io_done) {
 			zio_nowait(nio);
 		} else {
 			zio_vdev_io_reissue(nio);
 			zio_execute(nio);
 		}
 		mutex_enter(&vq->vq_lock);
 	}
 
 	mutex_exit(&vq->vq_lock);
 }
 
 void
 vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority)
 {
 	vdev_queue_t *vq = &zio->io_vd->vdev_queue;
 	avl_tree_t *tree;
 
 	/*
 	 * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio
 	 * code to issue IOs without adding them to the vdev queue. In this
 	 * case, the zio is already going to be issued as quickly as possible
 	 * and so it doesn't need any reprioritization to help.
 	 */
 	if (zio->io_priority == ZIO_PRIORITY_NOW)
 		return;
 
 	ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 	ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
 		if (priority != ZIO_PRIORITY_SYNC_READ &&
 		    priority != ZIO_PRIORITY_ASYNC_READ &&
 		    priority != ZIO_PRIORITY_SCRUB)
 			priority = ZIO_PRIORITY_ASYNC_READ;
 	} else {
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 		if (priority != ZIO_PRIORITY_SYNC_WRITE &&
 		    priority != ZIO_PRIORITY_ASYNC_WRITE)
 			priority = ZIO_PRIORITY_ASYNC_WRITE;
 	}
 
 	mutex_enter(&vq->vq_lock);
 
 	/*
 	 * If the zio is in none of the queues we can simply change
 	 * the priority. If the zio is waiting to be submitted we must
 	 * remove it from the queue and re-insert it with the new priority.
 	 * Otherwise, the zio is currently active and we cannot change its
 	 * priority.
 	 */
 	tree = vdev_queue_class_tree(vq, zio->io_priority);
 	if (avl_find(tree, zio, NULL) == zio) {
 		avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
 		zio->io_priority = priority;
 		avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
 	} else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) {
 		zio->io_priority = priority;
 	}
 
 	mutex_exit(&vq->vq_lock);
 }
 
 /*
  * As these two methods are only used for load calculations we're not
  * concerned if we get an incorrect value on 32bit platforms due to lack of
  * vq_lock mutex use here, instead we prefer to keep it lock free for
  * performance.
  */
 int
 vdev_queue_length(vdev_t *vd)
 {
 	return (avl_numnodes(&vd->vdev_queue.vq_active_tree));
 }
 
 uint64_t
 vdev_queue_last_offset(vdev_t *vd)
 {
 	return (vd->vdev_queue.vq_last_offset);
 }
 
 /* BEGIN CSTYLED */
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, INT, ZMOD_RW,
 	"Max vdev I/O aggregation size");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, INT, ZMOD_RW,
 	"Max vdev I/O aggregation size for non-rotating media");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, INT, ZMOD_RW,
 	"Allow TRIM I/O to be aggregated");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, INT, ZMOD_RW,
 	"Aggregate read I/O over gap");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, INT, ZMOD_RW,
 	"Aggregate write I/O over gap");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, INT, ZMOD_RW,
 	"Maximum number of active I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, INT, ZMOD_RW,
 	"Async write concurrency max threshold");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, INT, ZMOD_RW,
 	"Async write concurrency min threshold");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, INT, ZMOD_RW,
 	"Max active async read I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, INT, ZMOD_RW,
 	"Min active async read I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, INT, ZMOD_RW,
 	"Max active async write I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, INT, ZMOD_RW,
 	"Min active async write I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, INT, ZMOD_RW,
 	"Max active initializing I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, INT, ZMOD_RW,
 	"Min active initializing I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, INT, ZMOD_RW,
 	"Max active removal I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, INT, ZMOD_RW,
 	"Min active removal I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, INT, ZMOD_RW,
 	"Max active scrub I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, INT, ZMOD_RW,
 	"Min active scrub I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, INT, ZMOD_RW,
 	"Max active sync read I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, INT, ZMOD_RW,
 	"Min active sync read I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, INT, ZMOD_RW,
 	"Max active sync write I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, INT, ZMOD_RW,
 	"Min active sync write I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW,
 	"Max active trim/discard I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW,
 	"Min active trim/discard I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW,
 	"Max active rebuild I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW,
 	"Min active rebuild I/Os per vdev");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_credit, INT, ZMOD_RW,
 	"Number of non-interactive I/Os to allow in sequence");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, nia_delay, INT, ZMOD_RW,
 	"Number of non-interactive I/Os before _max_active");
 
 ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW,
 	"Queue depth percentage for each top-level vdev");
 /* END CSTYLED */