Index: vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c	(revision 336957)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dbuf.c	(revision 336958)
@@ -1,3743 +1,3853 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_send.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dmu_tx.h>
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfeature.h>
 #include <sys/blkptr.h>
 #include <sys/range_tree.h>
 #include <sys/callb.h>
 #include <sys/abd.h>
 #include <sys/vdev.h>
 #include <sys/cityhash.h>
+#include <sys/spa_impl.h>
 
 uint_t zfs_dbuf_evict_key;
 
 static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 
 #ifndef __lint
 extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu,
     dmu_buf_evict_func_t *evict_func_sync,
     dmu_buf_evict_func_t *evict_func_async,
     dmu_buf_t **clear_on_evict_dbufp);
 #endif /* ! __lint */
 
 /*
  * Global data structures and functions for the dbuf cache.
  */
 static kmem_cache_t *dbuf_kmem_cache;
 static taskq_t *dbu_evict_taskq;
 
 static kthread_t *dbuf_cache_evict_thread;
 static kmutex_t dbuf_evict_lock;
 static kcondvar_t dbuf_evict_cv;
 static boolean_t dbuf_evict_thread_exit;
 
 /*
- * LRU cache of dbufs. The dbuf cache maintains a list of dbufs that
- * are not currently held but have been recently released. These dbufs
- * are not eligible for arc eviction until they are aged out of the cache.
- * Dbufs are added to the dbuf cache once the last hold is released. If a
- * dbuf is later accessed and still exists in the dbuf cache, then it will
- * be removed from the cache and later re-added to the head of the cache.
- * Dbufs that are aged out of the cache will be immediately destroyed and
- * become eligible for arc eviction.
+ * There are two dbuf caches; each dbuf can only be in one of them at a time.
+ *
+ * 1. Cache of metadata dbufs, to help make read-heavy administrative commands
+ *    from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs
+ *    that represent the metadata that describes filesystems/snapshots/
+ *    bookmarks/properties/etc. We only evict from this cache when we export a
+ *    pool, to short-circuit as much I/O as possible for all administrative
+ *    commands that need the metadata. There is no eviction policy for this
+ *    cache, because we try to only include types in it which would occupy a
+ *    very small amount of space per object but create a large impact on the
+ *    performance of these commands. Instead, after it reaches a maximum size
+ *    (which should only happen on very small memory systems with a very large
+ *    number of filesystem objects), we stop taking new dbufs into the
+ *    metadata cache, instead putting them in the normal dbuf cache.
+ *
+ * 2. LRU cache of dbufs. The "dbuf cache" maintains a list of dbufs that
+ *    are not currently held but have been recently released. These dbufs
+ *    are not eligible for arc eviction until they are aged out of the cache.
+ *    Dbufs that are aged out of the cache will be immediately destroyed and
+ *    become eligible for arc eviction.
+ *
+ * Dbufs are added to these caches once the last hold is released. If a dbuf is
+ * later accessed and still exists in the dbuf cache, then it will be removed
+ * from the cache and later re-added to the head of the cache.
+ *
+ * If a given dbuf meets the requirements for the metadata cache, it will go
+ * there, otherwise it will be considered for the generic LRU dbuf cache. The
+ * caches and the refcounts tracking their sizes are stored in an array indexed
+ * by those caches' matching enum values (from dbuf_cached_state_t).
  */
-static multilist_t *dbuf_cache;
-static refcount_t dbuf_cache_size;
-uint64_t dbuf_cache_max_bytes = 0;
+typedef struct dbuf_cache {
+	multilist_t *cache;
+	refcount_t size;
+} dbuf_cache_t;
+dbuf_cache_t dbuf_caches[DB_CACHE_MAX];
 
-/* Set the default size of the dbuf cache to log2 fraction of arc size. */
+/* Size limits for the caches */
+uint64_t dbuf_cache_max_bytes = 0;
+uint64_t dbuf_metadata_cache_max_bytes = 0;
+/* Set the default sizes of the caches to log2 fraction of arc size */
 int dbuf_cache_shift = 5;
+int dbuf_metadata_cache_shift = 6;
 
 /*
- * The dbuf cache uses a three-stage eviction policy:
+ * For diagnostic purposes, this is incremented whenever we can't add
+ * something to the metadata cache because it's full, and instead put
+ * the data in the regular dbuf cache.
+ */
+uint64_t dbuf_metadata_cache_overflow;
+
+/*
+ * The LRU dbuf cache uses a three-stage eviction policy:
  *	- A low water marker designates when the dbuf eviction thread
  *	should stop evicting from the dbuf cache.
  *	- When we reach the maximum size (aka mid water mark), we
  *	signal the eviction thread to run.
  *	- The high water mark indicates when the eviction thread
  *	is unable to keep up with the incoming load and eviction must
  *	happen in the context of the calling thread.
  *
  * The dbuf cache:
  *                                                 (max size)
  *                                      low water   mid water   hi water
  * +----------------------------------------+----------+----------+
  * |                                        |          |          |
  * |                                        |          |          |
  * |                                        |          |          |
  * |                                        |          |          |
  * +----------------------------------------+----------+----------+
  *                                        stop        signal     evict
  *                                      evicting     eviction   directly
  *                                                    thread
  *
  * The high and low water marks indicate the operating range for the eviction
  * thread. The low water mark is, by default, 90% of the total size of the
  * cache and the high water mark is at 110% (both of these percentages can be
  * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct,
  * respectively). The eviction thread will try to ensure that the cache remains
  * within this range by waking up every second and checking if the cache is
  * above the low water mark. The thread can also be woken up by callers adding
  * elements into the cache if the cache is larger than the mid water (i.e max
  * cache size). Once the eviction thread is woken up and eviction is required,
  * it will continue evicting buffers until it's able to reduce the cache size
  * to the low water mark. If the cache size continues to grow and hits the high
  * water mark, then callers adding elments to the cache will begin to evict
  * directly from the cache until the cache is no longer above the high water
  * mark.
  */
 
 /*
  * The percentage above and below the maximum cache size.
  */
 uint_t dbuf_cache_hiwater_pct = 10;
 uint_t dbuf_cache_lowater_pct = 10;
 
 /* ARGSUSED */
 static int
 dbuf_cons(void *vdb, void *unused, int kmflag)
 {
 	dmu_buf_impl_t *db = vdb;
 	bzero(db, sizeof (dmu_buf_impl_t));
 
 	mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
 	multilist_link_init(&db->db_cache_link);
 	refcount_create(&db->db_holds);
 
 	return (0);
 }
 
 /* ARGSUSED */
 static void
 dbuf_dest(void *vdb, void *unused)
 {
 	dmu_buf_impl_t *db = vdb;
 	mutex_destroy(&db->db_mtx);
 	cv_destroy(&db->db_changed);
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 	refcount_destroy(&db->db_holds);
 }
 
 /*
  * dbuf hash table routines
  */
 static dbuf_hash_table_t dbuf_hash_table;
 
 static uint64_t dbuf_hash_count;
 
 /*
  * We use Cityhash for this. It's fast, and has good hash properties without
  * requiring any large static buffers.
  */
 static uint64_t
 dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid)
 {
 	return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid));
 }
 
 #define	DBUF_EQUAL(dbuf, os, obj, level, blkid)		\
 	((dbuf)->db.db_object == (obj) &&		\
 	(dbuf)->db_objset == (os) &&			\
 	(dbuf)->db_level == (level) &&			\
 	(dbuf)->db_blkid == (blkid))
 
 dmu_buf_impl_t *
 dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t hv = dbuf_hash(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *db;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) {
 		if (DBUF_EQUAL(db, os, obj, level, blkid)) {
 			mutex_enter(&db->db_mtx);
 			if (db->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (db);
 			}
 			mutex_exit(&db->db_mtx);
 		}
 	}
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	return (NULL);
 }
 
 static dmu_buf_impl_t *
 dbuf_find_bonus(objset_t *os, uint64_t object)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *db = NULL;
 
 	if (dnode_hold(os, object, FTAG, &dn) == 0) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		if (dn->dn_bonus != NULL) {
 			db = dn->dn_bonus;
 			mutex_enter(&db->db_mtx);
 		}
 		rw_exit(&dn->dn_struct_rwlock);
 		dnode_rele(dn, FTAG);
 	}
 	return (db);
 }
 
 /*
  * Insert an entry into the hash table.  If there is already an element
  * equal to elem in the hash table, then the already existing element
  * will be returned and the new element will not be inserted.
  * Otherwise returns NULL.
  */
 static dmu_buf_impl_t *
 dbuf_hash_insert(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	objset_t *os = db->db_objset;
 	uint64_t obj = db->db.db_object;
 	int level = db->db_level;
 	uint64_t blkid = db->db_blkid;
 	uint64_t hv = dbuf_hash(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *dbf;
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) {
 		if (DBUF_EQUAL(dbf, os, obj, level, blkid)) {
 			mutex_enter(&dbf->db_mtx);
 			if (dbf->db_state != DB_EVICTING) {
 				mutex_exit(DBUF_HASH_MUTEX(h, idx));
 				return (dbf);
 			}
 			mutex_exit(&dbf->db_mtx);
 		}
 	}
 
 	mutex_enter(&db->db_mtx);
 	db->db_hash_next = h->hash_table[idx];
 	h->hash_table[idx] = db;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	atomic_inc_64(&dbuf_hash_count);
 
 	return (NULL);
 }
 
 /*
  * Remove an entry from the hash table.  It must be in the EVICTING state.
  */
 static void
 dbuf_hash_remove(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	uint64_t hv = dbuf_hash(db->db_objset, db->db.db_object,
 	    db->db_level, db->db_blkid);
 	uint64_t idx = hv & h->hash_table_mask;
 	dmu_buf_impl_t *dbf, **dbp;
 
 	/*
 	 * We musn't hold db_mtx to maintain lock ordering:
 	 * DBUF_HASH_MUTEX > db_mtx.
 	 */
 	ASSERT(refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_state == DB_EVICTING);
 	ASSERT(!MUTEX_HELD(&db->db_mtx));
 
 	mutex_enter(DBUF_HASH_MUTEX(h, idx));
 	dbp = &h->hash_table[idx];
 	while ((dbf = *dbp) != db) {
 		dbp = &dbf->db_hash_next;
 		ASSERT(dbf != NULL);
 	}
 	*dbp = db->db_hash_next;
 	db->db_hash_next = NULL;
 	mutex_exit(DBUF_HASH_MUTEX(h, idx));
 	atomic_dec_64(&dbuf_hash_count);
 }
 
 typedef enum {
 	DBVU_EVICTING,
 	DBVU_NOT_EVICTING
 } dbvu_verify_type_t;
 
 static void
 dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type)
 {
 #ifdef ZFS_DEBUG
 	int64_t holds;
 
 	if (db->db_user == NULL)
 		return;
 
 	/* Only data blocks support the attachment of user data. */
 	ASSERT(db->db_level == 0);
 
 	/* Clients must resolve a dbuf before attaching user data. */
 	ASSERT(db->db.db_data != NULL);
 	ASSERT3U(db->db_state, ==, DB_CACHED);
 
 	holds = refcount_count(&db->db_holds);
 	if (verify_type == DBVU_EVICTING) {
 		/*
 		 * Immediate eviction occurs when holds == dirtycnt.
 		 * For normal eviction buffers, holds is zero on
 		 * eviction, except when dbuf_fix_old_data() calls
 		 * dbuf_clear_data().  However, the hold count can grow
 		 * during eviction even though db_mtx is held (see
 		 * dmu_bonus_hold() for an example), so we can only
 		 * test the generic invariant that holds >= dirtycnt.
 		 */
 		ASSERT3U(holds, >=, db->db_dirtycnt);
 	} else {
 		if (db->db_user_immediate_evict == TRUE)
 			ASSERT3U(holds, >=, db->db_dirtycnt);
 		else
 			ASSERT3U(holds, >, 0);
 	}
 #endif
 }
 
 static void
 dbuf_evict_user(dmu_buf_impl_t *db)
 {
 	dmu_buf_user_t *dbu = db->db_user;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (dbu == NULL)
 		return;
 
 	dbuf_verify_user(db, DBVU_EVICTING);
 	db->db_user = NULL;
 
 #ifdef ZFS_DEBUG
 	if (dbu->dbu_clear_on_evict_dbufp != NULL)
 		*dbu->dbu_clear_on_evict_dbufp = NULL;
 #endif
 
 	/*
 	 * There are two eviction callbacks - one that we call synchronously
 	 * and one that we invoke via a taskq.  The async one is useful for
 	 * avoiding lock order reversals and limiting stack depth.
 	 *
 	 * Note that if we have a sync callback but no async callback,
 	 * it's likely that the sync callback will free the structure
 	 * containing the dbu.  In that case we need to take care to not
 	 * dereference dbu after calling the sync evict func.
 	 */
 	boolean_t has_async = (dbu->dbu_evict_func_async != NULL);
 
 	if (dbu->dbu_evict_func_sync != NULL)
 		dbu->dbu_evict_func_sync(dbu);
 
 	if (has_async) {
 		taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async,
 		    dbu, 0, &dbu->dbu_tqent);
 	}
 }
 
 boolean_t
 dbuf_is_metadata(dmu_buf_impl_t *db)
 {
 	if (db->db_level > 0) {
 		return (B_TRUE);
 	} else {
 		boolean_t is_metadata;
 
 		DB_DNODE_ENTER(db);
 		is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type);
 		DB_DNODE_EXIT(db);
 
 		return (is_metadata);
 	}
 }
 
 /*
+ * This returns whether this dbuf should be stored in the metadata cache, which
+ * is based on whether it's from one of the dnode types that store data related
+ * to traversing dataset hierarchies.
+ */
+static boolean_t
+dbuf_include_in_metadata_cache(dmu_buf_impl_t *db)
+{
+	DB_DNODE_ENTER(db);
+	dmu_object_type_t type = DB_DNODE(db)->dn_type;
+	DB_DNODE_EXIT(db);
+
+	/* Check if this dbuf is one of the types we care about */
+	if (DMU_OT_IS_METADATA_CACHED(type)) {
+		/* If we hit this, then we set something up wrong in dmu_ot */
+		ASSERT(DMU_OT_IS_METADATA(type));
+
+		/*
+		 * Sanity check for small-memory systems: don't allocate too
+		 * much memory for this purpose.
+		 */
+		if (refcount_count(&dbuf_caches[DB_DBUF_METADATA_CACHE].size) >
+		    dbuf_metadata_cache_max_bytes) {
+			dbuf_metadata_cache_overflow++;
+			DTRACE_PROBE1(dbuf__metadata__cache__overflow,
+			    dmu_buf_impl_t *, db);
+			return (B_FALSE);
+		}
+
+		return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+/*
  * This function *must* return indices evenly distributed between all
  * sublists of the multilist. This is needed due to how the dbuf eviction
  * code is laid out; dbuf_evict_thread() assumes dbufs are evenly
  * distributed between all sublists and uses this assumption when
  * deciding which sublist to evict from and how much to evict from it.
  */
 unsigned int
 dbuf_cache_multilist_index_func(multilist_t *ml, void *obj)
 {
 	dmu_buf_impl_t *db = obj;
 
 	/*
 	 * The assumption here, is the hash value for a given
 	 * dmu_buf_impl_t will remain constant throughout it's lifetime
 	 * (i.e. it's objset, object, level and blkid fields don't change).
 	 * Thus, we don't need to store the dbuf's sublist index
 	 * on insertion, as this index can be recalculated on removal.
 	 *
 	 * Also, the low order bits of the hash value are thought to be
 	 * distributed evenly. Otherwise, in the case that the multilist
 	 * has a power of two number of sublists, each sublists' usage
 	 * would not be evenly distributed.
 	 */
 	return (dbuf_hash(db->db_objset, db->db.db_object,
 	    db->db_level, db->db_blkid) %
 	    multilist_get_num_sublists(ml));
 }
 
 static inline boolean_t
 dbuf_cache_above_hiwater(void)
 {
 	uint64_t dbuf_cache_hiwater_bytes =
 	    (dbuf_cache_max_bytes * dbuf_cache_hiwater_pct) / 100;
 
-	return (refcount_count(&dbuf_cache_size) >
+	return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
 	    dbuf_cache_max_bytes + dbuf_cache_hiwater_bytes);
 }
 
 static inline boolean_t
 dbuf_cache_above_lowater(void)
 {
 	uint64_t dbuf_cache_lowater_bytes =
 	    (dbuf_cache_max_bytes * dbuf_cache_lowater_pct) / 100;
 
-	return (refcount_count(&dbuf_cache_size) >
+	return (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
 	    dbuf_cache_max_bytes - dbuf_cache_lowater_bytes);
 }
 
 /*
  * Evict the oldest eligible dbuf from the dbuf cache.
  */
 static void
 dbuf_evict_one(void)
 {
-	int idx = multilist_get_random_index(dbuf_cache);
-	multilist_sublist_t *mls = multilist_sublist_lock(dbuf_cache, idx);
+	int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache);
+	multilist_sublist_t *mls = multilist_sublist_lock(
+	    dbuf_caches[DB_DBUF_CACHE].cache, idx);
 
 	ASSERT(!MUTEX_HELD(&dbuf_evict_lock));
 
 	/*
 	 * Set the thread's tsd to indicate that it's processing evictions.
 	 * Once a thread stops evicting from the dbuf cache it will
 	 * reset its tsd to NULL.
 	 */
 	ASSERT3P(tsd_get(zfs_dbuf_evict_key), ==, NULL);
 	(void) tsd_set(zfs_dbuf_evict_key, (void *)B_TRUE);
 
 	dmu_buf_impl_t *db = multilist_sublist_tail(mls);
 	while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) {
 		db = multilist_sublist_prev(mls, db);
 	}
 
 	DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db,
 	    multilist_sublist_t *, mls);
 
 	if (db != NULL) {
 		multilist_sublist_remove(mls, db);
 		multilist_sublist_unlock(mls);
-		(void) refcount_remove_many(&dbuf_cache_size,
+		(void) refcount_remove_many(&dbuf_caches[DB_DBUF_CACHE].size,
 		    db->db.db_size, db);
+		ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE);
+		db->db_caching_status = DB_NO_CACHE;
 		dbuf_destroy(db);
 	} else {
 		multilist_sublist_unlock(mls);
 	}
 	(void) tsd_set(zfs_dbuf_evict_key, NULL);
 }
 
 /*
  * The dbuf evict thread is responsible for aging out dbufs from the
  * cache. Once the cache has reached it's maximum size, dbufs are removed
  * and destroyed. The eviction thread will continue running until the size
  * of the dbuf cache is at or below the maximum size. Once the dbuf is aged
  * out of the cache it is destroyed and becomes eligible for arc eviction.
  */
 /* ARGSUSED */
 static void
 dbuf_evict_thread(void *unused)
 {
 	callb_cpr_t cpr;
 
 	CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&dbuf_evict_lock);
 	while (!dbuf_evict_thread_exit) {
 		while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
 			CALLB_CPR_SAFE_BEGIN(&cpr);
 			(void) cv_timedwait_hires(&dbuf_evict_cv,
 			    &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0);
 			CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock);
 		}
 		mutex_exit(&dbuf_evict_lock);
 
 		/*
 		 * Keep evicting as long as we're above the low water mark
 		 * for the cache. We do this without holding the locks to
 		 * minimize lock contention.
 		 */
 		while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) {
 			dbuf_evict_one();
 		}
 
 		mutex_enter(&dbuf_evict_lock);
 	}
 
 	dbuf_evict_thread_exit = B_FALSE;
 	cv_broadcast(&dbuf_evict_cv);
 	CALLB_CPR_EXIT(&cpr);	/* drops dbuf_evict_lock */
 	thread_exit();
 }
 
 /*
  * Wake up the dbuf eviction thread if the dbuf cache is at its max size.
  * If the dbuf cache is at its high water mark, then evict a dbuf from the
  * dbuf cache using the callers context.
  */
 static void
 dbuf_evict_notify(void)
 {
 
 	/*
 	 * We use thread specific data to track when a thread has
 	 * started processing evictions. This allows us to avoid deeply
 	 * nested stacks that would have a call flow similar to this:
 	 *
 	 * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify()
 	 *	^						|
 	 *	|						|
 	 *	+-----dbuf_destroy()<--dbuf_evict_one()<--------+
 	 *
 	 * The dbuf_eviction_thread will always have its tsd set until
 	 * that thread exits. All other threads will only set their tsd
 	 * if they are participating in the eviction process. This only
 	 * happens if the eviction thread is unable to process evictions
 	 * fast enough. To keep the dbuf cache size in check, other threads
 	 * can evict from the dbuf cache directly. Those threads will set
 	 * their tsd values so that we ensure that they only evict one dbuf
 	 * from the dbuf cache.
 	 */
 	if (tsd_get(zfs_dbuf_evict_key) != NULL)
 		return;
 
 	/*
 	 * We check if we should evict without holding the dbuf_evict_lock,
 	 * because it's OK to occasionally make the wrong decision here,
 	 * and grabbing the lock results in massive lock contention.
 	 */
-	if (refcount_count(&dbuf_cache_size) > dbuf_cache_max_bytes) {
+	if (refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) >
+	    dbuf_cache_max_bytes) {
 		if (dbuf_cache_above_hiwater())
 			dbuf_evict_one();
 		cv_signal(&dbuf_evict_cv);
 	}
 }
 
 void
 dbuf_init(void)
 {
 	uint64_t hsize = 1ULL << 16;
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	int i;
 
 	/*
 	 * The hash table is big enough to fill all of physical memory
 	 * with an average 4K block size.  The table will take up
 	 * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers).
 	 */
 	while (hsize * 4096 < physmem * PAGESIZE)
 		hsize <<= 1;
 
 retry:
 	h->hash_table_mask = hsize - 1;
 	h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP);
 	if (h->hash_table == NULL) {
 		/* XXX - we should really return an error instead of assert */
 		ASSERT(hsize > (1ULL << 10));
 		hsize >>= 1;
 		goto retry;
 	}
 
 	dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t",
 	    sizeof (dmu_buf_impl_t),
 	    0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0);
 
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL);
 
 	/*
-	 * Setup the parameters for the dbuf cache. We set the size of the
-	 * dbuf cache to 1/32nd (default) of the size of the ARC. If the value
-	 * has been set in /etc/system and it's not greater than the size of
-	 * the ARC, then we honor that value.
+	 * Setup the parameters for the dbuf caches. We set the sizes of the
+	 * dbuf cache and the metadata cache to 1/32nd and 1/16th (default)
+	 * of the size of the ARC, respectively. If the values are set in
+	 * /etc/system and they're not greater than the size of the ARC, then
+	 * we honor that value.
 	 */
 	if (dbuf_cache_max_bytes == 0 ||
 	    dbuf_cache_max_bytes >= arc_max_bytes())  {
 		dbuf_cache_max_bytes = arc_max_bytes() >> dbuf_cache_shift;
 	}
+	if (dbuf_metadata_cache_max_bytes == 0 ||
+	    dbuf_metadata_cache_max_bytes >= arc_max_bytes()) {
+		dbuf_metadata_cache_max_bytes =
+		    arc_max_bytes() >> dbuf_metadata_cache_shift;
+	}
 
 	/*
 	 * All entries are queued via taskq_dispatch_ent(), so min/maxalloc
 	 * configuration is not required.
 	 */
 	dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0);
 
-	dbuf_cache = multilist_create(sizeof (dmu_buf_impl_t),
-	    offsetof(dmu_buf_impl_t, db_cache_link),
-	    dbuf_cache_multilist_index_func);
-	refcount_create(&dbuf_cache_size);
+	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+		dbuf_caches[dcs].cache =
+		    multilist_create(sizeof (dmu_buf_impl_t),
+		    offsetof(dmu_buf_impl_t, db_cache_link),
+		    dbuf_cache_multilist_index_func);
+		refcount_create(&dbuf_caches[dcs].size);
+	}
 
 	tsd_create(&zfs_dbuf_evict_key, NULL);
 	dbuf_evict_thread_exit = B_FALSE;
 	mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL);
 	dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread,
 	    NULL, 0, &p0, TS_RUN, minclsyspri);
 }
 
 void
 dbuf_fini(void)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
 	int i;
 
 	for (i = 0; i < DBUF_MUTEXES; i++)
 		mutex_destroy(&h->hash_mutexes[i]);
 	kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *));
 	kmem_cache_destroy(dbuf_kmem_cache);
 	taskq_destroy(dbu_evict_taskq);
 
 	mutex_enter(&dbuf_evict_lock);
 	dbuf_evict_thread_exit = B_TRUE;
 	while (dbuf_evict_thread_exit) {
 		cv_signal(&dbuf_evict_cv);
 		cv_wait(&dbuf_evict_cv, &dbuf_evict_lock);
 	}
 	mutex_exit(&dbuf_evict_lock);
 	tsd_destroy(&zfs_dbuf_evict_key);
 
 	mutex_destroy(&dbuf_evict_lock);
 	cv_destroy(&dbuf_evict_cv);
 
-	refcount_destroy(&dbuf_cache_size);
-	multilist_destroy(dbuf_cache);
+	for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) {
+		refcount_destroy(&dbuf_caches[dcs].size);
+		multilist_destroy(dbuf_caches[dcs].cache);
+	}
 }
 
 /*
  * Other stuff.
  */
 
 #ifdef ZFS_DEBUG
 static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dbuf_dirty_record_t *dr;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY))
 		return;
 
 	ASSERT(db->db_objset != NULL);
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dn == NULL) {
 		ASSERT(db->db_parent == NULL);
 		ASSERT(db->db_blkptr == NULL);
 	} else {
 		ASSERT3U(db->db.db_object, ==, dn->dn_object);
 		ASSERT3P(db->db_objset, ==, dn->dn_objset);
 		ASSERT3U(db->db_level, <, dn->dn_nlevels);
 		ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
 		    db->db_blkid == DMU_SPILL_BLKID ||
 		    !avl_is_empty(&dn->dn_dbufs));
 	}
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID);
 	} else if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn != NULL);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT0(db->db.db_offset);
 	} else {
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 	}
 
 	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
 		ASSERT(dr->dr_dbuf == db);
 
 	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
 		ASSERT(dr->dr_dbuf == db);
 
 	/*
 	 * We can't assert that db_size matches dn_datablksz because it
 	 * can be momentarily different when another thread is doing
 	 * dnode_set_blksz().
 	 */
 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
 		dr = db->db_data_pending;
 		/*
 		 * It should only be modified in syncing context, so
 		 * make sure we only have one copy of the data.
 		 */
 		ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf);
 	}
 
 	/* verify db->db_blkptr */
 	if (db->db_blkptr) {
 		if (db->db_parent == dn->dn_dbuf) {
 			/* db is pointed to by the dnode */
 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
 			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 				ASSERT(db->db_parent == NULL);
 			else
 				ASSERT(db->db_parent != NULL);
 			if (db->db_blkid != DMU_SPILL_BLKID)
 				ASSERT3P(db->db_blkptr, ==,
 				    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		} else {
 			/* db is pointed to by an indirect block */
 			int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_parent->db_level, ==, db->db_level+1);
 			ASSERT3U(db->db_parent->db.db_object, ==,
 			    db->db.db_object);
 			/*
 			 * dnode_grow_indblksz() can make this fail if we don't
 			 * have the struct_rwlock.  XXX indblksz no longer
 			 * grows.  safe to do this now?
 			 */
 			if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 				ASSERT3P(db->db_blkptr, ==,
 				    ((blkptr_t *)db->db_parent->db.db_data +
 				    db->db_blkid % epb));
 			}
 		}
 	}
 	if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) &&
 	    (db->db_buf == NULL || db->db_buf->b_data) &&
 	    db->db.db_data && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_state != DB_FILL && !dn->dn_free_txg) {
 		/*
 		 * If the blkptr isn't set but they have nonzero data,
 		 * it had better be dirty, otherwise we'll lose that
 		 * data when we evict this buffer.
 		 *
 		 * There is an exception to this rule for indirect blocks; in
 		 * this case, if the indirect block is a hole, we fill in a few
 		 * fields on each of the child blocks (importantly, birth time)
 		 * to prevent hole birth times from being lost when you
 		 * partially fill in a hole.
 		 */
 		if (db->db_dirtycnt == 0) {
 			if (db->db_level == 0) {
 				uint64_t *buf = db->db.db_data;
 				int i;
 
 				for (i = 0; i < db->db.db_size >> 3; i++) {
 					ASSERT(buf[i] == 0);
 				}
 			} else {
 				blkptr_t *bps = db->db.db_data;
 				ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==,
 				    db->db.db_size);
 				/*
 				 * We want to verify that all the blkptrs in the
 				 * indirect block are holes, but we may have
 				 * automatically set up a few fields for them.
 				 * We iterate through each blkptr and verify
 				 * they only have those fields set.
 				 */
 				for (int i = 0;
 				    i < db->db.db_size / sizeof (blkptr_t);
 				    i++) {
 					blkptr_t *bp = &bps[i];
 					ASSERT(ZIO_CHECKSUM_IS_ZERO(
 					    &bp->blk_cksum));
 					ASSERT(
 					    DVA_IS_EMPTY(&bp->blk_dva[0]) &&
 					    DVA_IS_EMPTY(&bp->blk_dva[1]) &&
 					    DVA_IS_EMPTY(&bp->blk_dva[2]));
 					ASSERT0(bp->blk_fill);
 					ASSERT0(bp->blk_pad[0]);
 					ASSERT0(bp->blk_pad[1]);
 					ASSERT(!BP_IS_EMBEDDED(bp));
 					ASSERT(BP_IS_HOLE(bp));
 					ASSERT0(bp->blk_phys_birth);
 				}
 			}
 		}
 	}
 	DB_DNODE_EXIT(db);
 }
 #endif
 
 static void
 dbuf_clear_data(dmu_buf_impl_t *db)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	dbuf_evict_user(db);
 	ASSERT3P(db->db_buf, ==, NULL);
 	db->db.db_data = NULL;
 	if (db->db_state != DB_NOFILL)
 		db->db_state = DB_UNCACHED;
 }
 
 static void
 dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 {
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(buf != NULL);
 
 	db->db_buf = buf;
 	ASSERT(buf->b_data != NULL);
 	db->db.db_data = buf->b_data;
 }
 
 /*
  * Loan out an arc_buf for read.  Return the loaned arc_buf.
  */
 arc_buf_t *
 dbuf_loan_arcbuf(dmu_buf_impl_t *db)
 {
 	arc_buf_t *abuf;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
 		int blksz = db->db.db_size;
 		spa_t *spa = db->db_objset->os_spa;
 
 		mutex_exit(&db->db_mtx);
 		abuf = arc_loan_buf(spa, B_FALSE, blksz);
 		bcopy(db->db.db_data, abuf->b_data, blksz);
 	} else {
 		abuf = db->db_buf;
 		arc_loan_inuse_buf(abuf, db);
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
 		mutex_exit(&db->db_mtx);
 	}
 	return (abuf);
 }
 
 /*
  * Calculate which level n block references the data at the level 0 offset
  * provided.
  */
 uint64_t
 dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset)
 {
 	if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) {
 		/*
 		 * The level n blkid is equal to the level 0 blkid divided by
 		 * the number of level 0s in a level n block.
 		 *
 		 * The level 0 blkid is offset >> datablkshift =
 		 * offset / 2^datablkshift.
 		 *
 		 * The number of level 0s in a level n is the number of block
 		 * pointers in an indirect block, raised to the power of level.
 		 * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level =
 		 * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)).
 		 *
 		 * Thus, the level n blkid is: offset /
 		 * ((2^datablkshift)*(2^(level*(indblkshift - SPA_BLKPTRSHIFT)))
 		 * = offset / 2^(datablkshift + level *
 		 *   (indblkshift - SPA_BLKPTRSHIFT))
 		 * = offset >> (datablkshift + level *
 		 *   (indblkshift - SPA_BLKPTRSHIFT))
 		 */
 		return (offset >> (dn->dn_datablkshift + level *
 		    (dn->dn_indblkshift - SPA_BLKPTRSHIFT)));
 	} else {
 		ASSERT3U(offset, <, dn->dn_datablksz);
 		return (0);
 	}
 }
 
 static void
 dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT3U(db->db_state, ==, DB_READ);
 	/*
 	 * All reads are synchronous, so we must have a hold on the dbuf
 	 */
 	ASSERT(refcount_count(&db->db_holds) > 0);
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	if (db->db_level == 0 && db->db_freed_in_flight) {
 		/* we were freed in flight; disregard any error */
 		arc_release(buf, db);
 		bzero(buf->b_data, db->db.db_size);
 		arc_buf_freeze(buf);
 		db->db_freed_in_flight = FALSE;
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else if (zio == NULL || zio->io_error == 0) {
 		dbuf_set_data(db, buf);
 		db->db_state = DB_CACHED;
 	} else {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT3P(db->db_buf, ==, NULL);
 		arc_buf_destroy(buf, db);
 		db->db_state = DB_UNCACHED;
 	}
 	cv_broadcast(&db->db_changed);
 	dbuf_rele_and_unlock(db, NULL);
 }
 
 static void
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
 	dnode_t *dn;
 	zbookmark_phys_t zb;
 	arc_flags_t aflags = ARC_FLAG_NOWAIT;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED);
 	ASSERT(db->db_buf == NULL);
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 
 		ASSERT3U(bonuslen, <=, db->db.db_size);
 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		if (bonuslen < DN_MAX_BONUSLEN)
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
 		if (bonuslen)
 			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
 		DB_DNODE_EXIT(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
 	/*
 	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
 	 * processes the delete record and clears the bp while we are waiting
 	 * for the dn_mtx (resulting in a "no" from block_freed).
 	 */
 	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
 	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
 	    BP_IS_HOLE(db->db_blkptr)))) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 		dbuf_set_data(db, arc_alloc_buf(db->db_objset->os_spa, db, type,
 		    db->db.db_size));
 		bzero(db->db.db_data, db->db.db_size);
 
 		if (db->db_blkptr != NULL && db->db_level > 0 &&
 		    BP_IS_HOLE(db->db_blkptr) &&
 		    db->db_blkptr->blk_birth != 0) {
 			blkptr_t *bps = db->db.db_data;
 			for (int i = 0; i < ((1 <<
 			    DB_DNODE(db)->dn_indblkshift) / sizeof (blkptr_t));
 			    i++) {
 				blkptr_t *bp = &bps[i];
 				ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
 				    1 << dn->dn_indblkshift);
 				BP_SET_LSIZE(bp,
 				    BP_GET_LEVEL(db->db_blkptr) == 1 ?
 				    dn->dn_datablksz :
 				    BP_GET_LSIZE(db->db_blkptr));
 				BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr));
 				BP_SET_LEVEL(bp,
 				    BP_GET_LEVEL(db->db_blkptr) - 1);
 				BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0);
 			}
 		}
 		DB_DNODE_EXIT(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
 	DB_DNODE_EXIT(db);
 
 	db->db_state = DB_READ;
 	mutex_exit(&db->db_mtx);
 
 	if (DBUF_IS_L2CACHEABLE(db))
 		aflags |= ARC_FLAG_L2CACHE;
 
 	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
 	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	dbuf_add_ref(db, NULL);
 
 	(void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 	    (flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 	    &aflags, &zb);
 }
 
 /*
  * This is our just-in-time copy function.  It makes a copy of buffers that
  * have been modified in a previous transaction group before we access them in
  * the current active group.
  *
  * This function is used in three places: when we are dirtying a buffer for the
  * first time in a txg, when we are freeing a range in a dnode that includes
  * this buffer, and when we are accessing a buffer which was received compressed
  * and later referenced in a WRITE_BYREF record.
  *
  * Note that when we are called from dbuf_free_range() we do not put a hold on
  * the buffer, we just traverse the active dbuf list for the dnode.
  */
 static void
 dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 {
 	dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db.db_data != NULL);
 	ASSERT(db->db_level == 0);
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT);
 
 	if (dr == NULL ||
 	    (dr->dt.dl.dr_data !=
 	    ((db->db_blkid  == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf)))
 		return;
 
 	/*
 	 * If the last dirty record for this dbuf has not yet synced
 	 * and its referencing the dbuf data, either:
 	 *	reset the reference to point to a new copy,
 	 * or (if there a no active holders)
 	 *	just null out the current db_data pointer.
 	 */
 	ASSERT(dr->dr_txg >= txg - 2);
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		/* Note that the data bufs here are zio_bufs */
 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
 		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		int size = arc_buf_size(db->db_buf);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		spa_t *spa = db->db_objset->os_spa;
 		enum zio_compress compress_type =
 		    arc_get_compression(db->db_buf);
 
 		if (compress_type == ZIO_COMPRESS_OFF) {
 			dr->dt.dl.dr_data = arc_alloc_buf(spa, db, type, size);
 		} else {
 			ASSERT3U(type, ==, ARC_BUFC_DATA);
 			dr->dt.dl.dr_data = arc_alloc_compressed_buf(spa, db,
 			    size, arc_buf_lsize(db->db_buf), compress_type);
 		}
 		bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size);
 	} else {
 		db->db_buf = NULL;
 		dbuf_clear_data(db);
 	}
 }
 
 int
 dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 {
 	int err = 0;
 	boolean_t prefetch;
 	dnode_t *dn;
 
 	/*
 	 * We don't have to hold the mutex to check db_state because it
 	 * can't be freed while we have a hold on the buffer.
 	 */
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	if (db->db_state == DB_NOFILL)
 		return (SET_ERROR(EIO));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL &&
 	    DBUF_IS_CACHEABLE(db);
 
 	mutex_enter(&db->db_mtx);
 	if (db->db_state == DB_CACHED) {
 		/*
 		 * If the arc buf is compressed, we need to decompress it to
 		 * read the data. This could happen during the "zfs receive" of
 		 * a stream which is compressed and deduplicated.
 		 */
 		if (db->db_buf != NULL &&
 		    arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF) {
 			dbuf_fix_old_data(db,
 			    spa_syncing_txg(dmu_objset_spa(db->db_objset)));
 			err = arc_decompress(db->db_buf);
 			dbuf_set_data(db, db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
 	} else if (db->db_state == DB_UNCACHED) {
 		spa_t *spa = dn->dn_objset->os_spa;
 		boolean_t need_wait = B_FALSE;
 
 		if (zio == NULL &&
 		    db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) {
 			zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 			need_wait = B_TRUE;
 		}
 		dbuf_read_impl(db, zio, flags);
 
 		/* dbuf_read_impl has dropped db_mtx for us */
 
 		if (prefetch)
 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
 
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
 
 		if (need_wait)
 			err = zio_wait(zio);
 	} else {
 		/*
 		 * Another reader came in while the dbuf was in flight
 		 * between UNCACHED and CACHED.  Either a writer will finish
 		 * writing the buffer (sending the dbuf to CACHED) or the
 		 * first reader's request will reach the read_done callback
 		 * and send the dbuf to CACHED.  Otherwise, a failure
 		 * occurred and the dbuf went to UNCACHED.
 		 */
 		mutex_exit(&db->db_mtx);
 		if (prefetch)
 			dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
 		if ((flags & DB_RF_HAVESTRUCT) == 0)
 			rw_exit(&dn->dn_struct_rwlock);
 		DB_DNODE_EXIT(db);
 
 		/* Skip the wait per the caller's request. */
 		mutex_enter(&db->db_mtx);
 		if ((flags & DB_RF_NEVERWAIT) == 0) {
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL) {
 				ASSERT(db->db_state == DB_READ ||
 				    (flags & DB_RF_HAVESTRUCT) == 0);
 				DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *,
 				    db, zio_t *, zio);
 				cv_wait(&db->db_changed, &db->db_mtx);
 			}
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 		}
 		mutex_exit(&db->db_mtx);
 	}
 
 	return (err);
 }
 
 static void
 dbuf_noread(dmu_buf_impl_t *db)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	mutex_enter(&db->db_mtx);
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 	if (db->db_state == DB_UNCACHED) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		spa_t *spa = db->db_objset->os_spa;
 
 		ASSERT(db->db_buf == NULL);
 		ASSERT(db->db.db_data == NULL);
 		dbuf_set_data(db, arc_alloc_buf(spa, db, type, db->db.db_size));
 		db->db_state = DB_FILL;
 	} else if (db->db_state == DB_NOFILL) {
 		dbuf_clear_data(db);
 	} else {
 		ASSERT3U(db->db_state, ==, DB_CACHED);
 	}
 	mutex_exit(&db->db_mtx);
 }
 
 void
 dbuf_unoverride(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 	uint64_t txg = dr->dr_txg;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	/*
 	 * This assert is valid because dmu_sync() expects to be called by
 	 * a zilog's get_data while holding a range lock.  This call only
 	 * comes from dbuf_dirty() callers who must also hold a range lock.
 	 */
 	ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC);
 	ASSERT(db->db_level == 0);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 		return;
 
 	ASSERT(db->db_data_pending != dr);
 
 	/* free this block */
 	if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite)
 		zio_free(db->db_objset->os_spa, txg, bp);
 
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	dr->dt.dl.dr_nopwrite = B_FALSE;
 
 	/*
 	 * Release the already-written buffer, so we leave it in
 	 * a consistent dirty state.  Note that all callers are
 	 * modifying the buffer, so they will immediately do
 	 * another (redundant) arc_release().  Therefore, leave
 	 * the buf thawed to save the effort of freezing &
 	 * immediately re-thawing it.
 	 */
 	arc_release(dr->dt.dl.dr_data, db);
 }
 
 /*
  * Evict (if its unreferenced) or clear (if its referenced) any level-0
  * data blocks in the free range, so that any future readers will find
  * empty blocks.
  */
 void
 dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t db_search;
 	dmu_buf_impl_t *db, *db_next;
 	uint64_t txg = tx->tx_txg;
 	avl_index_t where;
 
 	if (end_blkid > dn->dn_maxblkid &&
 	    !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID))
 		end_blkid = dn->dn_maxblkid;
 	dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);
 
 	db_search.db_level = 0;
 	db_search.db_blkid = start_blkid;
 	db_search.db_state = DB_SEARCH;
 
 	mutex_enter(&dn->dn_dbufs_mtx);
 	db = avl_find(&dn->dn_dbufs, &db_search, &where);
 	ASSERT3P(db, ==, NULL);
 
 	db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
 
 	for (; db != NULL; db = db_next) {
 		db_next = AVL_NEXT(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 		if (db->db_level != 0 || db->db_blkid > end_blkid) {
 			break;
 		}
 		ASSERT3U(db->db_blkid, >=, start_blkid);
 
 		/* found a level 0 buffer in the range */
 		mutex_enter(&db->db_mtx);
 		if (dbuf_undirty(db, tx)) {
 			/* mutex has been dropped and dbuf destroyed */
 			continue;
 		}
 
 		if (db->db_state == DB_UNCACHED ||
 		    db->db_state == DB_NOFILL ||
 		    db->db_state == DB_EVICTING) {
 			ASSERT(db->db.db_data == NULL);
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (db->db_state == DB_READ || db->db_state == DB_FILL) {
 			/* will be handled in dbuf_read_done or dbuf_rele */
 			db->db_freed_in_flight = TRUE;
 			mutex_exit(&db->db_mtx);
 			continue;
 		}
 		if (refcount_count(&db->db_holds) == 0) {
 			ASSERT(db->db_buf);
 			dbuf_destroy(db);
 			continue;
 		}
 		/* The dbuf is referenced */
 
 		if (db->db_last_dirty != NULL) {
 			dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 			if (dr->dr_txg == txg) {
 				/*
 				 * This buffer is "in-use", re-adjust the file
 				 * size to reflect that this buffer may
 				 * contain new data when we sync.
 				 */
 				if (db->db_blkid != DMU_SPILL_BLKID &&
 				    db->db_blkid > dn->dn_maxblkid)
 					dn->dn_maxblkid = db->db_blkid;
 				dbuf_unoverride(dr);
 			} else {
 				/*
 				 * This dbuf is not dirty in the open context.
 				 * Either uncache it (if its not referenced in
 				 * the open context) or reset its contents to
 				 * empty.
 				 */
 				dbuf_fix_old_data(db, txg);
 			}
 		}
 		/* clear the contents if its cached */
 		if (db->db_state == DB_CACHED) {
 			ASSERT(db->db.db_data != NULL);
 			arc_release(db->db_buf, db);
 			bzero(db->db.db_data, db->db.db_size);
 			arc_buf_freeze(db->db_buf);
 		}
 
 		mutex_exit(&db->db_mtx);
 	}
 	mutex_exit(&dn->dn_dbufs_mtx);
 }
 
 void
 dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx)
 {
 	arc_buf_t *buf, *obuf;
 	int osize = db->db.db_size;
 	arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 	dnode_t *dn;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	/* XXX does *this* func really need the lock? */
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	/*
 	 * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held
 	 * is OK, because there can be no other references to the db
 	 * when we are changing its size, so no concurrent DB_FILL can
 	 * be happening.
 	 */
 	/*
 	 * XXX we should be doing a dbuf_read, checking the return
 	 * value and returning that up to our callers
 	 */
 	dmu_buf_will_dirty(&db->db, tx);
 
 	/* create the data buffer for the new block */
 	buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size);
 
 	/* copy old block data to the new block */
 	obuf = db->db_buf;
 	bcopy(obuf->b_data, buf->b_data, MIN(osize, size));
 	/* zero the remainder */
 	if (size > osize)
 		bzero((uint8_t *)buf->b_data + osize, size - osize);
 
 	mutex_enter(&db->db_mtx);
 	dbuf_set_data(db, buf);
 	arc_buf_destroy(obuf, db);
 	db->db.db_size = size;
 
 	if (db->db_level == 0) {
 		ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
 		db->db_last_dirty->dt.dl.dr_data = buf;
 	}
 	mutex_exit(&db->db_mtx);
 
 	dmu_objset_willuse_space(dn->dn_objset, size - osize, tx);
 	DB_DNODE_EXIT(db);
 }
 
 void
 dbuf_release_bp(dmu_buf_impl_t *db)
 {
 	objset_t *os = db->db_objset;
 
 	ASSERT(dsl_pool_sync_context(dmu_objset_pool(os)));
 	ASSERT(arc_released(os->os_phys_buf) ||
 	    list_link_active(&os->os_dsl_dataset->ds_synced_link));
 	ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf));
 
 	(void) arc_release(db->db_buf, db);
 }
 
 /*
  * We already have a dirty record for this TXG, and we are being
  * dirtied again.
  */
 static void
 dbuf_redirty(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) {
 		/*
 		 * If this buffer has already been written out,
 		 * we now need to reset its state.
 		 */
 		dbuf_unoverride(dr);
 		if (db->db.db_object != DMU_META_DNODE_OBJECT &&
 		    db->db_state != DB_NOFILL) {
 			/* Already released on initial dirty, so just thaw. */
 			ASSERT(arc_released(db->db_buf));
 			arc_buf_thaw(db->db_buf);
 		}
 	}
 }
 
 dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	objset_t *os;
 	dbuf_dirty_record_t **drp, *dr;
 	int drop_struct_lock = FALSE;
 	int txgoff = tx->tx_txg & TXG_MASK;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	DMU_TX_DIRTY_BUF(tx, db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	/*
 	 * Shouldn't dirty a regular buffer in syncing context.  Private
 	 * objects may be dirtied in syncing context, but only if they
 	 * were already pre-dirtied in open context.
 	 */
 #ifdef DEBUG
 	if (dn->dn_objset->os_dsl_dataset != NULL) {
 		rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
 		    RW_READER, FTAG);
 	}
 	ASSERT(!dmu_tx_is_syncing(tx) ||
 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
 	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    dn->dn_objset->os_dsl_dataset == NULL);
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG);
 #endif
 	/*
 	 * We make this assert for private objects as well, but after we
 	 * check if we're already dirty.  They are allowed to re-dirty
 	 * in syncing context.
 	 */
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * XXX make this true for indirects too?  The problem is that
 	 * transactions created with dmu_tx_create_assigned() from
 	 * syncing context don't bother holding ahead.
 	 */
 	ASSERT(db->db_level != 0 ||
 	    db->db_state == DB_CACHED || db->db_state == DB_FILL ||
 	    db->db_state == DB_NOFILL);
 
 	mutex_enter(&dn->dn_mtx);
 	/*
 	 * Don't set dirtyctx to SYNC if we're just modifying this as we
 	 * initialize the objset.
 	 */
 	if (dn->dn_dirtyctx == DN_UNDIRTIED) {
 		if (dn->dn_objset->os_dsl_dataset != NULL) {
 			rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
 			    RW_READER, FTAG);
 		}
 		if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) {
 			dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ?
 			    DN_DIRTY_SYNC : DN_DIRTY_OPEN);
 			ASSERT(dn->dn_dirtyctx_firstset == NULL);
 			dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP);
 		}
 		if (dn->dn_objset->os_dsl_dataset != NULL) {
 			rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock,
 			    FTAG);
 		}
 	}
 	mutex_exit(&dn->dn_mtx);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		dn->dn_have_spill = B_TRUE;
 
 	/*
 	 * If this buffer is already dirty, we're done.
 	 */
 	drp = &db->db_last_dirty;
 	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
 	    db->db.db_object == DMU_META_DNODE_OBJECT);
 	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
 		drp = &dr->dr_next;
 	if (dr && dr->dr_txg == tx->tx_txg) {
 		DB_DNODE_EXIT(db);
 
 		dbuf_redirty(dr);
 		mutex_exit(&db->db_mtx);
 		return (dr);
 	}
 
 	/*
 	 * Only valid if not already dirty.
 	 */
 	ASSERT(dn->dn_object == 0 ||
 	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
 
 	/*
 	 * We should only be dirtying in syncing context if it's the
 	 * mos or we're initializing the os or it's a special object.
 	 * However, we are allowed to dirty in syncing context provided
 	 * we already dirtied it in open context.  Hence we must make
 	 * this assertion only if we're not already dirty.
 	 */
 	os = dn->dn_objset;
 	VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa));
 #ifdef DEBUG
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG);
 	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
 	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
 	if (dn->dn_objset->os_dsl_dataset != NULL)
 		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
 #endif
 	ASSERT(db->db.db_size != 0);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	if (db->db_blkid != DMU_BONUS_BLKID) {
 		dmu_objset_willuse_space(os, db->db.db_size, tx);
 	}
 
 	/*
 	 * If this buffer is dirty in an old transaction group we need
 	 * to make a copy of it so that the changes we make in this
 	 * transaction group won't leak out when we sync the older txg.
 	 */
 	dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP);
 	if (db->db_level == 0) {
 		void *data_old = db->db_buf;
 
 		if (db->db_state != DB_NOFILL) {
 			if (db->db_blkid == DMU_BONUS_BLKID) {
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db.db_data;
 			} else if (db->db.db_object != DMU_META_DNODE_OBJECT) {
 				/*
 				 * Release the data buffer from the cache so
 				 * that we can modify it without impacting
 				 * possible other users of this cached data
 				 * block.  Note that indirect blocks and
 				 * private objects are not released until the
 				 * syncing state (since they are only modified
 				 * then).
 				 */
 				arc_release(db->db_buf, db);
 				dbuf_fix_old_data(db, tx->tx_txg);
 				data_old = db->db_buf;
 			}
 			ASSERT(data_old != NULL);
 		}
 		dr->dt.dl.dr_data = data_old;
 	} else {
 		mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL);
 		list_create(&dr->dt.di.dr_children,
 		    sizeof (dbuf_dirty_record_t),
 		    offsetof(dbuf_dirty_record_t, dr_dirty_node));
 	}
 	if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL)
 		dr->dr_accounted = db->db.db_size;
 	dr->dr_dbuf = db;
 	dr->dr_txg = tx->tx_txg;
 	dr->dr_next = *drp;
 	*drp = dr;
 
 	/*
 	 * We could have been freed_in_flight between the dbuf_noread
 	 * and dbuf_dirty.  We win, as though the dbuf_noread() had
 	 * happened after the free.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    db->db_blkid != DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_free_ranges[txgoff] != NULL) {
 			range_tree_clear(dn->dn_free_ranges[txgoff],
 			    db->db_blkid, 1);
 		}
 		mutex_exit(&dn->dn_mtx);
 		db->db_freed_in_flight = FALSE;
 	}
 
 	/*
 	 * This buffer is now part of this txg
 	 */
 	dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg);
 	db->db_dirtycnt += 1;
 	ASSERT3U(db->db_dirtycnt, <=, 3);
 
 	mutex_exit(&db->db_mtx);
 
 	if (db->db_blkid == DMU_BONUS_BLKID ||
 	    db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		dnode_setdirty(dn, tx);
 		DB_DNODE_EXIT(db);
 		return (dr);
 	}
 
 	/*
 	 * The dn_struct_rwlock prevents db_blkptr from changing
 	 * due to a write from syncing context completing
 	 * while we are running, so we want to acquire it before
 	 * looking at db_blkptr.
 	 */
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		drop_struct_lock = TRUE;
 	}
 
 	/*
 	 * We need to hold the dn_struct_rwlock to make this assertion,
 	 * because it protects dn_phys / dn_next_nlevels from changing.
 	 */
 	ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) ||
 	    dn->dn_phys->dn_nlevels > db->db_level ||
 	    dn->dn_next_nlevels[txgoff] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level ||
 	    dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level);
 
 	/*
 	 * If we are overwriting a dedup BP, then unless it is snapshotted,
 	 * when we get to syncing context we will need to decrement its
 	 * refcount in the DDT.  Prefetch the relevant DDT block so that
 	 * syncing context won't have to wait for the i/o.
 	 */
 	ddt_prefetch(os->os_spa, db->db_blkptr);
 
 	if (db->db_level == 0) {
 		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
 		ASSERT(dn->dn_maxblkid >= db->db_blkid);
 	}
 
 	if (db->db_level+1 < dn->dn_nlevels) {
 		dmu_buf_impl_t *parent = db->db_parent;
 		dbuf_dirty_record_t *di;
 		int parent_held = FALSE;
 
 		if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) {
 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 			parent = dbuf_hold_level(dn, db->db_level+1,
 			    db->db_blkid >> epbs, FTAG);
 			ASSERT(parent != NULL);
 			parent_held = TRUE;
 		}
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
 		ASSERT3U(db->db_level+1, ==, parent->db_level);
 		di = dbuf_dirty(parent, tx);
 		if (parent_held)
 			dbuf_rele(parent, FTAG);
 
 		mutex_enter(&db->db_mtx);
 		/*
 		 * Since we've dropped the mutex, it's possible that
 		 * dbuf_undirty() might have changed this out from under us.
 		 */
 		if (db->db_last_dirty == dr ||
 		    dn->dn_object == DMU_META_DNODE_OBJECT) {
 			mutex_enter(&di->dt.di.dr_mtx);
 			ASSERT3U(di->dr_txg, ==, tx->tx_txg);
 			ASSERT(!list_link_active(&dr->dr_dirty_node));
 			list_insert_tail(&di->dt.di.dr_children, dr);
 			mutex_exit(&di->dt.di.dr_mtx);
 			dr->dr_parent = di;
 		}
 		mutex_exit(&db->db_mtx);
 	} else {
 		ASSERT(db->db_level+1 == dn->dn_nlevels);
 		ASSERT(db->db_blkid < dn->dn_nblkptr);
 		ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		ASSERT(!list_link_active(&dr->dr_dirty_node));
 		list_insert_tail(&dn->dn_dirty_records[txgoff], dr);
 		mutex_exit(&dn->dn_mtx);
 		if (drop_struct_lock)
 			rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	dnode_setdirty(dn, tx);
 	DB_DNODE_EXIT(db);
 	return (dr);
 }
 
 /*
  * Undirty a buffer in the transaction group referenced by the given
  * transaction.  Return whether this evicted the dbuf.
  */
 static boolean_t
 dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	uint64_t txg = tx->tx_txg;
 	dbuf_dirty_record_t *dr, **drp;
 
 	ASSERT(txg != 0);
 
 	/*
 	 * Due to our use of dn_nlevels below, this can only be called
 	 * in open context, unless we are operating on the MOS.
 	 * From syncing context, dn_nlevels may be different from the
 	 * dn_nlevels used when dbuf was dirtied.
 	 */
 	ASSERT(db->db_objset ==
 	    dmu_objset_pool(db->db_objset)->dp_meta_objset ||
 	    txg != spa_syncing_txg(dmu_objset_spa(db->db_objset)));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT0(db->db_level);
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
 		if (dr->dr_txg <= txg)
 			break;
 	if (dr == NULL || dr->dr_txg < txg)
 		return (B_FALSE);
 	ASSERT(dr->dr_txg == txg);
 	ASSERT(dr->dr_dbuf == db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
 	ASSERT(db->db.db_size != 0);
 
 	dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset),
 	    dr->dr_accounted, txg);
 
 	*drp = dr->dr_next;
 
 	/*
 	 * Note that there are three places in dbuf_dirty()
 	 * where this dirty record may be put on a list.
 	 * Make sure to do a list_remove corresponding to
 	 * every one of those list_insert calls.
 	 */
 	if (dr->dr_parent) {
 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_blkid == DMU_SPILL_BLKID ||
 	    db->db_level + 1 == dn->dn_nlevels) {
 		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 		mutex_exit(&dn->dn_mtx);
 	}
 	DB_DNODE_EXIT(db);
 
 	if (db->db_state != DB_NOFILL) {
 		dbuf_unoverride(dr);
 
 		ASSERT(db->db_buf != NULL);
 		ASSERT(dr->dt.dl.dr_data != NULL);
 		if (dr->dt.dl.dr_data != db->db_buf)
 			arc_buf_destroy(dr->dt.dl.dr_data, db);
 	}
 
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 
 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
 		ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf));
 		dbuf_destroy(db);
 		return (B_TRUE);
 	}
 
 	return (B_FALSE);
 }
 
 void
 dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	/*
 	 * Quick check for dirtyness.  For already dirty blocks, this
 	 * reduces runtime of this function by >90%, and overall performance
 	 * by 50% for some workloads (e.g. file deletion with indirect blocks
 	 * cached).
 	 */
 	mutex_enter(&db->db_mtx);
 	dbuf_dirty_record_t *dr;
 	for (dr = db->db_last_dirty;
 	    dr != NULL && dr->dr_txg >= tx->tx_txg; dr = dr->dr_next) {
 		/*
 		 * It's possible that it is already dirty but not cached,
 		 * because there are some calls to dbuf_dirty() that don't
 		 * go through dmu_buf_will_dirty().
 		 */
 		if (dr->dr_txg == tx->tx_txg && db->db_state == DB_CACHED) {
 			/* This dbuf is already dirty and cached. */
 			dbuf_redirty(dr);
 			mutex_exit(&db->db_mtx);
 			return;
 		}
 	}
 	mutex_exit(&db->db_mtx);
 
 	DB_DNODE_ENTER(db);
 	if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock))
 		rf |= DB_RF_HAVESTRUCT;
 	DB_DNODE_EXIT(db);
 	(void) dbuf_read(db, NULL, rf);
 	(void) dbuf_dirty(db, tx);
 }
 
 void
 dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	db->db_state = DB_NOFILL;
 
 	dmu_buf_will_fill(db_fake, tx);
 }
 
 void
 dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(db->db_level == 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
 
 	ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT ||
 	    dmu_tx_private_ok(tx));
 
 	dbuf_noread(db);
 	(void) dbuf_dirty(db, tx);
 }
 
 #pragma weak dmu_buf_fill_done = dbuf_fill_done
 /* ARGSUSED */
 void
 dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	mutex_enter(&db->db_mtx);
 	DBUF_VERIFY(db);
 
 	if (db->db_state == DB_FILL) {
 		if (db->db_level == 0 && db->db_freed_in_flight) {
 			ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 			/* we were freed while filling */
 			/* XXX dbuf_undirty? */
 			bzero(db->db.db_data, db->db.db_size);
 			db->db_freed_in_flight = FALSE;
 		}
 		db->db_state = DB_CACHED;
 		cv_broadcast(&db->db_changed);
 	}
 	mutex_exit(&db->db_mtx);
 }
 
 void
 dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
     bp_embedded_type_t etype, enum zio_compress comp,
     int uncompressed_size, int compressed_size, int byteorder,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
 	struct dirty_leaf *dl;
 	dmu_object_type_t type;
 
 	if (etype == BP_EMBEDDED_TYPE_DATA) {
 		ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset),
 		    SPA_FEATURE_EMBEDDED_DATA));
 	}
 
 	DB_DNODE_ENTER(db);
 	type = DB_DNODE(db)->dn_type;
 	DB_DNODE_EXIT(db);
 
 	ASSERT0(db->db_level);
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 
 	dmu_buf_will_not_fill(dbuf, tx);
 
 	ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg);
 	dl = &db->db_last_dirty->dt.dl;
 	encode_embedded_bp_compressed(&dl->dr_overridden_by,
 	    data, comp, uncompressed_size, compressed_size);
 	BPE_SET_ETYPE(&dl->dr_overridden_by, etype);
 	BP_SET_TYPE(&dl->dr_overridden_by, type);
 	BP_SET_LEVEL(&dl->dr_overridden_by, 0);
 	BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder);
 
 	dl->dr_override_state = DR_OVERRIDDEN;
 	dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg;
 }
 
 /*
  * Directly assign a provided arc buf to a given dbuf if it's not referenced
  * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
  */
 void
 dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
 {
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 	ASSERT(db->db_level == 0);
 	ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf));
 	ASSERT(buf != NULL);
 	ASSERT(arc_buf_lsize(buf) == db->db.db_size);
 	ASSERT(tx->tx_txg != 0);
 
 	arc_return_buf(buf, db);
 	ASSERT(arc_released(buf));
 
 	mutex_enter(&db->db_mtx);
 
 	while (db->db_state == DB_READ || db->db_state == DB_FILL)
 		cv_wait(&db->db_changed, &db->db_mtx);
 
 	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
 
 	if (db->db_state == DB_CACHED &&
 	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_dirty(db, tx);
 		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
 		arc_buf_destroy(buf, db);
 		xuio_stat_wbuf_copied();
 		return;
 	}
 
 	xuio_stat_wbuf_nocopy();
 	if (db->db_state == DB_CACHED) {
 		dbuf_dirty_record_t *dr = db->db_last_dirty;
 
 		ASSERT(db->db_buf != NULL);
 		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
 			ASSERT(dr->dt.dl.dr_data == db->db_buf);
 			if (!arc_released(db->db_buf)) {
 				ASSERT(dr->dt.dl.dr_override_state ==
 				    DR_OVERRIDDEN);
 				arc_release(db->db_buf, db);
 			}
 			dr->dt.dl.dr_data = buf;
 			arc_buf_destroy(db->db_buf, db);
 		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
 			arc_release(db->db_buf, db);
 			arc_buf_destroy(db->db_buf, db);
 		}
 		db->db_buf = NULL;
 	}
 	ASSERT(db->db_buf == NULL);
 	dbuf_set_data(db, buf);
 	db->db_state = DB_FILL;
 	mutex_exit(&db->db_mtx);
 	(void) dbuf_dirty(db, tx);
 	dmu_buf_fill_done(&db->db, tx);
 }
 
 void
 dbuf_destroy(dmu_buf_impl_t *db)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *parent = db->db_parent;
 	dmu_buf_impl_t *dndb;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	if (db->db_buf != NULL) {
 		arc_buf_destroy(db->db_buf, db);
 		db->db_buf = NULL;
 	}
 
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		ASSERT(db->db.db_data != NULL);
 		zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
 		arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		db->db_state = DB_UNCACHED;
 	}
 
 	dbuf_clear_data(db);
 
 	if (multilist_link_active(&db->db_cache_link)) {
-		multilist_remove(dbuf_cache, db);
-		(void) refcount_remove_many(&dbuf_cache_size,
+		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+		multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+		(void) refcount_remove_many(
+		    &dbuf_caches[db->db_caching_status].size,
 		    db->db.db_size, db);
+
+		db->db_caching_status = DB_NO_CACHE;
 	}
 
 	ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL);
 	ASSERT(db->db_data_pending == NULL);
 
 	db->db_state = DB_EVICTING;
 	db->db_blkptr = NULL;
 
 	/*
 	 * Now that db_state is DB_EVICTING, nobody else can find this via
 	 * the hash table.  We can now drop db_mtx, which allows us to
 	 * acquire the dn_dbufs_mtx.
 	 */
 	mutex_exit(&db->db_mtx);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dndb = dn->dn_dbuf;
 	if (db->db_blkid != DMU_BONUS_BLKID) {
 		boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx);
 		if (needlock)
 			mutex_enter(&dn->dn_dbufs_mtx);
 		avl_remove(&dn->dn_dbufs, db);
 		atomic_dec_32(&dn->dn_dbufs_count);
 		membar_producer();
 		DB_DNODE_EXIT(db);
 		if (needlock)
 			mutex_exit(&dn->dn_dbufs_mtx);
 		/*
 		 * Decrementing the dbuf count means that the hold corresponding
 		 * to the removed dbuf is no longer discounted in dnode_move(),
 		 * so the dnode cannot be moved until after we release the hold.
 		 * The membar_producer() ensures visibility of the decremented
 		 * value in dnode_move(), since DB_DNODE_EXIT doesn't actually
 		 * release any lock.
 		 */
 		dnode_rele(dn, db);
 		db->db_dnode_handle = NULL;
 
 		dbuf_hash_remove(db);
 	} else {
 		DB_DNODE_EXIT(db);
 	}
 
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	db->db_parent = NULL;
 
 	ASSERT(db->db_buf == NULL);
 	ASSERT(db->db.db_data == NULL);
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
+	ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE);
 	ASSERT(!multilist_link_active(&db->db_cache_link));
 
 	kmem_cache_free(dbuf_kmem_cache, db);
 	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 
 	/*
 	 * If this dbuf is referenced from an indirect dbuf,
 	 * decrement the ref count on the indirect dbuf.
 	 */
 	if (parent && parent != dndb)
 		dbuf_rele(parent, db);
 }
 
 /*
  * Note: While bpp will always be updated if the function returns success,
  * parentp will not be updated if the dnode does not have dn_dbuf filled in;
  * this happens when the dnode is the meta-dnode, or a userused or groupused
  * object.
  */
 static int
 dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse,
     dmu_buf_impl_t **parentp, blkptr_t **bpp)
 {
 	*parentp = NULL;
 	*bpp = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 
 	if (blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		if (dn->dn_have_spill &&
 		    (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR))
 			*bpp = &dn->dn_phys->dn_spill;
 		else
 			*bpp = NULL;
 		dbuf_add_ref(dn->dn_dbuf, NULL);
 		*parentp = dn->dn_dbuf;
 		mutex_exit(&dn->dn_mtx);
 		return (0);
 	}
 
 	int nlevels =
 	    (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels;
 	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 	ASSERT3U(level * epbs, <, 64);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	/*
 	 * This assertion shouldn't trip as long as the max indirect block size
 	 * is less than 1M.  The reason for this is that up to that point,
 	 * the number of levels required to address an entire object with blocks
 	 * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64.  In
 	 * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55
 	 * (i.e. we can address the entire object), objects will all use at most
 	 * N-1 levels and the assertion won't overflow.  However, once epbs is
 	 * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66.  Then, 4 levels will not be
 	 * enough to address an entire object, so objects will have 5 levels,
 	 * but then this assertion will overflow.
 	 *
 	 * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we
 	 * need to redo this logic to handle overflows.
 	 */
 	ASSERT(level >= nlevels ||
 	    ((nlevels - level - 1) * epbs) +
 	    highbit64(dn->dn_phys->dn_nblkptr) <= 64);
 	if (level >= nlevels ||
 	    blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr <<
 	    ((nlevels - level - 1) * epbs)) ||
 	    (fail_sparse &&
 	    blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) {
 		/* the buffer has no parent yet */
 		return (SET_ERROR(ENOENT));
 	} else if (level < nlevels-1) {
 		/* this block is referenced from an indirect block */
 		int err = dbuf_hold_impl(dn, level+1,
 		    blkid >> epbs, fail_sparse, FALSE, NULL, parentp);
 		if (err)
 			return (err);
 		err = dbuf_read(*parentp, NULL,
 		    (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL));
 		if (err) {
 			dbuf_rele(*parentp, NULL);
 			*parentp = NULL;
 			return (err);
 		}
 		*bpp = ((blkptr_t *)(*parentp)->db.db_data) +
 		    (blkid & ((1ULL << epbs) - 1));
 		if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))
 			ASSERT(BP_IS_HOLE(*bpp));
 		return (0);
 	} else {
 		/* the block is referenced from the dnode */
 		ASSERT3U(level, ==, nlevels-1);
 		ASSERT(dn->dn_phys->dn_nblkptr == 0 ||
 		    blkid < dn->dn_phys->dn_nblkptr);
 		if (dn->dn_dbuf) {
 			dbuf_add_ref(dn->dn_dbuf, NULL);
 			*parentp = dn->dn_dbuf;
 		}
 		*bpp = &dn->dn_phys->dn_blkptr[blkid];
 		return (0);
 	}
 }
 
 static dmu_buf_impl_t *
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
     dmu_buf_impl_t *parent, blkptr_t *blkptr)
 {
 	objset_t *os = dn->dn_objset;
 	dmu_buf_impl_t *db, *odb;
 
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(dn->dn_type != DMU_OT_NONE);
 
 	db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP);
 
 	db->db_objset = os;
 	db->db.db_object = dn->dn_object;
 	db->db_level = level;
 	db->db_blkid = blkid;
 	db->db_last_dirty = NULL;
 	db->db_dirtycnt = 0;
 	db->db_dnode_handle = dn->dn_handle;
 	db->db_parent = parent;
 	db->db_blkptr = blkptr;
 
 	db->db_user = NULL;
 	db->db_user_immediate_evict = FALSE;
 	db->db_freed_in_flight = FALSE;
 	db->db_pending_evict = FALSE;
 
 	if (blkid == DMU_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
 		db->db.db_size = DN_MAX_BONUSLEN -
 		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DMU_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
+		db->db_caching_status = DB_NO_CACHE;
 		/* the bonus dbuf is not placed in the hash table */
 		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 		return (db);
 	} else if (blkid == DMU_SPILL_BLKID) {
 		db->db.db_size = (blkptr != NULL) ?
 		    BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE;
 		db->db.db_offset = 0;
 	} else {
 		int blocksize =
 		    db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz;
 		db->db.db_size = blocksize;
 		db->db.db_offset = db->db_blkid * blocksize;
 	}
 
 	/*
 	 * Hold the dn_dbufs_mtx while we get the new dbuf
 	 * in the hash table *and* added to the dbufs list.
 	 * This prevents a possible deadlock with someone
 	 * trying to look up this dbuf before its added to the
 	 * dn_dbufs list.
 	 */
 	mutex_enter(&dn->dn_dbufs_mtx);
 	db->db_state = DB_EVICTING;
 	if ((odb = dbuf_hash_insert(db)) != NULL) {
 		/* someone else inserted it first */
 		kmem_cache_free(dbuf_kmem_cache, db);
 		mutex_exit(&dn->dn_dbufs_mtx);
 		return (odb);
 	}
 	avl_add(&dn->dn_dbufs, db);
 
 	db->db_state = DB_UNCACHED;
+	db->db_caching_status = DB_NO_CACHE;
 	mutex_exit(&dn->dn_dbufs_mtx);
 	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
 
 	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT ||
 	    refcount_count(&dn->dn_holds) > 0);
 	(void) refcount_add(&dn->dn_holds, db);
 	atomic_inc_32(&dn->dn_dbufs_count);
 
 	dprintf_dbuf(db, "db=%p\n", db);
 
 	return (db);
 }
 
 typedef struct dbuf_prefetch_arg {
 	spa_t *dpa_spa;	/* The spa to issue the prefetch in. */
 	zbookmark_phys_t dpa_zb; /* The target block to prefetch. */
 	int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */
 	int dpa_curlevel; /* The current level that we're reading */
 	dnode_t *dpa_dnode; /* The dnode associated with the prefetch */
 	zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */
 	zio_t *dpa_zio; /* The parent zio_t for all prefetches. */
 	arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */
 } dbuf_prefetch_arg_t;
 
 /*
  * Actually issue the prefetch read for the block given.
  */
 static void
 dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp)
 {
 	if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp))
 		return;
 
 	arc_flags_t aflags =
 	    dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH;
 
 	ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
 	ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level);
 	ASSERT(dpa->dpa_zio != NULL);
 	(void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL,
 	    dpa->dpa_prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 	    &aflags, &dpa->dpa_zb);
 }
 
 /*
  * Called when an indirect block above our prefetch target is read in.  This
  * will either read in the next indirect block down the tree or issue the actual
  * prefetch if the next block down is our target.
  */
 static void
 dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private)
 {
 	dbuf_prefetch_arg_t *dpa = private;
 
 	ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel);
 	ASSERT3S(dpa->dpa_curlevel, >, 0);
 
 	/*
 	 * The dpa_dnode is only valid if we are called with a NULL
 	 * zio. This indicates that the arc_read() returned without
 	 * first calling zio_read() to issue a physical read. Once
 	 * a physical read is made the dpa_dnode must be invalidated
 	 * as the locks guarding it may have been dropped. If the
 	 * dpa_dnode is still valid, then we want to add it to the dbuf
 	 * cache. To do so, we must hold the dbuf associated with the block
 	 * we just prefetched, read its contents so that we associate it
 	 * with an arc_buf_t, and then release it.
 	 */
 	if (zio != NULL) {
 		ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel);
 		if (zio->io_flags & ZIO_FLAG_RAW) {
 			ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size);
 		} else {
 			ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size);
 		}
 		ASSERT3P(zio->io_spa, ==, dpa->dpa_spa);
 
 		dpa->dpa_dnode = NULL;
 	} else if (dpa->dpa_dnode != NULL) {
 		uint64_t curblkid = dpa->dpa_zb.zb_blkid >>
 		    (dpa->dpa_epbs * (dpa->dpa_curlevel -
 		    dpa->dpa_zb.zb_level));
 		dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode,
 		    dpa->dpa_curlevel, curblkid, FTAG);
 		(void) dbuf_read(db, NULL,
 		    DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT);
 		dbuf_rele(db, FTAG);
 	}
 
 	dpa->dpa_curlevel--;
 
 	uint64_t nextblkid = dpa->dpa_zb.zb_blkid >>
 	    (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level));
 	blkptr_t *bp = ((blkptr_t *)abuf->b_data) +
 	    P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs);
 	if (BP_IS_HOLE(bp) || (zio != NULL && zio->io_error != 0)) {
 		kmem_free(dpa, sizeof (*dpa));
 	} else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) {
 		ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid);
 		dbuf_issue_final_prefetch(dpa, bp);
 		kmem_free(dpa, sizeof (*dpa));
 	} else {
 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
 		zbookmark_phys_t zb;
 
 		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
 		if (dpa->dpa_aflags & ARC_FLAG_L2CACHE)
 			iter_aflags |= ARC_FLAG_L2CACHE;
 
 		ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp));
 
 		SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset,
 		    dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid);
 
 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
 		    bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 		    &iter_aflags, &zb);
 	}
 
 	arc_buf_destroy(abuf, private);
 }
 
 /*
  * Issue prefetch reads for the given block on the given level.  If the indirect
  * blocks above that block are not in memory, we will read them in
  * asynchronously.  As a result, this call never blocks waiting for a read to
  * complete.
  */
 void
 dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio,
     arc_flags_t aflags)
 {
 	blkptr_t bp;
 	int epbs, nlevels, curlevel;
 	uint64_t curblkid;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 
 	if (blkid > dn->dn_maxblkid)
 		return;
 
 	if (dnode_block_freed(dn, blkid))
 		return;
 
 	/*
 	 * This dnode hasn't been written to disk yet, so there's nothing to
 	 * prefetch.
 	 */
 	nlevels = dn->dn_phys->dn_nlevels;
 	if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0)
 		return;
 
 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level))
 		return;
 
 	dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object,
 	    level, blkid);
 	if (db != NULL) {
 		mutex_exit(&db->db_mtx);
 		/*
 		 * This dbuf already exists.  It is either CACHED, or
 		 * (we assume) about to be read or filled.
 		 */
 		return;
 	}
 
 	/*
 	 * Find the closest ancestor (indirect block) of the target block
 	 * that is present in the cache.  In this indirect block, we will
 	 * find the bp that is at curlevel, curblkid.
 	 */
 	curlevel = level;
 	curblkid = blkid;
 	while (curlevel < nlevels - 1) {
 		int parent_level = curlevel + 1;
 		uint64_t parent_blkid = curblkid >> epbs;
 		dmu_buf_impl_t *db;
 
 		if (dbuf_hold_impl(dn, parent_level, parent_blkid,
 		    FALSE, TRUE, FTAG, &db) == 0) {
 			blkptr_t *bpp = db->db_buf->b_data;
 			bp = bpp[P2PHASE(curblkid, 1 << epbs)];
 			dbuf_rele(db, FTAG);
 			break;
 		}
 
 		curlevel = parent_level;
 		curblkid = parent_blkid;
 	}
 
 	if (curlevel == nlevels - 1) {
 		/* No cached indirect blocks found. */
 		ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr);
 		bp = dn->dn_phys->dn_blkptr[curblkid];
 	}
 	if (BP_IS_HOLE(&bp))
 		return;
 
 	ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp));
 
 	zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL,
 	    ZIO_FLAG_CANFAIL);
 
 	dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP);
 	dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 	SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
 	    dn->dn_object, level, blkid);
 	dpa->dpa_curlevel = curlevel;
 	dpa->dpa_prio = prio;
 	dpa->dpa_aflags = aflags;
 	dpa->dpa_spa = dn->dn_objset->os_spa;
 	dpa->dpa_dnode = dn;
 	dpa->dpa_epbs = epbs;
 	dpa->dpa_zio = pio;
 
 	/* flag if L2ARC eligible, l2arc_noprefetch then decides */
 	if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
 		dpa->dpa_aflags |= ARC_FLAG_L2CACHE;
 
 	/*
 	 * If we have the indirect just above us, no need to do the asynchronous
 	 * prefetch chain; we'll just run the last step ourselves.  If we're at
 	 * a higher level, though, we want to issue the prefetches for all the
 	 * indirect blocks asynchronously, so we can go on with whatever we were
 	 * doing.
 	 */
 	if (curlevel == level) {
 		ASSERT3U(curblkid, ==, blkid);
 		dbuf_issue_final_prefetch(dpa, &bp);
 		kmem_free(dpa, sizeof (*dpa));
 	} else {
 		arc_flags_t iter_aflags = ARC_FLAG_NOWAIT;
 		zbookmark_phys_t zb;
 
 		/* flag if L2ARC eligible, l2arc_noprefetch then decides */
 		if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level))
 			iter_aflags |= ARC_FLAG_L2CACHE;
 
 		SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET,
 		    dn->dn_object, curlevel, curblkid);
 		(void) arc_read(dpa->dpa_zio, dpa->dpa_spa,
 		    &bp, dbuf_prefetch_indirect_done, dpa, prio,
 		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 		    &iter_aflags, &zb);
 	}
 	/*
 	 * We use pio here instead of dpa_zio since it's possible that
 	 * dpa may have already been freed.
 	 */
 	zio_nowait(pio);
 }
 
 /*
  * Returns with db_holds incremented, and db_mtx not held.
  * Note: dn_struct_rwlock must be held.
  */
 int
 dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid,
     boolean_t fail_sparse, boolean_t fail_uncached,
     void *tag, dmu_buf_impl_t **dbp)
 {
 	dmu_buf_impl_t *db, *parent = NULL;
 
 	ASSERT(blkid != DMU_BONUS_BLKID);
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT3U(dn->dn_nlevels, >, level);
 
 	*dbp = NULL;
 top:
 	/* dbuf_find() returns with db_mtx held */
 	db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid);
 
 	if (db == NULL) {
 		blkptr_t *bp = NULL;
 		int err;
 
 		if (fail_uncached)
 			return (SET_ERROR(ENOENT));
 
 		ASSERT3P(parent, ==, NULL);
 		err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp);
 		if (fail_sparse) {
 			if (err == 0 && bp && BP_IS_HOLE(bp))
 				err = SET_ERROR(ENOENT);
 			if (err) {
 				if (parent)
 					dbuf_rele(parent, NULL);
 				return (err);
 			}
 		}
 		if (err && err != ENOENT)
 			return (err);
 		db = dbuf_create(dn, level, blkid, parent, bp);
 	}
 
 	if (fail_uncached && db->db_state != DB_CACHED) {
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (db->db_buf != NULL)
 		ASSERT3P(db->db.db_data, ==, db->db_buf->b_data);
 
 	ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf));
 
 	/*
 	 * If this buffer is currently syncing out, and we are are
 	 * still referencing it from db_data, we need to make a copy
 	 * of it in case we decide we want to dirty it again in this txg.
 	 */
 	if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    db->db_state == DB_CACHED && db->db_data_pending) {
 		dbuf_dirty_record_t *dr = db->db_data_pending;
 
 		if (dr->dt.dl.dr_data == db->db_buf) {
 			arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
 			dbuf_set_data(db,
 			    arc_alloc_buf(dn->dn_objset->os_spa, db, type,
 			    db->db.db_size));
 			bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data,
 			    db->db.db_size);
 		}
 	}
 
 	if (multilist_link_active(&db->db_cache_link)) {
 		ASSERT(refcount_is_zero(&db->db_holds));
-		multilist_remove(dbuf_cache, db);
-		(void) refcount_remove_many(&dbuf_cache_size,
+		ASSERT(db->db_caching_status == DB_DBUF_CACHE ||
+		    db->db_caching_status == DB_DBUF_METADATA_CACHE);
+
+		multilist_remove(dbuf_caches[db->db_caching_status].cache, db);
+		(void) refcount_remove_many(
+		    &dbuf_caches[db->db_caching_status].size,
 		    db->db.db_size, db);
+
+		db->db_caching_status = DB_NO_CACHE;
 	}
 	(void) refcount_add(&db->db_holds, tag);
 	DBUF_VERIFY(db);
 	mutex_exit(&db->db_mtx);
 
 	/* NOTE: we can't rele the parent until after we drop the db_mtx */
 	if (parent)
 		dbuf_rele(parent, NULL);
 
 	ASSERT3P(DB_DNODE(db), ==, dn);
 	ASSERT3U(db->db_blkid, ==, blkid);
 	ASSERT3U(db->db_level, ==, level);
 	*dbp = db;
 
 	return (0);
 }
 
 dmu_buf_impl_t *
 dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag)
 {
 	return (dbuf_hold_level(dn, 0, blkid, tag));
 }
 
 dmu_buf_impl_t *
 dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
 {
 	dmu_buf_impl_t *db;
 	int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db);
 	return (err ? NULL : db);
 }
 
 void
 dbuf_create_bonus(dnode_t *dn)
 {
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT(dn->dn_bonus == NULL);
 	dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL);
 }
 
 int
 dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	if (db->db_blkid != DMU_SPILL_BLKID)
 		return (SET_ERROR(ENOTSUP));
 	if (blksz == 0)
 		blksz = SPA_MINBLOCKSIZE;
 	ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset)));
 	blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dbuf_new_size(db, blksz, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	DB_DNODE_EXIT(db);
 
 	return (0);
 }
 
 void
 dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx)
 {
 	dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx);
 }
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
 void
 dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds = refcount_add(&db->db_holds, tag);
 	ASSERT3S(holds, >, 1);
 }
 
 #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref
 boolean_t
 dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid,
     void *tag)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dmu_buf_impl_t *found_db;
 	boolean_t result = B_FALSE;
 
 	if (db->db_blkid == DMU_BONUS_BLKID)
 		found_db = dbuf_find_bonus(os, obj);
 	else
 		found_db = dbuf_find(os, obj, 0, blkid);
 
 	if (found_db != NULL) {
 		if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) {
 			(void) refcount_add(&db->db_holds, tag);
 			result = B_TRUE;
 		}
 		mutex_exit(&db->db_mtx);
 	}
 	return (result);
 }
 
 /*
  * If you call dbuf_rele() you had better not be referencing the dnode handle
  * unless you have some other direct or indirect hold on the dnode. (An indirect
  * hold is a hold on one of the dnode's dbufs, including the bonus buffer.)
  * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the
  * dnode's parent dbuf evicting its dnode handles.
  */
 void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
 {
 	mutex_enter(&db->db_mtx);
 	dbuf_rele_and_unlock(db, tag);
 }
 
 void
 dmu_buf_rele(dmu_buf_t *db, void *tag)
 {
 	dbuf_rele((dmu_buf_impl_t *)db, tag);
 }
 
 /*
  * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
  * db_dirtycnt and db_holds to be updated atomically.
  */
 void
 dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	DBUF_VERIFY(db);
 
 	/*
 	 * Remove the reference to the dbuf before removing its hold on the
 	 * dnode so we can guarantee in dnode_move() that a referenced bonus
 	 * buffer has a corresponding dnode hold.
 	 */
 	holds = refcount_remove(&db->db_holds, tag);
 	ASSERT(holds >= 0);
 
 	/*
 	 * We can't freeze indirects if there is a possibility that they
 	 * may be modified in the current syncing context.
 	 */
 	if (db->db_buf != NULL &&
 	    holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) {
 		arc_buf_freeze(db->db_buf);
 	}
 
 	if (holds == db->db_dirtycnt &&
 	    db->db_level == 0 && db->db_user_immediate_evict)
 		dbuf_evict_user(db);
 
 	if (holds == 0) {
 		if (db->db_blkid == DMU_BONUS_BLKID) {
 			dnode_t *dn;
 			boolean_t evict_dbuf = db->db_pending_evict;
 
 			/*
 			 * If the dnode moves here, we cannot cross this
 			 * barrier until the move completes.
 			 */
 			DB_DNODE_ENTER(db);
 
 			dn = DB_DNODE(db);
 			atomic_dec_32(&dn->dn_dbufs_count);
 
 			/*
 			 * Decrementing the dbuf count means that the bonus
 			 * buffer's dnode hold is no longer discounted in
 			 * dnode_move(). The dnode cannot move until after
 			 * the dnode_rele() below.
 			 */
 			DB_DNODE_EXIT(db);
 
 			/*
 			 * Do not reference db after its lock is dropped.
 			 * Another thread may evict it.
 			 */
 			mutex_exit(&db->db_mtx);
 
 			if (evict_dbuf)
 				dnode_evict_bonus(dn);
 
 			dnode_rele(dn, db);
 		} else if (db->db_buf == NULL) {
 			/*
 			 * This is a special case: we never associated this
 			 * dbuf with any data allocated from the ARC.
 			 */
 			ASSERT(db->db_state == DB_UNCACHED ||
 			    db->db_state == DB_NOFILL);
 			dbuf_destroy(db);
 		} else if (arc_released(db->db_buf)) {
 			/*
 			 * This dbuf has anonymous data associated with it.
 			 */
 			dbuf_destroy(db);
 		} else {
 			boolean_t do_arc_evict = B_FALSE;
 			blkptr_t bp;
 			spa_t *spa = dmu_objset_spa(db->db_objset);
 
 			if (!DBUF_IS_CACHEABLE(db) &&
 			    db->db_blkptr != NULL &&
 			    !BP_IS_HOLE(db->db_blkptr) &&
 			    !BP_IS_EMBEDDED(db->db_blkptr)) {
 				do_arc_evict = B_TRUE;
 				bp = *db->db_blkptr;
 			}
 
 			if (!DBUF_IS_CACHEABLE(db) ||
 			    db->db_pending_evict) {
 				dbuf_destroy(db);
 			} else if (!multilist_link_active(&db->db_cache_link)) {
-				multilist_insert(dbuf_cache, db);
-				(void) refcount_add_many(&dbuf_cache_size,
+				ASSERT3U(db->db_caching_status, ==,
+				    DB_NO_CACHE);
+
+				dbuf_cached_state_t dcs =
+				    dbuf_include_in_metadata_cache(db) ?
+				    DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE;
+				db->db_caching_status = dcs;
+
+				multilist_insert(dbuf_caches[dcs].cache, db);
+				(void) refcount_add_many(&dbuf_caches[dcs].size,
 				    db->db.db_size, db);
 				mutex_exit(&db->db_mtx);
 
-				dbuf_evict_notify();
+				if (db->db_caching_status == DB_DBUF_CACHE) {
+					dbuf_evict_notify();
+				}
 			}
 
 			if (do_arc_evict)
 				arc_freed(spa, &bp);
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
 	}
 
 }
 
 #pragma weak dmu_buf_refcount = dbuf_refcount
 uint64_t
 dbuf_refcount(dmu_buf_impl_t *db)
 {
 	return (refcount_count(&db->db_holds));
 }
 
 void *
 dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user,
     dmu_buf_user_t *new_user)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	mutex_enter(&db->db_mtx);
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	if (db->db_user == old_user)
 		db->db_user = new_user;
 	else
 		old_user = db->db_user;
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	mutex_exit(&db->db_mtx);
 
 	return (old_user);
 }
 
 void *
 dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	return (dmu_buf_replace_user(db_fake, NULL, user));
 }
 
 void *
 dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	db->db_user_immediate_evict = TRUE;
 	return (dmu_buf_set_user(db_fake, user));
 }
 
 void *
 dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user)
 {
 	return (dmu_buf_replace_user(db_fake, user, NULL));
 }
 
 void *
 dmu_buf_get_user(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	dbuf_verify_user(db, DBVU_NOT_EVICTING);
 	return (db->db_user);
 }
 
 void
 dmu_buf_user_evict_wait()
 {
 	taskq_wait(dbu_evict_taskq);
 }
 
 blkptr_t *
 dmu_buf_get_blkptr(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	return (dbi->db_blkptr);
 }
 
 objset_t *
 dmu_buf_get_objset(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	return (dbi->db_objset);
 }
 
 dnode_t *
 dmu_buf_dnode_enter(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	DB_DNODE_ENTER(dbi);
 	return (DB_DNODE(dbi));
 }
 
 void
 dmu_buf_dnode_exit(dmu_buf_t *db)
 {
 	dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 	DB_DNODE_EXIT(dbi);
 }
 
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
 	/* ASSERT(dmu_tx_is_syncing(tx) */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
 	if (db->db_blkptr != NULL)
 		return;
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		db->db_blkptr = &dn->dn_phys->dn_spill;
 		BP_ZERO(db->db_blkptr);
 		return;
 	}
 	if (db->db_level == dn->dn_phys->dn_nlevels-1) {
 		/*
 		 * This buffer was allocated at a time when there was
 		 * no available blkptrs from the dnode, or it was
 		 * inappropriate to hook it in (i.e., nlevels mis-match).
 		 */
 		ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr);
 		ASSERT(db->db_parent == NULL);
 		db->db_parent = dn->dn_dbuf;
 		db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid];
 		DBUF_VERIFY(db);
 	} else {
 		dmu_buf_impl_t *parent = db->db_parent;
 		int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 
 		ASSERT(dn->dn_phys->dn_nlevels > 1);
 		if (parent == NULL) {
 			mutex_exit(&db->db_mtx);
 			rw_enter(&dn->dn_struct_rwlock, RW_READER);
 			parent = dbuf_hold_level(dn, db->db_level + 1,
 			    db->db_blkid >> epbs, db);
 			rw_exit(&dn->dn_struct_rwlock);
 			mutex_enter(&db->db_mtx);
 			db->db_parent = parent;
 		}
 		db->db_blkptr = (blkptr_t *)parent->db.db_data +
 		    (db->db_blkid & ((1ULL << epbs) - 1));
 		DBUF_VERIFY(db);
 	}
 }
 
 static void
 dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn;
 	zio_t *zio;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 
 	ASSERT(db->db_level > 0);
 	DBUF_VERIFY(db);
 
 	/* Read the block if it hasn't been read yet. */
 	if (db->db_buf == NULL) {
 		mutex_exit(&db->db_mtx);
 		(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
 		mutex_enter(&db->db_mtx);
 	}
 	ASSERT3U(db->db_state, ==, DB_CACHED);
 	ASSERT(db->db_buf != NULL);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	/* Indirect block size must match what the dnode thinks it is. */
 	ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 	dbuf_check_blkptr(dn, db);
 	DB_DNODE_EXIT(db);
 
 	/* Provide the pending dirty record to child dbufs */
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write(dr, db->db_buf, tx);
 
 	zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
 	dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx);
 	ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 	mutex_exit(&dr->dt.di.dr_mtx);
 	zio_nowait(zio);
 }
 
 static void
 dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 {
 	arc_buf_t **datap = &dr->dt.dl.dr_data;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn;
 	objset_t *os;
 	uint64_t txg = tx->tx_txg;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr);
 
 	mutex_enter(&db->db_mtx);
 	/*
 	 * To be synced, we must be dirtied.  But we
 	 * might have been freed after the dirty.
 	 */
 	if (db->db_state == DB_UNCACHED) {
 		/* This buffer has been freed since it was dirtied */
 		ASSERT(db->db.db_data == NULL);
 	} else if (db->db_state == DB_FILL) {
 		/* This buffer was freed and is now being re-filled */
 		ASSERT(db->db.db_data != dr->dt.dl.dr_data);
 	} else {
 		ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL);
 	}
 	DBUF_VERIFY(db);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR;
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	/*
 	 * If this is a bonus buffer, simply copy the bonus data into the
 	 * dnode.  It will be written out when the dnode is synced (and it
 	 * will be synced, since it must have been dirty for dbuf_sync to
 	 * be called).
 	 */
 	if (db->db_blkid == DMU_BONUS_BLKID) {
 		dbuf_dirty_record_t **drp;
 
 		ASSERT(*datap != NULL);
 		ASSERT0(db->db_level);
 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
 		DB_DNODE_EXIT(db);
 
 		if (*datap != db->db.db_data) {
 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
 			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		}
 		db->db_data_pending = NULL;
 		drp = &db->db_last_dirty;
 		while (*drp != dr)
 			drp = &(*drp)->dr_next;
 		ASSERT(dr->dr_next == NULL);
 		ASSERT(dr->dr_dbuf == db);
 		*drp = dr->dr_next;
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
 		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 		return;
 	}
 
 	os = dn->dn_objset;
 
 	/*
 	 * This function may have dropped the db_mtx lock allowing a dmu_sync
 	 * operation to sneak in. As a result, we need to ensure that we
 	 * don't check the dr_override_state until we have returned from
 	 * dbuf_check_blkptr.
 	 */
 	dbuf_check_blkptr(dn, db);
 
 	/*
 	 * If this buffer is in the middle of an immediate write,
 	 * wait for the synchronous IO to complete.
 	 */
 	while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		cv_wait(&db->db_changed, &db->db_mtx);
 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
 	}
 
 	if (db->db_state != DB_NOFILL &&
 	    dn->dn_object != DMU_META_DNODE_OBJECT &&
 	    refcount_count(&db->db_holds) > 1 &&
 	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
 	    *datap == db->db_buf) {
 		/*
 		 * If this buffer is currently "in use" (i.e., there
 		 * are active holds and db_data still references it),
 		 * then make a copy before we start the write so that
 		 * any modifications from the open txg will not leak
 		 * into this write.
 		 *
 		 * NOTE: this copy does not need to be made for
 		 * objects only modified in the syncing context (e.g.
 		 * DNONE_DNODE blocks).
 		 */
 		int psize = arc_buf_size(*datap);
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 		enum zio_compress compress_type = arc_get_compression(*datap);
 
 		if (compress_type == ZIO_COMPRESS_OFF) {
 			*datap = arc_alloc_buf(os->os_spa, db, type, psize);
 		} else {
 			ASSERT3U(type, ==, ARC_BUFC_DATA);
 			int lsize = arc_buf_lsize(*datap);
 			*datap = arc_alloc_compressed_buf(os->os_spa, db,
 			    psize, lsize, compress_type);
 		}
 		bcopy(db->db.db_data, (*datap)->b_data, psize);
 	}
 	db->db_data_pending = dr;
 
 	mutex_exit(&db->db_mtx);
 
 	dbuf_write(dr, *datap, tx);
 
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	if (dn->dn_object == DMU_META_DNODE_OBJECT) {
 		list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr);
 		DB_DNODE_EXIT(db);
 	} else {
 		/*
 		 * Although zio_nowait() does not "wait for an IO", it does
 		 * initiate the IO. If this is an empty write it seems plausible
 		 * that the IO could actually be completed before the nowait
 		 * returns. We need to DB_DNODE_EXIT() first in case
 		 * zio_nowait() invalidates the dbuf.
 		 */
 		DB_DNODE_EXIT(db);
 		zio_nowait(dr->dr_zio);
 	}
 }
 
 void
 dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr;
 
 	while (dr = list_head(list)) {
 		if (dr->dr_zio != NULL) {
 			/*
 			 * If we find an already initialized zio then we
 			 * are processing the meta-dnode, and we have finished.
 			 * The dbufs for all dnodes are put back on the list
 			 * during processing, so that we can zio_wait()
 			 * these IOs after initiating all child IOs.
 			 */
 			ASSERT3U(dr->dr_dbuf->db.db_object, ==,
 			    DMU_META_DNODE_OBJECT);
 			break;
 		}
 		if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID &&
 		    dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) {
 			VERIFY3U(dr->dr_dbuf->db_level, ==, level);
 		}
 		list_remove(list, dr);
 		if (dr->dr_dbuf->db_level > 0)
 			dbuf_sync_indirect(dr, tx);
 		else
 			dbuf_sync_leaf(dr, tx);
 	}
 }
 
 /* ARGSUSED */
 static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	spa_t *spa = zio->io_spa;
 	int64_t delta;
 	uint64_t fill = 0;
 	int i;
 
 	ASSERT3P(db->db_blkptr, !=, NULL);
 	ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
 	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
 	zio->io_prev_space_delta = delta;
 
 	if (bp->blk_birth != 0) {
 		ASSERT((db->db_blkid != DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_type) ||
 		    (db->db_blkid == DMU_SPILL_BLKID &&
 		    BP_GET_TYPE(bp) == dn->dn_bonustype) ||
 		    BP_IS_EMBEDDED(bp));
 		ASSERT(BP_GET_LEVEL(bp) == db->db_level);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(bp)) &&
 		    db->db_blkptr == &dn->dn_phys->dn_spill);
 	}
 #endif
 
 	if (db->db_level == 0) {
 		mutex_enter(&dn->dn_mtx);
 		if (db->db_blkid > dn->dn_phys->dn_maxblkid &&
 		    db->db_blkid != DMU_SPILL_BLKID)
 			dn->dn_phys->dn_maxblkid = db->db_blkid;
 		mutex_exit(&dn->dn_mtx);
 
 		if (dn->dn_type == DMU_OT_DNODE) {
 			dnode_phys_t *dnp = db->db.db_data;
 			for (i = db->db.db_size >> DNODE_SHIFT; i > 0;
 			    i--, dnp++) {
 				if (dnp->dn_type != DMU_OT_NONE)
 					fill++;
 			}
 		} else {
 			if (BP_IS_HOLE(bp)) {
 				fill = 0;
 			} else {
 				fill = 1;
 			}
 		}
 	} else {
 		blkptr_t *ibp = db->db.db_data;
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
 			if (BP_IS_HOLE(ibp))
 				continue;
 			fill += BP_GET_FILL(ibp);
 		}
 	}
 	DB_DNODE_EXIT(db);
 
 	if (!BP_IS_EMBEDDED(bp))
 		bp->blk_fill = fill;
 
 	mutex_exit(&db->db_mtx);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	*db->db_blkptr = *bp;
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /* ARGSUSED */
 /*
  * This function gets called just prior to running through the compression
  * stage of the zio pipeline. If we're an indirect block comprised of only
  * holes, then we want this indirect to be compressed away to a hole. In
  * order to do that we must zero out any information about the holes that
  * this indirect points to prior to before we try to compress it.
  */
 static void
 dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn;
 	blkptr_t *bp;
 	unsigned int epbs, i;
 
 	ASSERT3U(db->db_level, >, 0);
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 	ASSERT3U(epbs, <, 31);
 
 	/* Determine if all our children are holes */
 	for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) {
 		if (!BP_IS_HOLE(bp))
 			break;
 	}
 
 	/*
 	 * If all the children are holes, then zero them all out so that
 	 * we may get compressed away.
 	 */
 	if (i == 1 << epbs) {
 		/*
 		 * We only found holes. Grab the rwlock to prevent
 		 * anybody from reading the blocks we're about to
 		 * zero out.
 		 */
 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		bzero(db->db.db_data, db->db.db_size);
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 	DB_DNODE_EXIT(db);
 }
 
 /*
  * The SPA will call this callback several times for each zio - once
  * for every physical child i/o (zio->io_phys_children times).  This
  * allows the DMU to monitor the progress of each logical i/o.  For example,
  * there may be 2 copies of an indirect block, or many fragments of a RAID-Z
  * block.  There may be a long delay before all copies/fragments are completed,
  * so this callback allows us to retire dirty space gradually, as the physical
  * i/os complete.
  */
 /* ARGSUSED */
 static void
 dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg)
 {
 	dmu_buf_impl_t *db = arg;
 	objset_t *os = db->db_objset;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 	dbuf_dirty_record_t *dr;
 	int delta = 0;
 
 	dr = db->db_data_pending;
 	ASSERT3U(dr->dr_txg, ==, zio->io_txg);
 
 	/*
 	 * The callback will be called io_phys_children times.  Retire one
 	 * portion of our dirty space each time we are called.  Any rounding
 	 * error will be cleaned up by dsl_pool_sync()'s call to
 	 * dsl_pool_undirty_space().
 	 */
 	delta = dr->dr_accounted / zio->io_phys_children;
 	dsl_pool_undirty_space(dp, delta, zio->io_txg);
 }
 
 /* ARGSUSED */
 static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	blkptr_t *bp = db->db_blkptr;
 	objset_t *os = db->db_objset;
 	dmu_tx_t *tx = os->os_synctx;
 	dbuf_dirty_record_t **drp, *dr;
 
 	ASSERT0(zio->io_error);
 	ASSERT(db->db_blkptr == bp);
 
 	/*
 	 * For nopwrites and rewrites we ensure that the bp matches our
 	 * original and bypass all the accounting.
 	 */
 	if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
 
 	mutex_enter(&db->db_mtx);
 
 	DBUF_VERIFY(db);
 
 	drp = &db->db_last_dirty;
 	while ((dr = *drp) != db->db_data_pending)
 		drp = &dr->dr_next;
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	ASSERT(dr->dr_dbuf == db);
 	ASSERT(dr->dr_next == NULL);
 	*drp = dr->dr_next;
 
 #ifdef ZFS_DEBUG
 	if (db->db_blkid == DMU_SPILL_BLKID) {
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(db);
 		dn = DB_DNODE(db);
 		ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR);
 		ASSERT(!(BP_IS_HOLE(db->db_blkptr)) &&
 		    db->db_blkptr == &dn->dn_phys->dn_spill);
 		DB_DNODE_EXIT(db);
 	}
 #endif
 
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DMU_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 		if (db->db_state != DB_NOFILL) {
 			if (dr->dt.dl.dr_data != db->db_buf)
 				arc_buf_destroy(dr->dt.dl.dr_data, db);
 		}
 	} else {
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(db);
 		dn = DB_DNODE(db);
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 		ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift);
 		if (!BP_IS_HOLE(db->db_blkptr)) {
 			int epbs =
 			    dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT;
 			ASSERT3U(db->db_blkid, <=,
 			    dn->dn_phys->dn_maxblkid >> (db->db_level * epbs));
 			ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==,
 			    db->db.db_size);
 		}
 		DB_DNODE_EXIT(db);
 		mutex_destroy(&dr->dt.di.dr_mtx);
 		list_destroy(&dr->dt.di.dr_children);
 	}
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
 	cv_broadcast(&db->db_changed);
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
 	dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg);
 }
 
 static void
 dbuf_write_nofill_ready(zio_t *zio)
 {
 	dbuf_write_ready(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_nofill_done(zio_t *zio)
 {
 	dbuf_write_done(zio, NULL, zio->io_private);
 }
 
 static void
 dbuf_write_override_ready(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	dbuf_write_ready(zio, NULL, db);
 }
 
 static void
 dbuf_write_override_done(zio_t *zio)
 {
 	dbuf_dirty_record_t *dr = zio->io_private;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
 
 	mutex_enter(&db->db_mtx);
 	if (!BP_EQUAL(zio->io_bp, obp)) {
 		if (!BP_IS_HOLE(obp))
 			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
 		arc_release(dr->dt.dl.dr_data, db);
 	}
 	mutex_exit(&db->db_mtx);
 	dbuf_write_done(zio, NULL, db);
 
 	if (zio->io_abd != NULL)
 		abd_put(zio->io_abd);
 }
 
 typedef struct dbuf_remap_impl_callback_arg {
 	objset_t	*drica_os;
 	uint64_t	drica_blk_birth;
 	dmu_tx_t	*drica_tx;
 } dbuf_remap_impl_callback_arg_t;
 
 static void
 dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size,
     void *arg)
 {
 	dbuf_remap_impl_callback_arg_t *drica = arg;
 	objset_t *os = drica->drica_os;
 	spa_t *spa = dmu_objset_spa(os);
 	dmu_tx_t *tx = drica->drica_tx;
 
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	if (os == spa_meta_objset(spa)) {
 		spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx);
 	} else {
 		dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset,
 		    size, drica->drica_blk_birth, tx);
 	}
 }
 
 static void
 dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, dmu_tx_t *tx)
 {
 	blkptr_t bp_copy = *bp;
 	spa_t *spa = dmu_objset_spa(dn->dn_objset);
 	dbuf_remap_impl_callback_arg_t drica;
 
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	drica.drica_os = dn->dn_objset;
 	drica.drica_blk_birth = bp->blk_birth;
 	drica.drica_tx = tx;
 	if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback,
 	    &drica)) {
 		/*
 		 * The struct_rwlock prevents dbuf_read_impl() from
 		 * dereferencing the BP while we are changing it.  To
 		 * avoid lock contention, only grab it when we are actually
 		 * changing the BP.
 		 */
 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		*bp = bp_copy;
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 }
 
 /*
  * Returns true if a dbuf_remap would modify the dbuf. We do this by attempting
  * to remap a copy of every bp in the dbuf.
  */
 boolean_t
 dbuf_can_remap(const dmu_buf_impl_t *db)
 {
 	spa_t *spa = dmu_objset_spa(db->db_objset);
 	blkptr_t *bp = db->db.db_data;
 	boolean_t ret = B_FALSE;
 
 	ASSERT3U(db->db_level, >, 0);
 	ASSERT3S(db->db_state, ==, DB_CACHED);
 
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
 		blkptr_t bp_copy = bp[i];
 		if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
 			ret = B_TRUE;
 			break;
 		}
 	}
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	return (ret);
 }
 
 boolean_t
 dnode_needs_remap(const dnode_t *dn)
 {
 	spa_t *spa = dmu_objset_spa(dn->dn_objset);
 	boolean_t ret = B_FALSE;
 
 	if (dn->dn_phys->dn_nlevels == 0) {
 		return (B_FALSE);
 	}
 
 	ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL));
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 	for (int j = 0; j < dn->dn_phys->dn_nblkptr; j++) {
 		blkptr_t bp_copy = dn->dn_phys->dn_blkptr[j];
 		if (spa_remap_blkptr(spa, &bp_copy, NULL, NULL)) {
 			ret = B_TRUE;
 			break;
 		}
 	}
 	spa_config_exit(spa, SCL_VDEV, FTAG);
 
 	return (ret);
 }
 
 /*
  * Remap any existing BP's to concrete vdevs, if possible.
  */
 static void
 dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	spa_t *spa = dmu_objset_spa(db->db_objset);
 	ASSERT(dsl_pool_sync_context(spa_get_dsl(spa)));
 
 	if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL))
 		return;
 
 	if (db->db_level > 0) {
 		blkptr_t *bp = db->db.db_data;
 		for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) {
 			dbuf_remap_impl(dn, &bp[i], tx);
 		}
 	} else if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 		dnode_phys_t *dnp = db->db.db_data;
 		ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==,
 		    DMU_OT_DNODE);
 		for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; i++) {
 			for (int j = 0; j < dnp[i].dn_nblkptr; j++) {
 				dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], tx);
 			}
 		}
 	}
 }
 
 
 /* Issue I/O to commit a dirty buffer to disk. */
 static void
 dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn;
 	objset_t *os;
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
 	int wp_flag = 0;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	os = dn->dn_objset;
 
 	if (db->db_state != DB_NOFILL) {
 		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
 			/*
 			 * Private object buffers are released here rather
 			 * than in dbuf_dirty() since they are only modified
 			 * in the syncing context and we don't want the
 			 * overhead of making multiple copies of the data.
 			 */
 			if (BP_IS_HOLE(db->db_blkptr)) {
 				arc_buf_thaw(data);
 			} else {
 				dbuf_release_bp(db);
 			}
 			dbuf_remap(dn, db, tx);
 		}
 	}
 
 	if (parent != dn->dn_dbuf) {
 		/* Our parent is an indirect block. */
 		/* We have a dirty parent that has been scheduled for write. */
 		ASSERT(parent && parent->db_data_pending);
 		/* Our parent's buffer is one level closer to the dnode. */
 		ASSERT(db->db_level == parent->db_level-1);
 		/*
 		 * We're about to modify our parent's db_data by modifying
 		 * our block pointer, so the parent must be released.
 		 */
 		ASSERT(arc_released(parent->db_buf));
 		zio = parent->db_data_pending->dr_zio;
 	} else {
 		/* Our parent is the dnode itself. */
 		ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
 		    db->db_blkid != DMU_SPILL_BLKID) ||
 		    (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
 		if (db->db_blkid != DMU_SPILL_BLKID)
 			ASSERT3P(db->db_blkptr, ==,
 			    &dn->dn_phys->dn_blkptr[db->db_blkid]);
 		zio = dn->dn_zio;
 	}
 
 	ASSERT(db->db_level == 0 || data == db->db_buf);
 	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
 	ASSERT(zio);
 
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	if (db->db_blkid == DMU_SPILL_BLKID)
 		wp_flag = WP_SPILL;
 	wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0;
 
 	dmu_write_policy(os, dn, db->db_level, wp_flag, &zp);
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * We copy the blkptr now (rather than when we instantiate the dirty
 	 * record), because its value can change between open context and
 	 * syncing context. We do not need to hold dn_struct_rwlock to read
 	 * db_blkptr because we are in syncing context.
 	 */
 	dr->dr_bp_copy = *db->db_blkptr;
 
 	if (db->db_level == 0 &&
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * The BP for this block has been provided by open context
 		 * (by dmu_sync() or dmu_buf_write_embedded()).
 		 */
 		abd_t *contents = (data != NULL) ?
 		    abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL;
 
 		dr->dr_zio = zio_write(zio, os->os_spa, txg, &dr->dr_bp_copy,
 		    contents, db->db.db_size, db->db.db_size, &zp,
 		    dbuf_write_override_ready, NULL, NULL,
 		    dbuf_write_override_done,
 		    dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 		mutex_enter(&db->db_mtx);
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
 		    dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite);
 		mutex_exit(&db->db_mtx);
 	} else if (db->db_state == DB_NOFILL) {
 		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF ||
 		    zp.zp_checksum == ZIO_CHECKSUM_NOPARITY);
 		dr->dr_zio = zio_write(zio, os->os_spa, txg,
 		    &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp,
 		    dbuf_write_nofill_ready, NULL, NULL,
 		    dbuf_write_nofill_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE,
 		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
 	} else {
 		ASSERT(arc_released(data));
 
 		/*
 		 * For indirect blocks, we want to setup the children
 		 * ready callback so that we can properly handle an indirect
 		 * block that only contains holes.
 		 */
 		arc_done_func_t *children_ready_cb = NULL;
 		if (db->db_level != 0)
 			children_ready_cb = dbuf_write_children_ready;
 
 		dr->dr_zio = arc_write(zio, os->os_spa, txg,
 		    &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db),
 		    &zp, dbuf_write_ready, children_ready_cb,
 		    dbuf_write_physdone, dbuf_write_done, db,
 		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 	}
 }
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c	(revision 336957)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c	(revision 336958)
@@ -1,2371 +1,2371 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  */
 /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
 /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
 /* Copyright 2016 Nexenta Systems, Inc. All rights reserved. */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_prop.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
 #include <sys/zio_compress.h>
 #include <sys/sa.h>
 #include <sys/zfeature.h>
 #include <sys/abd.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <sys/zfs_znode.h>
 #endif
 
 /*
  * Enable/disable nopwrite feature.
  */
 int zfs_nopwrite_enabled = 1;
 
 /*
  * Tunable to control percentage of dirtied blocks from frees in one TXG.
  * After this threshold is crossed, additional dirty blocks from frees
  * wait until the next TXG.
  * A value of zero will disable this throttle.
  */
 uint32_t zfs_per_txg_dirty_frees_percent = 30;
 
 /*
  * This can be used for testing, to ensure that certain actions happen
  * while in the middle of a remap (which might otherwise complete too
  * quickly).
  */
 int zfs_object_remap_one_indirect_delay_ticks = 0;
 
 const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
-	{	DMU_BSWAP_UINT8,	TRUE,	"unallocated"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"object directory"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"object array"		},
-	{	DMU_BSWAP_UINT8,	TRUE,	"packed nvlist"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"packed nvlist size"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj"			},
-	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj header"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map header"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"SPA space map"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"ZIL intent log"	},
-	{	DMU_BSWAP_DNODE,	TRUE,	"DMU dnode"		},
-	{	DMU_BSWAP_OBJSET,	TRUE,	"DMU objset"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"DSL directory"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL directory child map"},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset snap map"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL props"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"DSL dataset"		},
-	{	DMU_BSWAP_ZNODE,	TRUE,	"ZFS znode"		},
-	{	DMU_BSWAP_OLDACL,	TRUE,	"ZFS V0 ACL"		},
-	{	DMU_BSWAP_UINT8,	FALSE,	"ZFS plain file"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS directory"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS master node"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS delete queue"	},
-	{	DMU_BSWAP_UINT8,	FALSE,	"zvol object"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"zvol prop"		},
-	{	DMU_BSWAP_UINT8,	FALSE,	"other uint8[]"		},
-	{	DMU_BSWAP_UINT64,	FALSE,	"other uint64[]"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"other ZAP"		},
-	{	DMU_BSWAP_ZAP,		TRUE,	"persistent error log"	},
-	{	DMU_BSWAP_UINT8,	TRUE,	"SPA history"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"SPA history offsets"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"Pool properties"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL permissions"	},
-	{	DMU_BSWAP_ACL,		TRUE,	"ZFS ACL"		},
-	{	DMU_BSWAP_UINT8,	TRUE,	"ZFS SYSACL"		},
-	{	DMU_BSWAP_UINT8,	TRUE,	"FUID table"		},
-	{	DMU_BSWAP_UINT64,	TRUE,	"FUID table size"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dataset next clones"},
-	{	DMU_BSWAP_ZAP,		TRUE,	"scan work queue"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group used"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"ZFS user/group quota"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"snapshot refcount tags"},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DDT ZAP algorithm"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DDT statistics"	},
-	{	DMU_BSWAP_UINT8,	TRUE,	"System attributes"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"SA master node"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr registration"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"SA attr layouts"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"scan translations"	},
-	{	DMU_BSWAP_UINT8,	FALSE,	"deduplicated block"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL deadlist map"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"DSL deadlist map hdr"	},
-	{	DMU_BSWAP_ZAP,		TRUE,	"DSL dir clones"	},
-	{	DMU_BSWAP_UINT64,	TRUE,	"bpobj subobj"		}
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "unallocated"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "object directory"		},
+	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "object array"		},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "packed nvlist"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "packed nvlist size"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj"			},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj header"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map header"	},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA space map"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "ZIL intent log"		},
+	{ DMU_BSWAP_DNODE,  TRUE,  FALSE,  "DMU dnode"			},
+	{ DMU_BSWAP_OBJSET, TRUE,  TRUE,   "DMU objset"			},
+	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL directory"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL directory child map"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset snap map"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL props"			},
+	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL dataset"		},
+	{ DMU_BSWAP_ZNODE,  TRUE,  FALSE,  "ZFS znode"			},
+	{ DMU_BSWAP_OLDACL, TRUE,  FALSE,  "ZFS V0 ACL"			},
+	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "ZFS plain file"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS directory"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS master node"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS delete queue"		},
+	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "zvol object"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "zvol prop"			},
+	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "other uint8[]"		},
+	{ DMU_BSWAP_UINT64, FALSE, FALSE,  "other uint64[]"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "other ZAP"			},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "persistent error log"	},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "SPA history"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "SPA history offsets"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "Pool properties"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL permissions"		},
+	{ DMU_BSWAP_ACL,    TRUE,  FALSE,  "ZFS ACL"			},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "ZFS SYSACL"			},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "FUID table"			},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "FUID table size"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dataset next clones"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan work queue"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group used"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "ZFS user/group quota"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "snapshot refcount tags"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT ZAP algorithm"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "DDT statistics"		},
+	{ DMU_BSWAP_UINT8,  TRUE,  FALSE,  "System attributes"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA master node"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr registration"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "SA attr layouts"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  FALSE,  "scan translations"		},
+	{ DMU_BSWAP_UINT8,  FALSE, FALSE,  "deduplicated block"		},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL deadlist map"		},
+	{ DMU_BSWAP_UINT64, TRUE,  TRUE,   "DSL deadlist map hdr"	},
+	{ DMU_BSWAP_ZAP,    TRUE,  TRUE,   "DSL dir clones"		},
+	{ DMU_BSWAP_UINT64, TRUE,  FALSE,  "bpobj subobj"		}
 };
 
 const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = {
 	{	byteswap_uint8_array,	"uint8"		},
 	{	byteswap_uint16_array,	"uint16"	},
 	{	byteswap_uint32_array,	"uint32"	},
 	{	byteswap_uint64_array,	"uint64"	},
 	{	zap_byteswap,		"zap"		},
 	{	dnode_buf_byteswap,	"dnode"		},
 	{	dmu_objset_byteswap,	"objset"	},
 	{	zfs_znode_byteswap,	"znode"		},
 	{	zfs_oldacl_byteswap,	"oldacl"	},
 	{	zfs_acl_byteswap,	"acl"		}
 };
 
 int
 dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset,
     void *tag, dmu_buf_t **dbp)
 {
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 
 	blkid = dbuf_whichblock(dn, 0, offset);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (0);
 }
 int
 dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	dmu_buf_impl_t *db;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	db = dbuf_hold(dn, blkid, tag);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 
 	if (db == NULL) {
 		*dbp = NULL;
 		return (SET_ERROR(EIO));
 	}
 
 	*dbp = &db->db;
 	return (err);
 }
 
 int
 dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 
 	err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **dbp, int flags)
 {
 	int err;
 	int db_flags = DB_RF_CANFAIL;
 
 	if (flags & DMU_READ_NO_PREFETCH)
 		db_flags |= DB_RF_NOPREFETCH;
 
 	err = dmu_buf_hold_noread(os, object, offset, tag, dbp);
 	if (err == 0) {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp);
 		err = dbuf_read(db, NULL, db_flags);
 		if (err != 0) {
 			dbuf_rele(db, tag);
 			*dbp = NULL;
 		}
 	}
 
 	return (err);
 }
 
 int
 dmu_bonus_max(void)
 {
 	return (DN_MAX_BONUSLEN);
 }
 
 int
 dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else if (newsize < 0 || newsize > db_fake->db_size) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonuslen(dn, newsize, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 int
 dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int error;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (!DMU_OT_IS_VALID(type)) {
 		error = SET_ERROR(EINVAL);
 	} else if (dn->dn_bonus != db) {
 		error = SET_ERROR(EINVAL);
 	} else {
 		dnode_setbonus_type(dn, type, tx);
 		error = 0;
 	}
 
 	DB_DNODE_EXIT(db);
 	return (error);
 }
 
 dmu_object_type_t
 dmu_get_bonustype(dmu_buf_t *db_fake)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	dmu_object_type_t type;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	type = dn->dn_bonustype;
 	DB_DNODE_EXIT(db);
 
 	return (type);
 }
 
 int
 dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	dbuf_rm_spill(dn, tx);
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 	dnode_rm_spill(dn, tx);
 	rw_exit(&dn->dn_struct_rwlock);
 	dnode_rele(dn, FTAG);
 	return (error);
 }
 
 /*
  * returns ENOENT, EIO, or 0.
  */
 int
 dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
 {
 	dnode_t *dn;
 	dmu_buf_impl_t *db;
 	int error;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	if (error)
 		return (error);
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_bonus == NULL) {
 		rw_exit(&dn->dn_struct_rwlock);
 		rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
 		if (dn->dn_bonus == NULL)
 			dbuf_create_bonus(dn);
 	}
 	db = dn->dn_bonus;
 
 	/* as long as the bonus buf is held, the dnode will be held */
 	if (refcount_add(&db->db_holds, tag) == 1) {
 		VERIFY(dnode_add_ref(dn, db));
 		atomic_inc_32(&dn->dn_dbufs_count);
 	}
 
 	/*
 	 * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's
 	 * hold and incrementing the dbuf count to ensure that dnode_move() sees
 	 * a dnode hold for every dbuf.
 	 */
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 
 	VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH));
 
 	*dbp = &db->db;
 	return (0);
 }
 
 /*
  * returns ENOENT, EIO, or 0.
  *
  * This interface will allocate a blank spill dbuf when a spill blk
  * doesn't already exist on the dnode.
  *
  * if you only want to find an already existing spill db, then
  * dmu_spill_hold_existing() should be used.
  */
 int
 dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = NULL;
 	int err;
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 	db = dbuf_hold(dn, DMU_SPILL_BLKID, tag);
 
 	if ((flags & DB_RF_HAVESTRUCT) == 0)
 		rw_exit(&dn->dn_struct_rwlock);
 
 	ASSERT(db != NULL);
 	err = dbuf_read(db, NULL, flags);
 	if (err == 0)
 		*dbp = &db->db;
 	else
 		dbuf_rele(db, tag);
 	return (err);
 }
 
 int
 dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) {
 		err = SET_ERROR(EINVAL);
 	} else {
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 
 		if (!dn->dn_have_spill) {
 			err = SET_ERROR(ENOENT);
 		} else {
 			err = dmu_spill_hold_by_dnode(dn,
 			    DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp);
 		}
 
 		rw_exit(&dn->dn_struct_rwlock);
 	}
 
 	DB_DNODE_EXIT(db);
 	return (err);
 }
 
 int
 dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Note: longer-term, we should modify all of the dmu_buf_*() interfaces
  * to take a held dnode rather than <os, object> -- the lookup is wasteful,
  * and can induce severe lock contention when writing to several files
  * whose dnodes are in the same block.
  */
 static int
 dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
     boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
 	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio;
 
 	ASSERT(length <= DMU_MAX_ACCESS);
 
 	/*
 	 * Note: We directly notify the prefetch code of this read, so that
 	 * we can tell it about the multi-block read.  dbuf_read() only knows
 	 * about the one block it is accessing.
 	 */
 	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT |
 	    DB_RF_NOPREFETCH;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
 		int blkshift = dn->dn_datablkshift;
 		nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) -
 		    P2ALIGN(offset, 1ULL << blkshift)) >> blkshift;
 	} else {
 		if (offset + length > dn->dn_datablksz) {
 			zfs_panic_recover("zfs: accessing past end of object "
 			    "%llx/%llx (size=%u access=%llu+%llu)",
 			    (longlong_t)dn->dn_objset->
 			    os_dsl_dataset->ds_object,
 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
 			    (longlong_t)offset, (longlong_t)length);
 			rw_exit(&dn->dn_struct_rwlock);
 			return (SET_ERROR(EIO));
 		}
 		nblks = 1;
 	}
 	dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
 
 	zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	for (i = 0; i < nblks; i++) {
 		dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag);
 		if (db == NULL) {
 			rw_exit(&dn->dn_struct_rwlock);
 			dmu_buf_rele_array(dbp, nblks, tag);
 			zio_nowait(zio);
 			return (SET_ERROR(EIO));
 		}
 
 		/* initiate async i/o */
 		if (read)
 			(void) dbuf_read(db, zio, dbuf_flags);
 		dbp[i] = &db->db;
 	}
 
 	if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
 	    DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
 		dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
 		    read && DNODE_IS_CACHEABLE(dn));
 	}
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/* wait for async i/o */
 	err = zio_wait(zio);
 	if (err) {
 		dmu_buf_rele_array(dbp, nblks, tag);
 		return (err);
 	}
 
 	/* wait for other io to complete */
 	if (read) {
 		for (i = 0; i < nblks; i++) {
 			dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i];
 			mutex_enter(&db->db_mtx);
 			while (db->db_state == DB_READ ||
 			    db->db_state == DB_FILL)
 				cv_wait(&db->db_changed, &db->db_mtx);
 			if (db->db_state == DB_UNCACHED)
 				err = SET_ERROR(EIO);
 			mutex_exit(&db->db_mtx);
 			if (err) {
 				dmu_buf_rele_array(dbp, nblks, tag);
 				return (err);
 			}
 		}
 	}
 
 	*numbufsp = nblks;
 	*dbpp = dbp;
 	return (0);
 }
 
 static int
 dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset,
     uint64_t length, boolean_t read, void *tag, int *numbufsp,
     dmu_buf_t ***dbpp)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 	int err;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
 	    numbufsp, dbpp, DMU_READ_PREFETCH);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 void
 dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag)
 {
 	int i;
 	dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake;
 
 	if (numbufs == 0)
 		return;
 
 	for (i = 0; i < numbufs; i++) {
 		if (dbp[i])
 			dbuf_rele(dbp[i], tag);
 	}
 
 	kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs);
 }
 
 /*
  * Issue prefetch i/os for the given blocks.  If level is greater than 0, the
  * indirect blocks prefeteched will be those that point to the blocks containing
  * the data starting at offset, and continuing to offset + len.
  *
  * Note that if the indirect blocks above the blocks being prefetched are not in
  * cache, they will be asychronously read in.
  */
 void
 dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, zio_priority_t pri)
 {
 	dnode_t *dn;
 	uint64_t blkid;
 	int nblks, err;
 
 	if (len == 0) {  /* they're interested in the bonus buffer */
 		dn = DMU_META_DNODE(os);
 
 		if (object == 0 || object >= DN_MAX_OBJECT)
 			return;
 
 		rw_enter(&dn->dn_struct_rwlock, RW_READER);
 		blkid = dbuf_whichblock(dn, level,
 		    object * sizeof (dnode_phys_t));
 		dbuf_prefetch(dn, level, blkid, pri, 0);
 		rw_exit(&dn->dn_struct_rwlock);
 		return;
 	}
 
 	/*
 	 * XXX - Note, if the dnode for the requested object is not
 	 * already cached, we will do a *synchronous* read in the
 	 * dnode_hold() call.  The same is true for any indirects.
 	 */
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	/*
 	 * offset + len - 1 is the last byte we want to prefetch for, and offset
 	 * is the first.  Then dbuf_whichblk(dn, level, off + len - 1) is the
 	 * last block we want to prefetch, and dbuf_whichblock(dn, level,
 	 * offset)  is the first.  Then the number we need to prefetch is the
 	 * last - first + 1.
 	 */
 	if (level > 0 || dn->dn_datablkshift != 0) {
 		nblks = dbuf_whichblock(dn, level, offset + len - 1) -
 		    dbuf_whichblock(dn, level, offset) + 1;
 	} else {
 		nblks = (offset < dn->dn_datablksz);
 	}
 
 	if (nblks != 0) {
 		blkid = dbuf_whichblock(dn, level, offset);
 		for (int i = 0; i < nblks; i++)
 			dbuf_prefetch(dn, level, blkid + i, pri, 0);
 	}
 
 	rw_exit(&dn->dn_struct_rwlock);
 
 	dnode_rele(dn, FTAG);
 }
 
 /*
  * Get the next "chunk" of file data to free.  We traverse the file from
  * the end so that the file gets shorter over time (if we crashes in the
  * middle, this will leave us in a better state).  We find allocated file
  * data by simply searching the allocated level 1 indirects.
  *
  * On input, *start should be the first offset that does not need to be
  * freed (e.g. "offset + length").  On return, *start will be the first
  * offset that should be freed.
  */
 static int
 get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum)
 {
 	uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1);
 	/* bytes of data covered by a level-1 indirect block */
 	uint64_t iblkrange =
 	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
 	ASSERT3U(minimum, <=, *start);
 
 	if (*start - minimum <= iblkrange * maxblks) {
 		*start = minimum;
 		return (0);
 	}
 	ASSERT(ISP2(iblkrange));
 
 	for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) {
 		int err;
 
 		/*
 		 * dnode_next_offset(BACKWARDS) will find an allocated L1
 		 * indirect block at or before the input offset.  We must
 		 * decrement *start so that it is at the end of the region
 		 * to search.
 		 */
 		(*start)--;
 		err = dnode_next_offset(dn,
 		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
 		/* if there are no indirect blocks before start, we are done */
 		if (err == ESRCH) {
 			*start = minimum;
 			break;
 		} else if (err != 0) {
 			return (err);
 		}
 
 		/* set start to the beginning of this L1 indirect */
 		*start = P2ALIGN(*start, iblkrange);
 	}
 	if (*start < minimum)
 		*start = minimum;
 	return (0);
 }
 
 /*
  * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set,
  * otherwise return false.
  * Used below in dmu_free_long_range_impl() to enable abort when unmounting
  */
 /*ARGSUSED*/
 static boolean_t
 dmu_objset_zfs_unmounting(objset_t *os)
 {
 #ifdef _KERNEL
 	if (dmu_objset_type(os) == DMU_OST_ZFS)
 		return (zfs_get_vfs_flag_unmounted(os));
 #endif
 	return (B_FALSE);
 }
 
 static int
 dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
     uint64_t length)
 {
 	uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	int err;
 	uint64_t dirty_frees_threshold;
 	dsl_pool_t *dp = dmu_objset_pool(os);
 
 	if (offset >= object_size)
 		return (0);
 
 	if (zfs_per_txg_dirty_frees_percent <= 100)
 		dirty_frees_threshold =
 		    zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100;
 	else
 		dirty_frees_threshold = zfs_dirty_data_max / 4;
 
 	if (length == DMU_OBJECT_END || offset + length > object_size)
 		length = object_size - offset;
 
 	while (length != 0) {
 		uint64_t chunk_end, chunk_begin, chunk_len;
 		uint64_t long_free_dirty_all_txgs = 0;
 		dmu_tx_t *tx;
 
 		if (dmu_objset_zfs_unmounting(dn->dn_objset))
 			return (SET_ERROR(EINTR));
 
 		chunk_end = chunk_begin = offset + length;
 
 		/* move chunk_begin backwards to the beginning of this chunk */
 		err = get_next_chunk(dn, &chunk_begin, offset);
 		if (err)
 			return (err);
 		ASSERT3U(chunk_begin, >=, offset);
 		ASSERT3U(chunk_begin, <=, chunk_end);
 
 		chunk_len = chunk_end - chunk_begin;
 
 		mutex_enter(&dp->dp_lock);
 		for (int t = 0; t < TXG_SIZE; t++) {
 			long_free_dirty_all_txgs +=
 			    dp->dp_long_free_dirty_pertxg[t];
 		}
 		mutex_exit(&dp->dp_lock);
 
 		/*
 		 * To avoid filling up a TXG with just frees wait for
 		 * the next TXG to open before freeing more chunks if
 		 * we have reached the threshold of frees
 		 */
 		if (dirty_frees_threshold != 0 &&
 		    long_free_dirty_all_txgs >= dirty_frees_threshold) {
 			txg_wait_open(dp, 0);
 			continue;
 		}
 
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len);
 
 		/*
 		 * Mark this transaction as typically resulting in a net
 		 * reduction in space used.
 		 */
 		dmu_tx_mark_netfree(tx);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err) {
 			dmu_tx_abort(tx);
 			return (err);
 		}
 
 		mutex_enter(&dp->dp_lock);
 		dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] +=
 		    chunk_len;
 		mutex_exit(&dp->dp_lock);
 		DTRACE_PROBE3(free__long__range,
 		    uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len,
 		    uint64_t, dmu_tx_get_txg(tx));
 		dnode_free_range(dn, chunk_begin, chunk_len, tx);
 		dmu_tx_commit(tx);
 
 		length -= chunk_len;
 	}
 	return (0);
 }
 
 int
 dmu_free_long_range(objset_t *os, uint64_t object,
     uint64_t offset, uint64_t length)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	err = dmu_free_long_range_impl(os, dn, offset, length);
 
 	/*
 	 * It is important to zero out the maxblkid when freeing the entire
 	 * file, so that (a) subsequent calls to dmu_free_long_range_impl()
 	 * will take the fast path, and (b) dnode_reallocate() can verify
 	 * that the entire file has been freed.
 	 */
 	if (err == 0 && offset == 0 && length == DMU_OBJECT_END)
 		dn->dn_maxblkid = 0;
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_free_long_object(objset_t *os, uint64_t object)
 {
 	dmu_tx_t *tx;
 	int err;
 
 	err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
 	if (err != 0)
 		return (err);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_bonus(tx, object);
 	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
 	dmu_tx_mark_netfree(tx);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err == 0) {
 		err = dmu_object_free(os, object, tx);
 		dmu_tx_commit(tx);
 	} else {
 		dmu_tx_abort(tx);
 	}
 
 	return (err);
 }
 
 int
 dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	ASSERT(offset < UINT64_MAX);
 	ASSERT(size == -1ULL || size <= UINT64_MAX - offset);
 	dnode_free_range(dn, offset, size, tx);
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 static int
 dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dmu_buf_t **dbp;
 	int numbufs, err = 0;
 
 	/*
 	 * Deal with odd block sizes, where there can't be data past the first
 	 * block.  If we ever do the tail block optimization, we will need to
 	 * handle that here as well.
 	 */
 	if (dn->dn_maxblkid == 0) {
 		int newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
 		bzero((char *)buf + newsz, size - newsz);
 		size = newsz;
 	}
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
 		int i;
 
 		/*
 		 * NB: we could do this block-at-a-time, but it's nice
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
 		    TRUE, FTAG, &numbufs, &dbp, flags);
 		if (err)
 			break;
 
 		for (i = 0; i < numbufs; i++) {
 			int tocpy;
 			int bufoff;
 			dmu_buf_t *db = dbp[i];
 
 			ASSERT(size > 0);
 
 			bufoff = offset - db->db_offset;
 			tocpy = (int)MIN(db->db_size - bufoff, size);
 
 			bcopy((char *)db->db_data + bufoff, buf, tocpy);
 
 			offset += tocpy;
 			size -= tocpy;
 			buf = (char *)buf + tocpy;
 		}
 		dmu_buf_rele_array(dbp, numbufs, FTAG);
 	}
 	return (err);
 }
 
 int
 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     void *buf, uint32_t flags)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 
 	err = dmu_read_impl(dn, offset, size, buf, flags);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 int
 dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     uint32_t flags)
 {
 	return (dmu_read_impl(dn, offset, size, buf, flags));
 }
 
 static void
 dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	int i;
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy;
 		int bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = offset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		bcopy(buf, (char *)db->db_data + bufoff, tocpy);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		offset += tocpy;
 		size -= tocpy;
 		buf = (char *)buf + tocpy;
 	}
 }
 
 void
 dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 
 	if (size == 0)
 		return;
 
 	VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH));
 	dmu_write_impl(dbp, numbufs, offset, size, buf, tx);
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 static int
 dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn,
     uint64_t last_removal_txg, uint64_t offset)
 {
 	uint64_t l1blkid = dbuf_whichblock(dn, 1, offset);
 	int err = 0;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG);
 	ASSERT3P(dbuf, !=, NULL);
 
 	/*
 	 * If the block hasn't been written yet, this default will ensure
 	 * we don't try to remap it.
 	 */
 	uint64_t birth = UINT64_MAX;
 	ASSERT3U(last_removal_txg, !=, UINT64_MAX);
 	if (dbuf->db_blkptr != NULL)
 		birth = dbuf->db_blkptr->blk_birth;
 	rw_exit(&dn->dn_struct_rwlock);
 
 	/*
 	 * If this L1 was already written after the last removal, then we've
 	 * already tried to remap it.
 	 */
 	if (birth <= last_removal_txg &&
 	    dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 &&
 	    dbuf_can_remap(dbuf)) {
 		dmu_tx_t *tx = dmu_tx_create(os);
 		dmu_tx_hold_remap_l1indirect(tx, dn->dn_object);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err == 0) {
 			(void) dbuf_dirty(dbuf, tx);
 			dmu_tx_commit(tx);
 		} else {
 			dmu_tx_abort(tx);
 		}
 	}
 
 	dbuf_rele(dbuf, FTAG);
 
 	delay(zfs_object_remap_one_indirect_delay_ticks);
 
 	return (err);
 }
 
 /*
  * Remap all blockpointers in the object, if possible, so that they reference
  * only concrete vdevs.
  *
  * To do this, iterate over the L0 blockpointers and remap any that reference
  * an indirect vdev. Note that we only examine L0 blockpointers; since we
  * cannot guarantee that we can remap all blockpointer anyways (due to split
  * blocks), we do not want to make the code unnecessarily complicated to
  * catch the unlikely case that there is an L1 block on an indirect vdev that
  * contains no indirect blockpointers.
  */
 int
 dmu_object_remap_indirects(objset_t *os, uint64_t object,
     uint64_t last_removal_txg)
 {
 	uint64_t offset, l1span;
 	int err;
 	dnode_t *dn;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0) {
 		return (err);
 	}
 
 	if (dn->dn_nlevels <= 1) {
 		if (issig(JUSTLOOKING) && issig(FORREAL)) {
 			err = SET_ERROR(EINTR);
 		}
 
 		/*
 		 * If the dnode has no indirect blocks, we cannot dirty them.
 		 * We still want to remap the blkptr(s) in the dnode if
 		 * appropriate, so mark it as dirty.
 		 */
 		if (err == 0 && dnode_needs_remap(dn)) {
 			dmu_tx_t *tx = dmu_tx_create(os);
 			dmu_tx_hold_bonus(tx, dn->dn_object);
 			if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) {
 				dnode_setdirty(dn, tx);
 				dmu_tx_commit(tx);
 			} else {
 				dmu_tx_abort(tx);
 			}
 		}
 
 		dnode_rele(dn, FTAG);
 		return (err);
 	}
 
 	offset = 0;
 	l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT +
 	    dn->dn_datablkshift);
 	/*
 	 * Find the next L1 indirect that is not a hole.
 	 */
 	while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) {
 		if (issig(JUSTLOOKING) && issig(FORREAL)) {
 			err = SET_ERROR(EINTR);
 			break;
 		}
 		if ((err = dmu_object_remap_one_indirect(os, dn,
 		    last_removal_txg, offset)) != 0) {
 			break;
 		}
 		offset += l1span;
 	}
 
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 void
 dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 
 	if (size == 0)
 		return;
 
 	VERIFY(0 == dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp));
 
 	for (i = 0; i < numbufs; i++) {
 		dmu_buf_t *db = dbp[i];
 
 		dmu_buf_will_not_fill(db, tx);
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
 void
 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 
 	ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES);
 	ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS);
 	VERIFY0(dmu_buf_hold_noread(os, object, offset,
 	    FTAG, &db));
 
 	dmu_buf_write_embedded(db,
 	    data, (bp_embedded_type_t)etype, (enum zio_compress)comp,
 	    uncompressed_size, compressed_size, byteorder, tx);
 
 	dmu_buf_rele(db, FTAG);
 }
 
 /*
  * DMU support for xuio
  */
 kstat_t *xuio_ksp = NULL;
 
 int
 dmu_xuio_init(xuio_t *xuio, int nblk)
 {
 	dmu_xuio_t *priv;
 	uio_t *uio = &xuio->xu_uio;
 
 	uio->uio_iovcnt = nblk;
 	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
 
 	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
 	priv->cnt = nblk;
 	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
 	priv->iovp = uio->uio_iov;
 	XUIO_XUZC_PRIV(xuio) = priv;
 
 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
 	else
 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
 
 	return (0);
 }
 
 void
 dmu_xuio_fini(xuio_t *xuio)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 	int nblk = priv->cnt;
 
 	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
 	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
 	kmem_free(priv, sizeof (dmu_xuio_t));
 
 	if (XUIO_XUZC_RW(xuio) == UIO_READ)
 		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
 	else
 		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
 }
 
 /*
  * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
  * and increase priv->next by 1.
  */
 int
 dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
 {
 	struct iovec *iov;
 	uio_t *uio = &xuio->xu_uio;
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 	int i = priv->next++;
 
 	ASSERT(i < priv->cnt);
 	ASSERT(off + n <= arc_buf_lsize(abuf));
 	iov = uio->uio_iov + i;
 	iov->iov_base = (char *)abuf->b_data + off;
 	iov->iov_len = n;
 	priv->bufs[i] = abuf;
 	return (0);
 }
 
 int
 dmu_xuio_cnt(xuio_t *xuio)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 	return (priv->cnt);
 }
 
 arc_buf_t *
 dmu_xuio_arcbuf(xuio_t *xuio, int i)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 
 	ASSERT(i < priv->cnt);
 	return (priv->bufs[i]);
 }
 
 void
 dmu_xuio_clear(xuio_t *xuio, int i)
 {
 	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
 
 	ASSERT(i < priv->cnt);
 	priv->bufs[i] = NULL;
 }
 
 static void
 xuio_stat_init(void)
 {
 	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
 	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
 	    KSTAT_FLAG_VIRTUAL);
 	if (xuio_ksp != NULL) {
 		xuio_ksp->ks_data = &xuio_stats;
 		kstat_install(xuio_ksp);
 	}
 }
 
 static void
 xuio_stat_fini(void)
 {
 	if (xuio_ksp != NULL) {
 		kstat_delete(xuio_ksp);
 		xuio_ksp = NULL;
 	}
 }
 
 void
 xuio_stat_wbuf_copied(void)
 {
 	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
 }
 
 void
 xuio_stat_wbuf_nocopy(void)
 {
 	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
 }
 
 #ifdef _KERNEL
 static int
 dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
 	xuio_t *xuio = NULL;
 
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
 	 * to be reading in parallel.
 	 */
 	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
 	    TRUE, FTAG, &numbufs, &dbp, 0);
 	if (err)
 		return (err);
 
 	if (uio->uio_extflg == UIO_XUIO)
 		xuio = (xuio_t *)uio;
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy;
 		int bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = uio->uio_loffset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 
 		if (xuio) {
 			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
 			arc_buf_t *dbuf_abuf = dbi->db_buf;
 			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
 			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
 			if (!err) {
 				uio->uio_resid -= tocpy;
 				uio->uio_loffset += tocpy;
 			}
 
 			if (abuf == dbuf_abuf)
 				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
 			else
 				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
 		} else {
 			err = uiomove((char *)db->db_data + bufoff, tocpy,
 			    UIO_READ, uio);
 		}
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From object zdb->db_object.
  * Starting at offset uio->uio_loffset.
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_read_uio_dnode(dn, uio, size);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Read 'size' bytes into the uio buffer.
  * From the specified object
  * Starting at offset uio->uio_loffset.
  */
 int
 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_read_uio_dnode(dn, uio, size);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 static int
 dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs;
 	int err = 0;
 	int i;
 
 	err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size,
 	    FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy;
 		int bufoff;
 		dmu_buf_t *db = dbp[i];
 
 		ASSERT(size > 0);
 
 		bufoff = uio->uio_loffset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		/*
 		 * XXX uiomove could block forever (eg. nfs-backed
 		 * pages).  There needs to be a uiolockdown() function
 		 * to lock the pages in memory, so that uiomove won't
 		 * block.
 		 */
 		err = uiomove((char *)db->db_data + bufoff, tocpy,
 		    UIO_WRITE, uio);
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		if (err)
 			break;
 
 		size -= tocpy;
 	}
 
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To object zdb->db_object.
  * Starting at offset uio->uio_loffset.
  *
  * If the caller already has a dbuf in the target object
  * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(),
  * because we don't have to find the dnode_t for the object.
  */
 int
 dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb;
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 	DB_DNODE_EXIT(db);
 
 	return (err);
 }
 
 /*
  * Write 'size' bytes from the uio buffer.
  * To the specified object.
  * Starting at offset uio->uio_loffset.
  */
 int
 dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_write_uio_dnode(dn, uio, size, tx);
 
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 int
 dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
     page_t *pp, dmu_tx_t *tx)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i;
 	int err;
 
 	if (size == 0)
 		return (0);
 
 	err = dmu_buf_hold_array(os, object, offset, size,
 	    FALSE, FTAG, &numbufs, &dbp);
 	if (err)
 		return (err);
 
 	for (i = 0; i < numbufs; i++) {
 		int tocpy, copied, thiscpy;
 		int bufoff;
 		dmu_buf_t *db = dbp[i];
 		caddr_t va;
 
 		ASSERT(size > 0);
 		ASSERT3U(db->db_size, >=, PAGESIZE);
 
 		bufoff = offset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 
 		ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size);
 
 		if (tocpy == db->db_size)
 			dmu_buf_will_fill(db, tx);
 		else
 			dmu_buf_will_dirty(db, tx);
 
 		for (copied = 0; copied < tocpy; copied += PAGESIZE) {
 			ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
 			thiscpy = MIN(PAGESIZE, tocpy - copied);
 			va = zfs_map_page(pp, S_READ);
 			bcopy(va, (char *)db->db_data + bufoff, thiscpy);
 			zfs_unmap_page(pp, va);
 			pp = pp->p_next;
 			bufoff += PAGESIZE;
 		}
 
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
 		offset += tocpy;
 		size -= tocpy;
 	}
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 	return (err);
 }
 #endif
 
 /*
  * Allocate a loaned anonymous arc buffer.
  */
 arc_buf_t *
 dmu_request_arcbuf(dmu_buf_t *handle, int size)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle;
 
 	return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size));
 }
 
 /*
  * Free a loaned arc buffer.
  */
 void
 dmu_return_arcbuf(arc_buf_t *buf)
 {
 	arc_return_buf(buf, FTAG);
 	arc_buf_destroy(buf, FTAG);
 }
 
 /*
  * When possible directly assign passed loaned arc buffer to a dbuf.
  * If this is not possible copy the contents of passed arc buf via
  * dmu_write().
  */
 void
 dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
     dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle;
 	dnode_t *dn;
 	dmu_buf_impl_t *db;
 	uint32_t blksz = (uint32_t)arc_buf_lsize(buf);
 	uint64_t blkid;
 
 	DB_DNODE_ENTER(dbuf);
 	dn = DB_DNODE(dbuf);
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	blkid = dbuf_whichblock(dn, 0, offset);
 	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
 	rw_exit(&dn->dn_struct_rwlock);
 	DB_DNODE_EXIT(dbuf);
 
 	/*
 	 * We can only assign if the offset is aligned, the arc buf is the
 	 * same size as the dbuf, and the dbuf is not metadata.
 	 */
 	if (offset == db->db.db_offset && blksz == db->db.db_size) {
 		dbuf_assign_arcbuf(db, buf, tx);
 		dbuf_rele(db, FTAG);
 	} else {
 		objset_t *os;
 		uint64_t object;
 
 		/* compressed bufs must always be assignable to their dbuf */
 		ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF);
 		ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED));
 
 		DB_DNODE_ENTER(dbuf);
 		dn = DB_DNODE(dbuf);
 		os = dn->dn_objset;
 		object = dn->dn_object;
 		DB_DNODE_EXIT(dbuf);
 
 		dbuf_rele(db, FTAG);
 		dmu_write(os, object, offset, blksz, buf->b_data, tx);
 		dmu_return_arcbuf(buf);
 		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
 	}
 }
 
 typedef struct {
 	dbuf_dirty_record_t	*dsa_dr;
 	dmu_sync_cb_t		*dsa_done;
 	zgd_t			*dsa_zgd;
 	dmu_tx_t		*dsa_tx;
 } dmu_sync_arg_t;
 
 /* ARGSUSED */
 static void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	dmu_sync_arg_t *dsa = varg;
 	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio->io_error == 0) {
 		if (BP_IS_HOLE(bp)) {
 			/*
 			 * A block of zeros may compress to a hole, but the
 			 * block size still needs to be known for replay.
 			 */
 			BP_SET_LSIZE(bp, db->db_size);
 		} else if (!BP_IS_EMBEDDED(bp)) {
 			ASSERT(BP_GET_LEVEL(bp) == 0);
 			bp->blk_fill = 1;
 		}
 	}
 }
 
 static void
 dmu_sync_late_arrival_ready(zio_t *zio)
 {
 	dmu_sync_ready(zio, NULL, zio->io_private);
 }
 
 /* ARGSUSED */
 static void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
 	dmu_sync_arg_t *dsa = varg;
 	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
 	if (zio->io_error == 0) {
 		dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE);
 		if (dr->dt.dl.dr_nopwrite) {
 			blkptr_t *bp = zio->io_bp;
 			blkptr_t *bp_orig = &zio->io_bp_orig;
 			uint8_t chksum = BP_GET_CHECKSUM(bp_orig);
 
 			ASSERT(BP_EQUAL(bp, bp_orig));
 			VERIFY(BP_EQUAL(bp, db->db_blkptr));
 			ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF);
 			ASSERT(zio_checksum_table[chksum].ci_flags &
 			    ZCHECKSUM_FLAG_NOPWRITE);
 		}
 		dr->dt.dl.dr_overridden_by = *zio->io_bp;
 		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
 		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
 
 		/*
 		 * Old style holes are filled with all zeros, whereas
 		 * new-style holes maintain their lsize, type, level,
 		 * and birth time (see zio_write_compress). While we
 		 * need to reset the BP_SET_LSIZE() call that happened
 		 * in dmu_sync_ready for old style holes, we do *not*
 		 * want to wipe out the information contained in new
 		 * style holes. Thus, only zero out the block pointer if
 		 * it's an old style hole.
 		 */
 		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) &&
 		    dr->dt.dl.dr_overridden_by.blk_birth == 0)
 			BP_ZERO(&dr->dt.dl.dr_overridden_by);
 	} else {
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	}
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static void
 dmu_sync_late_arrival_done(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 	dmu_sync_arg_t *dsa = zio->io_private;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 
 	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
 		ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE));
 		ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig));
 		ASSERT(zio->io_bp->blk_birth == zio->io_txg);
 		ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
 		zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
 	}
 
 	dmu_tx_commit(dsa->dsa_tx);
 
 	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
 
 	abd_put(zio->io_abd);
 	kmem_free(dsa, sizeof (*dsa));
 }
 
 static int
 dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
     zio_prop_t *zp, zbookmark_phys_t *zb)
 {
 	dmu_sync_arg_t *dsa;
 	dmu_tx_t *tx;
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
 	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
 		dmu_tx_abort(tx);
 		/* Make zl_get_data do txg_waited_synced() */
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * In order to prevent the zgd's lwb from being free'd prior to
 	 * dmu_sync_late_arrival_done() being called, we have to ensure
 	 * the lwb's "max txg" takes this tx's txg into account.
 	 */
 	zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx));
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = NULL;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = tx;
 
 	/*
 	 * Since we are currently syncing this txg, it's nontrivial to
 	 * determine what BP to nopwrite against, so we disable nopwrite.
 	 *
 	 * When syncing, the db_blkptr is initially the BP of the previous
 	 * txg.  We can not nopwrite against it because it will be changed
 	 * (this is similar to the non-late-arrival case where the dbuf is
 	 * dirty in a future txg).
 	 *
 	 * Then dbuf_write_ready() sets bp_blkptr to the location we will write.
 	 * We can not nopwrite against it because although the BP will not
 	 * (typically) be changed, the data has not yet been persisted to this
 	 * location.
 	 *
 	 * Finally, when dbuf_write_done() is called, it is theoretically
 	 * possible to always nopwrite, because the data that was written in
 	 * this txg is the same data that we are trying to write.  However we
 	 * would need to check that this dbuf is not dirty in any future
 	 * txg's (as we do in the normal dmu_sync() path). For simplicity, we
 	 * don't nopwrite in this case.
 	 */
 	zp->zp_nopwrite = B_FALSE;
 
 	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
 	    abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size),
 	    zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp,
 	    dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done,
 	    dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
 
 	return (0);
 }
 
 /*
  * Intent log support: sync the block associated with db to disk.
  * N.B. and XXX: the caller is responsible for making sure that the
  * data isn't changing while dmu_sync() is writing it.
  *
  * Return values:
  *
  *	EEXIST: this txg has already been synced, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	ENOENT: the block was dbuf_free_range()'d, so there's nothing to do.
  *		The caller should not log the write.
  *
  *	EALREADY: this block is already in the process of being synced.
  *		The caller should track its progress (somehow).
  *
  *	EIO: could not do the I/O.
  *		The caller should do a txg_wait_synced().
  *
  *	0: the I/O has been initiated.
  *		The caller should log this blkptr in the done callback.
  *		It is possible that the I/O will fail, in which case
  *		the error will be reported to the done callback and
  *		propagated to pio from zio_done().
  */
 int
 dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
 	objset_t *os = db->db_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr;
 	dmu_sync_arg_t *dsa;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	dnode_t *dn;
 
 	ASSERT(pio != NULL);
 	ASSERT(txg != 0);
 
 	SET_BOOKMARK(&zb, ds->ds_object,
 	    db->db.db_object, db->db_level, db->db_blkid);
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp);
 	DB_DNODE_EXIT(db);
 
 	/*
 	 * If we're frozen (running ziltest), we always need to generate a bp.
 	 */
 	if (txg > spa_freeze_txg(os->os_spa))
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 
 	/*
 	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
 	 * and us.  If we determine that this txg is not yet syncing,
 	 * but it begins to sync a moment later, that's OK because the
 	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
 	 */
 	mutex_enter(&db->db_mtx);
 
 	if (txg <= spa_last_synced_txg(os->os_spa)) {
 		/*
 		 * This txg has already synced.  There's nothing to do.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EEXIST));
 	}
 
 	if (txg <= spa_syncing_txg(os->os_spa)) {
 		/*
 		 * This txg is currently syncing, so we can't mess with
 		 * the dirty record anymore; just write a new log block.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 	}
 
 	dr = db->db_last_dirty;
 	while (dr && dr->dr_txg != txg)
 		dr = dr->dr_next;
 
 	if (dr == NULL) {
 		/*
 		 * There's no dr for this dbuf, so it must have been freed.
 		 * There's no need to log writes to freed blocks, so we're done.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(ENOENT));
 	}
 
 	ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg);
 
 	if (db->db_blkptr != NULL) {
 		/*
 		 * We need to fill in zgd_bp with the current blkptr so that
 		 * the nopwrite code can check if we're writing the same
 		 * data that's already on disk.  We can only nopwrite if we
 		 * are sure that after making the copy, db_blkptr will not
 		 * change until our i/o completes.  We ensure this by
 		 * holding the db_mtx, and only allowing nopwrite if the
 		 * block is not already dirty (see below).  This is verified
 		 * by dmu_sync_done(), which VERIFYs that the db_blkptr has
 		 * not changed.
 		 */
 		*zgd->zgd_bp = *db->db_blkptr;
 	}
 
 	/*
 	 * Assume the on-disk data is X, the current syncing data (in
 	 * txg - 1) is Y, and the current in-memory data is Z (currently
 	 * in dmu_sync).
 	 *
 	 * We usually want to perform a nopwrite if X and Z are the
 	 * same.  However, if Y is different (i.e. the BP is going to
 	 * change before this write takes effect), then a nopwrite will
 	 * be incorrect - we would override with X, which could have
 	 * been freed when Y was written.
 	 *
 	 * (Note that this is not a concern when we are nop-writing from
 	 * syncing context, because X and Y must be identical, because
 	 * all previous txgs have been synced.)
 	 *
 	 * Therefore, we disable nopwrite if the current BP could change
 	 * before this TXG.  There are two ways it could change: by
 	 * being dirty (dr_next is non-NULL), or by being freed
 	 * (dnode_block_freed()).  This behavior is verified by
 	 * zio_done(), which VERIFYs that the override BP is identical
 	 * to the on-disk BP.
 	 */
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 	if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid))
 		zp.zp_nopwrite = B_FALSE;
 	DB_DNODE_EXIT(db);
 
 	ASSERT(dr->dr_txg == txg);
 	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
 	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
 		 * We have already issued a sync write for this buffer,
 		 * or this buffer has already been synced.  It could not
 		 * have been dirtied since, or we would have cleared the state.
 		 */
 		mutex_exit(&db->db_mtx);
 		return (SET_ERROR(EALREADY));
 	}
 
 	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
 	mutex_exit(&db->db_mtx);
 
 	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
 	dsa->dsa_dr = dr;
 	dsa->dsa_done = done;
 	dsa->dsa_zgd = zgd;
 	dsa->dsa_tx = NULL;
 
 	zio_nowait(arc_write(pio, os->os_spa, txg,
 	    zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db),
 	    &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa,
 	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
 	return (0);
 }
 
 int
 dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 	int err;
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_blksz(dn, size, ibs, tx);
 	dnode_rele(dn, FTAG);
 	return (err);
 }
 
 void
 dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's checksum function.  This
 	 * check ensures that the receiving system can understand the
 	 * checksum function transmitted.
 	 */
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 void
 dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	/*
 	 * Send streams include each object's compression function.  This
 	 * check ensures that the receiving system can understand the
 	 * compression function transmitted.
 	 */
 	ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS);
 
 	VERIFY0(dnode_hold(os, object, FTAG, &dn));
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
 int zfs_mdcomp_disable = 0;
 
 /*
  * When the "redundant_metadata" property is set to "most", only indirect
  * blocks of this level and higher will have an additional ditto block.
  */
 int zfs_redundant_metadata_most_ditto_level = 2;
 
 void
 dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
 {
 	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
 	boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) ||
 	    (wp & WP_SPILL));
 	enum zio_checksum checksum = os->os_checksum;
 	enum zio_compress compress = os->os_compress;
 	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
 	boolean_t dedup = B_FALSE;
 	boolean_t nopwrite = B_FALSE;
 	boolean_t dedup_verify = os->os_dedup_verify;
 	int copies = os->os_copies;
 
 	/*
 	 * We maintain different write policies for each of the following
 	 * types of data:
 	 *	 1. metadata
 	 *	 2. preallocated blocks (i.e. level-0 blocks of a dump device)
 	 *	 3. all other level 0 blocks
 	 */
 	if (ismd) {
 		if (zfs_mdcomp_disable) {
 			compress = ZIO_COMPRESS_EMPTY;
 		} else {
 			/*
 			 * XXX -- we should design a compression algorithm
 			 * that specializes in arrays of bps.
 			 */
 			compress = zio_compress_select(os->os_spa,
 			    ZIO_COMPRESS_ON, ZIO_COMPRESS_ON);
 		}
 
 		/*
 		 * Metadata always gets checksummed.  If the data
 		 * checksum is multi-bit correctable, and it's not a
 		 * ZBT-style checksum, then it's suitable for metadata
 		 * as well.  Otherwise, the metadata checksum defaults
 		 * to fletcher4.
 		 */
 		if (!(zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_METADATA) ||
 		    (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_EMBEDDED))
 			checksum = ZIO_CHECKSUM_FLETCHER_4;
 
 		if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
 		    (os->os_redundant_metadata ==
 		    ZFS_REDUNDANT_METADATA_MOST &&
 		    (level >= zfs_redundant_metadata_most_ditto_level ||
 		    DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
 			copies++;
 	} else if (wp & WP_NOFILL) {
 		ASSERT(level == 0);
 
 		/*
 		 * If we're writing preallocated blocks, we aren't actually
 		 * writing them so don't set any policy properties.  These
 		 * blocks are currently only used by an external subsystem
 		 * outside of zfs (i.e. dump) and not written by the zio
 		 * pipeline.
 		 */
 		compress = ZIO_COMPRESS_OFF;
 		checksum = ZIO_CHECKSUM_NOPARITY;
 	} else {
 		compress = zio_compress_select(os->os_spa, dn->dn_compress,
 		    compress);
 
 		checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ?
 		    zio_checksum_select(dn->dn_checksum, checksum) :
 		    dedup_checksum;
 
 		/*
 		 * Determine dedup setting.  If we are in dmu_sync(),
 		 * we won't actually dedup now because that's all
 		 * done in syncing context; but we do want to use the
 		 * dedup checkum.  If the checksum is not strong
 		 * enough to ensure unique signatures, force
 		 * dedup_verify.
 		 */
 		if (dedup_checksum != ZIO_CHECKSUM_OFF) {
 			dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE;
 			if (!(zio_checksum_table[checksum].ci_flags &
 			    ZCHECKSUM_FLAG_DEDUP))
 				dedup_verify = B_TRUE;
 		}
 
 		/*
 		 * Enable nopwrite if we have secure enough checksum
 		 * algorithm (see comment in zio_nop_write) and
 		 * compression is enabled.  We don't enable nopwrite if
 		 * dedup is enabled as the two features are mutually
 		 * exclusive.
 		 */
 		nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags &
 		    ZCHECKSUM_FLAG_NOPWRITE) &&
 		    compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled);
 	}
 
 	zp->zp_checksum = checksum;
 	zp->zp_compress = compress;
 	ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT);
 
 	zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
 	zp->zp_level = level;
 	zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
 	zp->zp_dedup = dedup;
 	zp->zp_dedup_verify = dedup && dedup_verify;
 	zp->zp_nopwrite = nopwrite;
 }
 
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
 	int err;
 
 	/*
 	 * Sync any current changes before
 	 * we go trundling through the block pointers.
 	 */
 	err = dmu_object_wait_synced(os, object);
 	if (err) {
 		return (err);
 	}
 
 	err = dnode_hold(os, object, FTAG, &dn);
 	if (err) {
 		return (err);
 	}
 
 	err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
 	dnode_rele(dn, FTAG);
 
 	return (err);
 }
 
 /*
  * Given the ZFS object, if it contains any dirty nodes
  * this function flushes all dirty blocks to disk. This
  * ensures the DMU object info is updated. A more efficient
  * future version might just find the TXG with the maximum
  * ID and wait for that to be synced.
  */
 int
 dmu_object_wait_synced(objset_t *os, uint64_t object)
 {
 	dnode_t *dn;
 	int error, i;
 
 	error = dnode_hold(os, object, FTAG, &dn);
 	if (error) {
 		return (error);
 	}
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		if (list_link_active(&dn->dn_dirty_link[i])) {
 			break;
 		}
 	}
 	dnode_rele(dn, FTAG);
 	if (i != TXG_SIZE) {
 		txg_wait_synced(dmu_objset_pool(os), 0);
 	}
 
 	return (0);
 }
 
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
 	dnode_phys_t *dnp;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	mutex_enter(&dn->dn_mtx);
 
 	dnp = dn->dn_phys;
 
 	doi->doi_data_block_size = dn->dn_datablksz;
 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
 	    1ULL << dn->dn_indblkshift : 0;
 	doi->doi_type = dn->dn_type;
 	doi->doi_bonus_type = dn->dn_bonustype;
 	doi->doi_bonus_size = dn->dn_bonuslen;
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
 	doi->doi_nblkptr = dn->dn_nblkptr;
 	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
 	doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz;
 	doi->doi_fill_count = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]);
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
 }
 
 /*
  * Get information on a DMU object.
  * If doi is NULL, just indicates whether the object exists.
  */
 int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
 	dnode_t *dn;
 	int err = dnode_hold(os, object, FTAG, &dn);
 
 	if (err)
 		return (err);
 
 	if (doi != NULL)
 		dmu_object_info_from_dnode(dn, doi);
 
 	dnode_rele(dn, FTAG);
 	return (0);
 }
 
 /*
  * As above, but faster; can be used when you have a held dbuf in hand.
  */
 void
 dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 
 	DB_DNODE_ENTER(db);
 	dmu_object_info_from_dnode(DB_DNODE(db), doi);
 	DB_DNODE_EXIT(db);
 }
 
 /*
  * Faster still when you only care about the size.
  * This is specifically optimized for zfs_getattr().
  */
 void
 dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize,
     u_longlong_t *nblk512)
 {
 	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
 	dnode_t *dn;
 
 	DB_DNODE_ENTER(db);
 	dn = DB_DNODE(db);
 
 	*blksize = dn->dn_datablksz;
 	/* add 1 for dnode space */
 	*nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >>
 	    SPA_MINBLOCKSHIFT) + 1;
 	DB_DNODE_EXIT(db);
 }
 
 void
 byteswap_uint64_array(void *vbuf, size_t size)
 {
 	uint64_t *buf = vbuf;
 	size_t count = size >> 3;
 	int i;
 
 	ASSERT((size & 7) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_64(buf[i]);
 }
 
 void
 byteswap_uint32_array(void *vbuf, size_t size)
 {
 	uint32_t *buf = vbuf;
 	size_t count = size >> 2;
 	int i;
 
 	ASSERT((size & 3) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_32(buf[i]);
 }
 
 void
 byteswap_uint16_array(void *vbuf, size_t size)
 {
 	uint16_t *buf = vbuf;
 	size_t count = size >> 1;
 	int i;
 
 	ASSERT((size & 1) == 0);
 
 	for (i = 0; i < count; i++)
 		buf[i] = BSWAP_16(buf[i]);
 }
 
 /* ARGSUSED */
 void
 byteswap_uint8_array(void *vbuf, size_t size)
 {
 }
 
 void
 dmu_init(void)
 {
 	abd_init();
 	zfs_dbgmsg_init();
 	sa_cache_init();
 	xuio_stat_init();
 	dmu_objset_init();
 	dnode_init();
 	zfetch_init();
 	l2arc_init();
 	arc_init();
 	dbuf_init();
 }
 
 void
 dmu_fini(void)
 {
 	arc_fini(); /* arc depends on l2arc, so arc must go first */
 	l2arc_fini();
 	zfetch_fini();
 	dbuf_fini();
 	dnode_fini();
 	dmu_objset_fini();
 	xuio_stat_fini();
 	sa_cache_fini();
 	zfs_dbgmsg_fini();
 	abd_fini();
 }
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c	(revision 336957)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu_objset.c	(revision 336958)
@@ -1,2347 +1,2355 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2013, Joyent, Inc. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2015, STRATO AG, Inc. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2017 Nexenta Systems, Inc.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/cred.h>
 #include <sys/zfs_context.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/dnode.h>
 #include <sys/dbuf.h>
 #include <sys/zvol.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/dmu_impl.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/sa.h>
 #include <sys/zfs_onexit.h>
 #include <sys/dsl_destroy.h>
 #include <sys/vdev.h>
 #include <sys/zfeature.h>
 
 /*
  * Needed to close a window in dnode_move() that allows the objset to be freed
  * before it can be safely accessed.
  */
 krwlock_t os_lock;
 
 /*
  * Tunable to overwrite the maximum number of threads for the parallization
  * of dmu_objset_find_dp, needed to speed up the import of pools with many
  * datasets.
  * Default is 4 times the number of leaf vdevs.
  */
 int dmu_find_threads = 0;
 
 /*
  * Backfill lower metadnode objects after this many have been freed.
  * Backfilling negatively impacts object creation rates, so only do it
  * if there are enough holes to fill.
  */
 int dmu_rescan_dnode_threshold = 131072;
 
 static void dmu_objset_find_dp_cb(void *arg);
 
 void
 dmu_objset_init(void)
 {
 	rw_init(&os_lock, NULL, RW_DEFAULT, NULL);
 }
 
 void
 dmu_objset_fini(void)
 {
 	rw_destroy(&os_lock);
 }
 
 spa_t *
 dmu_objset_spa(objset_t *os)
 {
 	return (os->os_spa);
 }
 
 zilog_t *
 dmu_objset_zil(objset_t *os)
 {
 	return (os->os_zil);
 }
 
 dsl_pool_t *
 dmu_objset_pool(objset_t *os)
 {
 	dsl_dataset_t *ds;
 
 	if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
 		return (ds->ds_dir->dd_pool);
 	else
 		return (spa_get_dsl(os->os_spa));
 }
 
 dsl_dataset_t *
 dmu_objset_ds(objset_t *os)
 {
 	return (os->os_dsl_dataset);
 }
 
 dmu_objset_type_t
 dmu_objset_type(objset_t *os)
 {
 	return (os->os_phys->os_type);
 }
 
 void
 dmu_objset_name(objset_t *os, char *buf)
 {
 	dsl_dataset_name(os->os_dsl_dataset, buf);
 }
 
 uint64_t
 dmu_objset_id(objset_t *os)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 
 	return (ds ? ds->ds_object : 0);
 }
 
 zfs_sync_type_t
 dmu_objset_syncprop(objset_t *os)
 {
 	return (os->os_sync);
 }
 
 zfs_logbias_op_t
 dmu_objset_logbias(objset_t *os)
 {
 	return (os->os_logbias);
 }
 
 static void
 checksum_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance should have been done by now.
 	 */
 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 
 	os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
 }
 
 static void
 compression_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval != ZIO_COMPRESS_INHERIT);
 
 	os->os_compress = zio_compress_select(os->os_spa, newval,
 	    ZIO_COMPRESS_ON);
 }
 
 static void
 copies_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval > 0);
 	ASSERT(newval <= spa_max_replication(os->os_spa));
 
 	os->os_copies = newval;
 }
 
 static void
 dedup_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 	spa_t *spa = os->os_spa;
 	enum zio_checksum checksum;
 
 	/*
 	 * Inheritance should have been done by now.
 	 */
 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 
 	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
 
 	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
 	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
 }
 
 static void
 primary_cache_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 	    newval == ZFS_CACHE_METADATA);
 
 	os->os_primary_cache = newval;
 }
 
 static void
 secondary_cache_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 	    newval == ZFS_CACHE_METADATA);
 
 	os->os_secondary_cache = newval;
 }
 
 static void
 sync_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS ||
 	    newval == ZFS_SYNC_DISABLED);
 
 	os->os_sync = newval;
 	if (os->os_zil)
 		zil_set_sync(os->os_zil, newval);
 }
 
 static void
 redundant_metadata_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
 	    newval == ZFS_REDUNDANT_METADATA_MOST);
 
 	os->os_redundant_metadata = newval;
 }
 
 static void
 logbias_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
 	    newval == ZFS_LOGBIAS_THROUGHPUT);
 	os->os_logbias = newval;
 	if (os->os_zil)
 		zil_set_logbias(os->os_zil, newval);
 }
 
 static void
 recordsize_changed_cb(void *arg, uint64_t newval)
 {
 	objset_t *os = arg;
 
 	os->os_recordsize = newval;
 }
 
 void
 dmu_objset_byteswap(void *buf, size_t size)
 {
 	objset_phys_t *osp = buf;
 
 	ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
 	dnode_byteswap(&osp->os_meta_dnode);
 	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
 	osp->os_type = BSWAP_64(osp->os_type);
 	osp->os_flags = BSWAP_64(osp->os_flags);
 	if (size == sizeof (objset_phys_t)) {
 		dnode_byteswap(&osp->os_userused_dnode);
 		dnode_byteswap(&osp->os_groupused_dnode);
 	}
 }
 
 /*
  * The hash is a CRC-based hash of the objset_t pointer and the object number.
  */
 static uint64_t
 dnode_hash(const objset_t *os, uint64_t obj)
 {
 	uintptr_t osv = (uintptr_t)os;
 	uint64_t crc = -1ULL;
 
 	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
 	/*
 	 * The low 6 bits of the pointer don't have much entropy, because
 	 * the objset_t is larger than 2^6 bytes long.
 	 */
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF];
 	crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF];
 
 	crc ^= (osv>>14) ^ (obj>>24);
 
 	return (crc);
 }
 
 unsigned int
 dnode_multilist_index_func(multilist_t *ml, void *obj)
 {
 	dnode_t *dn = obj;
 	return (dnode_hash(dn->dn_objset, dn->dn_object) %
 	    multilist_get_num_sublists(ml));
 }
 
 /*
  * Instantiates the objset_t in-memory structure corresponding to the
  * objset_phys_t that's pointed to by the specified blkptr_t.
  */
 int
 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     objset_t **osp)
 {
 	objset_t *os;
 	int i, err;
 
 	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 
 	/*
 	 * The $ORIGIN dataset (if it exists) doesn't have an associated
 	 * objset, so there's no reason to open it. The $ORIGIN dataset
 	 * will not exist on pools older than SPA_VERSION_ORIGIN.
 	 */
 	if (ds != NULL && spa_get_dsl(spa) != NULL &&
 	    spa_get_dsl(spa)->dp_origin_snap != NULL) {
 		ASSERT3P(ds->ds_dir, !=,
 		    spa_get_dsl(spa)->dp_origin_snap->ds_dir);
 	}
 
 	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
 	os->os_dsl_dataset = ds;
 	os->os_spa = spa;
 	os->os_rootbp = bp;
 	if (!BP_IS_HOLE(os->os_rootbp)) {
 		arc_flags_t aflags = ARC_FLAG_WAIT;
 		zbookmark_phys_t zb;
 		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
 		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 
 		if (DMU_OS_IS_L2CACHEABLE(os))
 			aflags |= ARC_FLAG_L2CACHE;
 
 		dprintf_bp(os->os_rootbp, "reading %s", "");
 		err = arc_read(NULL, spa, os->os_rootbp,
 		    arc_getbuf_func, &os->os_phys_buf,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
 		if (err != 0) {
 			kmem_free(os, sizeof (objset_t));
 			/* convert checksum errors into IO errors */
 			if (err == ECKSUM)
 				err = SET_ERROR(EIO);
 			return (err);
 		}
 
 		/* Increase the blocksize if we are permitted. */
 		if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
 		    arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
 			arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf,
 			    ARC_BUFC_METADATA, sizeof (objset_phys_t));
 			bzero(buf->b_data, sizeof (objset_phys_t));
 			bcopy(os->os_phys_buf->b_data, buf->b_data,
 			    arc_buf_size(os->os_phys_buf));
 			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 			os->os_phys_buf = buf;
 		}
 
 		os->os_phys = os->os_phys_buf->b_data;
 		os->os_flags = os->os_phys->os_flags;
 	} else {
 		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
 		    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
 		os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf,
 		    ARC_BUFC_METADATA, size);
 		os->os_phys = os->os_phys_buf->b_data;
 		bzero(os->os_phys, size);
 	}
 
 	/*
 	 * Note: the changed_cb will be called once before the register
 	 * func returns, thus changing the checksum/compression from the
 	 * default (fletcher2/off).  Snapshots don't need to know about
 	 * checksum/compression/copies.
 	 */
 	if (ds != NULL) {
 		boolean_t needlock = B_FALSE;
 
 		/*
 		 * Note: it's valid to open the objset if the dataset is
 		 * long-held, in which case the pool_config lock will not
 		 * be held.
 		 */
 		if (!dsl_pool_config_held(dmu_objset_pool(os))) {
 			needlock = B_TRUE;
 			dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 		}
 		err = dsl_prop_register(ds,
 		    zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
 		    primary_cache_changed_cb, os);
 		if (err == 0) {
 			err = dsl_prop_register(ds,
 			    zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
 			    secondary_cache_changed_cb, os);
 		}
 		if (!ds->ds_is_snapshot) {
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 				    checksum_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 				    compression_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_COPIES),
 				    copies_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_DEDUP),
 				    dedup_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_LOGBIAS),
 				    logbias_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_SYNC),
 				    sync_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(
 				    ZFS_PROP_REDUNDANT_METADATA),
 				    redundant_metadata_changed_cb, os);
 			}
 			if (err == 0) {
 				err = dsl_prop_register(ds,
 				    zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
 				    recordsize_changed_cb, os);
 			}
 		}
 		if (needlock)
 			dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 		if (err != 0) {
 			arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 			kmem_free(os, sizeof (objset_t));
 			return (err);
 		}
 	} else {
 		/* It's the meta-objset. */
 		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
 		os->os_compress = ZIO_COMPRESS_ON;
 		os->os_copies = spa_max_replication(spa);
 		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
 		os->os_dedup_verify = B_FALSE;
 		os->os_logbias = ZFS_LOGBIAS_LATENCY;
 		os->os_sync = ZFS_SYNC_STANDARD;
 		os->os_primary_cache = ZFS_CACHE_ALL;
 		os->os_secondary_cache = ZFS_CACHE_ALL;
 	}
+	/*
+	 * These properties will be filled in by the logic in zfs_get_zplprop()
+	 * when they are queried for the first time.
+	 */
+	os->os_version = OBJSET_PROP_UNINITIALIZED;
+	os->os_normalization = OBJSET_PROP_UNINITIALIZED;
+	os->os_utf8only = OBJSET_PROP_UNINITIALIZED;
+	os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED;
 
 	if (ds == NULL || !ds->ds_is_snapshot)
 		os->os_zil_header = os->os_phys->os_zil_header;
 	os->os_zil = zil_alloc(os, &os->os_zil_header);
 
 	for (i = 0; i < TXG_SIZE; i++) {
 		os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[i]),
 		    dnode_multilist_index_func);
 	}
 	list_create(&os->os_dnodes, sizeof (dnode_t),
 	    offsetof(dnode_t, dn_link));
 	list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
 	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	dnode_special_open(os, &os->os_phys->os_meta_dnode,
 	    DMU_META_DNODE_OBJECT, &os->os_meta_dnode);
 	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
 		dnode_special_open(os, &os->os_phys->os_userused_dnode,
 		    DMU_USERUSED_OBJECT, &os->os_userused_dnode);
 		dnode_special_open(os, &os->os_phys->os_groupused_dnode,
 		    DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode);
 	}
 
 	*osp = os;
 	return (0);
 }
 
 int
 dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
 {
 	int err = 0;
 
 	/*
 	 * We shouldn't be doing anything with dsl_dataset_t's unless the
 	 * pool_config lock is held, or the dataset is long-held.
 	 */
 	ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool) ||
 	    dsl_dataset_long_held(ds));
 
 	mutex_enter(&ds->ds_opening_lock);
 	if (ds->ds_objset == NULL) {
 		objset_t *os;
 		rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
 		    ds, dsl_dataset_get_blkptr(ds), &os);
 		rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 		if (err == 0) {
 			mutex_enter(&ds->ds_lock);
 			ASSERT(ds->ds_objset == NULL);
 			ds->ds_objset = os;
 			mutex_exit(&ds->ds_lock);
 		}
 	}
 	*osp = ds->ds_objset;
 	mutex_exit(&ds->ds_opening_lock);
 	return (err);
 }
 
 /*
  * Holds the pool while the objset is held.  Therefore only one objset
  * can be held at a time.
  */
 int
 dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_pool_hold(name, tag, &dp);
 	if (err != 0)
 		return (err);
 	err = dsl_dataset_hold(dp, name, tag, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, tag);
 		return (err);
 	}
 
 	err = dmu_objset_from_ds(ds, osp);
 	if (err != 0) {
 		dsl_dataset_rele(ds, tag);
 		dsl_pool_rele(dp, tag);
 	}
 
 	return (err);
 }
 
 static int
 dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp)
 {
 	int err;
 
 	err = dmu_objset_from_ds(ds, osp);
 	if (err != 0) {
 		dsl_dataset_disown(ds, tag);
 	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
 		dsl_dataset_disown(ds, tag);
 		return (SET_ERROR(EINVAL));
 	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
 		dsl_dataset_disown(ds, tag);
 		return (SET_ERROR(EROFS));
 	}
 	return (err);
 }
 
 /*
  * dsl_pool must not be held when this is called.
  * Upon successful return, there will be a longhold on the dataset,
  * and the dsl_pool will not be held.
  */
 int
 dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp)
 {
 	dsl_pool_t *dp;
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_pool_hold(name, FTAG, &dp);
 	if (err != 0)
 		return (err);
 	err = dsl_dataset_own(dp, name, tag, &ds);
 	if (err != 0) {
 		dsl_pool_rele(dp, FTAG);
 		return (err);
 	}
 	err = dmu_objset_own_impl(ds, type, readonly, tag, osp);
 	dsl_pool_rele(dp, FTAG);
 
 	return (err);
 }
 
 int
 dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp)
 {
 	dsl_dataset_t *ds;
 	int err;
 
 	err = dsl_dataset_own_obj(dp, obj, tag, &ds);
 	if (err != 0)
 		return (err);
 
 	return (dmu_objset_own_impl(ds, type, readonly, tag, osp));
 }
 
 void
 dmu_objset_rele(objset_t *os, void *tag)
 {
 	dsl_pool_t *dp = dmu_objset_pool(os);
 	dsl_dataset_rele(os->os_dsl_dataset, tag);
 	dsl_pool_rele(dp, tag);
 }
 
 /*
  * When we are called, os MUST refer to an objset associated with a dataset
  * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner
  * == tag.  We will then release and reacquire ownership of the dataset while
  * holding the pool config_rwlock to avoid intervening namespace or ownership
  * changes may occur.
  *
  * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to
  * release the hold on its dataset and acquire a new one on the dataset of the
  * same name so that it can be partially torn down and reconstructed.
  */
 void
 dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds,
     void *tag)
 {
 	dsl_pool_t *dp;
 	char name[ZFS_MAX_DATASET_NAME_LEN];
 
 	VERIFY3P(ds, !=, NULL);
 	VERIFY3P(ds->ds_owner, ==, tag);
 	VERIFY(dsl_dataset_long_held(ds));
 
 	dsl_dataset_name(ds, name);
 	dp = ds->ds_dir->dd_pool;
 	dsl_pool_config_enter(dp, FTAG);
 	dsl_dataset_disown(ds, tag);
 	VERIFY0(dsl_dataset_own(dp, name, tag, newds));
 	dsl_pool_config_exit(dp, FTAG);
 }
 
 void
 dmu_objset_disown(objset_t *os, void *tag)
 {
 	dsl_dataset_disown(os->os_dsl_dataset, tag);
 }
 
 void
 dmu_objset_evict_dbufs(objset_t *os)
 {
 	dnode_t dn_marker;
 	dnode_t *dn;
 
 	mutex_enter(&os->os_lock);
 	dn = list_head(&os->os_dnodes);
 	while (dn != NULL) {
 		/*
 		 * Skip dnodes without holds.  We have to do this dance
 		 * because dnode_add_ref() only works if there is already a
 		 * hold.  If the dnode has no holds, then it has no dbufs.
 		 */
 		if (dnode_add_ref(dn, FTAG)) {
 			list_insert_after(&os->os_dnodes, dn, &dn_marker);
 			mutex_exit(&os->os_lock);
 
 			dnode_evict_dbufs(dn);
 			dnode_rele(dn, FTAG);
 
 			mutex_enter(&os->os_lock);
 			dn = list_next(&os->os_dnodes, &dn_marker);
 			list_remove(&os->os_dnodes, &dn_marker);
 		} else {
 			dn = list_next(&os->os_dnodes, dn);
 		}
 	}
 	mutex_exit(&os->os_lock);
 
 	if (DMU_USERUSED_DNODE(os) != NULL) {
 		dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os));
 		dnode_evict_dbufs(DMU_USERUSED_DNODE(os));
 	}
 	dnode_evict_dbufs(DMU_META_DNODE(os));
 }
 
 /*
  * Objset eviction processing is split into into two pieces.
  * The first marks the objset as evicting, evicts any dbufs that
  * have a refcount of zero, and then queues up the objset for the
  * second phase of eviction.  Once os->os_dnodes has been cleared by
  * dnode_buf_pageout()->dnode_destroy(), the second phase is executed.
  * The second phase closes the special dnodes, dequeues the objset from
  * the list of those undergoing eviction, and finally frees the objset.
  *
  * NOTE: Due to asynchronous eviction processing (invocation of
  *       dnode_buf_pageout()), it is possible for the meta dnode for the
  *       objset to have no holds even though os->os_dnodes is not empty.
  */
 void
 dmu_objset_evict(objset_t *os)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 
 	for (int t = 0; t < TXG_SIZE; t++)
 		ASSERT(!dmu_objset_is_dirty(os, t));
 
 	if (ds)
 		dsl_prop_unregister_all(ds, os);
 
 	if (os->os_sa)
 		sa_tear_down(os);
 
 	dmu_objset_evict_dbufs(os);
 
 	mutex_enter(&os->os_lock);
 	spa_evicting_os_register(os->os_spa, os);
 	if (list_is_empty(&os->os_dnodes)) {
 		mutex_exit(&os->os_lock);
 		dmu_objset_evict_done(os);
 	} else {
 		mutex_exit(&os->os_lock);
 	}
 }
 
 void
 dmu_objset_evict_done(objset_t *os)
 {
 	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 
 	dnode_special_close(&os->os_meta_dnode);
 	if (DMU_USERUSED_DNODE(os)) {
 		dnode_special_close(&os->os_userused_dnode);
 		dnode_special_close(&os->os_groupused_dnode);
 	}
 	zil_free(os->os_zil);
 
 	arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf);
 
 	/*
 	 * This is a barrier to prevent the objset from going away in
 	 * dnode_move() until we can safely ensure that the objset is still in
 	 * use. We consider the objset valid before the barrier and invalid
 	 * after the barrier.
 	 */
 	rw_enter(&os_lock, RW_READER);
 	rw_exit(&os_lock);
 
 	mutex_destroy(&os->os_lock);
 	mutex_destroy(&os->os_userused_lock);
 	mutex_destroy(&os->os_obj_lock);
 	mutex_destroy(&os->os_user_ptr_lock);
 	for (int i = 0; i < TXG_SIZE; i++) {
 		multilist_destroy(os->os_dirty_dnodes[i]);
 	}
 	spa_evicting_os_deregister(os->os_spa, os);
 	kmem_free(os, sizeof (objset_t));
 }
 
 timestruc_t
 dmu_objset_snap_cmtime(objset_t *os)
 {
 	return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
 }
 
 /* called from dsl for meta-objset */
 objset_t *
 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_objset_type_t type, dmu_tx_t *tx)
 {
 	objset_t *os;
 	dnode_t *mdn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	if (ds != NULL)
 		VERIFY0(dmu_objset_from_ds(ds, &os));
 	else
 		VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os));
 
 	mdn = DMU_META_DNODE(os);
 
 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
 	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
 
 	/*
 	 * We don't want to have to increase the meta-dnode's nlevels
 	 * later, because then we could do it in quescing context while
 	 * we are also accessing it in open context.
 	 *
 	 * This precaution is not necessary for the MOS (ds == NULL),
 	 * because the MOS is only updated in syncing context.
 	 * This is most fortunate: the MOS is the only objset that
 	 * needs to be synced multiple times as spa_sync() iterates
 	 * to convergence, so minimizing its dn_nlevels matters.
 	 */
 	if (ds != NULL) {
 		int levels = 1;
 
 		/*
 		 * Determine the number of levels necessary for the meta-dnode
 		 * to contain DN_MAX_OBJECT dnodes.  Note that in order to
 		 * ensure that we do not overflow 64 bits, there has to be
 		 * a nlevels that gives us a number of blocks > DN_MAX_OBJECT
 		 * but < 2^64.  Therefore,
 		 * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) (10) must be
 		 * less than (64 - log2(DN_MAX_OBJECT)) (16).
 		 */
 		while ((uint64_t)mdn->dn_nblkptr <<
 		    (mdn->dn_datablkshift - DNODE_SHIFT +
 		    (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) <
 		    DN_MAX_OBJECT)
 			levels++;
 
 		mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] =
 		    mdn->dn_nlevels = levels;
 	}
 
 	ASSERT(type != DMU_OST_NONE);
 	ASSERT(type != DMU_OST_ANY);
 	ASSERT(type < DMU_OST_NUMTYPES);
 	os->os_phys->os_type = type;
 	if (dmu_objset_userused_enabled(os)) {
 		os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 		os->os_flags = os->os_phys->os_flags;
 	}
 
 	dsl_dataset_dirty(ds, tx);
 
 	return (os);
 }
 
 typedef struct dmu_objset_create_arg {
 	const char *doca_name;
 	cred_t *doca_cred;
 	void (*doca_userfunc)(objset_t *os, void *arg,
 	    cred_t *cr, dmu_tx_t *tx);
 	void *doca_userarg;
 	dmu_objset_type_t doca_type;
 	uint64_t doca_flags;
 } dmu_objset_create_arg_t;
 
 /*ARGSUSED*/
 static int
 dmu_objset_create_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_create_arg_t *doca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *pdd;
 	const char *tail;
 	int error;
 
 	if (strchr(doca->doca_name, '@') != NULL)
 		return (SET_ERROR(EINVAL));
 
 	if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail);
 	if (error != 0)
 		return (error);
 	if (tail == NULL) {
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 	    doca->doca_cred);
 	dsl_dir_rele(pdd, FTAG);
 
 	return (error);
 }
 
 static void
 dmu_objset_create_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_create_arg_t *doca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *pdd;
 	const char *tail;
 	dsl_dataset_t *ds;
 	uint64_t obj;
 	blkptr_t *bp;
 	objset_t *os;
 
 	VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail));
 
 	obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags,
 	    doca->doca_cred, tx);
 
 	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
 	rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG);
 	bp = dsl_dataset_get_blkptr(ds);
 	os = dmu_objset_create_impl(pdd->dd_pool->dp_spa,
 	    ds, bp, doca->doca_type, tx);
 	rrw_exit(&ds->ds_bp_rwlock, FTAG);
 
 	if (doca->doca_userfunc != NULL) {
 		doca->doca_userfunc(os, doca->doca_userarg,
 		    doca->doca_cred, tx);
 	}
 
 	spa_history_log_internal_ds(ds, "create", tx, "");
 	dsl_dataset_rele(ds, FTAG);
 	dsl_dir_rele(pdd, FTAG);
 }
 
 int
 dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 {
 	dmu_objset_create_arg_t doca;
 
 	doca.doca_name = name;
 	doca.doca_cred = CRED();
 	doca.doca_flags = flags;
 	doca.doca_userfunc = func;
 	doca.doca_userarg = arg;
 	doca.doca_type = type;
 
 	return (dsl_sync_task(name,
 	    dmu_objset_create_check, dmu_objset_create_sync, &doca,
 	    5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 typedef struct dmu_objset_clone_arg {
 	const char *doca_clone;
 	const char *doca_origin;
 	cred_t *doca_cred;
 } dmu_objset_clone_arg_t;
 
 /*ARGSUSED*/
 static int
 dmu_objset_clone_check(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_clone_arg_t *doca = arg;
 	dsl_dir_t *pdd;
 	const char *tail;
 	int error;
 	dsl_dataset_t *origin;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 
 	if (strchr(doca->doca_clone, '@') != NULL)
 		return (SET_ERROR(EINVAL));
 
 	if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 
 	error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail);
 	if (error != 0)
 		return (error);
 	if (tail == NULL) {
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EEXIST));
 	}
 
 	error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL,
 	    doca->doca_cred);
 	if (error != 0) {
 		dsl_dir_rele(pdd, FTAG);
 		return (SET_ERROR(EDQUOT));
 	}
 	dsl_dir_rele(pdd, FTAG);
 
 	error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin);
 	if (error != 0)
 		return (error);
 
 	/* You can only clone snapshots, not the head datasets. */
 	if (!origin->ds_is_snapshot) {
 		dsl_dataset_rele(origin, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 	dsl_dataset_rele(origin, FTAG);
 
 	return (0);
 }
 
 static void
 dmu_objset_clone_sync(void *arg, dmu_tx_t *tx)
 {
 	dmu_objset_clone_arg_t *doca = arg;
 	dsl_pool_t *dp = dmu_tx_pool(tx);
 	dsl_dir_t *pdd;
 	const char *tail;
 	dsl_dataset_t *origin, *ds;
 	uint64_t obj;
 	char namebuf[ZFS_MAX_DATASET_NAME_LEN];
 
 	VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail));
 	VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin));
 
 	obj = dsl_dataset_create_sync(pdd, tail, origin, 0,
 	    doca->doca_cred, tx);
 
 	VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds));
 	dsl_dataset_name(origin, namebuf);
 	spa_history_log_internal_ds(ds, "clone", tx,
 	    "origin=%s (%llu)", namebuf, origin->ds_object);
 	dsl_dataset_rele(ds, FTAG);
 	dsl_dataset_rele(origin, FTAG);
 	dsl_dir_rele(pdd, FTAG);
 }
 
 int
 dmu_objset_clone(const char *clone, const char *origin)
 {
 	dmu_objset_clone_arg_t doca;
 
 	doca.doca_clone = clone;
 	doca.doca_origin = origin;
 	doca.doca_cred = CRED();
 
 	return (dsl_sync_task(clone,
 	    dmu_objset_clone_check, dmu_objset_clone_sync, &doca,
 	    5, ZFS_SPACE_CHECK_NORMAL));
 }
 
 static int
 dmu_objset_remap_indirects_impl(objset_t *os, uint64_t last_removed_txg)
 {
 	int error = 0;
 	uint64_t object = 0;
 	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
 		error = dmu_object_remap_indirects(os, object,
 		    last_removed_txg);
 		/*
 		 * If the ZPL removed the object before we managed to dnode_hold
 		 * it, we would get an ENOENT. If the ZPL declares its intent
 		 * to remove the object (dnode_free) before we manage to
 		 * dnode_hold it, we would get an EEXIST. In either case, we
 		 * want to continue remapping the other objects in the objset;
 		 * in all other cases, we want to break early.
 		 */
 		if (error != 0 && error != ENOENT && error != EEXIST) {
 			break;
 		}
 	}
 	if (error == ESRCH) {
 		error = 0;
 	}
 	return (error);
 }
 
 int
 dmu_objset_remap_indirects(const char *fsname)
 {
 	int error = 0;
 	objset_t *os = NULL;
 	uint64_t last_removed_txg;
 	uint64_t remap_start_txg;
 	dsl_dir_t *dd;
 
 	error = dmu_objset_hold(fsname, FTAG, &os);
 	if (error != 0) {
 		return (error);
 	}
 	dd = dmu_objset_ds(os)->ds_dir;
 
 	if (!spa_feature_is_enabled(dmu_objset_spa(os),
 	    SPA_FEATURE_OBSOLETE_COUNTS)) {
 		dmu_objset_rele(os, FTAG);
 		return (SET_ERROR(ENOTSUP));
 	}
 
 	if (dsl_dataset_is_snapshot(dmu_objset_ds(os))) {
 		dmu_objset_rele(os, FTAG);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/*
 	 * If there has not been a removal, we're done.
 	 */
 	last_removed_txg = spa_get_last_removal_txg(dmu_objset_spa(os));
 	if (last_removed_txg == -1ULL) {
 		dmu_objset_rele(os, FTAG);
 		return (0);
 	}
 
 	/*
 	 * If we have remapped since the last removal, we're done.
 	 */
 	if (dsl_dir_is_zapified(dd)) {
 		uint64_t last_remap_txg;
 		if (zap_lookup(spa_meta_objset(dmu_objset_spa(os)),
 		    dd->dd_object, DD_FIELD_LAST_REMAP_TXG,
 		    sizeof (last_remap_txg), 1, &last_remap_txg) == 0 &&
 		    last_remap_txg > last_removed_txg) {
 			dmu_objset_rele(os, FTAG);
 			return (0);
 		}
 	}
 
 	dsl_dataset_long_hold(dmu_objset_ds(os), FTAG);
 	dsl_pool_rele(dmu_objset_pool(os), FTAG);
 
 	remap_start_txg = spa_last_synced_txg(dmu_objset_spa(os));
 	error = dmu_objset_remap_indirects_impl(os, last_removed_txg);
 	if (error == 0) {
 		/*
 		 * We update the last_remap_txg to be the start txg so that
 		 * we can guarantee that every block older than last_remap_txg
 		 * that can be remapped has been remapped.
 		 */
 		error = dsl_dir_update_last_remap_txg(dd, remap_start_txg);
 	}
 
 	dsl_dataset_long_rele(dmu_objset_ds(os), FTAG);
 	dsl_dataset_rele(dmu_objset_ds(os), FTAG);
 
 	return (error);
 }
 
 int
 dmu_objset_snapshot_one(const char *fsname, const char *snapname)
 {
 	int err;
 	char *longsnap = kmem_asprintf("%s@%s", fsname, snapname);
 	nvlist_t *snaps = fnvlist_alloc();
 
 	fnvlist_add_boolean(snaps, longsnap);
 	strfree(longsnap);
 	err = dsl_dataset_snapshot(snaps, NULL, NULL);
 	fnvlist_free(snaps);
 	return (err);
 }
 
 static void
 dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
 	while ((dn = multilist_sublist_head(list)) != NULL) {
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		ASSERT(dn->dn_dbuf->db_data_pending);
 		/*
 		 * Initialize dn_zio outside dnode_sync() because the
 		 * meta-dnode needs to set it ouside dnode_sync().
 		 */
 		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
 		ASSERT(dn->dn_zio);
 
 		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
 		multilist_sublist_remove(list, dn);
 
 		multilist_t *newlist = dn->dn_objset->os_synced_dnodes;
 		if (newlist != NULL) {
 			(void) dnode_add_ref(dn, newlist);
 			multilist_insert(newlist, dn);
 		}
 
 		dnode_sync(dn, tx);
 	}
 }
 
 /* ARGSUSED */
 static void
 dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	blkptr_t *bp = zio->io_bp;
 	objset_t *os = arg;
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 
 	ASSERT(!BP_IS_EMBEDDED(bp));
 	ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET);
 	ASSERT0(BP_GET_LEVEL(bp));
 
 	/*
 	 * Update rootbp fill count: it should be the number of objects
 	 * allocated in the object set (not counting the "special"
 	 * objects that are stored in the objset_phys_t -- the meta
 	 * dnode and user/group accounting objects).
 	 */
 	bp->blk_fill = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]);
 	if (os->os_dsl_dataset != NULL)
 		rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG);
 	*os->os_rootbp = *bp;
 	if (os->os_dsl_dataset != NULL)
 		rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG);
 }
 
 /* ARGSUSED */
 static void
 dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	objset_t *os = arg;
 
 	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
 		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		dmu_tx_t *tx = os->os_synctx;
 
 		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
 		dsl_dataset_block_born(ds, bp, tx);
 	}
 	kmem_free(bp, sizeof (*bp));
 }
 
 typedef struct sync_dnodes_arg {
 	multilist_t *sda_list;
 	int sda_sublist_idx;
 	multilist_t *sda_newlist;
 	dmu_tx_t *sda_tx;
 } sync_dnodes_arg_t;
 
 static void
 sync_dnodes_task(void *arg)
 {
 	sync_dnodes_arg_t *sda = arg;
 
 	multilist_sublist_t *ms =
 	    multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx);
 
 	dmu_objset_sync_dnodes(ms, sda->sda_tx);
 
 	multilist_sublist_unlock(ms);
 
 	kmem_free(sda, sizeof (*sda));
 }
 
 
 /* called from dsl */
 void
 dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 {
 	int txgoff;
 	zbookmark_phys_t zb;
 	zio_prop_t zp;
 	zio_t *zio;
 	list_t *list;
 	dbuf_dirty_record_t *dr;
 	blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP);
 	*blkptr_copy = *os->os_rootbp;
 
 	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	/* XXX the write_done callback should really give us the tx... */
 	os->os_synctx = tx;
 
 	if (os->os_dsl_dataset == NULL) {
 		/*
 		 * This is the MOS.  If we have upgraded,
 		 * spa_max_replication() could change, so reset
 		 * os_copies here.
 		 */
 		os->os_copies = spa_max_replication(os->os_spa);
 	}
 
 	/*
 	 * Create the root block IO
 	 */
 	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
 	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
 	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	arc_release(os->os_phys_buf, &os->os_phys_buf);
 
 	dmu_write_policy(os, NULL, 0, 0, &zp);
 
 	zio = arc_write(pio, os->os_spa, tx->tx_txg,
 	    blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os),
 	    &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done,
 	    os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
 	 * Sync special dnodes - the parent IO for the sync is the root block
 	 */
 	DMU_META_DNODE(os)->dn_zio = zio;
 	dnode_sync(DMU_META_DNODE(os), tx);
 
 	os->os_phys->os_flags = os->os_flags;
 
 	if (DMU_USERUSED_DNODE(os) &&
 	    DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) {
 		DMU_USERUSED_DNODE(os)->dn_zio = zio;
 		dnode_sync(DMU_USERUSED_DNODE(os), tx);
 		DMU_GROUPUSED_DNODE(os)->dn_zio = zio;
 		dnode_sync(DMU_GROUPUSED_DNODE(os), tx);
 	}
 
 	txgoff = tx->tx_txg & TXG_MASK;
 
 	if (dmu_objset_userused_enabled(os)) {
 		/*
 		 * We must create the list here because it uses the
 		 * dn_dirty_link[] of this txg.  But it may already
 		 * exist because we call dsl_dataset_sync() twice per txg.
 		 */
 		if (os->os_synced_dnodes == NULL) {
 			os->os_synced_dnodes =
 			    multilist_create(sizeof (dnode_t),
 			    offsetof(dnode_t, dn_dirty_link[txgoff]),
 			    dnode_multilist_index_func);
 		} else {
 			ASSERT3U(os->os_synced_dnodes->ml_offset, ==,
 			    offsetof(dnode_t, dn_dirty_link[txgoff]));
 		}
 	}
 
 	for (int i = 0;
 	    i < multilist_get_num_sublists(os->os_dirty_dnodes[txgoff]); i++) {
 		sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP);
 		sda->sda_list = os->os_dirty_dnodes[txgoff];
 		sda->sda_sublist_idx = i;
 		sda->sda_tx = tx;
 		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
 		    sync_dnodes_task, sda, 0);
 		/* callback frees sda */
 	}
 	taskq_wait(dmu_objset_pool(os)->dp_sync_taskq);
 
 	list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff];
 	while ((dr = list_head(list)) != NULL) {
 		ASSERT0(dr->dr_dbuf->db_level);
 		list_remove(list, dr);
 		if (dr->dr_zio)
 			zio_nowait(dr->dr_zio);
 	}
 
 	/* Enable dnode backfill if enough objects have been freed. */
 	if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) {
 		os->os_rescan_dnodes = B_TRUE;
 		os->os_freed_dnodes = 0;
 	}
 
 	/*
 	 * Free intent log blocks up to this tx.
 	 */
 	zil_sync(os->os_zil, tx);
 	os->os_phys->os_zil_header = os->os_zil_header;
 	zio_nowait(zio);
 }
 
 boolean_t
 dmu_objset_is_dirty(objset_t *os, uint64_t txg)
 {
 	return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK]));
 }
 
 static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
 
 void
 dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
 {
 	used_cbs[ost] = cb;
 }
 
 boolean_t
 dmu_objset_userused_enabled(objset_t *os)
 {
 	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
 	    used_cbs[os->os_phys->os_type] != NULL &&
 	    DMU_USERUSED_DNODE(os) != NULL);
 }
 
 typedef struct userquota_node {
 	uint64_t uqn_id;
 	int64_t uqn_delta;
 	avl_node_t uqn_node;
 } userquota_node_t;
 
 typedef struct userquota_cache {
 	avl_tree_t uqc_user_deltas;
 	avl_tree_t uqc_group_deltas;
 } userquota_cache_t;
 
 static int
 userquota_compare(const void *l, const void *r)
 {
 	const userquota_node_t *luqn = l;
 	const userquota_node_t *ruqn = r;
 
 	if (luqn->uqn_id < ruqn->uqn_id)
 		return (-1);
 	if (luqn->uqn_id > ruqn->uqn_id)
 		return (1);
 	return (0);
 }
 
 static void
 do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx)
 {
 	void *cookie;
 	userquota_node_t *uqn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	cookie = NULL;
 	while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas,
 	    &cookie)) != NULL) {
 		/*
 		 * os_userused_lock protects against concurrent calls to
 		 * zap_increment_int().  It's needed because zap_increment_int()
 		 * is not thread-safe (i.e. not atomic).
 		 */
 		mutex_enter(&os->os_userused_lock);
 		VERIFY0(zap_increment_int(os, DMU_USERUSED_OBJECT,
 		    uqn->uqn_id, uqn->uqn_delta, tx));
 		mutex_exit(&os->os_userused_lock);
 		kmem_free(uqn, sizeof (*uqn));
 	}
 	avl_destroy(&cache->uqc_user_deltas);
 
 	cookie = NULL;
 	while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas,
 	    &cookie)) != NULL) {
 		mutex_enter(&os->os_userused_lock);
 		VERIFY0(zap_increment_int(os, DMU_GROUPUSED_OBJECT,
 		    uqn->uqn_id, uqn->uqn_delta, tx));
 		mutex_exit(&os->os_userused_lock);
 		kmem_free(uqn, sizeof (*uqn));
 	}
 	avl_destroy(&cache->uqc_group_deltas);
 }
 
 static void
 userquota_update_cache(avl_tree_t *avl, uint64_t id, int64_t delta)
 {
 	userquota_node_t search = { .uqn_id = id };
 	avl_index_t idx;
 
 	userquota_node_t *uqn = avl_find(avl, &search, &idx);
 	if (uqn == NULL) {
 		uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP);
 		uqn->uqn_id = id;
 		avl_insert(avl, uqn, idx);
 	}
 	uqn->uqn_delta += delta;
 }
 
 static void
 do_userquota_update(userquota_cache_t *cache, uint64_t used, uint64_t flags,
     uint64_t user, uint64_t group, boolean_t subtract)
 {
 	if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) {
 		int64_t delta = DNODE_SIZE + used;
 		if (subtract)
 			delta = -delta;
 
 		userquota_update_cache(&cache->uqc_user_deltas, user, delta);
 		userquota_update_cache(&cache->uqc_group_deltas, group, delta);
 	}
 }
 
 typedef struct userquota_updates_arg {
 	objset_t *uua_os;
 	int uua_sublist_idx;
 	dmu_tx_t *uua_tx;
 } userquota_updates_arg_t;
 
 static void
 userquota_updates_task(void *arg)
 {
 	userquota_updates_arg_t *uua = arg;
 	objset_t *os = uua->uua_os;
 	dmu_tx_t *tx = uua->uua_tx;
 	dnode_t *dn;
 	userquota_cache_t cache = { 0 };
 
 	multilist_sublist_t *list =
 	    multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx);
 
 	ASSERT(multilist_sublist_head(list) == NULL ||
 	    dmu_objset_userused_enabled(os));
 	avl_create(&cache.uqc_user_deltas, userquota_compare,
 	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
 	avl_create(&cache.uqc_group_deltas, userquota_compare,
 	    sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node));
 
 	while ((dn = multilist_sublist_head(list)) != NULL) {
 		int flags;
 		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
 		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
 		    dn->dn_phys->dn_flags &
 		    DNODE_FLAG_USERUSED_ACCOUNTED);
 
 		flags = dn->dn_id_flags;
 		ASSERT(flags);
 		if (flags & DN_ID_OLD_EXIST)  {
 			do_userquota_update(&cache,
 			    dn->dn_oldused, dn->dn_oldflags,
 			    dn->dn_olduid, dn->dn_oldgid, B_TRUE);
 		}
 		if (flags & DN_ID_NEW_EXIST) {
 			do_userquota_update(&cache,
 			    DN_USED_BYTES(dn->dn_phys),
 			    dn->dn_phys->dn_flags,  dn->dn_newuid,
 			    dn->dn_newgid, B_FALSE);
 		}
 
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_oldused = 0;
 		dn->dn_oldflags = 0;
 		if (dn->dn_id_flags & DN_ID_NEW_EXIST) {
 			dn->dn_olduid = dn->dn_newuid;
 			dn->dn_oldgid = dn->dn_newgid;
 			dn->dn_id_flags |= DN_ID_OLD_EXIST;
 			if (dn->dn_bonuslen == 0)
 				dn->dn_id_flags |= DN_ID_CHKED_SPILL;
 			else
 				dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 		}
 		dn->dn_id_flags &= ~(DN_ID_NEW_EXIST);
 		mutex_exit(&dn->dn_mtx);
 
 		multilist_sublist_remove(list, dn);
 		dnode_rele(dn, os->os_synced_dnodes);
 	}
 	do_userquota_cacheflush(os, &cache, tx);
 	multilist_sublist_unlock(list);
 	kmem_free(uua, sizeof (*uua));
 }
 
 void
 dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx)
 {
 	if (!dmu_objset_userused_enabled(os))
 		return;
 
 	/* Allocate the user/groupused objects if necessary. */
 	if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) {
 		VERIFY0(zap_create_claim(os,
 		    DMU_USERUSED_OBJECT,
 		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 		VERIFY0(zap_create_claim(os,
 		    DMU_GROUPUSED_OBJECT,
 		    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
 	}
 
 	for (int i = 0;
 	    i < multilist_get_num_sublists(os->os_synced_dnodes); i++) {
 		userquota_updates_arg_t *uua =
 		    kmem_alloc(sizeof (*uua), KM_SLEEP);
 		uua->uua_os = os;
 		uua->uua_sublist_idx = i;
 		uua->uua_tx = tx;
 		/* note: caller does taskq_wait() */
 		(void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq,
 		    userquota_updates_task, uua, 0);
 		/* callback frees uua */
 	}
 }
 
 /*
  * Returns a pointer to data to find uid/gid from
  *
  * If a dirty record for transaction group that is syncing can't
  * be found then NULL is returned.  In the NULL case it is assumed
  * the uid/gid aren't changing.
  */
 static void *
 dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dbuf_dirty_record_t *dr, **drp;
 	void *data;
 
 	if (db->db_dirtycnt == 0)
 		return (db->db.db_data);  /* Nothing is changing */
 
 	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
 		if (dr->dr_txg == tx->tx_txg)
 			break;
 
 	if (dr == NULL) {
 		data = NULL;
 	} else {
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(dr->dr_dbuf);
 		dn = DB_DNODE(dr->dr_dbuf);
 
 		if (dn->dn_bonuslen == 0 &&
 		    dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID)
 			data = dr->dt.dl.dr_data->b_data;
 		else
 			data = dr->dt.dl.dr_data;
 
 		DB_DNODE_EXIT(dr->dr_dbuf);
 	}
 
 	return (data);
 }
 
 void
 dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx)
 {
 	objset_t *os = dn->dn_objset;
 	void *data = NULL;
 	dmu_buf_impl_t *db = NULL;
 	uint64_t *user = NULL;
 	uint64_t *group = NULL;
 	int flags = dn->dn_id_flags;
 	int error;
 	boolean_t have_spill = B_FALSE;
 
 	if (!dmu_objset_userused_enabled(dn->dn_objset))
 		return;
 
 	if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST|
 	    DN_ID_CHKED_SPILL)))
 		return;
 
 	if (before && dn->dn_bonuslen != 0)
 		data = DN_BONUS(dn->dn_phys);
 	else if (!before && dn->dn_bonuslen != 0) {
 		if (dn->dn_bonus) {
 			db = dn->dn_bonus;
 			mutex_enter(&db->db_mtx);
 			data = dmu_objset_userquota_find_data(db, tx);
 		} else {
 			data = DN_BONUS(dn->dn_phys);
 		}
 	} else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) {
 			int rf = 0;
 
 			if (RW_WRITE_HELD(&dn->dn_struct_rwlock))
 				rf |= DB_RF_HAVESTRUCT;
 			error = dmu_spill_hold_by_dnode(dn,
 			    rf | DB_RF_MUST_SUCCEED,
 			    FTAG, (dmu_buf_t **)&db);
 			ASSERT(error == 0);
 			mutex_enter(&db->db_mtx);
 			data = (before) ? db->db.db_data :
 			    dmu_objset_userquota_find_data(db, tx);
 			have_spill = B_TRUE;
 	} else {
 		mutex_enter(&dn->dn_mtx);
 		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 		mutex_exit(&dn->dn_mtx);
 		return;
 	}
 
 	if (before) {
 		ASSERT(data);
 		user = &dn->dn_olduid;
 		group = &dn->dn_oldgid;
 	} else if (data) {
 		user = &dn->dn_newuid;
 		group = &dn->dn_newgid;
 	}
 
 	/*
 	 * Must always call the callback in case the object
 	 * type has changed and that type isn't an object type to track
 	 */
 	error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data,
 	    user, group);
 
 	/*
 	 * Preserve existing uid/gid when the callback can't determine
 	 * what the new uid/gid are and the callback returned EEXIST.
 	 * The EEXIST error tells us to just use the existing uid/gid.
 	 * If we don't know what the old values are then just assign
 	 * them to 0, since that is a new file  being created.
 	 */
 	if (!before && data == NULL && error == EEXIST) {
 		if (flags & DN_ID_OLD_EXIST) {
 			dn->dn_newuid = dn->dn_olduid;
 			dn->dn_newgid = dn->dn_oldgid;
 		} else {
 			dn->dn_newuid = 0;
 			dn->dn_newgid = 0;
 		}
 		error = 0;
 	}
 
 	if (db)
 		mutex_exit(&db->db_mtx);
 
 	mutex_enter(&dn->dn_mtx);
 	if (error == 0 && before)
 		dn->dn_id_flags |= DN_ID_OLD_EXIST;
 	if (error == 0 && !before)
 		dn->dn_id_flags |= DN_ID_NEW_EXIST;
 
 	if (have_spill) {
 		dn->dn_id_flags |= DN_ID_CHKED_SPILL;
 	} else {
 		dn->dn_id_flags |= DN_ID_CHKED_BONUS;
 	}
 	mutex_exit(&dn->dn_mtx);
 	if (have_spill)
 		dmu_buf_rele((dmu_buf_t *)db, FTAG);
 }
 
 boolean_t
 dmu_objset_userspace_present(objset_t *os)
 {
 	return (os->os_phys->os_flags &
 	    OBJSET_FLAG_USERACCOUNTING_COMPLETE);
 }
 
 int
 dmu_objset_userspace_upgrade(objset_t *os)
 {
 	uint64_t obj;
 	int err = 0;
 
 	if (dmu_objset_userspace_present(os))
 		return (0);
 	if (!dmu_objset_userused_enabled(os))
 		return (SET_ERROR(ENOTSUP));
 	if (dmu_objset_is_snapshot(os))
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * We simply need to mark every object dirty, so that it will be
 	 * synced out and now accounted.  If this is called
 	 * concurrently, or if we already did some work before crashing,
 	 * that's fine, since we track each object's accounted state
 	 * independently.
 	 */
 
 	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
 		dmu_tx_t *tx;
 		dmu_buf_t *db;
 		int objerr;
 
 		if (issig(JUSTLOOKING) && issig(FORREAL))
 			return (SET_ERROR(EINTR));
 
 		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
 		if (objerr != 0)
 			continue;
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, obj);
 		objerr = dmu_tx_assign(tx, TXG_WAIT);
 		if (objerr != 0) {
 			dmu_tx_abort(tx);
 			continue;
 		}
 		dmu_buf_will_dirty(db, tx);
 		dmu_buf_rele(db, FTAG);
 		dmu_tx_commit(tx);
 	}
 
 	os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
 	txg_wait_synced(dmu_objset_pool(os), 0);
 	return (0);
 }
 
 void
 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
 	dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
 	    usedobjsp, availobjsp);
 }
 
 uint64_t
 dmu_objset_fsid_guid(objset_t *os)
 {
 	return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
 }
 
 void
 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
 {
 	stat->dds_type = os->os_phys->os_type;
 	if (os->os_dsl_dataset)
 		dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
 }
 
 void
 dmu_objset_stats(objset_t *os, nvlist_t *nv)
 {
 	ASSERT(os->os_dsl_dataset ||
 	    os->os_phys->os_type == DMU_OST_META);
 
 	if (os->os_dsl_dataset != NULL)
 		dsl_dataset_stats(os->os_dsl_dataset, nv);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
 	    os->os_phys->os_type);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
 	    dmu_objset_userspace_present(os));
 }
 
 int
 dmu_objset_is_snapshot(objset_t *os)
 {
 	if (os->os_dsl_dataset != NULL)
 		return (os->os_dsl_dataset->ds_is_snapshot);
 	else
 		return (B_FALSE);
 }
 
 int
 dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
     boolean_t *conflict)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	uint64_t ignored;
 
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
 		return (SET_ERROR(ENOENT));
 
 	return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored,
 	    MT_NORMALIZE, real, maxlen, conflict));
 }
 
 int
 dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	zap_cursor_t cursor;
 	zap_attribute_t attr;
 
 	ASSERT(dsl_pool_config_held(dmu_objset_pool(os)));
 
 	if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0)
 		return (SET_ERROR(ENOENT));
 
 	zap_cursor_init_serialized(&cursor,
 	    ds->ds_dir->dd_pool->dp_meta_objset,
 	    dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
 		zap_cursor_fini(&cursor);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (strlen(attr.za_name) + 1 > namelen) {
 		zap_cursor_fini(&cursor);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	(void) strcpy(name, attr.za_name);
 	if (idp)
 		*idp = attr.za_first_integer;
 	if (case_conflict)
 		*case_conflict = attr.za_normalization_conflict;
 	zap_cursor_advance(&cursor);
 	*offp = zap_cursor_serialize(&cursor);
 	zap_cursor_fini(&cursor);
 
 	return (0);
 }
 
 int
 dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp)
 {
 	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
 	zap_cursor_t cursor;
 	zap_attribute_t attr;
 
 	/* there is no next dir on a snapshot! */
 	if (os->os_dsl_dataset->ds_object !=
 	    dsl_dir_phys(dd)->dd_head_dataset_obj)
 		return (SET_ERROR(ENOENT));
 
 	zap_cursor_init_serialized(&cursor,
 	    dd->dd_pool->dp_meta_objset,
 	    dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp);
 
 	if (zap_cursor_retrieve(&cursor, &attr) != 0) {
 		zap_cursor_fini(&cursor);
 		return (SET_ERROR(ENOENT));
 	}
 
 	if (strlen(attr.za_name) + 1 > namelen) {
 		zap_cursor_fini(&cursor);
 		return (SET_ERROR(ENAMETOOLONG));
 	}
 
 	(void) strcpy(name, attr.za_name);
 	if (idp)
 		*idp = attr.za_first_integer;
 	zap_cursor_advance(&cursor);
 	*offp = zap_cursor_serialize(&cursor);
 	zap_cursor_fini(&cursor);
 
 	return (0);
 }
 
 typedef struct dmu_objset_find_ctx {
 	taskq_t		*dc_tq;
 	dsl_pool_t	*dc_dp;
 	uint64_t	dc_ddobj;
 	char		*dc_ddname; /* last component of ddobj's name */
 	int		(*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *);
 	void		*dc_arg;
 	int		dc_flags;
 	kmutex_t	*dc_error_lock;
 	int		*dc_error;
 } dmu_objset_find_ctx_t;
 
 static void
 dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp)
 {
 	dsl_pool_t *dp = dcp->dc_dp;
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	uint64_t thisobj;
 	int err = 0;
 
 	/* don't process if there already was an error */
 	if (*dcp->dc_error != 0)
 		goto out;
 
 	/*
 	 * Note: passing the name (dc_ddname) here is optional, but it
 	 * improves performance because we don't need to call
 	 * zap_value_search() to determine the name.
 	 */
 	err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd);
 	if (err != 0)
 		goto out;
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
 		dsl_dir_rele(dd, FTAG);
 		goto out;
 	}
 
 	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	/*
 	 * Iterate over all children.
 	 */
 	if (dcp->dc_flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT3U(attr->za_integer_length, ==,
 			    sizeof (uint64_t));
 			ASSERT3U(attr->za_num_integers, ==, 1);
 
 			dmu_objset_find_ctx_t *child_dcp =
 			    kmem_alloc(sizeof (*child_dcp), KM_SLEEP);
 			*child_dcp = *dcp;
 			child_dcp->dc_ddobj = attr->za_first_integer;
 			child_dcp->dc_ddname = spa_strdup(attr->za_name);
 			if (dcp->dc_tq != NULL)
 				(void) taskq_dispatch(dcp->dc_tq,
 				    dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP);
 			else
 				dmu_objset_find_dp_impl(child_dcp);
 		}
 		zap_cursor_fini(&zc);
 	}
 
 	/*
 	 * Iterate over all snapshots.
 	 */
 	if (dcp->dc_flags & DS_FIND_SNAPSHOTS) {
 		dsl_dataset_t *ds;
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
 		if (err == 0) {
 			uint64_t snapobj;
 
 			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 			dsl_dataset_rele(ds, FTAG);
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
 			    zap_cursor_retrieve(&zc, attr) == 0;
 			    (void) zap_cursor_advance(&zc)) {
 				ASSERT3U(attr->za_integer_length, ==,
 				    sizeof (uint64_t));
 				ASSERT3U(attr->za_num_integers, ==, 1);
 
 				err = dsl_dataset_hold_obj(dp,
 				    attr->za_first_integer, FTAG, &ds);
 				if (err != 0)
 					break;
 				err = dcp->dc_func(dp, ds, dcp->dc_arg);
 				dsl_dataset_rele(ds, FTAG);
 				if (err != 0)
 					break;
 			}
 			zap_cursor_fini(&zc);
 		}
 	}
 
 	kmem_free(attr, sizeof (zap_attribute_t));
 
 	if (err != 0) {
 		dsl_dir_rele(dd, FTAG);
 		goto out;
 	}
 
 	/*
 	 * Apply to self.
 	 */
 	err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
 	/*
 	 * Note: we hold the dir while calling dsl_dataset_hold_obj() so
 	 * that the dir will remain cached, and we won't have to re-instantiate
 	 * it (which could be expensive due to finding its name via
 	 * zap_value_search()).
 	 */
 	dsl_dir_rele(dd, FTAG);
 	if (err != 0)
 		goto out;
 	err = dcp->dc_func(dp, ds, dcp->dc_arg);
 	dsl_dataset_rele(ds, FTAG);
 
 out:
 	if (err != 0) {
 		mutex_enter(dcp->dc_error_lock);
 		/* only keep first error */
 		if (*dcp->dc_error == 0)
 			*dcp->dc_error = err;
 		mutex_exit(dcp->dc_error_lock);
 	}
 
 	if (dcp->dc_ddname != NULL)
 		spa_strfree(dcp->dc_ddname);
 	kmem_free(dcp, sizeof (*dcp));
 }
 
 static void
 dmu_objset_find_dp_cb(void *arg)
 {
 	dmu_objset_find_ctx_t *dcp = arg;
 	dsl_pool_t *dp = dcp->dc_dp;
 
 	/*
 	 * We need to get a pool_config_lock here, as there are several
 	 * asssert(pool_config_held) down the stack. Getting a lock via
 	 * dsl_pool_config_enter is risky, as it might be stalled by a
 	 * pending writer. This would deadlock, as the write lock can
 	 * only be granted when our parent thread gives up the lock.
 	 * The _prio interface gives us priority over a pending writer.
 	 */
 	dsl_pool_config_enter_prio(dp, FTAG);
 
 	dmu_objset_find_dp_impl(dcp);
 
 	dsl_pool_config_exit(dp, FTAG);
 }
 
 /*
  * Find objsets under and including ddobj, call func(ds) on each.
  * The order for the enumeration is completely undefined.
  * func is called with dsl_pool_config held.
  */
 int
 dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj,
     int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags)
 {
 	int error = 0;
 	taskq_t *tq = NULL;
 	int ntasks;
 	dmu_objset_find_ctx_t *dcp;
 	kmutex_t err_lock;
 
 	mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL);
 	dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP);
 	dcp->dc_tq = NULL;
 	dcp->dc_dp = dp;
 	dcp->dc_ddobj = ddobj;
 	dcp->dc_ddname = NULL;
 	dcp->dc_func = func;
 	dcp->dc_arg = arg;
 	dcp->dc_flags = flags;
 	dcp->dc_error_lock = &err_lock;
 	dcp->dc_error = &error;
 
 	if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) {
 		/*
 		 * In case a write lock is held we can't make use of
 		 * parallelism, as down the stack of the worker threads
 		 * the lock is asserted via dsl_pool_config_held.
 		 * In case of a read lock this is solved by getting a read
 		 * lock in each worker thread, which isn't possible in case
 		 * of a writer lock. So we fall back to the synchronous path
 		 * here.
 		 * In the future it might be possible to get some magic into
 		 * dsl_pool_config_held in a way that it returns true for
 		 * the worker threads so that a single lock held from this
 		 * thread suffices. For now, stay single threaded.
 		 */
 		dmu_objset_find_dp_impl(dcp);
 		mutex_destroy(&err_lock);
 
 		return (error);
 	}
 
 	ntasks = dmu_find_threads;
 	if (ntasks == 0)
 		ntasks = vdev_count_leaves(dp->dp_spa) * 4;
 	tq = taskq_create("dmu_objset_find", ntasks, minclsyspri, ntasks,
 	    INT_MAX, 0);
 	if (tq == NULL) {
 		kmem_free(dcp, sizeof (*dcp));
 		mutex_destroy(&err_lock);
 
 		return (SET_ERROR(ENOMEM));
 	}
 	dcp->dc_tq = tq;
 
 	/* dcp will be freed by task */
 	(void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP);
 
 	/*
 	 * PORTING: this code relies on the property of taskq_wait to wait
 	 * until no more tasks are queued and no more tasks are active. As
 	 * we always queue new tasks from within other tasks, task_wait
 	 * reliably waits for the full recursion to finish, even though we
 	 * enqueue new tasks after taskq_wait has been called.
 	 * On platforms other than illumos, taskq_wait may not have this
 	 * property.
 	 */
 	taskq_wait(tq);
 	taskq_destroy(tq);
 	mutex_destroy(&err_lock);
 
 	return (error);
 }
 
 /*
  * Find all objsets under name, and for each, call 'func(child_name, arg)'.
  * The dp_config_rwlock must not be held when this is called, and it
  * will not be held when the callback is called.
  * Therefore this function should only be used when the pool is not changing
  * (e.g. in syncing context), or the callback can deal with the possible races.
  */
 static int
 dmu_objset_find_impl(spa_t *spa, const char *name,
     int func(const char *, void *), void *arg, int flags)
 {
 	dsl_dir_t *dd;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	dsl_dataset_t *ds;
 	zap_cursor_t zc;
 	zap_attribute_t *attr;
 	char *child;
 	uint64_t thisobj;
 	int err;
 
 	dsl_pool_config_enter(dp, FTAG);
 
 	err = dsl_dir_hold(dp, name, FTAG, &dd, NULL);
 	if (err != 0) {
 		dsl_pool_config_exit(dp, FTAG);
 		return (err);
 	}
 
 	/* Don't visit hidden ($MOS & $ORIGIN) objsets. */
 	if (dd->dd_myname[0] == '$') {
 		dsl_dir_rele(dd, FTAG);
 		dsl_pool_config_exit(dp, FTAG);
 		return (0);
 	}
 
 	thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj;
 	attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
 
 	/*
 	 * Iterate over all children.
 	 */
 	if (flags & DS_FIND_CHILDREN) {
 		for (zap_cursor_init(&zc, dp->dp_meta_objset,
 		    dsl_dir_phys(dd)->dd_child_dir_zapobj);
 		    zap_cursor_retrieve(&zc, attr) == 0;
 		    (void) zap_cursor_advance(&zc)) {
 			ASSERT3U(attr->za_integer_length, ==,
 			    sizeof (uint64_t));
 			ASSERT3U(attr->za_num_integers, ==, 1);
 
 			child = kmem_asprintf("%s/%s", name, attr->za_name);
 			dsl_pool_config_exit(dp, FTAG);
 			err = dmu_objset_find_impl(spa, child,
 			    func, arg, flags);
 			dsl_pool_config_enter(dp, FTAG);
 			strfree(child);
 			if (err != 0)
 				break;
 		}
 		zap_cursor_fini(&zc);
 
 		if (err != 0) {
 			dsl_dir_rele(dd, FTAG);
 			dsl_pool_config_exit(dp, FTAG);
 			kmem_free(attr, sizeof (zap_attribute_t));
 			return (err);
 		}
 	}
 
 	/*
 	 * Iterate over all snapshots.
 	 */
 	if (flags & DS_FIND_SNAPSHOTS) {
 		err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds);
 
 		if (err == 0) {
 			uint64_t snapobj;
 
 			snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj;
 			dsl_dataset_rele(ds, FTAG);
 
 			for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj);
 			    zap_cursor_retrieve(&zc, attr) == 0;
 			    (void) zap_cursor_advance(&zc)) {
 				ASSERT3U(attr->za_integer_length, ==,
 				    sizeof (uint64_t));
 				ASSERT3U(attr->za_num_integers, ==, 1);
 
 				child = kmem_asprintf("%s@%s",
 				    name, attr->za_name);
 				dsl_pool_config_exit(dp, FTAG);
 				err = func(child, arg);
 				dsl_pool_config_enter(dp, FTAG);
 				strfree(child);
 				if (err != 0)
 					break;
 			}
 			zap_cursor_fini(&zc);
 		}
 	}
 
 	dsl_dir_rele(dd, FTAG);
 	kmem_free(attr, sizeof (zap_attribute_t));
 	dsl_pool_config_exit(dp, FTAG);
 
 	if (err != 0)
 		return (err);
 
 	/* Apply to self. */
 	return (func(name, arg));
 }
 
 /*
  * See comment above dmu_objset_find_impl().
  */
 int
 dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags)
 {
 	spa_t *spa;
 	int error;
 
 	error = spa_open(name, &spa, FTAG);
 	if (error != 0)
 		return (error);
 	error = dmu_objset_find_impl(spa, name, func, arg, flags);
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 void
 dmu_objset_set_user(objset_t *os, void *user_ptr)
 {
 	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
 	os->os_user_ptr = user_ptr;
 }
 
 void *
 dmu_objset_get_user(objset_t *os)
 {
 	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
 	return (os->os_user_ptr);
 }
 
 /*
  * Determine name of filesystem, given name of snapshot.
  * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes
  */
 int
 dmu_fsname(const char *snapname, char *buf)
 {
 	char *atp = strchr(snapname, '@');
 	if (atp == NULL)
 		return (SET_ERROR(EINVAL));
 	if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN)
 		return (SET_ERROR(ENAMETOOLONG));
 	(void) strlcpy(buf, snapname, atp - snapname + 1);
 	return (0);
 }
 
 /*
  * Call when we think we're going to write/free space in open context to track
  * the amount of dirty data in the open txg, which is also the amount
  * of memory that can not be evicted until this txg syncs.
  */
 void
 dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 	int64_t aspace = spa_get_worst_case_asize(os->os_spa, space);
 
 	if (ds != NULL) {
 		dsl_dir_willuse_space(ds->ds_dir, aspace, tx);
 		dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx);
 	}
 }
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h	(revision 336957)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dbuf.h	(revision 336958)
@@ -1,405 +1,413 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  */
 
 #ifndef	_SYS_DBUF_H
 #define	_SYS_DBUF_H
 
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
 #include <sys/zfs_context.h>
 #include <sys/refcount.h>
 #include <sys/zrlock.h>
 #include <sys/multilist.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 #define	IN_DMU_SYNC 2
 
 /*
  * define flags for dbuf_read
  */
 
 #define	DB_RF_MUST_SUCCEED	(1 << 0)
 #define	DB_RF_CANFAIL		(1 << 1)
 #define	DB_RF_HAVESTRUCT	(1 << 2)
 #define	DB_RF_NOPREFETCH	(1 << 3)
 #define	DB_RF_NEVERWAIT		(1 << 4)
 #define	DB_RF_CACHED		(1 << 5)
 
 /*
  * The simplified state transition diagram for dbufs looks like:
  *
  *		+----> READ ----+
  *		|		|
  *		|		V
  *  (alloc)-->UNCACHED	     CACHED-->EVICTING-->(free)
  *		|		^	 ^
  *		|		|	 |
  *		+----> FILL ----+	 |
  *		|			 |
  *		|			 |
  *		+--------> NOFILL -------+
  *
  * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range
  * to find all dbufs in a range of a dnode and must be less than any other
  * dbuf_states_t (see comment on dn_dbufs in dnode.h).
  */
 typedef enum dbuf_states {
 	DB_SEARCH = -1,
 	DB_UNCACHED,
 	DB_FILL,
 	DB_NOFILL,
 	DB_READ,
 	DB_CACHED,
 	DB_EVICTING
 } dbuf_states_t;
 
+typedef enum dbuf_cached_state {
+	DB_NO_CACHE = -1,
+	DB_DBUF_CACHE,
+	DB_DBUF_METADATA_CACHE,
+	DB_CACHE_MAX
+} dbuf_cached_state_t;
+
 struct dnode;
 struct dmu_tx;
 
 /*
  * level = 0 means the user data
  * level = 1 means the single indirect block
  * etc.
  */
 
 struct dmu_buf_impl;
 
 typedef enum override_states {
 	DR_NOT_OVERRIDDEN,
 	DR_IN_DMU_SYNC,
 	DR_OVERRIDDEN
 } override_states_t;
 
 typedef struct dbuf_dirty_record {
 	/* link on our parents dirty list */
 	list_node_t dr_dirty_node;
 
 	/* transaction group this data will sync in */
 	uint64_t dr_txg;
 
 	/* zio of outstanding write IO */
 	zio_t *dr_zio;
 
 	/* pointer back to our dbuf */
 	struct dmu_buf_impl *dr_dbuf;
 
 	/* pointer to next dirty record */
 	struct dbuf_dirty_record *dr_next;
 
 	/* pointer to parent dirty record */
 	struct dbuf_dirty_record *dr_parent;
 
 	/* How much space was changed to dsl_pool_dirty_space() for this? */
 	unsigned int dr_accounted;
 
 	/* A copy of the bp that points to us */
 	blkptr_t dr_bp_copy;
 
 	union dirty_types {
 		struct dirty_indirect {
 
 			/* protect access to list */
 			kmutex_t dr_mtx;
 
 			/* Our list of dirty children */
 			list_t dr_children;
 		} di;
 		struct dirty_leaf {
 
 			/*
 			 * dr_data is set when we dirty the buffer
 			 * so that we can retain the pointer even if it
 			 * gets COW'd in a subsequent transaction group.
 			 */
 			arc_buf_t *dr_data;
 			blkptr_t dr_overridden_by;
 			override_states_t dr_override_state;
 			uint8_t dr_copies;
 			boolean_t dr_nopwrite;
 		} dl;
 	} dt;
 } dbuf_dirty_record_t;
 
 typedef struct dmu_buf_impl {
 	/*
 	 * The following members are immutable, with the exception of
 	 * db.db_data, which is protected by db_mtx.
 	 */
 
 	/* the publicly visible structure */
 	dmu_buf_t db;
 
 	/* the objset we belong to */
 	struct objset *db_objset;
 
 	/*
 	 * handle to safely access the dnode we belong to (NULL when evicted)
 	 */
 	struct dnode_handle *db_dnode_handle;
 
 	/*
 	 * our parent buffer; if the dnode points to us directly,
 	 * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf
 	 * only accessed by sync thread ???
 	 * (NULL when evicted)
 	 * May change from NULL to non-NULL under the protection of db_mtx
 	 * (see dbuf_check_blkptr())
 	 */
 	struct dmu_buf_impl *db_parent;
 
 	/*
 	 * link for hash table of all dmu_buf_impl_t's
 	 */
 	struct dmu_buf_impl *db_hash_next;
 
 	/* our block number */
 	uint64_t db_blkid;
 
 	/*
 	 * Pointer to the blkptr_t which points to us. May be NULL if we
 	 * don't have one yet. (NULL when evicted)
 	 */
 	blkptr_t *db_blkptr;
 
 	/*
 	 * Our indirection level.  Data buffers have db_level==0.
 	 * Indirect buffers which point to data buffers have
 	 * db_level==1. etc.  Buffers which contain dnodes have
 	 * db_level==0, since the dnodes are stored in a file.
 	 */
 	uint8_t db_level;
 
 	/* db_mtx protects the members below */
 	kmutex_t db_mtx;
 
 	/*
 	 * Current state of the buffer
 	 */
 	dbuf_states_t db_state;
 
 	/*
 	 * Refcount accessed by dmu_buf_{hold,rele}.
 	 * If nonzero, the buffer can't be destroyed.
 	 * Protected by db_mtx.
 	 */
 	refcount_t db_holds;
 
 	/* buffer holding our data */
 	arc_buf_t *db_buf;
 
 	kcondvar_t db_changed;
 	dbuf_dirty_record_t *db_data_pending;
 
 	/* pointer to most recent dirty record for this buffer */
 	dbuf_dirty_record_t *db_last_dirty;
 
 	/*
 	 * Our link on the owner dnodes's dn_dbufs list.
 	 * Protected by its dn_dbufs_mtx.
 	 */
 	avl_node_t db_link;
 
-	/*
-	 * Link in dbuf_cache.
-	 */
+	/* Link in dbuf_cache or dbuf_metadata_cache */
 	multilist_node_t db_cache_link;
+
+	/* Tells us which dbuf cache this dbuf is in, if any */
+	dbuf_cached_state_t db_caching_status;
 
 	/* Data which is unique to data (leaf) blocks: */
 
 	/* User callback information. */
 	dmu_buf_user_t *db_user;
 
 	/*
 	 * Evict user data as soon as the dirty and reference
 	 * counts are equal.
 	 */
 	uint8_t db_user_immediate_evict;
 
 	/*
 	 * This block was freed while a read or write was
 	 * active.
 	 */
 	uint8_t db_freed_in_flight;
 
 	/*
 	 * dnode_evict_dbufs() or dnode_evict_bonus() tried to
 	 * evict this dbuf, but couldn't due to outstanding
 	 * references.  Evict once the refcount drops to 0.
 	 */
 	uint8_t db_pending_evict;
 
 	uint8_t db_dirtycnt;
 } dmu_buf_impl_t;
 
 /* Note: the dbuf hash table is exposed only for the mdb module */
 #define	DBUF_MUTEXES 256
 #define	DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)])
 typedef struct dbuf_hash_table {
 	uint64_t hash_table_mask;
 	dmu_buf_impl_t **hash_table;
 	kmutex_t hash_mutexes[DBUF_MUTEXES];
 } dbuf_hash_table_t;
 
 uint64_t dbuf_whichblock(struct dnode *di, int64_t level, uint64_t offset);
 
 dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data);
 void dbuf_create_bonus(struct dnode *dn);
 int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx);
 void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag);
 
 void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx);
 
 dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag);
 dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid,
     void *tag);
 int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid,
     boolean_t fail_sparse, boolean_t fail_uncached,
     void *tag, dmu_buf_impl_t **dbp);
 
 void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid,
     zio_priority_t prio, arc_flags_t aflags);
 
 void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
 boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj,
     uint64_t blkid, void *tag);
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
 void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
 
 dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level,
     uint64_t blkid);
 
 int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags);
 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
 void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
 void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data,
     bp_embedded_type_t etype, enum zio_compress comp,
     int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx);
 
 void dbuf_destroy(dmu_buf_impl_t *db);
 
 void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dbuf_unoverride(dbuf_dirty_record_t *dr);
 void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx);
 void dbuf_release_bp(dmu_buf_impl_t *db);
 
 boolean_t dbuf_can_remap(const dmu_buf_impl_t *buf);
 
 void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end,
     struct dmu_tx *);
 
 void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx);
 
 #define	DB_DNODE(_db)		((_db)->db_dnode_handle->dnh_dnode)
 #define	DB_DNODE_LOCK(_db)	((_db)->db_dnode_handle->dnh_zrlock)
 #define	DB_DNODE_ENTER(_db)	(zrl_add(&DB_DNODE_LOCK(_db)))
 #define	DB_DNODE_EXIT(_db)	(zrl_remove(&DB_DNODE_LOCK(_db)))
 #define	DB_DNODE_HELD(_db)	(!zrl_is_zero(&DB_DNODE_LOCK(_db)))
 
 void dbuf_init(void);
 void dbuf_fini(void);
 
 boolean_t dbuf_is_metadata(dmu_buf_impl_t *db);
 
 #define	DBUF_GET_BUFC_TYPE(_db)	\
 	(dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA)
 
 #define	DBUF_IS_CACHEABLE(_db)						\
 	((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL ||		\
 	(dbuf_is_metadata(_db) &&					\
 	((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA)))
 
 #define	DBUF_IS_L2CACHEABLE(_db)					\
 	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
 	(dbuf_is_metadata(_db) &&					\
 	((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
 
 #define	DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level)				\
 	((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL ||	\
 	(((_level) > 0 ||						\
 	DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) &&	\
 	((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA)))
 
 #ifdef ZFS_DEBUG
 
 /*
  * There should be a ## between the string literal and fmt, to make it
  * clear that we're joining two strings together, but gcc does not
  * support that preprocessor token.
  */
 #define	dprintf_dbuf(dbuf, fmt, ...) do { \
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { \
 	char __db_buf[32]; \
 	uint64_t __db_obj = (dbuf)->db.db_object; \
 	if (__db_obj == DMU_META_DNODE_OBJECT) \
 		(void) strcpy(__db_buf, "mdn"); \
 	else \
 		(void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \
 		    (u_longlong_t)__db_obj); \
 	dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \
 	    "obj=%s lvl=%u blkid=%lld " fmt, \
 	    __db_buf, (dbuf)->db_level, \
 	    (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \
 	} \
 _NOTE(CONSTCOND) } while (0)
 
 #define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
 	snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);		\
 	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	}							\
 _NOTE(CONSTCOND) } while (0)
 
 #define	DBUF_VERIFY(db)	dbuf_verify(db)
 
 #else
 
 #define	dprintf_dbuf(db, fmt, ...)
 #define	dprintf_dbuf_bp(db, bp, fmt, ...)
 #define	DBUF_VERIFY(db)
 
 #endif
 
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_DBUF_H */
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h	(revision 336957)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu.h	(revision 336958)
@@ -1,975 +1,980 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2011, 2017 by Delphix. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
  * Copyright (c) 2012, Joyent, Inc. All rights reserved.
  * Copyright 2013 DEY Storage Systems, Inc.
  * Copyright 2014 HybridCluster. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright 2013 Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_DMU_H
 #define	_SYS_DMU_H
 
 /*
  * This file describes the interface that the DMU provides for its
  * consumers.
  *
  * The DMU also interacts with the SPA.  That interface is described in
  * dmu_spa.h.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/inttypes.h>
 #include <sys/cred.h>
 #include <sys/fs/zfs.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_priority.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct uio;
 struct xuio;
 struct page;
 struct vnode;
 struct spa;
 struct zilog;
 struct zio;
 struct blkptr;
 struct zap_cursor;
 struct dsl_dataset;
 struct dsl_pool;
 struct dnode;
 struct drr_begin;
 struct drr_end;
 struct zbookmark_phys;
 struct spa;
 struct nvlist;
 struct arc_buf;
 struct zio_prop;
 struct sa_handle;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
 typedef struct dsl_dir dsl_dir_t;
 typedef struct dnode dnode_t;
 
 typedef enum dmu_object_byteswap {
 	DMU_BSWAP_UINT8,
 	DMU_BSWAP_UINT16,
 	DMU_BSWAP_UINT32,
 	DMU_BSWAP_UINT64,
 	DMU_BSWAP_ZAP,
 	DMU_BSWAP_DNODE,
 	DMU_BSWAP_OBJSET,
 	DMU_BSWAP_ZNODE,
 	DMU_BSWAP_OLDACL,
 	DMU_BSWAP_ACL,
 	/*
 	 * Allocating a new byteswap type number makes the on-disk format
 	 * incompatible with any other format that uses the same number.
 	 *
 	 * Data can usually be structured to work with one of the
 	 * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types.
 	 */
 	DMU_BSWAP_NUMFUNCS
 } dmu_object_byteswap_t;
 
 #define	DMU_OT_NEWTYPE 0x80
 #define	DMU_OT_METADATA 0x40
 #define	DMU_OT_BYTESWAP_MASK 0x3f
 
 /*
  * Defines a uint8_t object type. Object types specify if the data
  * in the object is metadata (boolean) and how to byteswap the data
- * (dmu_object_byteswap_t).
+ * (dmu_object_byteswap_t). All of the types created by this method
+ * are cached in the dbuf metadata cache.
  */
 #define	DMU_OT(byteswap, metadata) \
 	(DMU_OT_NEWTYPE | \
 	((metadata) ? DMU_OT_METADATA : 0) | \
 	((byteswap) & DMU_OT_BYTESWAP_MASK))
 
 #define	DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \
 	(ot) < DMU_OT_NUMTYPES)
 
 #define	DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_METADATA) : \
 	dmu_ot[(ot)].ot_metadata)
 
+#define	DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \
+	B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache)
+
 /*
  * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't
  * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill
  * is repurposed for embedded BPs.
  */
 #define	DMU_OT_HAS_FILL(ot) \
 	((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET)
 
 #define	DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \
 	((ot) & DMU_OT_BYTESWAP_MASK) : \
 	dmu_ot[(ot)].ot_byteswap)
 
 typedef enum dmu_object_type {
 	DMU_OT_NONE,
 	/* general: */
 	DMU_OT_OBJECT_DIRECTORY,	/* ZAP */
 	DMU_OT_OBJECT_ARRAY,		/* UINT64 */
 	DMU_OT_PACKED_NVLIST,		/* UINT8 (XDR by nvlist_pack/unpack) */
 	DMU_OT_PACKED_NVLIST_SIZE,	/* UINT64 */
 	DMU_OT_BPOBJ,			/* UINT64 */
 	DMU_OT_BPOBJ_HDR,		/* UINT64 */
 	/* spa: */
 	DMU_OT_SPACE_MAP_HEADER,	/* UINT64 */
 	DMU_OT_SPACE_MAP,		/* UINT64 */
 	/* zil: */
 	DMU_OT_INTENT_LOG,		/* UINT64 */
 	/* dmu: */
 	DMU_OT_DNODE,			/* DNODE */
 	DMU_OT_OBJSET,			/* OBJSET */
 	/* dsl: */
 	DMU_OT_DSL_DIR,			/* UINT64 */
 	DMU_OT_DSL_DIR_CHILD_MAP,	/* ZAP */
 	DMU_OT_DSL_DS_SNAP_MAP,		/* ZAP */
 	DMU_OT_DSL_PROPS,		/* ZAP */
 	DMU_OT_DSL_DATASET,		/* UINT64 */
 	/* zpl: */
 	DMU_OT_ZNODE,			/* ZNODE */
 	DMU_OT_OLDACL,			/* Old ACL */
 	DMU_OT_PLAIN_FILE_CONTENTS,	/* UINT8 */
 	DMU_OT_DIRECTORY_CONTENTS,	/* ZAP */
 	DMU_OT_MASTER_NODE,		/* ZAP */
 	DMU_OT_UNLINKED_SET,		/* ZAP */
 	/* zvol: */
 	DMU_OT_ZVOL,			/* UINT8 */
 	DMU_OT_ZVOL_PROP,		/* ZAP */
 	/* other; for testing only! */
 	DMU_OT_PLAIN_OTHER,		/* UINT8 */
 	DMU_OT_UINT64_OTHER,		/* UINT64 */
 	DMU_OT_ZAP_OTHER,		/* ZAP */
 	/* new object types: */
 	DMU_OT_ERROR_LOG,		/* ZAP */
 	DMU_OT_SPA_HISTORY,		/* UINT8 */
 	DMU_OT_SPA_HISTORY_OFFSETS,	/* spa_his_phys_t */
 	DMU_OT_POOL_PROPS,		/* ZAP */
 	DMU_OT_DSL_PERMS,		/* ZAP */
 	DMU_OT_ACL,			/* ACL */
 	DMU_OT_SYSACL,			/* SYSACL */
 	DMU_OT_FUID,			/* FUID table (Packed NVLIST UINT8) */
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
 	DMU_OT_NEXT_CLONES,		/* ZAP */
 	DMU_OT_SCAN_QUEUE,		/* ZAP */
 	DMU_OT_USERGROUP_USED,		/* ZAP */
 	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
 	DMU_OT_USERREFS,		/* ZAP */
 	DMU_OT_DDT_ZAP,			/* ZAP */
 	DMU_OT_DDT_STATS,		/* ZAP */
 	DMU_OT_SA,			/* System attr */
 	DMU_OT_SA_MASTER_NODE,		/* ZAP */
 	DMU_OT_SA_ATTR_REGISTRATION,	/* ZAP */
 	DMU_OT_SA_ATTR_LAYOUTS,		/* ZAP */
 	DMU_OT_SCAN_XLATE,		/* ZAP */
 	DMU_OT_DEDUP,			/* fake dedup BP from ddt_bp_create() */
 	DMU_OT_DEADLIST,		/* ZAP */
 	DMU_OT_DEADLIST_HDR,		/* UINT64 */
 	DMU_OT_DSL_CLONES,		/* ZAP */
 	DMU_OT_BPOBJ_SUBOBJ,		/* UINT64 */
 	/*
 	 * Do not allocate new object types here. Doing so makes the on-disk
 	 * format incompatible with any other format that uses the same object
 	 * type number.
 	 *
 	 * When creating an object which does not have one of the above types
 	 * use the DMU_OTN_* type with the correct byteswap and metadata
 	 * values.
 	 *
 	 * The DMU_OTN_* types do not have entries in the dmu_ot table,
 	 * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead
 	 * of indexing into dmu_ot directly (this works for both DMU_OT_* types
 	 * and DMU_OTN_* types).
 	 */
 	DMU_OT_NUMTYPES,
 
 	/*
 	 * Names for valid types declared with DMU_OT().
 	 */
 	DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE),
 	DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE),
 	DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE),
 	DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE),
 	DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE),
 	DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE),
 	DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE),
 	DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE),
 	DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE),
 	DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE),
 } dmu_object_type_t;
 
 /*
  * These flags are intended to be used to specify the "txg_how"
  * parameter when calling the dmu_tx_assign() function. See the comment
  * above dmu_tx_assign() for more details on the meaning of these flags.
  */
 #define	TXG_NOWAIT	(0ULL)
 #define	TXG_WAIT	(1ULL<<0)
 #define	TXG_NOTHROTTLE	(1ULL<<1)
 
 void byteswap_uint64_array(void *buf, size_t size);
 void byteswap_uint32_array(void *buf, size_t size);
 void byteswap_uint16_array(void *buf, size_t size);
 void byteswap_uint8_array(void *buf, size_t size);
 void zap_byteswap(void *buf, size_t size);
 void zfs_oldacl_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);
 
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
 #define	DS_FIND_SERIALIZE	(1<<2)
 
 /*
  * The maximum number of bytes that can be accessed as part of one
  * operation, including metadata.
  */
 #define	DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */
 #define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
 #define	DMU_USERUSED_OBJECT	(-1ULL)
 #define	DMU_GROUPUSED_OBJECT	(-2ULL)
 
 /*
  * artificial blkids for bonus buffer and spill blocks
  */
 #define	DMU_BONUS_BLKID		(-1ULL)
 #define	DMU_SPILL_BLKID		(-2ULL)
 /*
  * Public routines to create, destroy, open, and close objsets.
  */
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp);
 void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, void *tag);
 int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
 
 void dmu_objset_evict_dbufs(objset_t *os);
 int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
 int dmu_objset_clone(const char *name, const char *origin);
 int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer,
     struct nvlist *errlist);
 int dmu_objset_snapshot_one(const char *fsname, const char *snapname);
 int dmu_objset_snapshot_tmp(const char *, const char *, int);
 int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
 int dsl_dataset_rename_snapshot(const char *fsname,
     const char *oldsnapname, const char *newsnapname, boolean_t recursive);
 int dmu_objset_remap_indirects(const char *fsname);
 
 typedef struct dmu_buf {
 	uint64_t db_object;		/* object that this buffer is part of */
 	uint64_t db_offset;		/* byte offset in this object */
 	uint64_t db_size;		/* size of buffer in bytes */
 	void *db_data;			/* data in buffer */
 } dmu_buf_t;
 
 /*
  * The names of zap entries in the DIRECTORY_OBJECT of the MOS.
  */
 #define	DMU_POOL_DIRECTORY_OBJECT	1
 #define	DMU_POOL_CONFIG			"config"
 #define	DMU_POOL_FEATURES_FOR_WRITE	"features_for_write"
 #define	DMU_POOL_FEATURES_FOR_READ	"features_for_read"
 #define	DMU_POOL_FEATURE_DESCRIPTIONS	"feature_descriptions"
 #define	DMU_POOL_FEATURE_ENABLED_TXG	"feature_enabled_txg"
 #define	DMU_POOL_ROOT_DATASET		"root_dataset"
 #define	DMU_POOL_SYNC_BPOBJ		"sync_bplist"
 #define	DMU_POOL_ERRLOG_SCRUB		"errlog_scrub"
 #define	DMU_POOL_ERRLOG_LAST		"errlog_last"
 #define	DMU_POOL_SPARES			"spares"
 #define	DMU_POOL_DEFLATE		"deflate"
 #define	DMU_POOL_HISTORY		"history"
 #define	DMU_POOL_PROPS			"pool_props"
 #define	DMU_POOL_L2CACHE		"l2cache"
 #define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
 #define	DMU_POOL_DDT			"DDT-%s-%s-%s"
 #define	DMU_POOL_DDT_STATS		"DDT-statistics"
 #define	DMU_POOL_CREATION_VERSION	"creation_version"
 #define	DMU_POOL_SCAN			"scan"
 #define	DMU_POOL_FREE_BPOBJ		"free_bpobj"
 #define	DMU_POOL_BPTREE_OBJ		"bptree_obj"
 #define	DMU_POOL_EMPTY_BPOBJ		"empty_bpobj"
 #define	DMU_POOL_CHECKSUM_SALT		"org.illumos:checksum_salt"
 #define	DMU_POOL_VDEV_ZAP_MAP		"com.delphix:vdev_zap_map"
 #define	DMU_POOL_REMOVING		"com.delphix:removing"
 #define	DMU_POOL_OBSOLETE_BPOBJ		"com.delphix:obsolete_bpobj"
 #define	DMU_POOL_CONDENSING_INDIRECT	"com.delphix:condensing_indirect"
 #define	DMU_POOL_ZPOOL_CHECKPOINT	"com.delphix:zpool_checkpoint"
 
 /*
  * Allocate an object from this objset.  The range of object numbers
  * available is (0, DN_MAX_OBJECT).  Object 0 is the meta-dnode.
  *
  * The transaction must be assigned to a txg.  The newly allocated
  * object will be "held" in the transaction (ie. you can modify the
  * newly allocated object in this transaction).
  *
  * dmu_object_alloc() chooses an object and returns it in *objectp.
  *
  * dmu_object_claim() allocates a specific object number.  If that
  * number is already allocated, it fails and returns EEXIST.
  *
  * Return 0 on success, or ENOSPC or EEXIST as specified above.
  */
 uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp);
 
 /*
  * Free an object from this objset.
  *
  * The object's data will be freed as well (ie. you don't need to call
  * dmu_free(object, 0, -1, tx)).
  *
  * The object need not be held in the transaction.
  *
  * If there are any holds on this object's buffers (via dmu_buf_hold()),
  * or tx holds on the object (via dmu_tx_hold_object()), you can not
  * free it; it fails and returns EBUSY.
  *
  * If the object is not allocated, it fails and returns ENOENT.
  *
  * Return 0 on success, or EBUSY or ENOENT as specified above.
  */
 int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx);
 
 /*
  * Find the next allocated or free object.
  *
  * The objectp parameter is in-out.  It will be updated to be the next
  * object which is allocated.  Ignore objects which have not been
  * modified since txg.
  *
  * XXX Can only be called on a objset with no dirty data.
  *
  * Returns 0 on success, or ENOENT if there are no more objects.
  */
 int dmu_object_next(objset_t *os, uint64_t *objectp,
     boolean_t hole, uint64_t txg);
 
 /*
  * Set the data blocksize for an object.
  *
  * The object cannot have any blocks allcated beyond the first.  If
  * the first block is allocated already, the new size must be greater
  * than the current block size.  If these conditions are not met,
  * ENOTSUP will be returned.
  *
  * Returns 0 on success, or EBUSY if there are any holds on the object
  * contents, or ENOTSUP as described above.
  */
 int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size,
     int ibs, dmu_tx_t *tx);
 
 /*
  * Set the checksum property on a dnode.  The new checksum algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
     dmu_tx_t *tx);
 
 /*
  * Set the compress property on a dnode.  The new compression algorithm will
  * apply to all newly written blocks; existing blocks will not be affected.
  */
 void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
 int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg);
 
 void
 dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset,
     void *data, uint8_t etype, uint8_t comp, int uncompressed_size,
     int compressed_size, int byteorder, dmu_tx_t *tx);
 
 /*
  * Decide how to write a block: checksum, compression, number of copies, etc.
  */
 #define	WP_NOFILL	0x1
 #define	WP_DMU_SYNC	0x2
 #define	WP_SPILL	0x4
 
 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp,
     struct zio_prop *zp);
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
  * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus
  * data.  As with any normal buffer, you must call dmu_buf_will_dirty()
  * before modifying it, and the
  * object must be held in an assigned transaction before calling
  * dmu_buf_will_dirty.  You may use dmu_buf_set_user() on the bonus
  * buffer as well.  You must release your hold with dmu_buf_rele().
  *
  * Returns ENOENT, EIO, or 0.
  */
 int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
 int dmu_bonus_max(void);
 int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *);
 int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *);
 dmu_object_type_t dmu_get_bonustype(dmu_buf_t *);
 int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *);
 
 /*
  * Special spill buffer support used by "SA" framework
  */
 
 int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags,
     void *tag, dmu_buf_t **dbp);
 int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp);
 
 /*
  * Obtain the DMU buffer from the specified object which contains the
  * specified offset.  dmu_buf_hold() puts a "hold" on the buffer, so
  * that it will remain in memory.  You must release the hold with
  * dmu_buf_rele().  You musn't access the dmu_buf_t after releasing your
  * hold.  You must have a hold on any dmu_buf_t* you pass to the DMU.
  *
  * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill
  * on the returned buffer before reading or writing the buffer's
  * db_data.  The comments for those routines describe what particular
  * operations are valid after calling them.
  *
  * The object number must be a valid, allocated object number.
  */
 int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
     void *tag, dmu_buf_t **, int flags);
 int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset,
     void *tag, dmu_buf_t **dbp, int flags);
 
 /*
  * Add a reference to a dmu buffer that has already been held via
  * dmu_buf_hold() in the current context.
  */
 void dmu_buf_add_ref(dmu_buf_t *db, void* tag);
 
 /*
  * Attempt to add a reference to a dmu buffer that is in an unknown state,
  * using a pointer that may have been invalidated by eviction processing.
  * The request will succeed if the passed in dbuf still represents the
  * same os/object/blkid, is ineligible for eviction, and has at least
  * one hold by a user other than the syncer.
  */
 boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object,
     uint64_t blkid, void *tag);
 
 void dmu_buf_rele(dmu_buf_t *db, void *tag);
 uint64_t dmu_buf_refcount(dmu_buf_t *db);
 
 /*
  * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a
  * range of an object.  A pointer to an array of dmu_buf_t*'s is
  * returned (in *dbpp).
  *
  * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and
  * frees the array.  The hold on the array of buffers MUST be released
  * with dmu_buf_rele_array.  You can NOT release the hold on each buffer
  * individually with dmu_buf_rele.
  */
 int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
     uint64_t length, boolean_t read, void *tag,
     int *numbufsp, dmu_buf_t ***dbpp);
 void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag);
 
 typedef void dmu_buf_evict_func_t(void *user_ptr);
 
 /*
  * A DMU buffer user object may be associated with a dbuf for the
  * duration of its lifetime.  This allows the user of a dbuf (client)
  * to attach private data to a dbuf (e.g. in-core only data such as a
  * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified
  * when that dbuf has been evicted.  Clients typically respond to the
  * eviction notification by freeing their private data, thus ensuring
  * the same lifetime for both dbuf and private data.
  *
  * The mapping from a dmu_buf_user_t to any client private data is the
  * client's responsibility.  All current consumers of the API with private
  * data embed a dmu_buf_user_t as the first member of the structure for
  * their private data.  This allows conversions between the two types
  * with a simple cast.  Since the DMU buf user API never needs access
  * to the private data, other strategies can be employed if necessary
  * or convenient for the client (e.g. using container_of() to do the
  * conversion for private data that cannot have the dmu_buf_user_t as
  * its first member).
  *
  * Eviction callbacks are executed without the dbuf mutex held or any
  * other type of mechanism to guarantee that the dbuf is still available.
  * For this reason, users must assume the dbuf has already been freed
  * and not reference the dbuf from the callback context.
  *
  * Users requesting "immediate eviction" are notified as soon as the dbuf
  * is only referenced by dirty records (dirties == holds).  Otherwise the
  * notification occurs after eviction processing for the dbuf begins.
  */
 typedef struct dmu_buf_user {
 	/*
 	 * Asynchronous user eviction callback state.
 	 */
 	taskq_ent_t	dbu_tqent;
 
 	/*
 	 * This instance's eviction function pointers.
 	 *
 	 * dbu_evict_func_sync is called synchronously and then
 	 * dbu_evict_func_async is executed asynchronously on a taskq.
 	 */
 	dmu_buf_evict_func_t *dbu_evict_func_sync;
 	dmu_buf_evict_func_t *dbu_evict_func_async;
 #ifdef ZFS_DEBUG
 	/*
 	 * Pointer to user's dbuf pointer.  NULL for clients that do
 	 * not associate a dbuf with their user data.
 	 *
 	 * The dbuf pointer is cleared upon eviction so as to catch
 	 * use-after-evict bugs in clients.
 	 */
 	dmu_buf_t **dbu_clear_on_evict_dbufp;
 #endif
 } dmu_buf_user_t;
 
 /*
  * Initialize the given dmu_buf_user_t instance with the eviction function
  * evict_func, to be called when the user is evicted.
  *
  * NOTE: This function should only be called once on a given dmu_buf_user_t.
  *       To allow enforcement of this, dbu must already be zeroed on entry.
  */
 /*ARGSUSED*/
 inline void
 dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync,
     dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp)
 {
 	ASSERT(dbu->dbu_evict_func_sync == NULL);
 	ASSERT(dbu->dbu_evict_func_async == NULL);
 
 	/* must have at least one evict func */
 	IMPLY(evict_func_sync == NULL, evict_func_async != NULL);
 	dbu->dbu_evict_func_sync = evict_func_sync;
 	dbu->dbu_evict_func_async = evict_func_async;
 #ifdef ZFS_DEBUG
 	dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp;
 #endif
 }
 
 /*
  * Attach user data to a dbuf and mark it for normal (when the dbuf's
  * data is cleared or its reference count goes to zero) eviction processing.
  *
  * Returns NULL on success, or the existing user if another user currently
  * owns the buffer.
  */
 void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Attach user data to a dbuf and mark it for immediate (its dirty and
  * reference counts are equal) eviction processing.
  *
  * Returns NULL on success, or the existing user if another user currently
  * owns the buffer.
  */
 void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Replace the current user of a dbuf.
  *
  * If given the current user of a dbuf, replaces the dbuf's user with
  * "new_user" and returns the user data pointer that was replaced.
  * Otherwise returns the current, and unmodified, dbuf user pointer.
  */
 void *dmu_buf_replace_user(dmu_buf_t *db,
     dmu_buf_user_t *old_user, dmu_buf_user_t *new_user);
 
 /*
  * Remove the specified user data for a DMU buffer.
  *
  * Returns the user that was removed on success, or the current user if
  * another user currently owns the buffer.
  */
 void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user);
 
 /*
  * Returns the user data (dmu_buf_user_t *) associated with this dbuf.
  */
 void *dmu_buf_get_user(dmu_buf_t *db);
 
 objset_t *dmu_buf_get_objset(dmu_buf_t *db);
 dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db);
 void dmu_buf_dnode_exit(dmu_buf_t *db);
 
 /* Block until any in-progress dmu buf user evictions complete. */
 void dmu_buf_user_evict_wait(void);
 
 /*
  * Returns the blkptr associated with this dbuf, or NULL if not set.
  */
 struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db);
 
 /*
  * Indicate that you are going to modify the buffer's data (db_data).
  *
  * The transaction (tx) must be assigned to a txg (ie. you've called
  * dmu_tx_assign()).  The buffer's object must be held in the tx
  * (ie. you've called dmu_tx_hold_object(tx, db->db_object)).
  */
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 
 /*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
  * the transaction to a transaction group.  Once the transaction has
  * been assigned, you can modify buffers which belong to held objects as
  * part of this transaction.  You can't modify buffers before the
  * transaction has been assigned; you can't modify buffers which don't
  * belong to objects which this transaction holds; you can't hold
  * objects once the transaction has been assigned.  You may hold an
  * object which you are going to free (with dmu_object_free()), but you
  * don't have to.
  *
  * You can abort the transaction before it has been assigned.
  *
  * Note that you may hold buffers (with dmu_buf_hold) at any time,
  * regardless of transaction state.
  */
 
 #define	DMU_NEW_OBJECT	(-1ULL)
 #define	DMU_OBJECT_END	(-1ULL)
 
 dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off,
     uint64_t len);
 void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
 void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add,
     const char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn);
 void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow);
 void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 void dmu_tx_mark_netfree(dmu_tx_t *tx);
 
 /*
  * To register a commit callback, dmu_tx_callback_register() must be called.
  *
  * dcb_data is a pointer to caller private data that is passed on as a
  * callback parameter. The caller is responsible for properly allocating and
  * freeing it.
  *
  * When registering a callback, the transaction must be already created, but
  * it cannot be committed or aborted. It can be assigned to a txg or not.
  *
  * The callback will be called after the transaction has been safely written
  * to stable storage and will also be called if the dmu_tx is aborted.
  * If there is any error which prevents the transaction from being committed to
  * disk, the callback will be called with a value of error != 0.
  */
 typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
 
 void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
     void *dcb_data);
 
 /*
  * Free up the data blocks for a defined range of a file.  If size is
  * -1, the range from offset to end-of-file is freed.
  */
 int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size, dmu_tx_t *tx);
 int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset,
 	uint64_t size);
 int dmu_free_long_object(objset_t *os, uint64_t object);
 
 /*
  * Convenience functions.
  *
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
 #define	DMU_READ_PREFETCH	0 /* prefetch */
 #define	DMU_READ_NO_PREFETCH	1 /* don't prefetch */
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	void *buf, uint32_t flags);
 int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf,
     uint32_t flags);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size,
     const void *buf, dmu_tx_t *tx);
 void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_tx_t *tx);
 int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size);
 int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size);
 int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
 int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct page *pp, dmu_tx_t *tx);
 struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
 void dmu_return_arcbuf(struct arc_buf *buf);
 void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
     dmu_tx_t *tx);
 int dmu_xuio_init(struct xuio *uio, int niov);
 void dmu_xuio_fini(struct xuio *uio);
 int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
     size_t n);
 int dmu_xuio_cnt(struct xuio *uio);
 struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
 void dmu_xuio_clear(struct xuio *uio, int i);
 void xuio_stat_wbuf_copied(void);
 void xuio_stat_wbuf_nocopy(void);
 
 extern boolean_t zfs_prefetch_disable;
 extern int zfs_max_recordsize;
 
 /*
  * Asynchronously try to read in the data.
  */
 void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset,
     uint64_t len, enum zio_priority pri);
 
 typedef struct dmu_object_info {
 	/* All sizes are in bytes unless otherwise indicated. */
 	uint32_t doi_data_block_size;
 	uint32_t doi_metadata_block_size;
 	dmu_object_type_t doi_type;
 	dmu_object_type_t doi_bonus_type;
 	uint64_t doi_bonus_size;
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
 	uint8_t doi_nblkptr;
 	uint8_t doi_pad[4];
 	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
 	uint64_t doi_max_offset;
 	uint64_t doi_fill_count;		/* number of non-empty blocks */
 } dmu_object_info_t;
 
 typedef void arc_byteswap_func_t(void *buf, size_t size);
 
 typedef struct dmu_object_type_info {
 	dmu_object_byteswap_t	ot_byteswap;
 	boolean_t		ot_metadata;
+	boolean_t		ot_dbuf_metadata_cache;
 	char			*ot_name;
 } dmu_object_type_info_t;
 
 typedef struct dmu_object_byteswap_info {
 	arc_byteswap_func_t	*ob_func;
 	char			*ob_name;
 } dmu_object_byteswap_info_t;
 
 extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES];
 extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
 
 /*
  * Get information on a DMU object.
  *
  * Return 0 on success or ENOENT if object is not allocated.
  *
  * If doi is NULL, just indicates whether the object exists.
  */
 int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dnode in hand. */
 void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi);
 /* Like dmu_object_info, but faster if you have a held dbuf in hand. */
 void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
 /*
  * Like dmu_object_info_from_db, but faster still when you only care about
  * the size.  This is specifically optimized for zfs_getattr().
  */
 void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
     u_longlong_t *nblk512);
 
 typedef struct dmu_objset_stats {
 	uint64_t dds_num_clones; /* number of clones of this */
 	uint64_t dds_creation_txg;
 	uint64_t dds_guid;
 	dmu_objset_type_t dds_type;
 	uint8_t dds_is_snapshot;
 	uint8_t dds_inconsistent;
 	char dds_origin[ZFS_MAX_DATASET_NAME_LEN];
 } dmu_objset_stats_t;
 
 /*
  * Get stats on a dataset.
  */
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 
 /*
  * Add entries to the nvlist for all the objset's properties.  See
  * zfs_prop_table[] and zfs(1m) for details on the properties.
  */
 void dmu_objset_stats(objset_t *os, struct nvlist *nv);
 
 /*
  * Get the space usage statistics for statvfs().
  *
  * refdbytes is the amount of space "referenced" by this objset.
  * availbytes is the amount of space available to this objset, taking
  * into account quotas & reservations, assuming that no other objsets
  * use the space first.  These values correspond to the 'referenced' and
  * 'available' properties, described in the zfs(1m) manpage.
  *
  * usedobjs and availobjs are the number of objects currently allocated,
  * and available.
  */
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 
 /*
  * The fsid_guid is a 56-bit ID that can change to avoid collisions.
  * (Contrast with the ds_guid which is a 64-bit ID that will never
  * change, so there is a small probability that it will collide.)
  */
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 
 /*
  * Get the [cm]time for an objset's snapshot dir
  */
 timestruc_t dmu_objset_snap_cmtime(objset_t *os);
 
 int dmu_objset_is_snapshot(objset_t *os);
 
 extern struct spa *dmu_objset_spa(objset_t *os);
 extern struct zilog *dmu_objset_zil(objset_t *os);
 extern struct dsl_pool *dmu_objset_pool(objset_t *os);
 extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
 extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
 extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
 extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
     int maxlen, boolean_t *conflict);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
 
 typedef int objset_used_cb_t(dmu_object_type_t bonustype,
     void *bonus, uint64_t *userp, uint64_t *groupp);
 extern void dmu_objset_register_type(dmu_objset_type_t ost,
     objset_used_cb_t *cb);
 extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
 extern void *dmu_objset_get_user(objset_t *os);
 
 /*
  * Return the txg number for the given assigned transaction.
  */
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 
 /*
  * Synchronous write.
  * If a parent zio is provided this function initiates a write on the
  * provided buffer as a child of the parent zio.
  * In the absence of a parent zio, the write is completed synchronously.
  * At write completion, blk is filled with the bp of the written block.
  * Note that while the data covered by this function will be on stable
  * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
 
 /*
  * {zfs,zvol,ztest}_get_done() args
  */
 typedef struct zgd {
 	struct lwb	*zgd_lwb;
 	struct blkptr	*zgd_bp;
 	dmu_buf_t	*zgd_db;
 	struct rl	*zgd_rl;
 	void		*zgd_private;
 } zgd_t;
 
 typedef void dmu_sync_cb_t(zgd_t *arg, int error);
 int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
 
 /*
  * Find the next hole or data block in file starting at *off
  * Return found offset in *off. Return ESRCH for end of file.
  */
 int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole,
     uint64_t *off);
 
 /*
  * Check if a DMU object has any dirty blocks. If so, sync out
  * all pending transaction groups. Otherwise, this function
  * does not alter DMU state. This could be improved to only sync
  * out the necessary transaction groups for this particular
  * object.
  */
 int dmu_object_wait_synced(objset_t *os, uint64_t object);
 
 /*
  * Initial setup and final teardown.
  */
 extern void dmu_init(void);
 extern void dmu_fini(void);
 
 typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp,
     uint64_t object, uint64_t offset, int len);
 void dmu_traverse_objset(objset_t *os, uint64_t txg_start,
     dmu_traverse_cb_t cb, void *arg);
 
 int dmu_diff(const char *tosnap_name, const char *fromsnap_name,
     struct vnode *vp, offset_t *offp);
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
 extern uint64_t zfs_crc64_table[256];
 
 extern int zfs_mdcomp_disable;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_DMU_H */
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_objset.h
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_objset.h	(revision 336957)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/dmu_objset.h	(revision 336958)
@@ -1,199 +1,211 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
  * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #ifndef	_SYS_DMU_OBJSET_H
 #define	_SYS_DMU_OBJSET_H
 
 #include <sys/spa.h>
 #include <sys/arc.h>
 #include <sys/txg.h>
 #include <sys/zfs_context.h>
 #include <sys/dnode.h>
 #include <sys/zio.h>
 #include <sys/zil.h>
 #include <sys/sa.h>
+#include <sys/zfs_ioctl.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 extern krwlock_t os_lock;
 
 struct dsl_pool;
 struct dsl_dataset;
 struct dmu_tx;
 
 #define	OBJSET_PHYS_SIZE 2048
 #define	OBJSET_OLD_PHYS_SIZE 1024
 
 #define	OBJSET_BUF_HAS_USERUSED(buf) \
 	(arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE)
 
 #define	OBJSET_FLAG_USERACCOUNTING_COMPLETE	(1ULL<<0)
 
 typedef struct objset_phys {
 	dnode_phys_t os_meta_dnode;
 	zil_header_t os_zil_header;
 	uint64_t os_type;
 	uint64_t os_flags;
 	char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 -
 	    sizeof (zil_header_t) - sizeof (uint64_t)*2];
 	dnode_phys_t os_userused_dnode;
 	dnode_phys_t os_groupused_dnode;
 } objset_phys_t;
 
+#define	OBJSET_PROP_UNINITIALIZED	((uint64_t)-1)
 struct objset {
 	/* Immutable: */
 	struct dsl_dataset *os_dsl_dataset;
 	spa_t *os_spa;
 	arc_buf_t *os_phys_buf;
 	objset_phys_t *os_phys;
 	/*
 	 * The following "special" dnodes have no parent, are exempt
 	 * from dnode_move(), and are not recorded in os_dnodes, but they
 	 * root their descendents in this objset using handles anyway, so
 	 * that all access to dnodes from dbufs consistently uses handles.
 	 */
 	dnode_handle_t os_meta_dnode;
 	dnode_handle_t os_userused_dnode;
 	dnode_handle_t os_groupused_dnode;
 	zilog_t *os_zil;
 
 	list_node_t os_evicting_node;
 
 	/* can change, under dsl_dir's locks: */
 	enum zio_checksum os_checksum;
 	enum zio_compress os_compress;
 	uint8_t os_copies;
 	enum zio_checksum os_dedup_checksum;
 	boolean_t os_dedup_verify;
 	zfs_logbias_op_t os_logbias;
 	zfs_cache_type_t os_primary_cache;
 	zfs_cache_type_t os_secondary_cache;
 	zfs_sync_type_t os_sync;
 	zfs_redundant_metadata_type_t os_redundant_metadata;
 	int os_recordsize;
+	/*
+	 * The next four values are used as a cache of whatever's on disk, and
+	 * are initialized the first time these properties are queried. Before
+	 * being initialized with their real values, their values are
+	 * OBJSET_PROP_UNINITIALIZED.
+	 */
+	uint64_t os_version;
+	uint64_t os_normalization;
+	uint64_t os_utf8only;
+	uint64_t os_casesensitivity;
 
 	/*
 	 * Pointer is constant; the blkptr it points to is protected by
 	 * os_dsl_dataset->ds_bp_rwlock
 	 */
 	blkptr_t *os_rootbp;
 
 	/* no lock needed: */
 	struct dmu_tx *os_synctx; /* XXX sketchy */
 	zil_header_t os_zil_header;
 	multilist_t *os_synced_dnodes;
 	uint64_t os_flags;
 	uint64_t os_freed_dnodes;
 	boolean_t os_rescan_dnodes;
 
 	/* Protected by os_obj_lock */
 	kmutex_t os_obj_lock;
 	uint64_t os_obj_next;
 
 	/* Protected by os_lock */
 	kmutex_t os_lock;
 	multilist_t *os_dirty_dnodes[TXG_SIZE];
 	list_t os_dnodes;
 	list_t os_downgraded_dbufs;
 
 	/* Protects changes to DMU_{USER,GROUP}USED_OBJECT */
 	kmutex_t os_userused_lock;
 
 	/* stuff we store for the user */
 	kmutex_t os_user_ptr_lock;
 	void *os_user_ptr;
 	sa_os_t *os_sa;
 };
 
 #define	DMU_META_OBJSET		0
 #define	DMU_META_DNODE_OBJECT	0
 #define	DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
 #define	DMU_META_DNODE(os)	((os)->os_meta_dnode.dnh_dnode)
 #define	DMU_USERUSED_DNODE(os)	((os)->os_userused_dnode.dnh_dnode)
 #define	DMU_GROUPUSED_DNODE(os)	((os)->os_groupused_dnode.dnh_dnode)
 
 #define	DMU_OS_IS_L2CACHEABLE(os)				\
 	((os)->os_secondary_cache == ZFS_CACHE_ALL ||		\
 	(os)->os_secondary_cache == ZFS_CACHE_METADATA)
 
 #define	DMU_OS_IS_L2COMPRESSIBLE(os)	(zfs_mdcomp_disable == B_FALSE)
 
 /* called from zpl */
 int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
 int dmu_objset_own(const char *name, dmu_objset_type_t type,
     boolean_t readonly, void *tag, objset_t **osp);
 int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj,
     dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp);
 void dmu_objset_refresh_ownership(struct dsl_dataset *ds,
     struct dsl_dataset **newds, void *tag);
 void dmu_objset_rele(objset_t *os, void *tag);
 void dmu_objset_disown(objset_t *os, void *tag);
 int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
 
 void dmu_objset_stats(objset_t *os, nvlist_t *nv);
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj,
     int func(struct dsl_pool *, struct dsl_dataset *, void *),
     void *arg, int flags);
 int dmu_objset_prefetch(const char *name, void *arg);
 void dmu_objset_evict_dbufs(objset_t *os);
 timestruc_t dmu_objset_snap_cmtime(objset_t *os);
 
 /* called from dsl */
 void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
 boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
 objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
     blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
 int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
     objset_t **osp);
 void dmu_objset_evict(objset_t *os);
 void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx);
 void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx);
 boolean_t dmu_objset_userused_enabled(objset_t *os);
 int dmu_objset_userspace_upgrade(objset_t *os);
 boolean_t dmu_objset_userspace_present(objset_t *os);
 int dmu_fsname(const char *snapname, char *buf);
 
 void dmu_objset_evict_done(objset_t *os);
 void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx);
 
 void dmu_objset_init(void);
 void dmu_objset_fini(void);
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif /* _SYS_DMU_OBJSET_H */
Index: vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vfsops.c
===================================================================
--- vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vfsops.c	(revision 336957)
+++ vendor-sys/illumos/dist/uts/common/fs/zfs/zfs_vfsops.c	(revision 336958)
@@ -1,2319 +1,2359 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright (c) 2012, 2015 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  * Copyright 2016 Nexenta Systems, Inc. All rights reserved.
  */
 
 /* Portions Copyright 2010 Robert Milkowski */
 
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/sysmacros.h>
 #include <sys/kmem.h>
 #include <sys/pathname.h>
 #include <sys/vnode.h>
 #include <sys/vfs.h>
 #include <sys/vfs_opreg.h>
 #include <sys/mntent.h>
 #include <sys/mount.h>
 #include <sys/cmn_err.h>
 #include "fs/fs_subr.h"
 #include <sys/zfs_znode.h>
 #include <sys/zfs_dir.h>
 #include <sys/zil.h>
 #include <sys/fs/zfs.h>
 #include <sys/dmu.h>
 #include <sys/dsl_prop.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/varargs.h>
 #include <sys/policy.h>
 #include <sys/atomic.h>
 #include <sys/mkdev.h>
 #include <sys/modctl.h>
 #include <sys/refstr.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zfs_ctldir.h>
 #include <sys/zfs_fuid.h>
 #include <sys/bootconf.h>
 #include <sys/sunddi.h>
 #include <sys/dnlc.h>
 #include <sys/dmu_objset.h>
 #include <sys/spa_boot.h>
 #include "zfs_comutil.h"
 
 int zfsfstype;
 vfsops_t *zfs_vfsops = NULL;
 static major_t zfs_major;
 static minor_t zfs_minor;
 static kmutex_t	zfs_dev_mtx;
 
 extern int sys_shutdown;
 
 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
 static int zfs_root(vfs_t *vfsp, vnode_t **vpp);
 static int zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp);
 static int zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp);
 static void zfs_freevfs(vfs_t *vfsp);
 
 static const fs_operation_def_t zfs_vfsops_template[] = {
 	VFSNAME_MOUNT,		{ .vfs_mount = zfs_mount },
 	VFSNAME_MOUNTROOT,	{ .vfs_mountroot = zfs_mountroot },
 	VFSNAME_UNMOUNT,	{ .vfs_unmount = zfs_umount },
 	VFSNAME_ROOT,		{ .vfs_root = zfs_root },
 	VFSNAME_STATVFS,	{ .vfs_statvfs = zfs_statvfs },
 	VFSNAME_SYNC,		{ .vfs_sync = zfs_sync },
 	VFSNAME_VGET,		{ .vfs_vget = zfs_vget },
 	VFSNAME_FREEVFS,	{ .vfs_freevfs = zfs_freevfs },
 	NULL,			NULL
 };
 
 /*
  * We need to keep a count of active fs's.
  * This is necessary to prevent our module
  * from being unloaded after a umount -f
  */
 static uint32_t	zfs_active_fs_count = 0;
 
 static char *noatime_cancel[] = { MNTOPT_ATIME, NULL };
 static char *atime_cancel[] = { MNTOPT_NOATIME, NULL };
 static char *noxattr_cancel[] = { MNTOPT_XATTR, NULL };
 static char *xattr_cancel[] = { MNTOPT_NOXATTR, NULL };
 
 /*
  * MO_DEFAULT is not used since the default value is determined
  * by the equivalent property.
  */
 static mntopt_t mntopts[] = {
 	{ MNTOPT_NOXATTR, noxattr_cancel, NULL, 0, NULL },
 	{ MNTOPT_XATTR, xattr_cancel, NULL, 0, NULL },
 	{ MNTOPT_NOATIME, noatime_cancel, NULL, 0, NULL },
 	{ MNTOPT_ATIME, atime_cancel, NULL, 0, NULL }
 };
 
 static mntopts_t zfs_mntopts = {
 	sizeof (mntopts) / sizeof (mntopt_t),
 	mntopts
 };
 
 /*ARGSUSED*/
 int
 zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
 {
 	/*
 	 * Data integrity is job one.  We don't want a compromised kernel
 	 * writing to the storage pool, so we never sync during panic.
 	 */
 	if (panicstr)
 		return (0);
 
 	/*
 	 * SYNC_ATTR is used by fsflush() to force old filesystems like UFS
 	 * to sync metadata, which they would otherwise cache indefinitely.
 	 * Semantically, the only requirement is that the sync be initiated.
 	 * The DMU syncs out txgs frequently, so there's nothing to do.
 	 */
 	if (flag & SYNC_ATTR)
 		return (0);
 
 	if (vfsp != NULL) {
 		/*
 		 * Sync a specific filesystem.
 		 */
 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
 		dsl_pool_t *dp;
 
 		ZFS_ENTER(zfsvfs);
 		dp = dmu_objset_pool(zfsvfs->z_os);
 
 		/*
 		 * If the system is shutting down, then skip any
 		 * filesystems which may exist on a suspended pool.
 		 */
 		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
 			ZFS_EXIT(zfsvfs);
 			return (0);
 		}
 
 		if (zfsvfs->z_log != NULL)
 			zil_commit(zfsvfs->z_log, 0);
 
 		ZFS_EXIT(zfsvfs);
 	} else {
 		/*
 		 * Sync all ZFS filesystems.  This is what happens when you
 		 * run sync(1M).  Unlike other filesystems, ZFS honors the
 		 * request by waiting for all pools to commit all dirty data.
 		 */
 		spa_sync_allpools();
 	}
 
 	return (0);
 }
 
 static int
 zfs_create_unique_device(dev_t *dev)
 {
 	major_t new_major;
 
 	do {
 		ASSERT3U(zfs_minor, <=, MAXMIN32);
 		minor_t start = zfs_minor;
 		do {
 			mutex_enter(&zfs_dev_mtx);
 			if (zfs_minor >= MAXMIN32) {
 				/*
 				 * If we're still using the real major
 				 * keep out of /dev/zfs and /dev/zvol minor
 				 * number space.  If we're using a getudev()'ed
 				 * major number, we can use all of its minors.
 				 */
 				if (zfs_major == ddi_name_to_major(ZFS_DRIVER))
 					zfs_minor = ZFS_MIN_MINOR;
 				else
 					zfs_minor = 0;
 			} else {
 				zfs_minor++;
 			}
 			*dev = makedevice(zfs_major, zfs_minor);
 			mutex_exit(&zfs_dev_mtx);
 		} while (vfs_devismounted(*dev) && zfs_minor != start);
 		if (zfs_minor == start) {
 			/*
 			 * We are using all ~262,000 minor numbers for the
 			 * current major number.  Create a new major number.
 			 */
 			if ((new_major = getudev()) == (major_t)-1) {
 				cmn_err(CE_WARN,
 				    "zfs_mount: Can't get unique major "
 				    "device number.");
 				return (-1);
 			}
 			mutex_enter(&zfs_dev_mtx);
 			zfs_major = new_major;
 			zfs_minor = 0;
 
 			mutex_exit(&zfs_dev_mtx);
 		} else {
 			break;
 		}
 		/* CONSTANTCONDITION */
 	} while (1);
 
 	return (0);
 }
 
 static void
 atime_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		zfsvfs->z_atime = TRUE;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0);
 	} else {
 		zfsvfs->z_atime = FALSE;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0);
 	}
 }
 
 static void
 xattr_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == TRUE) {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag |= VFS_XATTR;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_XATTR, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_XATTR;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_XATTR);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOXATTR, NULL, 0);
 	}
 }
 
 static void
 blksz_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 	ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os)));
 	ASSERT3U(newval, >=, SPA_MINBLOCKSIZE);
 	ASSERT(ISP2(newval));
 
 	zfsvfs->z_max_blksz = newval;
 	zfsvfs->z_vfs->vfs_bsize = newval;
 }
 
 static void
 readonly_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval) {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0);
 	} else {
 		/* XXX locking on vfs_flag? */
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0);
 	}
 }
 
 static void
 devices_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NODEVICES;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NODEVICES;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NODEVICES);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_DEVICES, NULL, 0);
 	}
 }
 
 static void
 setuid_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0);
 	}
 }
 
 static void
 exec_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	if (newval == FALSE) {
 		zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0);
 	} else {
 		zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC;
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0);
 	}
 }
 
 /*
  * The nbmand mount option can be changed at mount time.
  * We can't allow it to be toggled on live file systems or incorrect
  * behavior may be seen from cifs clients
  *
  * This property isn't registered via dsl_prop_register(), but this callback
  * will be called when a file system is first mounted
  */
 static void
 nbmand_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 	if (newval == FALSE) {
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0);
 	} else {
 		vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND);
 		vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0);
 	}
 }
 
 static void
 snapdir_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_show_ctldir = newval;
 }
 
 static void
 vscan_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_vscan = newval;
 }
 
 static void
 acl_mode_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_mode = newval;
 }
 
 static void
 acl_inherit_changed_cb(void *arg, uint64_t newval)
 {
 	zfsvfs_t *zfsvfs = arg;
 
 	zfsvfs->z_acl_inherit = newval;
 }
 
 static int
 zfs_register_callbacks(vfs_t *vfsp)
 {
 	struct dsl_dataset *ds = NULL;
 	objset_t *os = NULL;
 	zfsvfs_t *zfsvfs = NULL;
 	uint64_t nbmand;
 	boolean_t readonly = B_FALSE;
 	boolean_t do_readonly = B_FALSE;
 	boolean_t setuid = B_FALSE;
 	boolean_t do_setuid = B_FALSE;
 	boolean_t exec = B_FALSE;
 	boolean_t do_exec = B_FALSE;
 	boolean_t devices = B_FALSE;
 	boolean_t do_devices = B_FALSE;
 	boolean_t xattr = B_FALSE;
 	boolean_t do_xattr = B_FALSE;
 	boolean_t atime = B_FALSE;
 	boolean_t do_atime = B_FALSE;
 	int error = 0;
 
 	ASSERT(vfsp);
 	zfsvfs = vfsp->vfs_data;
 	ASSERT(zfsvfs);
 	os = zfsvfs->z_os;
 
 	/*
 	 * The act of registering our callbacks will destroy any mount
 	 * options we may have.  In order to enable temporary overrides
 	 * of mount options, we stash away the current values and
 	 * restore them after we register the callbacks.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) ||
 	    !spa_writeable(dmu_objset_spa(os))) {
 		readonly = B_TRUE;
 		do_readonly = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) {
 		readonly = B_FALSE;
 		do_readonly = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOSUID, NULL)) {
 		devices = B_FALSE;
 		setuid = B_FALSE;
 		do_devices = B_TRUE;
 		do_setuid = B_TRUE;
 	} else {
 		if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) {
 			devices = B_FALSE;
 			do_devices = B_TRUE;
 		} else if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) {
 			devices = B_TRUE;
 			do_devices = B_TRUE;
 		}
 
 		if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) {
 			setuid = B_FALSE;
 			do_setuid = B_TRUE;
 		} else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) {
 			setuid = B_TRUE;
 			do_setuid = B_TRUE;
 		}
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) {
 		exec = B_FALSE;
 		do_exec = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) {
 		exec = B_TRUE;
 		do_exec = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) {
 		xattr = B_FALSE;
 		do_xattr = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) {
 		xattr = B_TRUE;
 		do_xattr = B_TRUE;
 	}
 	if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) {
 		atime = B_FALSE;
 		do_atime = B_TRUE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) {
 		atime = B_TRUE;
 		do_atime = B_TRUE;
 	}
 
 	/*
 	 * nbmand is a special property.  It can only be changed at
 	 * mount time.
 	 *
 	 * This is weird, but it is documented to only be changeable
 	 * at mount time.
 	 */
 	if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) {
 		nbmand = B_FALSE;
 	} else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) {
 		nbmand = B_TRUE;
 	} else {
 		char osname[ZFS_MAX_DATASET_NAME_LEN];
 
 		dmu_objset_name(os, osname);
 		if (error = dsl_prop_get_integer(osname, "nbmand", &nbmand,
 		    NULL)) {
 			return (error);
 		}
 	}
 
 	/*
 	 * Register property callbacks.
 	 *
 	 * It would probably be fine to just check for i/o error from
 	 * the first prop_register(), but I guess I like to go
 	 * overboard...
 	 */
 	ds = dmu_objset_ds(os);
 	dsl_pool_config_enter(dmu_objset_pool(os), FTAG);
 	error = dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb,
 	    zfsvfs);
 	error = error ? error : dsl_prop_register(ds,
 	    zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs);
 	dsl_pool_config_exit(dmu_objset_pool(os), FTAG);
 	if (error)
 		goto unregister;
 
 	/*
 	 * Invoke our callbacks to restore temporary mount options.
 	 */
 	if (do_readonly)
 		readonly_changed_cb(zfsvfs, readonly);
 	if (do_setuid)
 		setuid_changed_cb(zfsvfs, setuid);
 	if (do_exec)
 		exec_changed_cb(zfsvfs, exec);
 	if (do_devices)
 		devices_changed_cb(zfsvfs, devices);
 	if (do_xattr)
 		xattr_changed_cb(zfsvfs, xattr);
 	if (do_atime)
 		atime_changed_cb(zfsvfs, atime);
 
 	nbmand_changed_cb(zfsvfs, nbmand);
 
 	return (0);
 
 unregister:
 	dsl_prop_unregister_all(ds, zfsvfs);
 	return (error);
 }
 
 static int
 zfs_space_delta_cb(dmu_object_type_t bonustype, void *data,
     uint64_t *userp, uint64_t *groupp)
 {
 	/*
 	 * Is it a valid type of object to track?
 	 */
 	if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA)
 		return (SET_ERROR(ENOENT));
 
 	/*
 	 * If we have a NULL data pointer
 	 * then assume the id's aren't changing and
 	 * return EEXIST to the dmu to let it know to
 	 * use the same ids
 	 */
 	if (data == NULL)
 		return (SET_ERROR(EEXIST));
 
 	if (bonustype == DMU_OT_ZNODE) {
 		znode_phys_t *znp = data;
 		*userp = znp->zp_uid;
 		*groupp = znp->zp_gid;
 	} else {
 		int hdrsize;
 		sa_hdr_phys_t *sap = data;
 		sa_hdr_phys_t sa = *sap;
 		boolean_t swap = B_FALSE;
 
 		ASSERT(bonustype == DMU_OT_SA);
 
 		if (sa.sa_magic == 0) {
 			/*
 			 * This should only happen for newly created
 			 * files that haven't had the znode data filled
 			 * in yet.
 			 */
 			*userp = 0;
 			*groupp = 0;
 			return (0);
 		}
 		if (sa.sa_magic == BSWAP_32(SA_MAGIC)) {
 			sa.sa_magic = SA_MAGIC;
 			sa.sa_layout_info = BSWAP_16(sa.sa_layout_info);
 			swap = B_TRUE;
 		} else {
 			VERIFY3U(sa.sa_magic, ==, SA_MAGIC);
 		}
 
 		hdrsize = sa_hdrsize(&sa);
 		VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t));
 		*userp = *((uint64_t *)((uintptr_t)data + hdrsize +
 		    SA_UID_OFFSET));
 		*groupp = *((uint64_t *)((uintptr_t)data + hdrsize +
 		    SA_GID_OFFSET));
 		if (swap) {
 			*userp = BSWAP_64(*userp);
 			*groupp = BSWAP_64(*groupp);
 		}
 	}
 	return (0);
 }
 
 static void
 fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
     char *domainbuf, int buflen, uid_t *ridp)
 {
 	uint64_t fuid;
 	const char *domain;
 
 	fuid = zfs_strtonum(fuidstr, NULL);
 
 	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
 	if (domain)
 		(void) strlcpy(domainbuf, domain, buflen);
 	else
 		domainbuf[0] = '\0';
 	*ridp = FUID_RID(fuid);
 }
 
 static uint64_t
 zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
 {
 	switch (type) {
 	case ZFS_PROP_USERUSED:
 		return (DMU_USERUSED_OBJECT);
 	case ZFS_PROP_GROUPUSED:
 		return (DMU_GROUPUSED_OBJECT);
 	case ZFS_PROP_USERQUOTA:
 		return (zfsvfs->z_userquota_obj);
 	case ZFS_PROP_GROUPQUOTA:
 		return (zfsvfs->z_groupquota_obj);
 	}
 	return (0);
 }
 
 int
 zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
 {
 	int error;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	zfs_useracct_t *buf = vbuf;
 	uint64_t obj;
 
 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
 		return (SET_ERROR(ENOTSUP));
 
 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 	if (obj == 0) {
 		*bufsizep = 0;
 		return (0);
 	}
 
 	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
 	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
 	    zap_cursor_advance(&zc)) {
 		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
 		    *bufsizep)
 			break;
 
 		fuidstr_to_sid(zfsvfs, za.za_name,
 		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
 
 		buf->zu_space = za.za_first_integer;
 		buf++;
 	}
 	if (error == ENOENT)
 		error = 0;
 
 	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
 	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
 	*cookiep = zap_cursor_serialize(&zc);
 	zap_cursor_fini(&zc);
 	return (error);
 }
 
 /*
  * buf must be big enough (eg, 32 bytes)
  */
 static int
 id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
     char *buf, boolean_t addok)
 {
 	uint64_t fuid;
 	int domainid = 0;
 
 	if (domain && domain[0]) {
 		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
 		if (domainid == -1)
 			return (SET_ERROR(ENOENT));
 	}
 	fuid = FUID_ENCODE(domainid, rid);
 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 	return (0);
 }
 
 int
 zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t *valp)
 {
 	char buf[32];
 	int err;
 	uint64_t obj;
 
 	*valp = 0;
 
 	if (!dmu_objset_userspace_present(zfsvfs->z_os))
 		return (SET_ERROR(ENOTSUP));
 
 	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
 	if (obj == 0)
 		return (0);
 
 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
 	if (err)
 		return (err);
 
 	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
 	if (err == ENOENT)
 		err = 0;
 	return (err);
 }
 
 int
 zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
     const char *domain, uint64_t rid, uint64_t quota)
 {
 	char buf[32];
 	int err;
 	dmu_tx_t *tx;
 	uint64_t *objp;
 	boolean_t fuid_dirtied;
 
 	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
 		return (SET_ERROR(EINVAL));
 
 	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
 		return (SET_ERROR(ENOTSUP));
 
 	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
 	    &zfsvfs->z_groupquota_obj;
 
 	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
 	if (err)
 		return (err);
 	fuid_dirtied = zfsvfs->z_fuid_dirty;
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
 	if (*objp == 0) {
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 		    zfs_userquota_prop_prefixes[type]);
 	}
 	if (fuid_dirtied)
 		zfs_fuid_txhold(zfsvfs, tx);
 	err = dmu_tx_assign(tx, TXG_WAIT);
 	if (err) {
 		dmu_tx_abort(tx);
 		return (err);
 	}
 
 	mutex_enter(&zfsvfs->z_lock);
 	if (*objp == 0) {
 		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
 		    DMU_OT_NONE, 0, tx);
 		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
 		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
 	}
 	mutex_exit(&zfsvfs->z_lock);
 
 	if (quota == 0) {
 		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
 		if (err == ENOENT)
 			err = 0;
 	} else {
 		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
 	}
 	ASSERT(err == 0);
 	if (fuid_dirtied)
 		zfs_fuid_sync(zfsvfs, tx);
 	dmu_tx_commit(tx);
 	return (err);
 }
 
 boolean_t
 zfs_fuid_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
 {
 	char buf[32];
 	uint64_t used, quota, usedobj, quotaobj;
 	int err;
 
 	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 
 	if (quotaobj == 0 || zfsvfs->z_replay)
 		return (B_FALSE);
 
 	(void) sprintf(buf, "%llx", (longlong_t)fuid);
 	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
 	if (err != 0)
 		return (B_FALSE);
 
 	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
 	if (err != 0)
 		return (B_FALSE);
 	return (used >= quota);
 }
 
 boolean_t
 zfs_owner_overquota(zfsvfs_t *zfsvfs, znode_t *zp, boolean_t isgroup)
 {
 	uint64_t fuid;
 	uint64_t quotaobj;
 
 	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
 
 	fuid = isgroup ? zp->z_gid : zp->z_uid;
 
 	if (quotaobj == 0 || zfsvfs->z_replay)
 		return (B_FALSE);
 
 	return (zfs_fuid_overquota(zfsvfs, isgroup, fuid));
 }
 
 /*
  * Associate this zfsvfs with the given objset, which must be owned.
  * This will cache a bunch of on-disk state from the objset in the
  * zfsvfs.
  */
 static int
 zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os)
 {
 	int error;
 	uint64_t val;
 
 	zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE;
 	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
 	zfsvfs->z_os = os;
 
 	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
 	if (error != 0)
 		return (error);
 	if (zfsvfs->z_version >
 	    zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) {
 		(void) printf("Can't mount a version %lld file system "
 		    "on a version %lld pool\n. Pool must be upgraded to mount "
 		    "this file system.", (u_longlong_t)zfsvfs->z_version,
 		    (u_longlong_t)spa_version(dmu_objset_spa(os)));
 		return (SET_ERROR(ENOTSUP));
 	}
 	error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val);
 	if (error != 0)
 		return (error);
 	zfsvfs->z_norm = (int)val;
 
 	error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val);
 	if (error != 0)
 		return (error);
 	zfsvfs->z_utf8 = (val != 0);
 
 	error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val);
 	if (error != 0)
 		return (error);
 	zfsvfs->z_case = (uint_t)val;
 
 	/*
 	 * Fold case on file systems that are always or sometimes case
 	 * insensitive.
 	 */
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
 	    zfsvfs->z_case == ZFS_CASE_MIXED)
 		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
 
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 
 	uint64_t sa_obj = 0;
 	if (zfsvfs->z_use_sa) {
 		/* should either have both of these objects or none */
 		error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1,
 		    &sa_obj);
 		if (error != 0)
 			return (error);
 	}
 
 	error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END,
 	    &zfsvfs->z_attr_table);
 	if (error != 0)
 		return (error);
 
 	if (zfsvfs->z_version >= ZPL_VERSION_SA)
 		sa_register_update_callback(os, zfs_sa_upgrade);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
 	    &zfsvfs->z_root);
 	if (error != 0)
 		return (error);
 	ASSERT(zfsvfs->z_root != 0);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
 	    &zfsvfs->z_unlinkedobj);
 	if (error != 0)
 		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
 	    8, 1, &zfsvfs->z_userquota_obj);
 	if (error == ENOENT)
 		zfsvfs->z_userquota_obj = 0;
 	else if (error != 0)
 		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ,
 	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
 	    8, 1, &zfsvfs->z_groupquota_obj);
 	if (error == ENOENT)
 		zfsvfs->z_groupquota_obj = 0;
 	else if (error != 0)
 		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
 	    &zfsvfs->z_fuid_obj);
 	if (error == ENOENT)
 		zfsvfs->z_fuid_obj = 0;
 	else if (error != 0)
 		return (error);
 
 	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
 	    &zfsvfs->z_shares_dir);
 	if (error == ENOENT)
 		zfsvfs->z_shares_dir = 0;
 	else if (error != 0)
 		return (error);
 
 	return (0);
 }
 
 int
 zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
 {
 	objset_t *os;
 	zfsvfs_t *zfsvfs;
 	int error;
 
 	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
 
 	/*
 	 * We claim to always be readonly so we can open snapshots;
 	 * other ZPL code will prevent us from writing to snapshots.
 	 */
 
 	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
 	if (error != 0) {
 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
 		return (error);
 	}
 
 	error = zfsvfs_create_impl(zfvp, zfsvfs, os);
 	if (error != 0) {
 		dmu_objset_disown(os, zfsvfs);
 	}
 	return (error);
 }
 
 
 int
 zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os)
 {
 	int error;
 
 	zfsvfs->z_vfs = NULL;
 	zfsvfs->z_parent = zfsvfs;
 
 	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
 	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 	rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
 	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 	for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
 
 	error = zfsvfs_init(zfsvfs, os);
 	if (error != 0) {
 		*zfvp = NULL;
 		kmem_free(zfsvfs, sizeof (zfsvfs_t));
 		return (error);
 	}
 
 	*zfvp = zfsvfs;
 	return (0);
 }
 
 static int
 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 {
 	int error;
 
 	error = zfs_register_callbacks(zfsvfs->z_vfs);
 	if (error)
 		return (error);
 
 	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
 
 	/*
 	 * If we are not mounting (ie: online recv), then we don't
 	 * have to worry about replaying the log as we blocked all
 	 * operations out since we closed the ZIL.
 	 */
 	if (mounting) {
 		boolean_t readonly;
 
 		/*
 		 * During replay we remove the read only flag to
 		 * allow replays to succeed.
 		 */
 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
 		if (readonly != 0)
 			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
 		else
 			zfs_unlinked_drain(zfsvfs);
 
 		/*
 		 * Parse and replay the intent log.
 		 *
 		 * Because of ziltest, this must be done after
 		 * zfs_unlinked_drain().  (Further note: ziltest
 		 * doesn't use readonly mounts, where
 		 * zfs_unlinked_drain() isn't called.)  This is because
 		 * ziltest causes spa_sync() to think it's committed,
 		 * but actually it is not, so the intent log contains
 		 * many txg's worth of changes.
 		 *
 		 * In particular, if object N is in the unlinked set in
 		 * the last txg to actually sync, then it could be
 		 * actually freed in a later txg and then reallocated
 		 * in a yet later txg.  This would write a "create
 		 * object N" record to the intent log.  Normally, this
 		 * would be fine because the spa_sync() would have
 		 * written out the fact that object N is free, before
 		 * we could write the "create object N" intent log
 		 * record.
 		 *
 		 * But when we are in ziltest mode, we advance the "open
 		 * txg" without actually spa_sync()-ing the changes to
 		 * disk.  So we would see that object N is still
 		 * allocated and in the unlinked set, and there is an
 		 * intent log record saying to allocate it.
 		 */
 		if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) {
 			if (zil_replay_disable) {
 				zil_destroy(zfsvfs->z_log, B_FALSE);
 			} else {
 				zfsvfs->z_replay = B_TRUE;
 				zil_replay(zfsvfs->z_os, zfsvfs,
 				    zfs_replay_vector);
 				zfsvfs->z_replay = B_FALSE;
 			}
 		}
 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
 	}
 
 	/*
 	 * Set the objset user_ptr to track its zfsvfs.
 	 */
 	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
 	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 
 	return (0);
 }
 
 void
 zfsvfs_free(zfsvfs_t *zfsvfs)
 {
 	int i;
 	extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
 
 	/*
 	 * This is a barrier to prevent the filesystem from going away in
 	 * zfs_znode_move() until we can safely ensure that the filesystem is
 	 * not unmounted. We consider the filesystem valid before the barrier
 	 * and invalid after the barrier.
 	 */
 	rw_enter(&zfsvfs_lock, RW_READER);
 	rw_exit(&zfsvfs_lock);
 
 	zfs_fuid_destroy(zfsvfs);
 
 	mutex_destroy(&zfsvfs->z_znodes_lock);
 	mutex_destroy(&zfsvfs->z_lock);
 	list_destroy(&zfsvfs->z_all_znodes);
 	rrm_destroy(&zfsvfs->z_teardown_lock);
 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
 	rw_destroy(&zfsvfs->z_fuid_lock);
 	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
 		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
 static void
 zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
 {
 	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
 	if (zfsvfs->z_vfs) {
 		if (zfsvfs->z_use_fuids) {
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
 			vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
 		} else {
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
 			vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
 		}
 	}
 	zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os);
 }
 
 static int
 zfs_domount(vfs_t *vfsp, char *osname)
 {
 	dev_t mount_dev;
 	uint64_t recordsize, fsid_guid;
 	int error = 0;
 	zfsvfs_t *zfsvfs;
 
 	ASSERT(vfsp);
 	ASSERT(osname);
 
 	error = zfsvfs_create(osname, &zfsvfs);
 	if (error)
 		return (error);
 	zfsvfs->z_vfs = vfsp;
 
 	/* Initialize the generic filesystem structure. */
 	vfsp->vfs_bcount = 0;
 	vfsp->vfs_data = NULL;
 
 	if (zfs_create_unique_device(&mount_dev) == -1) {
 		error = SET_ERROR(ENODEV);
 		goto out;
 	}
 	ASSERT(vfs_devismounted(mount_dev) == 0);
 
 	if (error = dsl_prop_get_integer(osname, "recordsize", &recordsize,
 	    NULL))
 		goto out;
 
 	vfsp->vfs_dev = mount_dev;
 	vfsp->vfs_fstype = zfsfstype;
 	vfsp->vfs_bsize = recordsize;
 	vfsp->vfs_flag |= VFS_NOTRUNC;
 	vfsp->vfs_data = zfsvfs;
 
 	/*
 	 * The fsid is 64 bits, composed of an 8-bit fs type, which
 	 * separates our fsid from any other filesystem types, and a
 	 * 56-bit objset unique ID.  The objset unique ID is unique to
 	 * all objsets open on this system, provided by unique_create().
 	 * The 8-bit fs type must be put in the low bits of fsid[1]
 	 * because that's where other Solaris filesystems put it.
 	 */
 	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
 	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
 	vfsp->vfs_fsid.val[0] = fsid_guid;
 	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
 	    zfsfstype & 0xFF;
 
 	/*
 	 * Set features for file system.
 	 */
 	zfs_set_fuid_feature(zfsvfs);
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 		vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE);
 	} else if (zfsvfs->z_case == ZFS_CASE_MIXED) {
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 	}
 	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
 
 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 		uint64_t pval;
 
 		atime_changed_cb(zfsvfs, B_FALSE);
 		readonly_changed_cb(zfsvfs, B_TRUE);
 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
 			goto out;
 		xattr_changed_cb(zfsvfs, pval);
 		zfsvfs->z_issnap = B_TRUE;
 		zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED;
 
 		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
 		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
 		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 	} else {
 		error = zfsvfs_setup(zfsvfs, B_TRUE);
 	}
 
 	if (!zfsvfs->z_issnap)
 		zfsctl_create(zfsvfs);
 out:
 	if (error) {
 		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 		zfsvfs_free(zfsvfs);
 	} else {
 		atomic_inc_32(&zfs_active_fs_count);
 	}
 
 	return (error);
 }
 
 void
 zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
 {
 	objset_t *os = zfsvfs->z_os;
 
 	if (!dmu_objset_is_snapshot(os))
 		dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
 }
 
 /*
  * Convert a decimal digit string to a uint64_t integer.
  */
 static int
 str_to_uint64(char *str, uint64_t *objnum)
 {
 	uint64_t num = 0;
 
 	while (*str) {
 		if (*str < '0' || *str > '9')
 			return (SET_ERROR(EINVAL));
 
 		num = num*10 + *str++ - '0';
 	}
 
 	*objnum = num;
 	return (0);
 }
 
 /*
  * The boot path passed from the boot loader is in the form of
  * "rootpool-name/root-filesystem-object-number'. Convert this
  * string to a dataset name: "rootpool-name/root-filesystem-name".
  */
 static int
 zfs_parse_bootfs(char *bpath, char *outpath)
 {
 	char *slashp;
 	uint64_t objnum;
 	int error;
 
 	if (*bpath == 0 || *bpath == '/')
 		return (SET_ERROR(EINVAL));
 
 	(void) strcpy(outpath, bpath);
 
 	slashp = strchr(bpath, '/');
 
 	/* if no '/', just return the pool name */
 	if (slashp == NULL) {
 		return (0);
 	}
 
 	/* if not a number, just return the root dataset name */
 	if (str_to_uint64(slashp+1, &objnum)) {
 		return (0);
 	}
 
 	*slashp = '\0';
 	error = dsl_dsobj_to_dsname(bpath, objnum, outpath);
 	*slashp = '/';
 
 	return (error);
 }
 
 /*
  * Check that the hex label string is appropriate for the dataset being
  * mounted into the global_zone proper.
  *
  * Return an error if the hex label string is not default or
  * admin_low/admin_high.  For admin_low labels, the corresponding
  * dataset must be readonly.
  */
 int
 zfs_check_global_label(const char *dsname, const char *hexsl)
 {
 	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
 		return (0);
 	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
 		return (0);
 	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
 		/* must be readonly */
 		uint64_t rdonly;
 
 		if (dsl_prop_get_integer(dsname,
 		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
 			return (SET_ERROR(EACCES));
 		return (rdonly ? 0 : EACCES);
 	}
 	return (SET_ERROR(EACCES));
 }
 
 /*
  * Determine whether the mount is allowed according to MAC check.
  * by comparing (where appropriate) label of the dataset against
  * the label of the zone being mounted into.  If the dataset has
  * no label, create one.
  *
  * Returns 0 if access allowed, error otherwise (e.g. EACCES)
  */
 static int
 zfs_mount_label_policy(vfs_t *vfsp, char *osname)
 {
 	int		error, retv;
 	zone_t		*mntzone = NULL;
 	ts_label_t	*mnt_tsl;
 	bslabel_t	*mnt_sl;
 	bslabel_t	ds_sl;
 	char		ds_hexsl[MAXNAMELEN];
 
 	retv = EACCES;				/* assume the worst */
 
 	/*
 	 * Start by getting the dataset label if it exists.
 	 */
 	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
 	if (error)
 		return (SET_ERROR(EACCES));
 
 	/*
 	 * If labeling is NOT enabled, then disallow the mount of datasets
 	 * which have a non-default label already.  No other label checks
 	 * are needed.
 	 */
 	if (!is_system_labeled()) {
 		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
 			return (0);
 		return (SET_ERROR(EACCES));
 	}
 
 	/*
 	 * Get the label of the mountpoint.  If mounting into the global
 	 * zone (i.e. mountpoint is not within an active zone and the
 	 * zoned property is off), the label must be default or
 	 * admin_low/admin_high only; no other checks are needed.
 	 */
 	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
 	if (mntzone->zone_id == GLOBAL_ZONEID) {
 		uint64_t zoned;
 
 		zone_rele(mntzone);
 
 		if (dsl_prop_get_integer(osname,
 		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
 			return (SET_ERROR(EACCES));
 		if (!zoned)
 			return (zfs_check_global_label(osname, ds_hexsl));
 		else
 			/*
 			 * This is the case of a zone dataset being mounted
 			 * initially, before the zone has been fully created;
 			 * allow this mount into global zone.
 			 */
 			return (0);
 	}
 
 	mnt_tsl = mntzone->zone_slabel;
 	ASSERT(mnt_tsl != NULL);
 	label_hold(mnt_tsl);
 	mnt_sl = label2bslabel(mnt_tsl);
 
 	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
 		/*
 		 * The dataset doesn't have a real label, so fabricate one.
 		 */
 		char *str = NULL;
 
 		if (l_to_str_internal(mnt_sl, &str) == 0 &&
 		    dsl_prop_set_string(osname,
 		    zfs_prop_to_name(ZFS_PROP_MLSLABEL),
 		    ZPROP_SRC_LOCAL, str) == 0)
 			retv = 0;
 		if (str != NULL)
 			kmem_free(str, strlen(str) + 1);
 	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
 		/*
 		 * Now compare labels to complete the MAC check.  If the
 		 * labels are equal then allow access.  If the mountpoint
 		 * label dominates the dataset label, allow readonly access.
 		 * Otherwise, access is denied.
 		 */
 		if (blequal(mnt_sl, &ds_sl))
 			retv = 0;
 		else if (bldominates(mnt_sl, &ds_sl)) {
 			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
 			retv = 0;
 		}
 	}
 
 	label_rele(mnt_tsl);
 	zone_rele(mntzone);
 	return (retv);
 }
 
 static int
 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
 {
 	int error = 0;
 	static int zfsrootdone = 0;
 	zfsvfs_t *zfsvfs = NULL;
 	znode_t *zp = NULL;
 	vnode_t *vp = NULL;
 	char *zfs_bootfs;
 	char *zfs_devid;
 
 	ASSERT(vfsp);
 
 	/*
 	 * The filesystem that we mount as root is defined in the
 	 * boot property "zfs-bootfs" with a format of
 	 * "poolname/root-dataset-objnum".
 	 */
 	if (why == ROOT_INIT) {
 		if (zfsrootdone++)
 			return (SET_ERROR(EBUSY));
 		/*
 		 * the process of doing a spa_load will require the
 		 * clock to be set before we could (for example) do
 		 * something better by looking at the timestamp on
 		 * an uberblock, so just set it to -1.
 		 */
 		clkset(-1);
 
 		if ((zfs_bootfs = spa_get_bootprop("zfs-bootfs")) == NULL) {
 			cmn_err(CE_NOTE, "spa_get_bootfs: can not get "
 			    "bootfs name");
 			return (SET_ERROR(EINVAL));
 		}
 		zfs_devid = spa_get_bootprop("diskdevid");
 		error = spa_import_rootpool(rootfs.bo_name, zfs_devid);
 		if (zfs_devid)
 			spa_free_bootprop(zfs_devid);
 		if (error) {
 			spa_free_bootprop(zfs_bootfs);
 			cmn_err(CE_NOTE, "spa_import_rootpool: error %d",
 			    error);
 			return (error);
 		}
 		if (error = zfs_parse_bootfs(zfs_bootfs, rootfs.bo_name)) {
 			spa_free_bootprop(zfs_bootfs);
 			cmn_err(CE_NOTE, "zfs_parse_bootfs: error %d",
 			    error);
 			return (error);
 		}
 
 		spa_free_bootprop(zfs_bootfs);
 
 		if (error = vfs_lock(vfsp))
 			return (error);
 
 		if (error = zfs_domount(vfsp, rootfs.bo_name)) {
 			cmn_err(CE_NOTE, "zfs_domount: error %d", error);
 			goto out;
 		}
 
 		zfsvfs = (zfsvfs_t *)vfsp->vfs_data;
 		ASSERT(zfsvfs);
 		if (error = zfs_zget(zfsvfs, zfsvfs->z_root, &zp)) {
 			cmn_err(CE_NOTE, "zfs_zget: error %d", error);
 			goto out;
 		}
 
 		vp = ZTOV(zp);
 		mutex_enter(&vp->v_lock);
 		vp->v_flag |= VROOT;
 		mutex_exit(&vp->v_lock);
 		rootvp = vp;
 
 		/*
 		 * Leave rootvp held.  The root file system is never unmounted.
 		 */
 
 		vfs_add((struct vnode *)0, vfsp,
 		    (vfsp->vfs_flag & VFS_RDONLY) ? MS_RDONLY : 0);
 out:
 		vfs_unlock(vfsp);
 		return (error);
 	} else if (why == ROOT_REMOUNT) {
 		readonly_changed_cb(vfsp->vfs_data, B_FALSE);
 		vfsp->vfs_flag |= VFS_REMOUNT;
 
 		/* refresh mount options */
 		zfs_unregister_callbacks(vfsp->vfs_data);
 		return (zfs_register_callbacks(vfsp));
 
 	} else if (why == ROOT_UNMOUNT) {
 		zfs_unregister_callbacks((zfsvfs_t *)vfsp->vfs_data);
 		(void) zfs_sync(vfsp, 0, 0);
 		return (0);
 	}
 
 	/*
 	 * if "why" is equal to anything else other than ROOT_INIT,
 	 * ROOT_REMOUNT, or ROOT_UNMOUNT, we do not support it.
 	 */
 	return (SET_ERROR(ENOTSUP));
 }
 
 /*ARGSUSED*/
 static int
 zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 {
 	char		*osname;
 	pathname_t	spn;
 	int		error = 0;
 	uio_seg_t	fromspace = (uap->flags & MS_SYSSPACE) ?
 	    UIO_SYSSPACE : UIO_USERSPACE;
 	int		canwrite;
 
 	if (mvp->v_type != VDIR)
 		return (SET_ERROR(ENOTDIR));
 
 	mutex_enter(&mvp->v_lock);
 	if ((uap->flags & MS_REMOUNT) == 0 &&
 	    (uap->flags & MS_OVERLAY) == 0 &&
 	    (mvp->v_count != 1 || (mvp->v_flag & VROOT))) {
 		mutex_exit(&mvp->v_lock);
 		return (SET_ERROR(EBUSY));
 	}
 	mutex_exit(&mvp->v_lock);
 
 	/*
 	 * ZFS does not support passing unparsed data in via MS_DATA.
 	 * Users should use the MS_OPTIONSTR interface; this means
 	 * that all option parsing is already done and the options struct
 	 * can be interrogated.
 	 */
 	if ((uap->flags & MS_DATA) && uap->datalen > 0)
 		return (SET_ERROR(EINVAL));
 
 	/*
 	 * Get the objset name (the "special" mount argument).
 	 */
 	if (error = pn_get(uap->spec, fromspace, &spn))
 		return (error);
 
 	osname = spn.pn_path;
 
 	/*
 	 * Check for mount privilege?
 	 *
 	 * If we don't have privilege then see if
 	 * we have local permission to allow it
 	 */
 	error = secpolicy_fs_mount(cr, mvp, vfsp);
 	if (error) {
 		if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) == 0) {
 			vattr_t		vattr;
 
 			/*
 			 * Make sure user is the owner of the mount point
 			 * or has sufficient privileges.
 			 */
 
 			vattr.va_mask = AT_UID;
 
 			if (VOP_GETATTR(mvp, &vattr, 0, cr, NULL)) {
 				goto out;
 			}
 
 			if (secpolicy_vnode_owner(cr, vattr.va_uid) != 0 &&
 			    VOP_ACCESS(mvp, VWRITE, 0, cr, NULL) != 0) {
 				goto out;
 			}
 			secpolicy_fs_mount_clearopts(cr, vfsp);
 		} else {
 			goto out;
 		}
 	}
 
 	/*
 	 * Refuse to mount a filesystem if we are in a local zone and the
 	 * dataset is not visible.
 	 */
 	if (!INGLOBALZONE(curproc) &&
 	    (!zone_dataset_visible(osname, &canwrite) || !canwrite)) {
 		error = SET_ERROR(EPERM);
 		goto out;
 	}
 
 	error = zfs_mount_label_policy(vfsp, osname);
 	if (error)
 		goto out;
 
 	/*
 	 * When doing a remount, we simply refresh our temporary properties
 	 * according to those options set in the current VFS options.
 	 */
 	if (uap->flags & MS_REMOUNT) {
 		/* refresh mount options */
 		zfs_unregister_callbacks(vfsp->vfs_data);
 		error = zfs_register_callbacks(vfsp);
 		goto out;
 	}
 
 	error = zfs_domount(vfsp, osname);
 
 	/*
 	 * Add an extra VFS_HOLD on our parent vfs so that it can't
 	 * disappear due to a forced unmount.
 	 */
 	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
 		VFS_HOLD(mvp->v_vfsp);
 
 out:
 	pn_free(&spn);
 	return (error);
 }
 
 static int
 zfs_statvfs(vfs_t *vfsp, struct statvfs64 *statp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	dev32_t d32;
 	uint64_t refdbytes, availbytes, usedobjs, availobjs;
 
 	ZFS_ENTER(zfsvfs);
 
 	dmu_objset_space(zfsvfs->z_os,
 	    &refdbytes, &availbytes, &usedobjs, &availobjs);
 
 	/*
 	 * The underlying storage pool actually uses multiple block sizes.
 	 * We report the fragsize as the smallest block size we support,
 	 * and we report our blocksize as the filesystem's maximum blocksize.
 	 */
 	statp->f_frsize = 1UL << SPA_MINBLOCKSHIFT;
 	statp->f_bsize = zfsvfs->z_max_blksz;
 
 	/*
 	 * The following report "total" blocks of various kinds in the
 	 * file system, but reported in terms of f_frsize - the
 	 * "fragment" size.
 	 */
 
 	statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT;
 	statp->f_bfree = availbytes >> SPA_MINBLOCKSHIFT;
 	statp->f_bavail = statp->f_bfree; /* no root reservation */
 
 	/*
 	 * statvfs() should really be called statufs(), because it assumes
 	 * static metadata.  ZFS doesn't preallocate files, so the best
 	 * we can do is report the max that could possibly fit in f_files,
 	 * and that minus the number actually used in f_ffree.
 	 * For f_ffree, report the smaller of the number of object available
 	 * and the number of blocks (each object will take at least a block).
 	 */
 	statp->f_ffree = MIN(availobjs, statp->f_bfree);
 	statp->f_favail = statp->f_ffree;	/* no "root reservation" */
 	statp->f_files = statp->f_ffree + usedobjs;
 
 	(void) cmpldev(&d32, vfsp->vfs_dev);
 	statp->f_fsid = d32;
 
 	/*
 	 * We're a zfs filesystem.
 	 */
 	(void) strcpy(statp->f_basetype, vfssw[vfsp->vfs_fstype].vsw_name);
 
 	statp->f_flag = vf_to_stf(vfsp->vfs_flag);
 
 	statp->f_namemax = MAXNAMELEN - 1;
 
 	/*
 	 * We have all of 32 characters to stuff a string here.
 	 * Is there anything useful we could/should provide?
 	 */
 	bzero(statp->f_fstr, sizeof (statp->f_fstr));
 
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 static int
 zfs_root(vfs_t *vfsp, vnode_t **vpp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	znode_t *rootzp;
 	int error;
 
 	ZFS_ENTER(zfsvfs);
 
 	error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp);
 	if (error == 0)
 		*vpp = ZTOV(rootzp);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
 }
 
 /*
  * Teardown the zfsvfs::z_os.
  *
  * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock'
  * and 'z_teardown_inactive_lock' held.
  */
 static int
 zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
 {
 	znode_t	*zp;
 
 	rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
 
 	if (!unmounting) {
 		/*
 		 * We purge the parent filesystem's vfsp as the parent
 		 * filesystem and all of its snapshots have their vnode's
 		 * v_vfsp set to the parent's filesystem's vfsp.  Note,
 		 * 'z_parent' is self referential for non-snapshots.
 		 */
 		(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 	}
 
 	/*
 	 * Close the zil. NB: Can't close the zil while zfs_inactive
 	 * threads are blocked as zil_close can call zfs_inactive.
 	 */
 	if (zfsvfs->z_log) {
 		zil_close(zfsvfs->z_log);
 		zfsvfs->z_log = NULL;
 	}
 
 	rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER);
 
 	/*
 	 * If we are not unmounting (ie: online recv) and someone already
 	 * unmounted this file system while we were doing the switcheroo,
 	 * or a reopen of z_os failed then just bail out now.
 	 */
 	if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 		return (SET_ERROR(EIO));
 	}
 
 	/*
 	 * At this point there are no vops active, and any new vops will
 	 * fail with EIO since we have z_teardown_lock for writer (only
 	 * relavent for forced unmount).
 	 *
 	 * Release all holds on dbufs.
 	 */
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL;
 	    zp = list_next(&zfsvfs->z_all_znodes, zp))
 		if (zp->z_sa_hdl) {
 			ASSERT(ZTOV(zp)->v_count > 0);
 			zfs_znode_dmu_fini(zp);
 		}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 	/*
 	 * If we are unmounting, set the unmounted flag and let new vops
 	 * unblock.  zfs_inactive will have the unmounted behavior, and all
 	 * other vops will fail with EIO.
 	 */
 	if (unmounting) {
 		zfsvfs->z_unmounted = B_TRUE;
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 	}
 
 	/*
 	 * z_os will be NULL if there was an error in attempting to reopen
 	 * zfsvfs, so just return as the properties had already been
 	 * unregistered and cached data had been evicted before.
 	 */
 	if (zfsvfs->z_os == NULL)
 		return (0);
 
 	/*
 	 * Unregister properties.
 	 */
 	zfs_unregister_callbacks(zfsvfs);
 
 	/*
 	 * Evict cached data
 	 */
 	if (dsl_dataset_is_dirty(dmu_objset_ds(zfsvfs->z_os)) &&
 	    !(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY))
 		txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
 	dmu_objset_evict_dbufs(zfsvfs->z_os);
 
 	return (0);
 }
 
 /*ARGSUSED*/
 static int
 zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 	objset_t *os;
 	int ret;
 
 	ret = secpolicy_fs_unmount(cr, vfsp);
 	if (ret) {
 		if (dsl_deleg_access((char *)refstr_value(vfsp->vfs_resource),
 		    ZFS_DELEG_PERM_MOUNT, cr))
 			return (ret);
 	}
 
 	/*
 	 * We purge the parent filesystem's vfsp as the parent filesystem
 	 * and all of its snapshots have their vnode's v_vfsp set to the
 	 * parent's filesystem's vfsp.  Note, 'z_parent' is self
 	 * referential for non-snapshots.
 	 */
 	(void) dnlc_purge_vfsp(zfsvfs->z_parent->z_vfs, 0);
 
 	/*
 	 * Unmount any snapshots mounted under .zfs before unmounting the
 	 * dataset itself.
 	 */
 	if (zfsvfs->z_ctldir != NULL &&
 	    (ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) {
 		return (ret);
 	}
 
 	if (!(fflag & MS_FORCE)) {
 		/*
 		 * Check the number of active vnodes in the file system.
 		 * Our count is maintained in the vfs structure, but the
 		 * number is off by 1 to indicate a hold on the vfs
 		 * structure itself.
 		 *
 		 * The '.zfs' directory maintains a reference of its
 		 * own, and any active references underneath are
 		 * reflected in the vnode count.
 		 */
 		if (zfsvfs->z_ctldir == NULL) {
 			if (vfsp->vfs_count > 1)
 				return (SET_ERROR(EBUSY));
 		} else {
 			if (vfsp->vfs_count > 2 ||
 			    zfsvfs->z_ctldir->v_count > 1)
 				return (SET_ERROR(EBUSY));
 		}
 	}
 
 	vfsp->vfs_flag |= VFS_UNMOUNTED;
 
 	VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0);
 	os = zfsvfs->z_os;
 
 	/*
 	 * z_os will be NULL if there was an error in
 	 * attempting to reopen zfsvfs.
 	 */
 	if (os != NULL) {
 		/*
 		 * Unset the objset user_ptr.
 		 */
 		mutex_enter(&os->os_user_ptr_lock);
 		dmu_objset_set_user(os, NULL);
 		mutex_exit(&os->os_user_ptr_lock);
 
 		/*
 		 * Finally release the objset
 		 */
 		dmu_objset_disown(os, zfsvfs);
 	}
 
 	/*
 	 * We can now safely destroy the '.zfs' directory node.
 	 */
 	if (zfsvfs->z_ctldir != NULL)
 		zfsctl_destroy(zfsvfs);
 
 	return (0);
 }
 
 static int
 zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
 {
 	zfsvfs_t	*zfsvfs = vfsp->vfs_data;
 	znode_t		*zp;
 	uint64_t	object = 0;
 	uint64_t	fid_gen = 0;
 	uint64_t	gen_mask;
 	uint64_t	zp_gen;
 	int 		i, err;
 
 	*vpp = NULL;
 
 	ZFS_ENTER(zfsvfs);
 
 	if (fidp->fid_len == LONG_FID_LEN) {
 		zfid_long_t	*zlfid = (zfid_long_t *)fidp;
 		uint64_t	objsetid = 0;
 		uint64_t	setgen = 0;
 
 		for (i = 0; i < sizeof (zlfid->zf_setid); i++)
 			objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zlfid->zf_setgen); i++)
 			setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i);
 
 		ZFS_EXIT(zfsvfs);
 
 		err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs);
 		if (err)
 			return (SET_ERROR(EINVAL));
 		ZFS_ENTER(zfsvfs);
 	}
 
 	if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) {
 		zfid_short_t	*zfid = (zfid_short_t *)fidp;
 
 		for (i = 0; i < sizeof (zfid->zf_object); i++)
 			object |= ((uint64_t)zfid->zf_object[i]) << (8 * i);
 
 		for (i = 0; i < sizeof (zfid->zf_gen); i++)
 			fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i);
 	} else {
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	/* A zero fid_gen means we are in the .zfs control directories */
 	if (fid_gen == 0 &&
 	    (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) {
 		*vpp = zfsvfs->z_ctldir;
 		ASSERT(*vpp != NULL);
 		if (object == ZFSCTL_INO_SNAPDIR) {
 			VERIFY(zfsctl_root_lookup(*vpp, "snapshot", vpp, NULL,
 			    0, NULL, NULL, NULL, NULL, NULL) == 0);
 		} else {
 			VN_HOLD(*vpp);
 		}
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
 	gen_mask = -1ULL >> (64 - 8 * i);
 
 	dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask);
 	if (err = zfs_zget(zfsvfs, object, &zp)) {
 		ZFS_EXIT(zfsvfs);
 		return (err);
 	}
 	(void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen,
 	    sizeof (uint64_t));
 	zp_gen = zp_gen & gen_mask;
 	if (zp_gen == 0)
 		zp_gen = 1;
 	if (zp->z_unlinked || zp_gen != fid_gen) {
 		dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen);
 		VN_RELE(ZTOV(zp));
 		ZFS_EXIT(zfsvfs);
 		return (SET_ERROR(EINVAL));
 	}
 
 	*vpp = ZTOV(zp);
 	ZFS_EXIT(zfsvfs);
 	return (0);
 }
 
 /*
  * Block out VOPs and close zfsvfs_t::z_os
  *
  * Note, if successful, then we return with the 'z_teardown_lock' and
  * 'z_teardown_inactive_lock' write held.  We leave ownership of the underlying
  * dataset and objset intact so that they can be atomically handed off during
  * a subsequent rollback or recv operation and the resume thereafter.
  */
 int
 zfs_suspend_fs(zfsvfs_t *zfsvfs)
 {
 	int error;
 
 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
 		return (error);
 
 	return (0);
 }
 
 /*
  * Rebuild SA and release VOPs.  Note that ownership of the underlying dataset
  * is an invariant across any of the operations that can be performed while the
  * filesystem was suspended.  Whether it succeeded or failed, the preconditions
  * are the same: the relevant objset and associated dataset are owned by
  * zfsvfs, held, and long held on entry.
  */
 int
 zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds)
 {
 	int err;
 	znode_t *zp;
 
 	ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
 
 	/*
 	 * We already own this, so just update the objset_t, as the one we
 	 * had before may have been evicted.
 	 */
 	objset_t *os;
 	VERIFY3P(ds->ds_owner, ==, zfsvfs);
 	VERIFY(dsl_dataset_long_held(ds));
 	VERIFY0(dmu_objset_from_ds(ds, &os));
 
 	err = zfsvfs_init(zfsvfs, os);
 	if (err != 0)
 		goto bail;
 
 	VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0);
 
 	zfs_set_fuid_feature(zfsvfs);
 
 	/*
 	 * Attempt to re-establish all the active znodes with
 	 * their dbufs.  If a zfs_rezget() fails, then we'll let
 	 * any potential callers discover that via ZFS_ENTER_VERIFY_VP
 	 * when they try to use their znode.
 	 */
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	for (zp = list_head(&zfsvfs->z_all_znodes); zp;
 	    zp = list_next(&zfsvfs->z_all_znodes, zp)) {
 		(void) zfs_rezget(zp);
 	}
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
 bail:
 	/* release the VOPs */
 	rw_exit(&zfsvfs->z_teardown_inactive_lock);
 	rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
 
 	if (err) {
 		/*
 		 * Since we couldn't setup the sa framework, try to force
 		 * unmount this file system.
 		 */
 		if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0)
 			(void) dounmount(zfsvfs->z_vfs, MS_FORCE, CRED());
 	}
 	return (err);
 }
 
 static void
 zfs_freevfs(vfs_t *vfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
 
 	/*
 	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
 	 * from zfs_mount().  Release it here.  If we came through
 	 * zfs_mountroot() instead, we didn't grab an extra hold, so
 	 * skip the VFS_RELE for rootvfs.
 	 */
 	if (zfsvfs->z_issnap && (vfsp != rootvfs))
 		VFS_RELE(zfsvfs->z_parent->z_vfs);
 
 	zfsvfs_free(zfsvfs);
 
 	atomic_dec_32(&zfs_active_fs_count);
 }
 
 /*
  * VFS_INIT() initialization.  Note that there is no VFS_FINI(),
  * so we can't safely do any non-idempotent initialization here.
  * Leave that to zfs_init() and zfs_fini(), which are called
  * from the module's _init() and _fini() entry points.
  */
 /*ARGSUSED*/
 static int
 zfs_vfsinit(int fstype, char *name)
 {
 	int error;
 
 	zfsfstype = fstype;
 
 	/*
 	 * Setup vfsops and vnodeops tables.
 	 */
 	error = vfs_setfsops(fstype, zfs_vfsops_template, &zfs_vfsops);
 	if (error != 0) {
 		cmn_err(CE_WARN, "zfs: bad vfs ops template");
 	}
 
 	error = zfs_create_op_tables();
 	if (error) {
 		zfs_remove_op_tables();
 		cmn_err(CE_WARN, "zfs: bad vnode ops template");
 		(void) vfs_freevfsops_by_type(zfsfstype);
 		return (error);
 	}
 
 	mutex_init(&zfs_dev_mtx, NULL, MUTEX_DEFAULT, NULL);
 
 	/*
 	 * Unique major number for all zfs mounts.
 	 * If we run out of 32-bit minors, we'll getudev() another major.
 	 */
 	zfs_major = ddi_name_to_major(ZFS_DRIVER);
 	zfs_minor = ZFS_MIN_MINOR;
 
 	return (0);
 }
 
 void
 zfs_init(void)
 {
 	/*
 	 * Initialize .zfs directory structures
 	 */
 	zfsctl_init();
 
 	/*
 	 * Initialize znode cache, vnode ops, etc...
 	 */
 	zfs_znode_init();
 
 	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
 }
 
 void
 zfs_fini(void)
 {
 	zfsctl_fini();
 	zfs_znode_fini();
 }
 
 int
 zfs_busy(void)
 {
 	return (zfs_active_fs_count != 0);
 }
 
 int
 zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 {
 	int error;
 	objset_t *os = zfsvfs->z_os;
 	dmu_tx_t *tx;
 
 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
 		return (SET_ERROR(EINVAL));
 
 	if (newvers < zfsvfs->z_version)
 		return (SET_ERROR(EINVAL));
 
 	if (zfs_spa_version_map(newvers) >
 	    spa_version(dmu_objset_spa(zfsvfs->z_os)))
 		return (SET_ERROR(ENOTSUP));
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
 		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
 		    ZFS_SA_ATTRS);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
 	}
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 		return (error);
 	}
 
 	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
 	    8, 1, &newvers, tx);
 
 	if (error) {
 		dmu_tx_commit(tx);
 		return (error);
 	}
 
 	if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) {
 		uint64_t sa_obj;
 
 		ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=,
 		    SPA_VERSION_SA);
 		sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE,
 		    DMU_OT_NONE, 0, tx);
 
 		error = zap_add(os, MASTER_NODE_OBJ,
 		    ZFS_SA_ATTRS, 8, 1, &sa_obj, tx);
 		ASSERT0(error);
 
 		VERIFY(0 == sa_set_sa_object(os, sa_obj));
 		sa_register_update_callback(os, zfs_sa_upgrade);
 	}
 
 	spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx,
 	    "from %llu to %llu", zfsvfs->z_version, newvers);
 
 	dmu_tx_commit(tx);
 
 	zfsvfs->z_version = newvers;
+	os->os_version = newvers;
 
 	zfs_set_fuid_feature(zfsvfs);
 
 	return (0);
 }
 
 /*
  * Read a property stored within the master node.
  */
 int
 zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value)
 {
-	const char *pname;
-	int error = ENOENT;
+	uint64_t *cached_copy = NULL;
 
 	/*
-	 * Look up the file system's value for the property.  For the
-	 * version property, we look up a slightly different string.
+	 * Figure out where in the objset_t the cached copy would live, if it
+	 * is available for the requested property.
 	 */
-	if (prop == ZFS_PROP_VERSION)
+	if (os != NULL) {
+		switch (prop) {
+		case ZFS_PROP_VERSION:
+			cached_copy = &os->os_version;
+			break;
+		case ZFS_PROP_NORMALIZE:
+			cached_copy = &os->os_normalization;
+			break;
+		case ZFS_PROP_UTF8ONLY:
+			cached_copy = &os->os_utf8only;
+			break;
+		case ZFS_PROP_CASE:
+			cached_copy = &os->os_casesensitivity;
+			break;
+		default:
+			break;
+		}
+	}
+	if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) {
+		*value = *cached_copy;
+		return (0);
+	}
+
+	/*
+	 * If the property wasn't cached, look up the file system's value for
+	 * the property. For the version property, we look up a slightly
+	 * different string.
+	 */
+	const char *pname;
+	int error = ENOENT;
+	if (prop == ZFS_PROP_VERSION) {
 		pname = ZPL_VERSION_STR;
-	else
+	} else {
 		pname = zfs_prop_to_name(prop);
+	}
 
 	if (os != NULL) {
 		ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS);
 		error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value);
 	}
 
 	if (error == ENOENT) {
 		/* No value set, use the default value */
 		switch (prop) {
 		case ZFS_PROP_VERSION:
 			*value = ZPL_VERSION;
 			break;
 		case ZFS_PROP_NORMALIZE:
 		case ZFS_PROP_UTF8ONLY:
 			*value = 0;
 			break;
 		case ZFS_PROP_CASE:
 			*value = ZFS_CASE_SENSITIVE;
 			break;
 		default:
 			return (error);
 		}
 		error = 0;
 	}
+
+	/*
+	 * If one of the methods for getting the property value above worked,
+	 * copy it into the objset_t's cache.
+	 */
+	if (error == 0 && cached_copy != NULL) {
+		*cached_copy = *value;
+	}
+
 	return (error);
 }
 
 /*
  * Return true if the coresponding vfs's unmounted flag is set.
  * Otherwise return false.
  * If this function returns true we know VFS unmount has been initiated.
  */
 boolean_t
 zfs_get_vfs_flag_unmounted(objset_t *os)
 {
 	zfsvfs_t *zfvp;
 	boolean_t unmounted = B_FALSE;
 
 	ASSERT(dmu_objset_type(os) == DMU_OST_ZFS);
 
 	mutex_enter(&os->os_user_ptr_lock);
 	zfvp = dmu_objset_get_user(os);
 	if (zfvp != NULL && zfvp->z_vfs != NULL &&
 	    (zfvp->z_vfs->vfs_flag & VFS_UNMOUNTED))
 		unmounted = B_TRUE;
 	mutex_exit(&os->os_user_ptr_lock);
 
 	return (unmounted);
 }
 
 static vfsdef_t vfw = {
 	VFSDEF_VERSION,
 	MNTTYPE_ZFS,
 	zfs_vfsinit,
 	VSW_HASPROTO|VSW_CANRWRO|VSW_CANREMOUNT|VSW_VOLATILEDEV|VSW_STATS|
 	    VSW_XID|VSW_ZMOUNT,
 	&zfs_mntopts
 };
 
 struct modlfs zfs_modlfs = {
 	&mod_fsops, "ZFS filesystem version " SPA_VERSION_STRING, &vfw
 };