Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	(revision 339124)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c	(revision 339125)
@@ -1,1337 +1,1342 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  * Copyright (c) 2014 Integros [integros.com]
  */
 
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
 #include <sys/dbuf.h>
 #include <sys/dmu_tx.h>
 #include <sys/dmu_objset.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_pool.h>
 #include <sys/zap_impl.h>
 #include <sys/spa.h>
 #include <sys/sa.h>
 #include <sys/sa_impl.h>
 #include <sys/zfs_context.h>
 #include <sys/varargs.h>
 
 typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn,
     uint64_t arg1, uint64_t arg2);
 
 
 dmu_tx_t *
 dmu_tx_create_dd(dsl_dir_t *dd)
 {
 	dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP);
 	tx->tx_dir = dd;
 	if (dd != NULL)
 		tx->tx_pool = dd->dd_pool;
 	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
 	    offsetof(dmu_tx_hold_t, txh_node));
 	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
 	    offsetof(dmu_tx_callback_t, dcb_node));
 	tx->tx_start = gethrtime();
 	return (tx);
 }
 
 dmu_tx_t *
 dmu_tx_create(objset_t *os)
 {
 	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
 	tx->tx_objset = os;
 	return (tx);
 }
 
 dmu_tx_t *
 dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg)
 {
 	dmu_tx_t *tx = dmu_tx_create_dd(NULL);
 
 	txg_verify(dp->dp_spa, txg);
 	tx->tx_pool = dp;
 	tx->tx_txg = txg;
 	tx->tx_anyobj = TRUE;
 
 	return (tx);
 }
 
 int
 dmu_tx_is_syncing(dmu_tx_t *tx)
 {
 	return (tx->tx_anyobj);
 }
 
 int
 dmu_tx_private_ok(dmu_tx_t *tx)
 {
 	return (tx->tx_anyobj);
 }
 
 static dmu_tx_hold_t *
 dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type,
     uint64_t arg1, uint64_t arg2)
 {
 	dmu_tx_hold_t *txh;
 
 	if (dn != NULL) {
 		(void) refcount_add(&dn->dn_holds, tx);
 		if (tx->tx_txg != 0) {
 			mutex_enter(&dn->dn_mtx);
 			/*
 			 * dn->dn_assigned_txg == tx->tx_txg doesn't pose a
 			 * problem, but there's no way for it to happen (for
 			 * now, at least).
 			 */
 			ASSERT(dn->dn_assigned_txg == 0);
 			dn->dn_assigned_txg = tx->tx_txg;
 			(void) refcount_add(&dn->dn_tx_holds, tx);
 			mutex_exit(&dn->dn_mtx);
 		}
 	}
 
 	txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP);
 	txh->txh_tx = tx;
 	txh->txh_dnode = dn;
 	refcount_create(&txh->txh_space_towrite);
 	refcount_create(&txh->txh_memory_tohold);
 	txh->txh_type = type;
 	txh->txh_arg1 = arg1;
 	txh->txh_arg2 = arg2;
 	list_insert_tail(&tx->tx_holds, txh);
 
 	return (txh);
 }
 
 static dmu_tx_hold_t *
 dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
     enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2)
 {
 	dnode_t *dn = NULL;
 	dmu_tx_hold_t *txh;
 	int err;
 
 	if (object != DMU_NEW_OBJECT) {
 		err = dnode_hold(os, object, FTAG, &dn);
 		if (err != 0) {
 			tx->tx_err = err;
 			return (NULL);
 		}
 	}
 	txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2);
 	if (dn != NULL)
 		dnode_rele(dn, FTAG);
 	return (txh);
 }
 
 void
 dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn)
 {
 	/*
 	 * If we're syncing, they can manipulate any object anyhow, and
 	 * the hold on the dnode_t can cause problems.
 	 */
 	if (!dmu_tx_is_syncing(tx))
 		(void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0);
 }
 
 /*
  * This function reads specified data from disk.  The specified data will
  * be needed to perform the transaction -- i.e, it will be read after
  * we do dmu_tx_assign().  There are two reasons that we read the data now
  * (before dmu_tx_assign()):
  *
  * 1. Reading it now has potentially better performance.  The transaction
  * has not yet been assigned, so the TXG is not held open, and also the
  * caller typically has less locks held when calling dmu_tx_hold_*() than
  * after the transaction has been assigned.  This reduces the lock (and txg)
  * hold times, thus reducing lock contention.
  *
  * 2. It is easier for callers (primarily the ZPL) to handle i/o errors
  * that are detected before they start making changes to the DMU state
  * (i.e. now).  Once the transaction has been assigned, and some DMU
  * state has been changed, it can be difficult to recover from an i/o
  * error (e.g. to undo the changes already made in memory at the DMU
  * layer).  Typically code to do so does not exist in the caller -- it
  * assumes that the data has already been cached and thus i/o errors are
  * not possible.
  *
  * It has been observed that the i/o initiated here can be a performance
  * problem, and it appears to be optional, because we don't look at the
  * data which is read.  However, removing this read would only serve to
  * move the work elsewhere (after the dmu_tx_assign()), where it may
  * have a greater impact on performance (in addition to the impact on
  * fault tolerance noted above).
  */
 static int
 dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 {
 	int err;
 	dmu_buf_impl_t *db;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	db = dbuf_hold_level(dn, level, blkid, FTAG);
 	rw_exit(&dn->dn_struct_rwlock);
 	if (db == NULL)
 		return (SET_ERROR(EIO));
 	err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH);
 	dbuf_rele(db, FTAG);
 	return (err);
 }
 
 /* ARGSUSED */
 static void
 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
 	dnode_t *dn = txh->txh_dnode;
 	int err = 0;
 
 	if (len == 0)
 		return;
 
 	(void) refcount_add_many(&txh->txh_space_towrite, len, FTAG);
 
 	if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS)
 		err = SET_ERROR(EFBIG);
 
 	if (dn == NULL)
 		return;
 
 	/*
 	 * For i/o error checking, read the blocks that will be needed
 	 * to perform the write: the first and last level-0 blocks (if
 	 * they are not aligned, i.e. if they are partial-block writes),
 	 * and all the level-1 blocks.
 	 */
 	if (dn->dn_maxblkid == 0) {
 		if (off < dn->dn_datablksz &&
 		    (off > 0 || len < dn->dn_datablksz)) {
 			err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 			if (err != 0) {
 				txh->txh_tx->tx_err = err;
 			}
 		}
 	} else {
 		zio_t *zio = zio_root(dn->dn_objset->os_spa,
 		    NULL, NULL, ZIO_FLAG_CANFAIL);
 
 		/* first level-0 block */
 		uint64_t start = off >> dn->dn_datablkshift;
 		if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) {
 			err = dmu_tx_check_ioerr(zio, dn, 0, start);
 			if (err != 0) {
 				txh->txh_tx->tx_err = err;
 			}
 		}
 
 		/* last level-0 block */
 		uint64_t end = (off + len - 1) >> dn->dn_datablkshift;
 		if (end != start && end <= dn->dn_maxblkid &&
 		    P2PHASE(off + len, dn->dn_datablksz)) {
 			err = dmu_tx_check_ioerr(zio, dn, 0, end);
 			if (err != 0) {
 				txh->txh_tx->tx_err = err;
 			}
 		}
 
 		/* level-1 blocks */
 		if (dn->dn_nlevels > 1) {
 			int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 			for (uint64_t i = (start >> shft) + 1;
 			    i < end >> shft; i++) {
 				err = dmu_tx_check_ioerr(zio, dn, 1, i);
 				if (err != 0) {
 					txh->txh_tx->tx_err = err;
 				}
 			}
 		}
 
 		err = zio_wait(zio);
 		if (err != 0) {
 			txh->txh_tx->tx_err = err;
 		}
 	}
 }
 
 static void
 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 {
 	(void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG);
 }
 
 void
 dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT0(tx->tx_txg);
 	ASSERT3U(len, <=, DMU_MAX_ACCESS);
 	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_WRITE, off, len);
 	if (txh != NULL) {
 		dmu_tx_count_write(txh, off, len);
 		dmu_tx_count_dnode(txh);
 	}
 }
 
 void
 dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT(tx->tx_txg == 0);
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_WRITE, 0, 0);
 	if (txh == NULL)
 		return;
 
 	dnode_t *dn = txh->txh_dnode;
 	(void) refcount_add_many(&txh->txh_space_towrite,
 	    1ULL << dn->dn_indblkshift, FTAG);
 	dmu_tx_count_dnode(txh);
 }
 
 void
 dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT0(tx->tx_txg);
 	ASSERT3U(len, <=, DMU_MAX_ACCESS);
 	ASSERT(len == 0 || UINT64_MAX - off >= len - 1);
 
 	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len);
 	if (txh != NULL) {
 		dmu_tx_count_write(txh, off, len);
 		dmu_tx_count_dnode(txh);
 	}
 }
 
 /*
  * This function marks the transaction as being a "net free".  The end
  * result is that refquotas will be disabled for this transaction, and
  * this transaction will be able to use half of the pool space overhead
  * (see dsl_pool_adjustedsize()).  Therefore this function should only
  * be called for transactions that we expect will not cause a net increase
  * in the amount of space used (but it's OK if that is occasionally not true).
  */
 void
 dmu_tx_mark_netfree(dmu_tx_t *tx)
 {
 	tx->tx_netfree = B_TRUE;
 }
 
 static void
 dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 {
 	dmu_tx_t *tx;
 	dnode_t *dn;
 	int err;
 
 	tx = txh->txh_tx;
 	ASSERT(tx->tx_txg == 0);
 
 	dn = txh->txh_dnode;
 	dmu_tx_count_dnode(txh);
 
 	if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz)
 		return;
 	if (len == DMU_OBJECT_END)
 		len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off;
 
 
 	/*
 	 * For i/o error checking, we read the first and last level-0
 	 * blocks if they are not aligned, and all the level-1 blocks.
 	 *
 	 * Note:  dbuf_free_range() assumes that we have not instantiated
 	 * any level-0 dbufs that will be completely freed.  Therefore we must
 	 * exercise care to not read or count the first and last blocks
 	 * if they are blocksize-aligned.
 	 */
 	if (dn->dn_datablkshift == 0) {
 		if (off != 0 || len < dn->dn_datablksz)
 			dmu_tx_count_write(txh, 0, dn->dn_datablksz);
 	} else {
 		/* first block will be modified if it is not aligned */
 		if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift))
 			dmu_tx_count_write(txh, off, 1);
 		/* last block will be modified if it is not aligned */
 		if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift))
 			dmu_tx_count_write(txh, off + len, 1);
 	}
 
 	/*
 	 * Check level-1 blocks.
 	 */
 	if (dn->dn_nlevels > 1) {
 		int shift = dn->dn_datablkshift + dn->dn_indblkshift -
 		    SPA_BLKPTRSHIFT;
 		uint64_t start = off >> shift;
 		uint64_t end = (off + len) >> shift;
 
 		ASSERT(dn->dn_indblkshift != 0);
 
 		/*
 		 * dnode_reallocate() can result in an object with indirect
 		 * blocks having an odd data block size.  In this case,
 		 * just check the single block.
 		 */
 		if (dn->dn_datablkshift == 0)
 			start = end = 0;
 
 		zio_t *zio = zio_root(tx->tx_pool->dp_spa,
 		    NULL, NULL, ZIO_FLAG_CANFAIL);
 		for (uint64_t i = start; i <= end; i++) {
 			uint64_t ibyte = i << shift;
 			err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0);
 			i = ibyte >> shift;
 			if (err == ESRCH || i > end)
 				break;
 			if (err != 0) {
 				tx->tx_err = err;
 				(void) zio_wait(zio);
 				return;
 			}
 
 			(void) refcount_add_many(&txh->txh_memory_tohold,
 			    1 << dn->dn_indblkshift, FTAG);
 
 			err = dmu_tx_check_ioerr(zio, dn, 1, i);
 			if (err != 0) {
 				tx->tx_err = err;
 				(void) zio_wait(zio);
 				return;
 			}
 		}
 		err = zio_wait(zio);
 		if (err != 0) {
 			tx->tx_err = err;
 			return;
 		}
 	}
 }
 
 void
 dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 {
 	dmu_tx_hold_t *txh;
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_FREE, off, len);
 	if (txh != NULL)
 		(void) dmu_tx_hold_free_impl(txh, off, len);
 }
 
 void
 dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len)
 {
 	dmu_tx_hold_t *txh;
 
 	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len);
 	if (txh != NULL)
 		(void) dmu_tx_hold_free_impl(txh, off, len);
 }
 
 static void
 dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name)
 {
 	dmu_tx_t *tx = txh->txh_tx;
 	dnode_t *dn;
 	int err;
 
 	ASSERT(tx->tx_txg == 0);
 
 	dn = txh->txh_dnode;
 
 	dmu_tx_count_dnode(txh);
 
 	/*
 	 * Modifying a almost-full microzap is around the worst case (128KB)
 	 *
 	 * If it is a fat zap, the worst case would be 7*16KB=112KB:
 	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
 	 * - 4 new blocks written if adding:
 	 *    - 2 blocks for possibly split leaves,
 	 *    - 2 grown ptrtbl blocks
 	 */
 	(void) refcount_add_many(&txh->txh_space_towrite,
 	    MZAP_MAX_BLKSZ, FTAG);
 
 	if (dn == NULL)
 		return;
 
 	ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP);
 
 	if (dn->dn_maxblkid == 0 || name == NULL) {
 		/*
 		 * This is a microzap (only one block), or we don't know
 		 * the name.  Check the first block for i/o errors.
 		 */
 		err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 		if (err != 0) {
 			tx->tx_err = err;
 		}
 	} else {
 		/*
 		 * Access the name so that we'll check for i/o errors to
 		 * the leaf blocks, etc.  We ignore ENOENT, as this name
 		 * may not yet exist.
 		 */
 		err = zap_lookup_by_dnode(dn, name, 8, 0, NULL);
 		if (err == EIO || err == ECKSUM || err == ENXIO) {
 			tx->tx_err = err;
 		}
 	}
 }
 
 void
 dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT0(tx->tx_txg);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_ZAP, add, (uintptr_t)name);
 	if (txh != NULL)
 		dmu_tx_hold_zap_impl(txh, name);
 }
 
 void
 dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT0(tx->tx_txg);
 	ASSERT(dn != NULL);
 
 	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name);
 	if (txh != NULL)
 		dmu_tx_hold_zap_impl(txh, name);
 }
 
 void
 dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT(tx->tx_txg == 0);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    object, THT_BONUS, 0, 0);
 	if (txh)
 		dmu_tx_count_dnode(txh);
 }
 
 void
 dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn)
 {
 	dmu_tx_hold_t *txh;
 
 	ASSERT0(tx->tx_txg);
 
 	txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0);
 	if (txh)
 		dmu_tx_count_dnode(txh);
 }
 
 void
 dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space)
 {
 	dmu_tx_hold_t *txh;
 	ASSERT(tx->tx_txg == 0);
 
 	txh = dmu_tx_hold_object_impl(tx, tx->tx_objset,
 	    DMU_NEW_OBJECT, THT_SPACE, space, 0);
 
 	(void) refcount_add_many(&txh->txh_space_towrite, space, FTAG);
 }
 
 #ifdef ZFS_DEBUG
 void
 dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 {
 	boolean_t match_object = B_FALSE;
 	boolean_t match_offset = B_FALSE;
 
 	DB_DNODE_ENTER(db);
 	dnode_t *dn = DB_DNODE(db);
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 	ASSERT3U(dn->dn_object, ==, db->db.db_object);
 
 	if (tx->tx_anyobj) {
 		DB_DNODE_EXIT(db);
 		return;
 	}
 
 	/* XXX No checking on the meta dnode for now */
 	if (db->db.db_object == DMU_META_DNODE_OBJECT) {
 		DB_DNODE_EXIT(db);
 		return;
 	}
 
 	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg);
 		if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT)
 			match_object = TRUE;
 		if (txh->txh_dnode == NULL || txh->txh_dnode == dn) {
 			int datablkshift = dn->dn_datablkshift ?
 			    dn->dn_datablkshift : SPA_MAXBLOCKSHIFT;
 			int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 			int shift = datablkshift + epbs * db->db_level;
 			uint64_t beginblk = shift >= 64 ? 0 :
 			    (txh->txh_arg1 >> shift);
 			uint64_t endblk = shift >= 64 ? 0 :
 			    ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift);
 			uint64_t blkid = db->db_blkid;
 
 			/* XXX txh_arg2 better not be zero... */
 
 			dprintf("found txh type %x beginblk=%llx endblk=%llx\n",
 			    txh->txh_type, beginblk, endblk);
 
 			switch (txh->txh_type) {
 			case THT_WRITE:
 				if (blkid >= beginblk && blkid <= endblk)
 					match_offset = TRUE;
 				/*
 				 * We will let this hold work for the bonus
 				 * or spill buffer so that we don't need to
 				 * hold it when creating a new object.
 				 */
 				if (blkid == DMU_BONUS_BLKID ||
 				    blkid == DMU_SPILL_BLKID)
 					match_offset = TRUE;
 				/*
 				 * They might have to increase nlevels,
 				 * thus dirtying the new TLIBs.  Or the
 				 * might have to change the block size,
 				 * thus dirying the new lvl=0 blk=0.
 				 */
 				if (blkid == 0)
 					match_offset = TRUE;
 				break;
 			case THT_FREE:
 				/*
 				 * We will dirty all the level 1 blocks in
 				 * the free range and perhaps the first and
 				 * last level 0 block.
 				 */
 				if (blkid >= beginblk && (blkid <= endblk ||
 				    txh->txh_arg2 == DMU_OBJECT_END))
 					match_offset = TRUE;
 				break;
 			case THT_SPILL:
 				if (blkid == DMU_SPILL_BLKID)
 					match_offset = TRUE;
 				break;
 			case THT_BONUS:
 				if (blkid == DMU_BONUS_BLKID)
 					match_offset = TRUE;
 				break;
 			case THT_ZAP:
 				match_offset = TRUE;
 				break;
 			case THT_NEWOBJECT:
 				match_object = TRUE;
 				break;
 			default:
 				ASSERT(!"bad txh_type");
 			}
 		}
 		if (match_object && match_offset) {
 			DB_DNODE_EXIT(db);
 			return;
 		}
 	}
 	DB_DNODE_EXIT(db);
 	panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n",
 	    (u_longlong_t)db->db.db_object, db->db_level,
 	    (u_longlong_t)db->db_blkid);
 }
 #endif
 
 /*
  * If we can't do 10 iops, something is wrong.  Let us go ahead
  * and hit zfs_dirty_data_max.
  */
 hrtime_t zfs_delay_max_ns = MSEC2NSEC(100);
 int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */
 
 /*
  * We delay transactions when we've determined that the backend storage
  * isn't able to accommodate the rate of incoming writes.
  *
  * If there is already a transaction waiting, we delay relative to when
  * that transaction finishes waiting.  This way the calculated min_time
  * is independent of the number of threads concurrently executing
  * transactions.
  *
  * If we are the only waiter, wait relative to when the transaction
  * started, rather than the current time.  This credits the transaction for
  * "time already served", e.g. reading indirect blocks.
  *
  * The minimum time for a transaction to take is calculated as:
  *     min_time = scale * (dirty - min) / (max - dirty)
  *     min_time is then capped at zfs_delay_max_ns.
  *
  * The delay has two degrees of freedom that can be adjusted via tunables.
  * The percentage of dirty data at which we start to delay is defined by
  * zfs_delay_min_dirty_percent. This should typically be at or above
  * zfs_vdev_async_write_active_max_dirty_percent so that we only start to
  * delay after writing at full speed has failed to keep up with the incoming
  * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly
  * speaking, this variable determines the amount of delay at the midpoint of
  * the curve.
  *
  * delay
  *  10ms +-------------------------------------------------------------*+
  *       |                                                             *|
  *   9ms +                                                             *+
  *       |                                                             *|
  *   8ms +                                                             *+
  *       |                                                            * |
  *   7ms +                                                            * +
  *       |                                                            * |
  *   6ms +                                                            * +
  *       |                                                            * |
  *   5ms +                                                           *  +
  *       |                                                           *  |
  *   4ms +                                                           *  +
  *       |                                                           *  |
  *   3ms +                                                          *   +
  *       |                                                          *   |
  *   2ms +                                              (midpoint) *    +
  *       |                                                  |    **     |
  *   1ms +                                                  v ***       +
  *       |             zfs_delay_scale ---------->     ********         |
  *     0 +-------------------------------------*********----------------+
  *       0%                    <- zfs_dirty_data_max ->               100%
  *
  * Note that since the delay is added to the outstanding time remaining on the
  * most recent transaction, the delay is effectively the inverse of IOPS.
  * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve
  * was chosen such that small changes in the amount of accumulated dirty data
  * in the first 3/4 of the curve yield relatively small differences in the
  * amount of delay.
  *
  * The effects can be easier to understand when the amount of delay is
  * represented on a log scale:
  *
  * delay
  * 100ms +-------------------------------------------------------------++
  *       +                                                              +
  *       |                                                              |
  *       +                                                             *+
  *  10ms +                                                             *+
  *       +                                                           ** +
  *       |                                              (midpoint)  **  |
  *       +                                                  |     **    +
  *   1ms +                                                  v ****      +
  *       +             zfs_delay_scale ---------->        *****         +
  *       |                                             ****             |
  *       +                                          ****                +
  * 100us +                                        **                    +
  *       +                                       *                      +
  *       |                                      *                       |
  *       +                                     *                        +
  *  10us +                                     *                        +
  *       +                                                              +
  *       |                                                              |
  *       +                                                              +
  *       +--------------------------------------------------------------+
  *       0%                    <- zfs_dirty_data_max ->               100%
  *
  * Note here that only as the amount of dirty data approaches its limit does
  * the delay start to increase rapidly. The goal of a properly tuned system
  * should be to keep the amount of dirty data out of that range by first
  * ensuring that the appropriate limits are set for the I/O scheduler to reach
  * optimal throughput on the backend storage, and then by changing the value
  * of zfs_delay_scale to increase the steepness of the curve.
  */
 static void
 dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty)
 {
 	dsl_pool_t *dp = tx->tx_pool;
 	uint64_t delay_min_bytes =
 	    zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
 	hrtime_t wakeup, min_tx_time, now;
 
 	if (dirty <= delay_min_bytes)
 		return;
 
 	/*
 	 * The caller has already waited until we are under the max.
 	 * We make them pass us the amount of dirty data so we don't
 	 * have to handle the case of it being >= the max, which could
 	 * cause a divide-by-zero if it's == the max.
 	 */
 	ASSERT3U(dirty, <, zfs_dirty_data_max);
 
 	now = gethrtime();
 	min_tx_time = zfs_delay_scale *
 	    (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty);
 	if (now > tx->tx_start + min_tx_time)
 		return;
 
 	min_tx_time = MIN(min_tx_time, zfs_delay_max_ns);
 
 	DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty,
 	    uint64_t, min_tx_time);
 
 	mutex_enter(&dp->dp_lock);
 	wakeup = MAX(tx->tx_start + min_tx_time,
 	    dp->dp_last_wakeup + min_tx_time);
 	dp->dp_last_wakeup = wakeup;
 	mutex_exit(&dp->dp_lock);
 
 #ifdef _KERNEL
 #ifdef illumos
 	mutex_enter(&curthread->t_delay_lock);
 	while (cv_timedwait_hires(&curthread->t_delay_cv,
 	    &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns,
 	    CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0)
 		continue;
 	mutex_exit(&curthread->t_delay_lock);
 #else
 	pause_sbt("dmu_tx_delay", nstosbt(wakeup),
 	    nstosbt(zfs_delay_resolution_ns), C_ABSOLUTE);
 #endif
 #else
 	hrtime_t delta = wakeup - gethrtime();
 	struct timespec ts;
 	ts.tv_sec = delta / NANOSEC;
 	ts.tv_nsec = delta % NANOSEC;
 	(void) nanosleep(&ts, NULL);
 #endif
 }
 
 /*
  * This routine attempts to assign the transaction to a transaction group.
  * To do so, we must determine if there is sufficient free space on disk.
  *
  * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree()
  * on it), then it is assumed that there is sufficient free space,
  * unless there's insufficient slop space in the pool (see the comment
  * above spa_slop_shift in spa_misc.c).
  *
  * If it is not a "netfree" transaction, then if the data already on disk
  * is over the allowed usage (e.g. quota), this will fail with EDQUOT or
  * ENOSPC.  Otherwise, if the current rough estimate of pending changes,
  * plus the rough estimate of this transaction's changes, may exceed the
  * allowed usage, then this will fail with ERESTART, which will cause the
  * caller to wait for the pending changes to be written to disk (by waiting
  * for the next TXG to open), and then check the space usage again.
  *
  * The rough estimate of pending changes is comprised of the sum of:
  *
  *  - this transaction's holds' txh_space_towrite
  *
  *  - dd_tempreserved[], which is the sum of in-flight transactions'
  *    holds' txh_space_towrite (i.e. those transactions that have called
  *    dmu_tx_assign() but not yet called dmu_tx_commit()).
  *
  *  - dd_space_towrite[], which is the amount of dirtied dbufs.
  *
  * Note that all of these values are inflated by spa_get_worst_case_asize(),
  * which means that we may get ERESTART well before we are actually in danger
  * of running out of space, but this also mitigates any small inaccuracies
  * in the rough estimate (e.g. txh_space_towrite doesn't take into account
  * indirect blocks, and dd_space_towrite[] doesn't take into account changes
  * to the MOS).
  *
  * Note that due to this algorithm, it is possible to exceed the allowed
  * usage by one transaction.  Also, as we approach the allowed usage,
  * we will allow a very limited amount of changes into each TXG, thus
  * decreasing performance.
  */
 static int
 dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 {
 	spa_t *spa = tx->tx_pool->dp_spa;
 
 	ASSERT0(tx->tx_txg);
 
 	if (tx->tx_err)
 		return (tx->tx_err);
 
 	if (spa_suspended(spa)) {
 		/*
 		 * If the user has indicated a blocking failure mode
 		 * then return ERESTART which will block in dmu_tx_wait().
 		 * Otherwise, return EIO so that an error can get
 		 * propagated back to the VOP calls.
 		 *
 		 * Note that we always honor the txg_how flag regardless
 		 * of the failuremode setting.
 		 */
 		if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE &&
 		    !(txg_how & TXG_WAIT))
 			return (SET_ERROR(EIO));
 
 		return (SET_ERROR(ERESTART));
 	}
 
 	if (!tx->tx_dirty_delayed &&
 	    dsl_pool_need_dirty_delay(tx->tx_pool)) {
 		tx->tx_wait_dirty = B_TRUE;
 		return (SET_ERROR(ERESTART));
 	}
 
 	tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh);
 	tx->tx_needassign_txh = NULL;
 
 	/*
 	 * NB: No error returns are allowed after txg_hold_open, but
 	 * before processing the dnode holds, due to the
 	 * dmu_tx_unassign() logic.
 	 */
 
 	uint64_t towrite = 0;
 	uint64_t tohold = 0;
 	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
 		if (dn != NULL) {
 			mutex_enter(&dn->dn_mtx);
 			if (dn->dn_assigned_txg == tx->tx_txg - 1) {
 				mutex_exit(&dn->dn_mtx);
 				tx->tx_needassign_txh = txh;
 				return (SET_ERROR(ERESTART));
 			}
 			if (dn->dn_assigned_txg == 0)
 				dn->dn_assigned_txg = tx->tx_txg;
 			ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 			(void) refcount_add(&dn->dn_tx_holds, tx);
 			mutex_exit(&dn->dn_mtx);
 		}
 		towrite += refcount_count(&txh->txh_space_towrite);
 		tohold += refcount_count(&txh->txh_memory_tohold);
 	}
 
 	/* needed allocation: worst-case estimate of write space */
 	uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite);
 	/* calculate memory footprint estimate */
 	uint64_t memory = towrite + tohold;
 
 	if (tx->tx_dir != NULL && asize != 0) {
 		int err = dsl_dir_tempreserve_space(tx->tx_dir, memory,
 		    asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx);
 		if (err != 0)
 			return (err);
 	}
 
 	return (0);
 }
 
 static void
 dmu_tx_unassign(dmu_tx_t *tx)
 {
 	if (tx->tx_txg == 0)
 		return;
 
 	txg_rele_to_quiesce(&tx->tx_txgh);
 
 	/*
 	 * Walk the transaction's hold list, removing the hold on the
 	 * associated dnode, and notifying waiters if the refcount drops to 0.
 	 */
 	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds);
 	    txh != tx->tx_needassign_txh;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
 
 		if (dn == NULL)
 			continue;
 		mutex_enter(&dn->dn_mtx);
 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 
 		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
 			dn->dn_assigned_txg = 0;
 			cv_broadcast(&dn->dn_notxholds);
 		}
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	txg_rele_to_sync(&tx->tx_txgh);
 
 	tx->tx_lasttried_txg = tx->tx_txg;
 	tx->tx_txg = 0;
 }
 
 /*
  * Assign tx to a transaction group; txg_how is a bitmask:
  *
  * If TXG_WAIT is set and the currently open txg is full, this function
  * will wait until there's a new txg. This should be used when no locks
  * are being held. With this bit set, this function will only fail if
  * we're truly out of space (or over quota).
  *
  * If TXG_WAIT is *not* set and we can't assign into the currently open
  * txg without blocking, this function will return immediately with
  * ERESTART. This should be used whenever locks are being held.  On an
  * ERESTART error, the caller should drop all locks, call dmu_tx_wait(),
  * and try again.
  *
  * If TXG_NOTHROTTLE is set, this indicates that this tx should not be
  * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for
  * details on the throttle). This is used by the VFS operations, after
  * they have already called dmu_tx_wait() (though most likely on a
  * different tx).
  */
 int
 dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how)
 {
 	int err;
 
 	ASSERT(tx->tx_txg == 0);
 	ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE));
 	ASSERT(!dsl_pool_sync_context(tx->tx_pool));
 
 	/* If we might wait, we must not hold the config lock. */
 	IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool));
 
 	if ((txg_how & TXG_NOTHROTTLE))
 		tx->tx_dirty_delayed = B_TRUE;
 
 	while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) {
 		dmu_tx_unassign(tx);
 
 		if (err != ERESTART || !(txg_how & TXG_WAIT))
 			return (err);
 
 		dmu_tx_wait(tx);
 	}
 
 	txg_rele_to_quiesce(&tx->tx_txgh);
 
 	return (0);
 }
 
 void
 dmu_tx_wait(dmu_tx_t *tx)
 {
 	spa_t *spa = tx->tx_pool->dp_spa;
 	dsl_pool_t *dp = tx->tx_pool;
 
 	ASSERT(tx->tx_txg == 0);
 	ASSERT(!dsl_pool_config_held(tx->tx_pool));
 
 	if (tx->tx_wait_dirty) {
 		/*
 		 * dmu_tx_try_assign() has determined that we need to wait
 		 * because we've consumed much or all of the dirty buffer
 		 * space.
 		 */
 		mutex_enter(&dp->dp_lock);
 		while (dp->dp_dirty_total >= zfs_dirty_data_max)
 			cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock);
 		uint64_t dirty = dp->dp_dirty_total;
 		mutex_exit(&dp->dp_lock);
 
 		dmu_tx_delay(tx, dirty);
 
 		tx->tx_wait_dirty = B_FALSE;
 
 		/*
 		 * Note: setting tx_dirty_delayed only has effect if the
 		 * caller used TX_WAIT.  Otherwise they are going to
 		 * destroy this tx and try again.  The common case,
 		 * zfs_write(), uses TX_WAIT.
 		 */
 		tx->tx_dirty_delayed = B_TRUE;
 	} else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) {
 		/*
 		 * If the pool is suspended we need to wait until it
 		 * is resumed.  Note that it's possible that the pool
 		 * has become active after this thread has tried to
 		 * obtain a tx.  If that's the case then tx_lasttried_txg
 		 * would not have been set.
 		 */
 		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
 	} else if (tx->tx_needassign_txh) {
 		/*
 		 * A dnode is assigned to the quiescing txg.  Wait for its
 		 * transaction to complete.
 		 */
 		dnode_t *dn = tx->tx_needassign_txh->txh_dnode;
 
 		mutex_enter(&dn->dn_mtx);
 		while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1)
 			cv_wait(&dn->dn_notxholds, &dn->dn_mtx);
 		mutex_exit(&dn->dn_mtx);
 		tx->tx_needassign_txh = NULL;
 	} else {
-		txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1);
+		/*
+		 * If we have a lot of dirty data just wait until we sync
+		 * out a TXG at which point we'll hopefully have synced
+		 * a portion of the changes.
+		 */
+		txg_wait_synced(dp, spa_last_synced_txg(spa) + 1);
 	}
 }
 
 static void
 dmu_tx_destroy(dmu_tx_t *tx)
 {
 	dmu_tx_hold_t *txh;
 
 	while ((txh = list_head(&tx->tx_holds)) != NULL) {
 		dnode_t *dn = txh->txh_dnode;
 
 		list_remove(&tx->tx_holds, txh);
 		refcount_destroy_many(&txh->txh_space_towrite,
 		    refcount_count(&txh->txh_space_towrite));
 		refcount_destroy_many(&txh->txh_memory_tohold,
 		    refcount_count(&txh->txh_memory_tohold));
 		kmem_free(txh, sizeof (dmu_tx_hold_t));
 		if (dn != NULL)
 			dnode_rele(dn, tx);
 	}
 
 	list_destroy(&tx->tx_callbacks);
 	list_destroy(&tx->tx_holds);
 	kmem_free(tx, sizeof (dmu_tx_t));
 }
 
 void
 dmu_tx_commit(dmu_tx_t *tx)
 {
 	ASSERT(tx->tx_txg != 0);
 
 	/*
 	 * Go through the transaction's hold list and remove holds on
 	 * associated dnodes, notifying waiters if no holds remain.
 	 */
 	for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL;
 	    txh = list_next(&tx->tx_holds, txh)) {
 		dnode_t *dn = txh->txh_dnode;
 
 		if (dn == NULL)
 			continue;
 
 		mutex_enter(&dn->dn_mtx);
 		ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg);
 
 		if (refcount_remove(&dn->dn_tx_holds, tx) == 0) {
 			dn->dn_assigned_txg = 0;
 			cv_broadcast(&dn->dn_notxholds);
 		}
 		mutex_exit(&dn->dn_mtx);
 	}
 
 	if (tx->tx_tempreserve_cookie)
 		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
 
 	if (!list_is_empty(&tx->tx_callbacks))
 		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
 
 	if (tx->tx_anyobj == FALSE)
 		txg_rele_to_sync(&tx->tx_txgh);
 
 	dmu_tx_destroy(tx);
 }
 
 void
 dmu_tx_abort(dmu_tx_t *tx)
 {
 	ASSERT(tx->tx_txg == 0);
 
 	/*
 	 * Call any registered callbacks with an error code.
 	 */
 	if (!list_is_empty(&tx->tx_callbacks))
 		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
 
 	dmu_tx_destroy(tx);
 }
 
 uint64_t
 dmu_tx_get_txg(dmu_tx_t *tx)
 {
 	ASSERT(tx->tx_txg != 0);
 	return (tx->tx_txg);
 }
 
 dsl_pool_t *
 dmu_tx_pool(dmu_tx_t *tx)
 {
 	ASSERT(tx->tx_pool != NULL);
 	return (tx->tx_pool);
 }
 
 void
 dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
 {
 	dmu_tx_callback_t *dcb;
 
 	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
 
 	dcb->dcb_func = func;
 	dcb->dcb_data = data;
 
 	list_insert_tail(&tx->tx_callbacks, dcb);
 }
 
 /*
  * Call all the commit callbacks on a list, with a given error code.
  */
 void
 dmu_tx_do_callbacks(list_t *cb_list, int error)
 {
 	dmu_tx_callback_t *dcb;
 
 	while ((dcb = list_head(cb_list)) != NULL) {
 		list_remove(cb_list, dcb);
 		dcb->dcb_func(dcb->dcb_data, error);
 		kmem_free(dcb, sizeof (dmu_tx_callback_t));
 	}
 }
 
 /*
  * Interface to hold a bunch of attributes.
  * used for creating new files.
  * attrsize is the total size of all attributes
  * to be added during object creation
  *
  * For updating/adding a single attribute dmu_tx_hold_sa() should be used.
  */
 
 /*
  * hold necessary attribute name for attribute registration.
  * should be a very rare case where this is needed.  If it does
  * happen it would only happen on the first write to the file system.
  */
 static void
 dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx)
 {
 	if (!sa->sa_need_attr_registration)
 		return;
 
 	for (int i = 0; i != sa->sa_num_attrs; i++) {
 		if (!sa->sa_attr_table[i].sa_registered) {
 			if (sa->sa_reg_attr_obj)
 				dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj,
 				    B_TRUE, sa->sa_attr_table[i].sa_name);
 			else
 				dmu_tx_hold_zap(tx, DMU_NEW_OBJECT,
 				    B_TRUE, sa->sa_attr_table[i].sa_name);
 		}
 	}
 }
 
 void
 dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object)
 {
 	dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx,
 	    tx->tx_objset, object, THT_SPILL, 0, 0);
 
 	(void) refcount_add_many(&txh->txh_space_towrite,
 	    SPA_OLD_MAXBLOCKSIZE, FTAG);
 }
 
 void
 dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize)
 {
 	sa_os_t *sa = tx->tx_objset->os_sa;
 
 	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
 
 	if (tx->tx_objset->os_sa->sa_master_obj == 0)
 		return;
 
 	if (tx->tx_objset->os_sa->sa_layout_attr_obj) {
 		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
 	} else {
 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 	}
 
 	dmu_tx_sa_registration_hold(sa, tx);
 
 	if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill)
 		return;
 
 	(void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT,
 	    THT_SPILL, 0, 0);
 }
 
 /*
  * Hold SA attribute
  *
  * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size)
  *
  * variable_size is the total size of all variable sized attributes
  * passed to this function.  It is not the total size of all
  * variable size attributes that *may* exist on this object.
  */
 void
 dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow)
 {
 	uint64_t object;
 	sa_os_t *sa = tx->tx_objset->os_sa;
 
 	ASSERT(hdl != NULL);
 
 	object = sa_handle_object(hdl);
 
 	dmu_tx_hold_bonus(tx, object);
 
 	if (tx->tx_objset->os_sa->sa_master_obj == 0)
 		return;
 
 	if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 ||
 	    tx->tx_objset->os_sa->sa_layout_attr_obj == 0) {
 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS);
 		dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
 	}
 
 	dmu_tx_sa_registration_hold(sa, tx);
 
 	if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj)
 		dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL);
 
 	if (sa->sa_force_spill || may_grow || hdl->sa_spill) {
 		ASSERT(tx->tx_txg == 0);
 		dmu_tx_hold_spill(tx, object);
 	} else {
 		dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus;
 		dnode_t *dn;
 
 		DB_DNODE_ENTER(db);
 		dn = DB_DNODE(db);
 		if (dn->dn_have_spill) {
 			ASSERT(tx->tx_txg == 0);
 			dmu_tx_hold_spill(tx, object);
 		}
 		DB_DNODE_EXIT(db);
 	}
 }
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h	(revision 339124)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h	(revision 339125)
@@ -1,124 +1,125 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 
 /*
  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*
- * Copyright (c) 2013 by Delphix. All rights reserved.
+ * Copyright (c) 2013, 2017 by Delphix. All rights reserved.
  */
 
 #ifndef _SYS_TXG_IMPL_H
 #define	_SYS_TXG_IMPL_H
 
 #include <sys/spa.h>
 #include <sys/txg.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 /*
  * The tx_cpu structure is a per-cpu structure that is used to track
  * the number of active transaction holds (tc_count). As transactions
  * are assigned into a transaction group the appropriate tc_count is
  * incremented to indicate that there are pending changes that have yet
  * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement
  * the tc_count. A transaction group is not considered quiesced until all
  * tx_cpu structures have reached a tc_count of zero.
  *
  * This structure is a per-cpu structure by design. Updates to this structure
  * are frequent and concurrent. Having a single structure would result in
  * heavy lock contention so a per-cpu design was implemented. With the fanned
  * out mutex design, consumers only need to lock the mutex associated with
  * thread's cpu.
  *
  * The tx_cpu contains two locks, the tc_lock and tc_open_lock.
  * The tc_lock is used to protect all members of the tx_cpu structure with
  * the exception of the tc_open_lock. This lock should only be held for a
  * short period of time, typically when updating the value of tc_count.
  *
  * The tc_open_lock protects the tx_open_txg member of the tx_state structure.
  * This lock is used to ensure that transactions are only assigned into
  * the current open transaction group. In order to move the current open
  * transaction group to the quiesce phase, the txg_quiesce thread must
  * grab all tc_open_locks, increment the tx_open_txg, and drop the locks.
  * The tc_open_lock is held until the transaction is assigned into the
  * transaction group. Typically, this is a short operation but if throttling
  * is occuring it may be held for longer periods of time.
  */
 struct tx_cpu {
 	kmutex_t	tc_open_lock;	/* protects tx_open_txg */
 	kmutex_t	tc_lock;	/* protects the rest of this struct */
 	kcondvar_t	tc_cv[TXG_SIZE];
 	uint64_t	tc_count[TXG_SIZE];	/* tx hold count on each txg */
 	list_t		tc_callbacks[TXG_SIZE]; /* commit cb list */
 	char		tc_pad[8];		/* pad to fill 3 cache lines */
 };
 
 /*
  * The tx_state structure maintains the state information about the different
  * stages of the pool's transcation groups. A per pool tx_state structure
  * is used to track this information. The tx_state structure also points to
  * an array of tx_cpu structures (described above). Although the tx_sync_lock
  * is used to protect the members of this structure, it is not used to
  * protect the tx_open_txg. Instead a special lock in the tx_cpu structure
  * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock.
  * Any thread wishing to update tx_open_txg must grab the tc_open_lock on
  * every cpu (see txg_quiesce()).
  */
 typedef struct tx_state {
 	tx_cpu_t	*tx_cpu;	/* protects access to tx_open_txg */
 	kmutex_t	tx_sync_lock;	/* protects the rest of this struct */
 
 	uint64_t	tx_open_txg;	/* currently open txg id */
+	uint64_t	tx_quiescing_txg; /* currently quiescing txg id */
 	uint64_t	tx_quiesced_txg; /* quiesced txg waiting for sync */
 	uint64_t	tx_syncing_txg;	/* currently syncing txg id */
 	uint64_t	tx_synced_txg;	/* last synced txg id */
 
 	hrtime_t	tx_open_time;	/* start time of tx_open_txg */
 
 	uint64_t	tx_sync_txg_waiting; /* txg we're waiting to sync */
 	uint64_t	tx_quiesce_txg_waiting; /* txg we're waiting to open */
 
 	kcondvar_t	tx_sync_more_cv;
 	kcondvar_t	tx_sync_done_cv;
 	kcondvar_t	tx_quiesce_more_cv;
 	kcondvar_t	tx_quiesce_done_cv;
 	kcondvar_t	tx_timeout_cv;
 	kcondvar_t	tx_exit_cv;	/* wait for all threads to exit */
 
 	uint8_t		tx_threads;	/* number of threads */
 	uint8_t		tx_exiting;	/* set when we're exiting */
 
 	kthread_t	*tx_sync_thread;
 	kthread_t	*tx_quiesce_thread;
 
 	taskq_t		*tx_commit_cb_taskq; /* commit callback taskq */
 } tx_state_t;
 
 #ifdef	__cplusplus
 }
 #endif
 
 #endif	/* _SYS_TXG_IMPL_H */
Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
===================================================================
--- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c	(revision 339124)
+++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c	(revision 339125)
@@ -1,903 +1,932 @@
 /*
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
  * Common Development and Distribution License (the "License").
  * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
  * See the License for the specific language governing permissions
  * and limitations under the License.
  *
  * When distributing Covered Code, include this CDDL HEADER in each
  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  * If applicable, add the following below this CDDL HEADER, with the
  * fields enclosed by brackets "[]" replaced with your own identifying
  * information: Portions Copyright [yyyy] [name of copyright owner]
  *
  * CDDL HEADER END
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
  * Portions Copyright 2011 Martin Matuska <mm@FreeBSD.org>
  * Copyright (c) 2012, 2017 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/txg_impl.h>
 #include <sys/dmu_impl.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_scan.h>
 #include <sys/zil.h>
 #include <sys/callb.h>
 
 /*
  * ZFS Transaction Groups
  * ----------------------
  *
  * ZFS transaction groups are, as the name implies, groups of transactions
  * that act on persistent state. ZFS asserts consistency at the granularity of
  * these transaction groups. Each successive transaction group (txg) is
  * assigned a 64-bit consecutive identifier. There are three active
  * transaction group states: open, quiescing, or syncing. At any given time,
  * there may be an active txg associated with each state; each active txg may
  * either be processing, or blocked waiting to enter the next state. There may
  * be up to three active txgs, and there is always a txg in the open state
  * (though it may be blocked waiting to enter the quiescing state). In broad
  * strokes, transactions -- operations that change in-memory structures -- are
  * accepted into the txg in the open state, and are completed while the txg is
  * in the open or quiescing states. The accumulated changes are written to
  * disk in the syncing state.
  *
  * Open
  *
  * When a new txg becomes active, it first enters the open state. New
  * transactions -- updates to in-memory structures -- are assigned to the
  * currently open txg. There is always a txg in the open state so that ZFS can
  * accept new changes (though the txg may refuse new changes if it has hit
  * some limit). ZFS advances the open txg to the next state for a variety of
  * reasons such as it hitting a time or size threshold, or the execution of an
  * administrative action that must be completed in the syncing state.
  *
  * Quiescing
  *
  * After a txg exits the open state, it enters the quiescing state. The
  * quiescing state is intended to provide a buffer between accepting new
  * transactions in the open state and writing them out to stable storage in
  * the syncing state. While quiescing, transactions can continue their
  * operation without delaying either of the other states. Typically, a txg is
  * in the quiescing state very briefly since the operations are bounded by
  * software latencies rather than, say, slower I/O latencies. After all
  * transactions complete, the txg is ready to enter the next state.
  *
  * Syncing
  *
  * In the syncing state, the in-memory state built up during the open and (to
  * a lesser degree) the quiescing states is written to stable storage. The
  * process of writing out modified data can, in turn modify more data. For
  * example when we write new blocks, we need to allocate space for them; those
  * allocations modify metadata (space maps)... which themselves must be
  * written to stable storage. During the sync state, ZFS iterates, writing out
  * data until it converges and all in-memory changes have been written out.
  * The first such pass is the largest as it encompasses all the modified user
  * data (as opposed to filesystem metadata). Subsequent passes typically have
  * far less data to write as they consist exclusively of filesystem metadata.
  *
  * To ensure convergence, after a certain number of passes ZFS begins
  * overwriting locations on stable storage that had been allocated earlier in
  * the syncing state (and subsequently freed). ZFS usually allocates new
  * blocks to optimize for large, continuous, writes. For the syncing state to
  * converge however it must complete a pass where no new blocks are allocated
  * since each allocation requires a modification of persistent metadata.
  * Further, to hasten convergence, after a prescribed number of passes, ZFS
  * also defers frees, and stops compressing.
  *
  * In addition to writing out user data, we must also execute synctasks during
  * the syncing context. A synctask is the mechanism by which some
  * administrative activities work such as creating and destroying snapshots or
  * datasets. Note that when a synctask is initiated it enters the open txg,
  * and ZFS then pushes that txg as quickly as possible to completion of the
  * syncing state in order to reduce the latency of the administrative
  * activity. To complete the syncing state, ZFS writes out a new uberblock,
  * the root of the tree of blocks that comprise all state stored on the ZFS
  * pool. Finally, if there is a quiesced txg waiting, we signal that it can
  * now transition to the syncing state.
  */
 
 static void txg_sync_thread(void *arg);
 static void txg_quiesce_thread(void *arg);
 
 int zfs_txg_timeout = 5;	/* max seconds worth of delta per txg */
 
 SYSCTL_DECL(_vfs_zfs);
 SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG");
 SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0,
     "Maximum seconds worth of delta per txg");
 
 /*
  * Prepare the txg subsystem.
  */
 void
 txg_init(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	int c;
 	bzero(tx, sizeof (tx_state_t));
 
 	tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP);
 
 	for (c = 0; c < max_ncpus; c++) {
 		int i;
 
 		mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
 		    NULL);
 		for (i = 0; i < TXG_SIZE; i++) {
 			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
 			    NULL);
 			list_create(&tx->tx_cpu[c].tc_callbacks[i],
 			    sizeof (dmu_tx_callback_t),
 			    offsetof(dmu_tx_callback_t, dcb_node));
 		}
 	}
 
 	mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
 
 	tx->tx_open_txg = txg;
 }
 
 /*
  * Close down the txg subsystem.
  */
 void
 txg_fini(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	int c;
 
 	ASSERT0(tx->tx_threads);
 
 	mutex_destroy(&tx->tx_sync_lock);
 
 	cv_destroy(&tx->tx_sync_more_cv);
 	cv_destroy(&tx->tx_sync_done_cv);
 	cv_destroy(&tx->tx_quiesce_more_cv);
 	cv_destroy(&tx->tx_quiesce_done_cv);
 	cv_destroy(&tx->tx_exit_cv);
 
 	for (c = 0; c < max_ncpus; c++) {
 		int i;
 
 		mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
 		mutex_destroy(&tx->tx_cpu[c].tc_lock);
 		for (i = 0; i < TXG_SIZE; i++) {
 			cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
 			list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
 		}
 	}
 
 	if (tx->tx_commit_cb_taskq != NULL)
 		taskq_destroy(tx->tx_commit_cb_taskq);
 
 	kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
 
 	bzero(tx, sizeof (tx_state_t));
 }
 
 /*
  * Start syncing transaction groups.
  */
 void
 txg_sync_start(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	mutex_enter(&tx->tx_sync_lock);
 
 	dprintf("pool %p\n", dp);
 
 	ASSERT0(tx->tx_threads);
 
 	tx->tx_threads = 2;
 
 	tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread,
 	    dp, 0, &p0, TS_RUN, minclsyspri);
 
 	/*
 	 * The sync thread can need a larger-than-default stack size on
 	 * 32-bit x86.  This is due in part to nested pools and
 	 * scrub_visitbp() recursion.
 	 */
 	tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread,
 	    dp, 0, &p0, TS_RUN, minclsyspri);
 
 	mutex_exit(&tx->tx_sync_lock);
 }
 
 static void
 txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr)
 {
 	CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG);
 	mutex_enter(&tx->tx_sync_lock);
 }
 
 static void
 txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp)
 {
 	ASSERT(*tpp != NULL);
 	*tpp = NULL;
 	tx->tx_threads--;
 	cv_broadcast(&tx->tx_exit_cv);
 	CALLB_CPR_EXIT(cpr);		/* drops &tx->tx_sync_lock */
 	thread_exit();
 }
 
 static void
 txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time)
 {
 	CALLB_CPR_SAFE_BEGIN(cpr);
 
 	if (time)
 		(void) cv_timedwait(cv, &tx->tx_sync_lock, time);
 	else
 		cv_wait(cv, &tx->tx_sync_lock);
 
 	CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock);
 }
 
 /*
  * Stop syncing transaction groups.
  */
 void
 txg_sync_stop(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	dprintf("pool %p\n", dp);
 	/*
 	 * Finish off any work in progress.
 	 */
 	ASSERT3U(tx->tx_threads, ==, 2);
 
 	/*
 	 * We need to ensure that we've vacated the deferred space_maps.
 	 */
 	txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
 
 	/*
 	 * Wake all sync threads and wait for them to die.
 	 */
 	mutex_enter(&tx->tx_sync_lock);
 
 	ASSERT3U(tx->tx_threads, ==, 2);
 
 	tx->tx_exiting = 1;
 
 	cv_broadcast(&tx->tx_quiesce_more_cv);
 	cv_broadcast(&tx->tx_quiesce_done_cv);
 	cv_broadcast(&tx->tx_sync_more_cv);
 
 	while (tx->tx_threads != 0)
 		cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock);
 
 	tx->tx_exiting = 0;
 
 	mutex_exit(&tx->tx_sync_lock);
 }
 
 uint64_t
 txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
 	uint64_t txg;
 
 	mutex_enter(&tc->tc_open_lock);
 	txg = tx->tx_open_txg;
 
 	mutex_enter(&tc->tc_lock);
 	tc->tc_count[txg & TXG_MASK]++;
 	mutex_exit(&tc->tc_lock);
 
 	th->th_cpu = tc;
 	th->th_txg = txg;
 
 	return (txg);
 }
 
 void
 txg_rele_to_quiesce(txg_handle_t *th)
 {
 	tx_cpu_t *tc = th->th_cpu;
 
 	ASSERT(!MUTEX_HELD(&tc->tc_lock));
 	mutex_exit(&tc->tc_open_lock);
 }
 
 void
 txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
 {
 	tx_cpu_t *tc = th->th_cpu;
 	int g = th->th_txg & TXG_MASK;
 
 	mutex_enter(&tc->tc_lock);
 	list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
 	mutex_exit(&tc->tc_lock);
 }
 
 void
 txg_rele_to_sync(txg_handle_t *th)
 {
 	tx_cpu_t *tc = th->th_cpu;
 	int g = th->th_txg & TXG_MASK;
 
 	mutex_enter(&tc->tc_lock);
 	ASSERT(tc->tc_count[g] != 0);
 	if (--tc->tc_count[g] == 0)
 		cv_broadcast(&tc->tc_cv[g]);
 	mutex_exit(&tc->tc_lock);
 
 	th->th_cpu = NULL;	/* defensive */
 }
 
 /*
  * Blocks until all transactions in the group are committed.
  *
  * On return, the transaction group has reached a stable state in which it can
  * then be passed off to the syncing context.
  */
 static __noinline void
 txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	int g = txg & TXG_MASK;
 	int c;
 
 	/*
 	 * Grab all tc_open_locks so nobody else can get into this txg.
 	 */
 	for (c = 0; c < max_ncpus; c++)
 		mutex_enter(&tx->tx_cpu[c].tc_open_lock);
 
 	ASSERT(txg == tx->tx_open_txg);
 	tx->tx_open_txg++;
 	tx->tx_open_time = gethrtime();
 
 	DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg);
 	DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg);
 
 	/*
 	 * Now that we've incremented tx_open_txg, we can let threads
 	 * enter the next transaction group.
 	 */
 	for (c = 0; c < max_ncpus; c++)
 		mutex_exit(&tx->tx_cpu[c].tc_open_lock);
 
 	/*
 	 * Quiesce the transaction group by waiting for everyone to txg_exit().
 	 */
 	for (c = 0; c < max_ncpus; c++) {
 		tx_cpu_t *tc = &tx->tx_cpu[c];
 		mutex_enter(&tc->tc_lock);
 		while (tc->tc_count[g] != 0)
 			cv_wait(&tc->tc_cv[g], &tc->tc_lock);
 		mutex_exit(&tc->tc_lock);
 	}
 }
 
 static void
 txg_do_callbacks(void *arg)
 {
 	list_t *cb_list = arg;
 
 	dmu_tx_do_callbacks(cb_list, 0);
 
 	list_destroy(cb_list);
 
 	kmem_free(cb_list, sizeof (list_t));
 }
 
 /*
  * Dispatch the commit callbacks registered on this txg to worker threads.
  *
  * If no callbacks are registered for a given TXG, nothing happens.
  * This function creates a taskq for the associated pool, if needed.
  */
 static void
 txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
 {
 	int c;
 	tx_state_t *tx = &dp->dp_tx;
 	list_t *cb_list;
 
 	for (c = 0; c < max_ncpus; c++) {
 		tx_cpu_t *tc = &tx->tx_cpu[c];
 		/*
 		 * No need to lock tx_cpu_t at this point, since this can
 		 * only be called once a txg has been synced.
 		 */
 
 		int g = txg & TXG_MASK;
 
 		if (list_is_empty(&tc->tc_callbacks[g]))
 			continue;
 
 		if (tx->tx_commit_cb_taskq == NULL) {
 			/*
 			 * Commit callback taskq hasn't been created yet.
 			 */
 			tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
 			    max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
 			    TASKQ_PREPOPULATE);
 		}
 
 		cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
 		list_create(cb_list, sizeof (dmu_tx_callback_t),
 		    offsetof(dmu_tx_callback_t, dcb_node));
 
 		list_move_tail(cb_list, &tc->tc_callbacks[g]);
 
 		(void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
 		    txg_do_callbacks, cb_list, TQ_SLEEP);
 	}
 }
 
+static boolean_t
+txg_is_syncing(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+	return (tx->tx_syncing_txg != 0);
+}
+
+static boolean_t
+txg_is_quiescing(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+	return (tx->tx_quiescing_txg != 0);
+}
+
+static boolean_t
+txg_has_quiesced_to_sync(dsl_pool_t *dp)
+{
+	tx_state_t *tx = &dp->dp_tx;
+	ASSERT(MUTEX_HELD(&tx->tx_sync_lock));
+	return (tx->tx_quiesced_txg != 0);
+}
+
 static void
 txg_sync_thread(void *arg)
 {
 	dsl_pool_t *dp = arg;
 	spa_t *spa = dp->dp_spa;
 	tx_state_t *tx = &dp->dp_tx;
 	callb_cpr_t cpr;
 	uint64_t start, delta;
 
 	txg_thread_enter(tx, &cpr);
 
 	start = delta = 0;
 	for (;;) {
 		uint64_t timeout = zfs_txg_timeout * hz;
 		uint64_t timer;
 		uint64_t txg;
 
 		/*
 		 * We sync when we're scanning, there's someone waiting
 		 * on us, or the quiesce thread has handed off a txg to
 		 * us, or we have reached our timeout.
 		 */
 		timer = (delta >= timeout ? 0 : timeout - delta);
 		while (!dsl_scan_active(dp->dp_scan) &&
 		    !tx->tx_exiting && timer > 0 &&
 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
-		    tx->tx_quiesced_txg == 0 &&
+		    !txg_has_quiesced_to_sync(dp) &&
 		    dp->dp_dirty_total < zfs_dirty_data_sync) {
 			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
 			delta = ddi_get_lbolt() - start;
 			timer = (delta > timeout ? 0 : timeout - delta);
 		}
 
 		/*
 		 * Wait until the quiesce thread hands off a txg to us,
 		 * prompting it to do so if necessary.
 		 */
-		while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) {
+		while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) {
 			if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1)
 				tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1;
 			cv_broadcast(&tx->tx_quiesce_more_cv);
 			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0);
 		}
 
 		if (tx->tx_exiting)
 			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
 
 		/*
 		 * Consume the quiesced txg which has been handed off to
 		 * us.  This may cause the quiescing thread to now be
 		 * able to quiesce another txg, so we must signal it.
 		 */
+		ASSERT(tx->tx_quiesced_txg != 0);
 		txg = tx->tx_quiesced_txg;
 		tx->tx_quiesced_txg = 0;
 		tx->tx_syncing_txg = txg;
 		DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_quiesce_more_cv);
 
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);
 
 		start = ddi_get_lbolt();
 		spa_sync(spa, txg);
 		delta = ddi_get_lbolt() - start;
 
 		mutex_enter(&tx->tx_sync_lock);
 		tx->tx_synced_txg = txg;
 		tx->tx_syncing_txg = 0;
 		DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_sync_done_cv);
 
 		/*
 		 * Dispatch commit callbacks to worker threads.
 		 */
 		txg_dispatch_callbacks(dp, txg);
 	}
 }
 
 static void
 txg_quiesce_thread(void *arg)
 {
 	dsl_pool_t *dp = arg;
 	tx_state_t *tx = &dp->dp_tx;
 	callb_cpr_t cpr;
 
 	txg_thread_enter(tx, &cpr);
 
 	for (;;) {
 		uint64_t txg;
 
 		/*
 		 * We quiesce when there's someone waiting on us.
 		 * However, we can only have one txg in "quiescing" or
 		 * "quiesced, waiting to sync" state.  So we wait until
 		 * the "quiesced, waiting to sync" txg has been consumed
 		 * by the sync thread.
 		 */
 		while (!tx->tx_exiting &&
 		    (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting ||
-		    tx->tx_quiesced_txg != 0))
+		    txg_has_quiesced_to_sync(dp)))
 			txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0);
 
 		if (tx->tx_exiting)
 			txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread);
 
 		txg = tx->tx_open_txg;
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 		    txg, tx->tx_quiesce_txg_waiting,
 		    tx->tx_sync_txg_waiting);
+		tx->tx_quiescing_txg = txg;
+
 		mutex_exit(&tx->tx_sync_lock);
 		txg_quiesce(dp, txg);
 		mutex_enter(&tx->tx_sync_lock);
 
 		/*
 		 * Hand this txg off to the sync thread.
 		 */
 		dprintf("quiesce done, handing off txg %llu\n", txg);
+		tx->tx_quiescing_txg = 0;
 		tx->tx_quiesced_txg = txg;
 		DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg);
 		cv_broadcast(&tx->tx_sync_more_cv);
 		cv_broadcast(&tx->tx_quiesce_done_cv);
 	}
 }
 
 /*
  * Delay this thread by delay nanoseconds if we are still in the open
  * transaction group and there is already a waiting txg quiesing or quiesced.
  * Abort the delay if this txg stalls or enters the quiesing state.
  */
 void
 txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	hrtime_t start = gethrtime();
 
 	/* don't delay if this txg could transition to quiescing immediately */
 	if (tx->tx_open_txg > txg ||
 	    tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1)
 		return;
 
 	mutex_enter(&tx->tx_sync_lock);
 	if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) {
 		mutex_exit(&tx->tx_sync_lock);
 		return;
 	}
 
 	while (gethrtime() - start < delay &&
 	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) {
 		(void) cv_timedwait_hires(&tx->tx_quiesce_more_cv,
 		    &tx->tx_sync_lock, delay, resolution, 0);
 	}
 
 	mutex_exit(&tx->tx_sync_lock);
 }
 
 void
 txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	ASSERT(!dsl_pool_config_held(dp));
 
 	mutex_enter(&tx->tx_sync_lock);
 	ASSERT3U(tx->tx_threads, ==, 2);
 	if (txg == 0)
 		txg = tx->tx_open_txg + TXG_DEFER_SIZE;
 	if (tx->tx_sync_txg_waiting < txg)
 		tx->tx_sync_txg_waiting = txg;
 	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 	while (tx->tx_synced_txg < txg) {
 		dprintf("broadcasting sync more "
 		    "tx_synced=%llu waiting=%llu dp=%p\n",
 		    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 		cv_broadcast(&tx->tx_sync_more_cv);
 		cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock);
 	}
 	mutex_exit(&tx->tx_sync_lock);
 }
 
 void
 txg_wait_open(dsl_pool_t *dp, uint64_t txg)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	ASSERT(!dsl_pool_config_held(dp));
 
 	mutex_enter(&tx->tx_sync_lock);
 	ASSERT3U(tx->tx_threads, ==, 2);
 	if (txg == 0)
 		txg = tx->tx_open_txg + 1;
 	if (tx->tx_quiesce_txg_waiting < txg)
 		tx->tx_quiesce_txg_waiting = txg;
 	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 	    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 	while (tx->tx_open_txg < txg) {
 		cv_broadcast(&tx->tx_quiesce_more_cv);
 		cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock);
 	}
 	mutex_exit(&tx->tx_sync_lock);
 }
 
 /*
  * If there isn't a txg syncing or in the pipeline, push another txg through
  * the pipeline by queiscing the open txg.
  */
 void
 txg_kick(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	ASSERT(!dsl_pool_config_held(dp));
 
 	mutex_enter(&tx->tx_sync_lock);
-	if (tx->tx_syncing_txg == 0 &&
+	if (!txg_is_syncing(dp) &&
+	    !txg_is_quiescing(dp) &&
 	    tx->tx_quiesce_txg_waiting <= tx->tx_open_txg &&
 	    tx->tx_sync_txg_waiting <= tx->tx_synced_txg &&
 	    tx->tx_quiesced_txg <= tx->tx_synced_txg) {
 		tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1;
 		cv_broadcast(&tx->tx_quiesce_more_cv);
 	}
 	mutex_exit(&tx->tx_sync_lock);
 }
 
 boolean_t
 txg_stalled(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 	return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg);
 }
 
 boolean_t
 txg_sync_waiting(dsl_pool_t *dp)
 {
 	tx_state_t *tx = &dp->dp_tx;
 
 	return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting ||
 	    tx->tx_quiesced_txg != 0);
 }
 
 /*
  * Verify that this txg is active (open, quiescing, syncing).  Non-active
  * txg's should not be manipulated.
  */
 void
 txg_verify(spa_t *spa, uint64_t txg)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	if (txg <= TXG_INITIAL || txg == ZILTEST_TXG)
 		return;
 	ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg);
 	ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg);
 	ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES);
 }
 
 /*
  * Per-txg object lists.
  */
 void
 txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset)
 {
 	int t;
 
 	mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	tl->tl_offset = offset;
 	tl->tl_spa = spa;
 
 	for (t = 0; t < TXG_SIZE; t++)
 		tl->tl_head[t] = NULL;
 }
 
 void
 txg_list_destroy(txg_list_t *tl)
 {
 	int t;
 
 	for (t = 0; t < TXG_SIZE; t++)
 		ASSERT(txg_list_empty(tl, t));
 
 	mutex_destroy(&tl->tl_lock);
 }
 
 boolean_t
 txg_list_empty(txg_list_t *tl, uint64_t txg)
 {
 	txg_verify(tl->tl_spa, txg);
 	return (tl->tl_head[txg & TXG_MASK] == NULL);
 }
 
 /*
  * Returns true if all txg lists are empty.
  *
  * Warning: this is inherently racy (an item could be added immediately
  * after this function returns). We don't bother with the lock because
  * it wouldn't change the semantics.
  */
 boolean_t
 txg_all_lists_empty(txg_list_t *tl)
 {
 	for (int i = 0; i < TXG_SIZE; i++) {
 		if (!txg_list_empty(tl, i)) {
 			return (B_FALSE);
 		}
 	}
 	return (B_TRUE);
 }
 
 /*
  * Add an entry to the list (unless it's already on the list).
  * Returns B_TRUE if it was actually added.
  */
 boolean_t
 txg_list_add(txg_list_t *tl, void *p, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 	boolean_t add;
 
 	txg_verify(tl->tl_spa, txg);
 	mutex_enter(&tl->tl_lock);
 	add = (tn->tn_member[t] == 0);
 	if (add) {
 		tn->tn_member[t] = 1;
 		tn->tn_next[t] = tl->tl_head[t];
 		tl->tl_head[t] = tn;
 	}
 	mutex_exit(&tl->tl_lock);
 
 	return (add);
 }
 
 /*
  * Add an entry to the end of the list, unless it's already on the list.
  * (walks list to find end)
  * Returns B_TRUE if it was actually added.
  */
 boolean_t
 txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 	boolean_t add;
 
 	txg_verify(tl->tl_spa, txg);
 	mutex_enter(&tl->tl_lock);
 	add = (tn->tn_member[t] == 0);
 	if (add) {
 		txg_node_t **tp;
 
 		for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t])
 			continue;
 
 		tn->tn_member[t] = 1;
 		tn->tn_next[t] = NULL;
 		*tp = tn;
 	}
 	mutex_exit(&tl->tl_lock);
 
 	return (add);
 }
 
 /*
  * Remove the head of the list and return it.
  */
 void *
 txg_list_remove(txg_list_t *tl, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn;
 	void *p = NULL;
 
 	txg_verify(tl->tl_spa, txg);
 	mutex_enter(&tl->tl_lock);
 	if ((tn = tl->tl_head[t]) != NULL) {
 		ASSERT(tn->tn_member[t]);
 		ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]);
 		p = (char *)tn - tl->tl_offset;
 		tl->tl_head[t] = tn->tn_next[t];
 		tn->tn_next[t] = NULL;
 		tn->tn_member[t] = 0;
 	}
 	mutex_exit(&tl->tl_lock);
 
 	return (p);
 }
 
 /*
  * Remove a specific item from the list and return it.
  */
 void *
 txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn, **tp;
 
 	txg_verify(tl->tl_spa, txg);
 	mutex_enter(&tl->tl_lock);
 
 	for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) {
 		if ((char *)tn - tl->tl_offset == p) {
 			*tp = tn->tn_next[t];
 			tn->tn_next[t] = NULL;
 			tn->tn_member[t] = 0;
 			mutex_exit(&tl->tl_lock);
 			return (p);
 		}
 	}
 
 	mutex_exit(&tl->tl_lock);
 
 	return (NULL);
 }
 
 boolean_t
 txg_list_member(txg_list_t *tl, void *p, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 
 	txg_verify(tl->tl_spa, txg);
 	return (tn->tn_member[t] != 0);
 }
 
 /*
  * Walk a txg list -- only safe if you know it's not changing.
  */
 void *
 txg_list_head(txg_list_t *tl, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn = tl->tl_head[t];
 
 	txg_verify(tl->tl_spa, txg);
 	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
 }
 
 void *
 txg_list_next(txg_list_t *tl, void *p, uint64_t txg)
 {
 	int t = txg & TXG_MASK;
 	txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset);
 
 	txg_verify(tl->tl_spa, txg);
 	tn = tn->tn_next[t];
 
 	return (tn == NULL ? NULL : (char *)tn - tl->tl_offset);
 }
Index: stable/11
===================================================================
--- stable/11	(revision 339124)
+++ stable/11	(revision 339125)

Property changes on: stable/11
___________________________________________________________________
Modified: svn:mergeinfo
## -0,0 +0,1 ##
   Merged /head:r337172