Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (revision 339124) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (revision 339125) @@ -1,1337 +1,1342 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); tx->tx_dir = dd; if (dd != NULL) tx->tx_pool = dd->dd_pool; list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), offsetof(dmu_tx_hold_t, txh_node)); list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); tx->tx_start = gethrtime(); return (tx); } dmu_tx_t * dmu_tx_create(objset_t *os) { dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); tx->tx_objset = os; return (tx); } dmu_tx_t * dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) { dmu_tx_t *tx = dmu_tx_create_dd(NULL); txg_verify(dp->dp_spa, txg); tx->tx_pool = dp; tx->tx_txg = txg; tx->tx_anyobj = TRUE; return (tx); } int dmu_tx_is_syncing(dmu_tx_t *tx) { return (tx->tx_anyobj); } int dmu_tx_private_ok(dmu_tx_t *tx) { return (tx->tx_anyobj); } static dmu_tx_hold_t * dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) { dmu_tx_hold_t *txh; if (dn != NULL) { (void) refcount_add(&dn->dn_holds, tx); if (tx->tx_txg != 0) { mutex_enter(&dn->dn_mtx); /* * dn->dn_assigned_txg == tx->tx_txg doesn't pose a * problem, but there's no way for it to happen (for * now, at least). */ ASSERT(dn->dn_assigned_txg == 0); dn->dn_assigned_txg = tx->tx_txg; (void) refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } } txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); txh->txh_tx = tx; txh->txh_dnode = dn; refcount_create(&txh->txh_space_towrite); refcount_create(&txh->txh_memory_tohold); txh->txh_type = type; txh->txh_arg1 = arg1; txh->txh_arg2 = arg2; list_insert_tail(&tx->tx_holds, txh); return (txh); } static dmu_tx_hold_t * dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) { dnode_t *dn = NULL; dmu_tx_hold_t *txh; int err; if (object != DMU_NEW_OBJECT) { err = dnode_hold(os, object, FTAG, &dn); if (err != 0) { tx->tx_err = err; return (NULL); } } txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); if (dn != NULL) dnode_rele(dn, FTAG); return (txh); } void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) { /* * If we're syncing, they can manipulate any object anyhow, and * the hold on the dnode_t can cause problems. */ if (!dmu_tx_is_syncing(tx)) (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); } /* * This function reads specified data from disk. The specified data will * be needed to perform the transaction -- i.e, it will be read after * we do dmu_tx_assign(). There are two reasons that we read the data now * (before dmu_tx_assign()): * * 1. Reading it now has potentially better performance. The transaction * has not yet been assigned, so the TXG is not held open, and also the * caller typically has less locks held when calling dmu_tx_hold_*() than * after the transaction has been assigned. This reduces the lock (and txg) * hold times, thus reducing lock contention. * * 2. It is easier for callers (primarily the ZPL) to handle i/o errors * that are detected before they start making changes to the DMU state * (i.e. now). Once the transaction has been assigned, and some DMU * state has been changed, it can be difficult to recover from an i/o * error (e.g. to undo the changes already made in memory at the DMU * layer). Typically code to do so does not exist in the caller -- it * assumes that the data has already been cached and thus i/o errors are * not possible. * * It has been observed that the i/o initiated here can be a performance * problem, and it appears to be optional, because we don't look at the * data which is read. However, removing this read would only serve to * move the work elsewhere (after the dmu_tx_assign()), where it may * have a greater impact on performance (in addition to the impact on * fault tolerance noted above). */ static int dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) { int err; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold_level(dn, level, blkid, FTAG); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); dbuf_rele(db, FTAG); return (err); } /* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; int err = 0; if (len == 0) return; (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) err = SET_ERROR(EFBIG); if (dn == NULL) return; /* * For i/o error checking, read the blocks that will be needed * to perform the write: the first and last level-0 blocks (if * they are not aligned, i.e. if they are partial-block writes), * and all the level-1 blocks. */ if (dn->dn_maxblkid == 0) { if (off < dn->dn_datablksz && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { txh->txh_tx->tx_err = err; } } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ uint64_t start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err != 0) { txh->txh_tx->tx_err = err; } } /* last level-0 block */ uint64_t end = (off + len - 1) >> dn->dn_datablkshift; if (end != start && end <= dn->dn_maxblkid && P2PHASE(off + len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err != 0) { txh->txh_tx->tx_err = err; } } /* level-1 blocks */ if (dn->dn_nlevels > 1) { int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = (start >> shft) + 1; i < end >> shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err != 0) { txh->txh_tx->tx_err = err; } } } err = zio_wait(zio); if (err != 0) { txh->txh_tx->tx_err = err; } } } static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { (void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG); } void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE, off, len); if (txh != NULL) { dmu_tx_count_write(txh, off, len); dmu_tx_count_dnode(txh); } } void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE, 0, 0); if (txh == NULL) return; dnode_t *dn = txh->txh_dnode; (void) refcount_add_many(&txh->txh_space_towrite, 1ULL << dn->dn_indblkshift, FTAG); dmu_tx_count_dnode(txh); } void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); if (txh != NULL) { dmu_tx_count_write(txh, off, len); dmu_tx_count_dnode(txh); } } /* * This function marks the transaction as being a "net free". The end * result is that refquotas will be disabled for this transaction, and * this transaction will be able to use half of the pool space overhead * (see dsl_pool_adjustedsize()). Therefore this function should only * be called for transactions that we expect will not cause a net increase * in the amount of space used (but it's OK if that is occasionally not true). */ void dmu_tx_mark_netfree(dmu_tx_t *tx) { tx->tx_netfree = B_TRUE; } static void dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dmu_tx_t *tx; dnode_t *dn; int err; tx = txh->txh_tx; ASSERT(tx->tx_txg == 0); dn = txh->txh_dnode; dmu_tx_count_dnode(txh); if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; /* * For i/o error checking, we read the first and last level-0 * blocks if they are not aligned, and all the level-1 blocks. * * Note: dbuf_free_range() assumes that we have not instantiated * any level-0 dbufs that will be completely freed. Therefore we must * exercise care to not read or count the first and last blocks * if they are blocksize-aligned. */ if (dn->dn_datablkshift == 0) { if (off != 0 || len < dn->dn_datablksz) dmu_tx_count_write(txh, 0, dn->dn_datablksz); } else { /* first block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) dmu_tx_count_write(txh, off, 1); /* last block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) dmu_tx_count_write(txh, off + len, 1); } /* * Check level-1 blocks. */ if (dn->dn_nlevels > 1) { int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t start = off >> shift; uint64_t end = (off + len) >> shift; ASSERT(dn->dn_indblkshift != 0); /* * dnode_reallocate() can result in an object with indirect * blocks having an odd data block size. In this case, * just check the single block. */ if (dn->dn_datablkshift == 0) start = end = 0; zio_t *zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (uint64_t i = start; i <= end; i++) { uint64_t ibyte = i << shift; err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; if (err == ESRCH || i > end) break; if (err != 0) { tx->tx_err = err; (void) zio_wait(zio); return; } (void) refcount_add_many(&txh->txh_memory_tohold, 1 << dn->dn_indblkshift, FTAG); err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err != 0) { tx->tx_err = err; (void) zio_wait(zio); return; } } err = zio_wait(zio); if (err != 0) { tx->tx_err = err; return; } } } void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE, off, len); if (txh != NULL) (void) dmu_tx_hold_free_impl(txh, off, len); } void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); if (txh != NULL) (void) dmu_tx_hold_free_impl(txh, off, len); } static void dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) { dmu_tx_t *tx = txh->txh_tx; dnode_t *dn; int err; ASSERT(tx->tx_txg == 0); dn = txh->txh_dnode; dmu_tx_count_dnode(txh); /* * Modifying a almost-full microzap is around the worst case (128KB) * * If it is a fat zap, the worst case would be 7*16KB=112KB: * - 3 blocks overwritten: target leaf, ptrtbl block, header block * - 4 new blocks written if adding: * - 2 blocks for possibly split leaves, * - 2 grown ptrtbl blocks */ (void) refcount_add_many(&txh->txh_space_towrite, MZAP_MAX_BLKSZ, FTAG); if (dn == NULL) return; ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 || name == NULL) { /* * This is a microzap (only one block), or we don't know * the name. Check the first block for i/o errors. */ err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { tx->tx_err = err; } } else { /* * Access the name so that we'll check for i/o errors to * the leaf blocks, etc. We ignore ENOENT, as this name * may not yet exist. */ err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); if (err == EIO || err == ECKSUM || err == ENXIO) { tx->tx_err = err; } } } void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP, add, (uintptr_t)name); if (txh != NULL) dmu_tx_hold_zap_impl(txh, name); } void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT(dn != NULL); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); if (txh != NULL) dmu_tx_hold_zap_impl(txh, name); } void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS, 0, 0); if (txh) dmu_tx_count_dnode(txh); } void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); if (txh) dmu_tx_count_dnode(txh); } void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE, space, 0); (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); } #ifdef ZFS_DEBUG void dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { boolean_t match_object = B_FALSE; boolean_t match_offset = B_FALSE; DB_DNODE_ENTER(db); dnode_t *dn = DB_DNODE(db); ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); if (tx->tx_anyobj) { DB_DNODE_EXIT(db); return; } /* XXX No checking on the meta dnode for now */ if (db->db.db_object == DMU_META_DNODE_OBJECT) { DB_DNODE_EXIT(db); return; } for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) match_object = TRUE; if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { int datablkshift = dn->dn_datablkshift ? dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; int shift = datablkshift + epbs * db->db_level; uint64_t beginblk = shift >= 64 ? 0 : (txh->txh_arg1 >> shift); uint64_t endblk = shift >= 64 ? 0 : ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); uint64_t blkid = db->db_blkid; /* XXX txh_arg2 better not be zero... */ dprintf("found txh type %x beginblk=%llx endblk=%llx\n", txh->txh_type, beginblk, endblk); switch (txh->txh_type) { case THT_WRITE: if (blkid >= beginblk && blkid <= endblk) match_offset = TRUE; /* * We will let this hold work for the bonus * or spill buffer so that we don't need to * hold it when creating a new object. */ if (blkid == DMU_BONUS_BLKID || blkid == DMU_SPILL_BLKID) match_offset = TRUE; /* * They might have to increase nlevels, * thus dirtying the new TLIBs. Or the * might have to change the block size, * thus dirying the new lvl=0 blk=0. */ if (blkid == 0) match_offset = TRUE; break; case THT_FREE: /* * We will dirty all the level 1 blocks in * the free range and perhaps the first and * last level 0 block. */ if (blkid >= beginblk && (blkid <= endblk || txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; case THT_SPILL: if (blkid == DMU_SPILL_BLKID) match_offset = TRUE; break; case THT_BONUS: if (blkid == DMU_BONUS_BLKID) match_offset = TRUE; break; case THT_ZAP: match_offset = TRUE; break; case THT_NEWOBJECT: match_object = TRUE; break; default: ASSERT(!"bad txh_type"); } } if (match_object && match_offset) { DB_DNODE_EXIT(db); return; } } DB_DNODE_EXIT(db); panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", (u_longlong_t)db->db.db_object, db->db_level, (u_longlong_t)db->db_blkid); } #endif /* * If we can't do 10 iops, something is wrong. Let us go ahead * and hit zfs_dirty_data_max. */ hrtime_t zfs_delay_max_ns = MSEC2NSEC(100); int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ /* * We delay transactions when we've determined that the backend storage * isn't able to accommodate the rate of incoming writes. * * If there is already a transaction waiting, we delay relative to when * that transaction finishes waiting. This way the calculated min_time * is independent of the number of threads concurrently executing * transactions. * * If we are the only waiter, wait relative to when the transaction * started, rather than the current time. This credits the transaction for * "time already served", e.g. reading indirect blocks. * * The minimum time for a transaction to take is calculated as: * min_time = scale * (dirty - min) / (max - dirty) * min_time is then capped at zfs_delay_max_ns. * * The delay has two degrees of freedom that can be adjusted via tunables. * The percentage of dirty data at which we start to delay is defined by * zfs_delay_min_dirty_percent. This should typically be at or above * zfs_vdev_async_write_active_max_dirty_percent so that we only start to * delay after writing at full speed has failed to keep up with the incoming * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly * speaking, this variable determines the amount of delay at the midpoint of * the curve. * * delay * 10ms +-------------------------------------------------------------*+ * | *| * 9ms + *+ * | *| * 8ms + *+ * | * | * 7ms + * + * | * | * 6ms + * + * | * | * 5ms + * + * | * | * 4ms + * + * | * | * 3ms + * + * | * | * 2ms + (midpoint) * + * | | ** | * 1ms + v *** + * | zfs_delay_scale ----------> ******** | * 0 +-------------------------------------*********----------------+ * 0% <- zfs_dirty_data_max -> 100% * * Note that since the delay is added to the outstanding time remaining on the * most recent transaction, the delay is effectively the inverse of IOPS. * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve * was chosen such that small changes in the amount of accumulated dirty data * in the first 3/4 of the curve yield relatively small differences in the * amount of delay. * * The effects can be easier to understand when the amount of delay is * represented on a log scale: * * delay * 100ms +-------------------------------------------------------------++ * + + * | | * + *+ * 10ms + *+ * + ** + * | (midpoint) ** | * + | ** + * 1ms + v **** + * + zfs_delay_scale ----------> ***** + * | **** | * + **** + * 100us + ** + * + * + * | * | * + * + * 10us + * + * + + * | | * + + * +--------------------------------------------------------------+ * 0% <- zfs_dirty_data_max -> 100% * * Note here that only as the amount of dirty data approaches its limit does * the delay start to increase rapidly. The goal of a properly tuned system * should be to keep the amount of dirty data out of that range by first * ensuring that the appropriate limits are set for the I/O scheduler to reach * optimal throughput on the backend storage, and then by changing the value * of zfs_delay_scale to increase the steepness of the curve. */ static void dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) { dsl_pool_t *dp = tx->tx_pool; uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; hrtime_t wakeup, min_tx_time, now; if (dirty <= delay_min_bytes) return; /* * The caller has already waited until we are under the max. * We make them pass us the amount of dirty data so we don't * have to handle the case of it being >= the max, which could * cause a divide-by-zero if it's == the max. */ ASSERT3U(dirty, <, zfs_dirty_data_max); now = gethrtime(); min_tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); if (now > tx->tx_start + min_tx_time) return; min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, uint64_t, min_tx_time); mutex_enter(&dp->dp_lock); wakeup = MAX(tx->tx_start + min_tx_time, dp->dp_last_wakeup + min_tx_time); dp->dp_last_wakeup = wakeup; mutex_exit(&dp->dp_lock); #ifdef _KERNEL #ifdef illumos mutex_enter(&curthread->t_delay_lock); while (cv_timedwait_hires(&curthread->t_delay_cv, &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns, CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0) continue; mutex_exit(&curthread->t_delay_lock); #else pause_sbt("dmu_tx_delay", nstosbt(wakeup), nstosbt(zfs_delay_resolution_ns), C_ABSOLUTE); #endif #else hrtime_t delta = wakeup - gethrtime(); struct timespec ts; ts.tv_sec = delta / NANOSEC; ts.tv_nsec = delta % NANOSEC; (void) nanosleep(&ts, NULL); #endif } /* * This routine attempts to assign the transaction to a transaction group. * To do so, we must determine if there is sufficient free space on disk. * * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() * on it), then it is assumed that there is sufficient free space, * unless there's insufficient slop space in the pool (see the comment * above spa_slop_shift in spa_misc.c). * * If it is not a "netfree" transaction, then if the data already on disk * is over the allowed usage (e.g. quota), this will fail with EDQUOT or * ENOSPC. Otherwise, if the current rough estimate of pending changes, * plus the rough estimate of this transaction's changes, may exceed the * allowed usage, then this will fail with ERESTART, which will cause the * caller to wait for the pending changes to be written to disk (by waiting * for the next TXG to open), and then check the space usage again. * * The rough estimate of pending changes is comprised of the sum of: * * - this transaction's holds' txh_space_towrite * * - dd_tempreserved[], which is the sum of in-flight transactions' * holds' txh_space_towrite (i.e. those transactions that have called * dmu_tx_assign() but not yet called dmu_tx_commit()). * * - dd_space_towrite[], which is the amount of dirtied dbufs. * * Note that all of these values are inflated by spa_get_worst_case_asize(), * which means that we may get ERESTART well before we are actually in danger * of running out of space, but this also mitigates any small inaccuracies * in the rough estimate (e.g. txh_space_towrite doesn't take into account * indirect blocks, and dd_space_towrite[] doesn't take into account changes * to the MOS). * * Note that due to this algorithm, it is possible to exceed the allowed * usage by one transaction. Also, as we approach the allowed usage, * we will allow a very limited amount of changes into each TXG, thus * decreasing performance. */ static int dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) { spa_t *spa = tx->tx_pool->dp_spa; ASSERT0(tx->tx_txg); if (tx->tx_err) return (tx->tx_err); if (spa_suspended(spa)) { /* * If the user has indicated a blocking failure mode * then return ERESTART which will block in dmu_tx_wait(). * Otherwise, return EIO so that an error can get * propagated back to the VOP calls. * * Note that we always honor the txg_how flag regardless * of the failuremode setting. */ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && !(txg_how & TXG_WAIT)) return (SET_ERROR(EIO)); return (SET_ERROR(ERESTART)); } if (!tx->tx_dirty_delayed && dsl_pool_need_dirty_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; return (SET_ERROR(ERESTART)); } tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); tx->tx_needassign_txh = NULL; /* * NB: No error returns are allowed after txg_hold_open, but * before processing the dnode holds, due to the * dmu_tx_unassign() logic. */ uint64_t towrite = 0; uint64_t tohold = 0; for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn != NULL) { mutex_enter(&dn->dn_mtx); if (dn->dn_assigned_txg == tx->tx_txg - 1) { mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = txh; return (SET_ERROR(ERESTART)); } if (dn->dn_assigned_txg == 0) dn->dn_assigned_txg = tx->tx_txg; ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); (void) refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } towrite += refcount_count(&txh->txh_space_towrite); tohold += refcount_count(&txh->txh_memory_tohold); } /* needed allocation: worst-case estimate of write space */ uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); /* calculate memory footprint estimate */ uint64_t memory = towrite + tohold; if (tx->tx_dir != NULL && asize != 0) { int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); if (err != 0) return (err); } return (0); } static void dmu_tx_unassign(dmu_tx_t *tx) { if (tx->tx_txg == 0) return; txg_rele_to_quiesce(&tx->tx_txgh); /* * Walk the transaction's hold list, removing the hold on the * associated dnode, and notifying waiters if the refcount drops to 0. */ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn == NULL) continue; mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { dn->dn_assigned_txg = 0; cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); } txg_rele_to_sync(&tx->tx_txgh); tx->tx_lasttried_txg = tx->tx_txg; tx->tx_txg = 0; } /* * Assign tx to a transaction group; txg_how is a bitmask: * * If TXG_WAIT is set and the currently open txg is full, this function * will wait until there's a new txg. This should be used when no locks * are being held. With this bit set, this function will only fail if * we're truly out of space (or over quota). * * If TXG_WAIT is *not* set and we can't assign into the currently open * txg without blocking, this function will return immediately with * ERESTART. This should be used whenever locks are being held. On an * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), * and try again. * * If TXG_NOTHROTTLE is set, this indicates that this tx should not be * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for * details on the throttle). This is used by the VFS operations, after * they have already called dmu_tx_wait() (though most likely on a * different tx). */ int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) { int err; ASSERT(tx->tx_txg == 0); ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); /* If we might wait, we must not hold the config lock. */ IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool)); if ((txg_how & TXG_NOTHROTTLE)) tx->tx_dirty_delayed = B_TRUE; while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); if (err != ERESTART || !(txg_how & TXG_WAIT)) return (err); dmu_tx_wait(tx); } txg_rele_to_quiesce(&tx->tx_txgh); return (0); } void dmu_tx_wait(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; dsl_pool_t *dp = tx->tx_pool; ASSERT(tx->tx_txg == 0); ASSERT(!dsl_pool_config_held(tx->tx_pool)); if (tx->tx_wait_dirty) { /* * dmu_tx_try_assign() has determined that we need to wait * because we've consumed much or all of the dirty buffer * space. */ mutex_enter(&dp->dp_lock); while (dp->dp_dirty_total >= zfs_dirty_data_max) cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); dmu_tx_delay(tx, dirty); tx->tx_wait_dirty = B_FALSE; /* * Note: setting tx_dirty_delayed only has effect if the * caller used TX_WAIT. Otherwise they are going to * destroy this tx and try again. The common case, * zfs_write(), uses TX_WAIT. */ tx->tx_dirty_delayed = B_TRUE; } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { /* * If the pool is suspended we need to wait until it * is resumed. Note that it's possible that the pool * has become active after this thread has tried to * obtain a tx. If that's the case then tx_lasttried_txg * would not have been set. */ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } else if (tx->tx_needassign_txh) { /* * A dnode is assigned to the quiescing txg. Wait for its * transaction to complete. */ dnode_t *dn = tx->tx_needassign_txh->txh_dnode; mutex_enter(&dn->dn_mtx); while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) cv_wait(&dn->dn_notxholds, &dn->dn_mtx); mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = NULL; } else { - txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); + /* + * If we have a lot of dirty data just wait until we sync + * out a TXG at which point we'll hopefully have synced + * a portion of the changes. + */ + txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } } static void dmu_tx_destroy(dmu_tx_t *tx) { dmu_tx_hold_t *txh; while ((txh = list_head(&tx->tx_holds)) != NULL) { dnode_t *dn = txh->txh_dnode; list_remove(&tx->tx_holds, txh); refcount_destroy_many(&txh->txh_space_towrite, refcount_count(&txh->txh_space_towrite)); refcount_destroy_many(&txh->txh_memory_tohold, refcount_count(&txh->txh_memory_tohold)); kmem_free(txh, sizeof (dmu_tx_hold_t)); if (dn != NULL) dnode_rele(dn, tx); } list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); kmem_free(tx, sizeof (dmu_tx_t)); } void dmu_tx_commit(dmu_tx_t *tx) { ASSERT(tx->tx_txg != 0); /* * Go through the transaction's hold list and remove holds on * associated dnodes, notifying waiters if no holds remain. */ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn == NULL) continue; mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { dn->dn_assigned_txg = 0; cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); } if (tx->tx_tempreserve_cookie) dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); if (!list_is_empty(&tx->tx_callbacks)) txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); dmu_tx_destroy(tx); } void dmu_tx_abort(dmu_tx_t *tx) { ASSERT(tx->tx_txg == 0); /* * Call any registered callbacks with an error code. */ if (!list_is_empty(&tx->tx_callbacks)) dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); dmu_tx_destroy(tx); } uint64_t dmu_tx_get_txg(dmu_tx_t *tx) { ASSERT(tx->tx_txg != 0); return (tx->tx_txg); } dsl_pool_t * dmu_tx_pool(dmu_tx_t *tx) { ASSERT(tx->tx_pool != NULL); return (tx->tx_pool); } void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { dmu_tx_callback_t *dcb; dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); dcb->dcb_func = func; dcb->dcb_data = data; list_insert_tail(&tx->tx_callbacks, dcb); } /* * Call all the commit callbacks on a list, with a given error code. */ void dmu_tx_do_callbacks(list_t *cb_list, int error) { dmu_tx_callback_t *dcb; while ((dcb = list_head(cb_list)) != NULL) { list_remove(cb_list, dcb); dcb->dcb_func(dcb->dcb_data, error); kmem_free(dcb, sizeof (dmu_tx_callback_t)); } } /* * Interface to hold a bunch of attributes. * used for creating new files. * attrsize is the total size of all attributes * to be added during object creation * * For updating/adding a single attribute dmu_tx_hold_sa() should be used. */ /* * hold necessary attribute name for attribute registration. * should be a very rare case where this is needed. If it does * happen it would only happen on the first write to the file system. */ static void dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) { if (!sa->sa_need_attr_registration) return; for (int i = 0; i != sa->sa_num_attrs; i++) { if (!sa->sa_attr_table[i].sa_registered) { if (sa->sa_reg_attr_obj) dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, B_TRUE, sa->sa_attr_table[i].sa_name); else dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, sa->sa_attr_table[i].sa_name); } } } void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_SPILL, 0, 0); (void) refcount_add_many(&txh->txh_space_towrite, SPA_OLD_MAXBLOCKSIZE, FTAG); } void dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) { sa_os_t *sa = tx->tx_objset->os_sa; dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; if (tx->tx_objset->os_sa->sa_layout_attr_obj) { dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); } else { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } dmu_tx_sa_registration_hold(sa, tx); if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) return; (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPILL, 0, 0); } /* * Hold SA attribute * * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) * * variable_size is the total size of all variable sized attributes * passed to this function. It is not the total size of all * variable size attributes that *may* exist on this object. */ void dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) { uint64_t object; sa_os_t *sa = tx->tx_objset->os_sa; ASSERT(hdl != NULL); object = sa_handle_object(hdl); dmu_tx_hold_bonus(tx, object); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } dmu_tx_sa_registration_hold(sa, tx); if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); if (sa->sa_force_spill || may_grow || hdl->sa_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } else { dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_have_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } DB_DNODE_EXIT(db); } } Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h (revision 339124) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/txg_impl.h (revision 339125) @@ -1,124 +1,125 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. */ #ifndef _SYS_TXG_IMPL_H #define _SYS_TXG_IMPL_H #include #include #ifdef __cplusplus extern "C" { #endif /* * The tx_cpu structure is a per-cpu structure that is used to track * the number of active transaction holds (tc_count). As transactions * are assigned into a transaction group the appropriate tc_count is * incremented to indicate that there are pending changes that have yet * to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement * the tc_count. A transaction group is not considered quiesced until all * tx_cpu structures have reached a tc_count of zero. * * This structure is a per-cpu structure by design. Updates to this structure * are frequent and concurrent. Having a single structure would result in * heavy lock contention so a per-cpu design was implemented. With the fanned * out mutex design, consumers only need to lock the mutex associated with * thread's cpu. * * The tx_cpu contains two locks, the tc_lock and tc_open_lock. * The tc_lock is used to protect all members of the tx_cpu structure with * the exception of the tc_open_lock. This lock should only be held for a * short period of time, typically when updating the value of tc_count. * * The tc_open_lock protects the tx_open_txg member of the tx_state structure. * This lock is used to ensure that transactions are only assigned into * the current open transaction group. In order to move the current open * transaction group to the quiesce phase, the txg_quiesce thread must * grab all tc_open_locks, increment the tx_open_txg, and drop the locks. * The tc_open_lock is held until the transaction is assigned into the * transaction group. Typically, this is a short operation but if throttling * is occuring it may be held for longer periods of time. */ struct tx_cpu { kmutex_t tc_open_lock; /* protects tx_open_txg */ kmutex_t tc_lock; /* protects the rest of this struct */ kcondvar_t tc_cv[TXG_SIZE]; uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */ list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ char tc_pad[8]; /* pad to fill 3 cache lines */ }; /* * The tx_state structure maintains the state information about the different * stages of the pool's transcation groups. A per pool tx_state structure * is used to track this information. The tx_state structure also points to * an array of tx_cpu structures (described above). Although the tx_sync_lock * is used to protect the members of this structure, it is not used to * protect the tx_open_txg. Instead a special lock in the tx_cpu structure * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock. * Any thread wishing to update tx_open_txg must grab the tc_open_lock on * every cpu (see txg_quiesce()). */ typedef struct tx_state { tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */ kmutex_t tx_sync_lock; /* protects the rest of this struct */ uint64_t tx_open_txg; /* currently open txg id */ + uint64_t tx_quiescing_txg; /* currently quiescing txg id */ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ uint64_t tx_syncing_txg; /* currently syncing txg id */ uint64_t tx_synced_txg; /* last synced txg id */ hrtime_t tx_open_time; /* start time of tx_open_txg */ uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */ kcondvar_t tx_sync_more_cv; kcondvar_t tx_sync_done_cv; kcondvar_t tx_quiesce_more_cv; kcondvar_t tx_quiesce_done_cv; kcondvar_t tx_timeout_cv; kcondvar_t tx_exit_cv; /* wait for all threads to exit */ uint8_t tx_threads; /* number of threads */ uint8_t tx_exiting; /* set when we're exiting */ kthread_t *tx_sync_thread; kthread_t *tx_quiesce_thread; taskq_t *tx_commit_cb_taskq; /* commit callback taskq */ } tx_state_t; #ifdef __cplusplus } #endif #endif /* _SYS_TXG_IMPL_H */ Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c (revision 339124) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c (revision 339125) @@ -1,903 +1,932 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include /* * ZFS Transaction Groups * ---------------------- * * ZFS transaction groups are, as the name implies, groups of transactions * that act on persistent state. ZFS asserts consistency at the granularity of * these transaction groups. Each successive transaction group (txg) is * assigned a 64-bit consecutive identifier. There are three active * transaction group states: open, quiescing, or syncing. At any given time, * there may be an active txg associated with each state; each active txg may * either be processing, or blocked waiting to enter the next state. There may * be up to three active txgs, and there is always a txg in the open state * (though it may be blocked waiting to enter the quiescing state). In broad * strokes, transactions -- operations that change in-memory structures -- are * accepted into the txg in the open state, and are completed while the txg is * in the open or quiescing states. The accumulated changes are written to * disk in the syncing state. * * Open * * When a new txg becomes active, it first enters the open state. New * transactions -- updates to in-memory structures -- are assigned to the * currently open txg. There is always a txg in the open state so that ZFS can * accept new changes (though the txg may refuse new changes if it has hit * some limit). ZFS advances the open txg to the next state for a variety of * reasons such as it hitting a time or size threshold, or the execution of an * administrative action that must be completed in the syncing state. * * Quiescing * * After a txg exits the open state, it enters the quiescing state. The * quiescing state is intended to provide a buffer between accepting new * transactions in the open state and writing them out to stable storage in * the syncing state. While quiescing, transactions can continue their * operation without delaying either of the other states. Typically, a txg is * in the quiescing state very briefly since the operations are bounded by * software latencies rather than, say, slower I/O latencies. After all * transactions complete, the txg is ready to enter the next state. * * Syncing * * In the syncing state, the in-memory state built up during the open and (to * a lesser degree) the quiescing states is written to stable storage. The * process of writing out modified data can, in turn modify more data. For * example when we write new blocks, we need to allocate space for them; those * allocations modify metadata (space maps)... which themselves must be * written to stable storage. During the sync state, ZFS iterates, writing out * data until it converges and all in-memory changes have been written out. * The first such pass is the largest as it encompasses all the modified user * data (as opposed to filesystem metadata). Subsequent passes typically have * far less data to write as they consist exclusively of filesystem metadata. * * To ensure convergence, after a certain number of passes ZFS begins * overwriting locations on stable storage that had been allocated earlier in * the syncing state (and subsequently freed). ZFS usually allocates new * blocks to optimize for large, continuous, writes. For the syncing state to * converge however it must complete a pass where no new blocks are allocated * since each allocation requires a modification of persistent metadata. * Further, to hasten convergence, after a prescribed number of passes, ZFS * also defers frees, and stops compressing. * * In addition to writing out user data, we must also execute synctasks during * the syncing context. A synctask is the mechanism by which some * administrative activities work such as creating and destroying snapshots or * datasets. Note that when a synctask is initiated it enters the open txg, * and ZFS then pushes that txg as quickly as possible to completion of the * syncing state in order to reduce the latency of the administrative * activity. To complete the syncing state, ZFS writes out a new uberblock, * the root of the tree of blocks that comprise all state stored on the ZFS * pool. Finally, if there is a quiesced txg waiting, we signal that it can * now transition to the syncing state. */ static void txg_sync_thread(void *arg); static void txg_quiesce_thread(void *arg); int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS TXG"); SYSCTL_INT(_vfs_zfs_txg, OID_AUTO, timeout, CTLFLAG_RWTUN, &zfs_txg_timeout, 0, "Maximum seconds worth of delta per txg"); /* * Prepare the txg subsystem. */ void txg_init(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; int c; bzero(tx, sizeof (tx_state_t)); tx->tx_cpu = kmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); for (c = 0; c < max_ncpus; c++) { int i; mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT, NULL); for (i = 0; i < TXG_SIZE; i++) { cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL); list_create(&tx->tx_cpu[c].tc_callbacks[i], sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); } } mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); tx->tx_open_txg = txg; } /* * Close down the txg subsystem. */ void txg_fini(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; int c; ASSERT0(tx->tx_threads); mutex_destroy(&tx->tx_sync_lock); cv_destroy(&tx->tx_sync_more_cv); cv_destroy(&tx->tx_sync_done_cv); cv_destroy(&tx->tx_quiesce_more_cv); cv_destroy(&tx->tx_quiesce_done_cv); cv_destroy(&tx->tx_exit_cv); for (c = 0; c < max_ncpus; c++) { int i; mutex_destroy(&tx->tx_cpu[c].tc_open_lock); mutex_destroy(&tx->tx_cpu[c].tc_lock); for (i = 0; i < TXG_SIZE; i++) { cv_destroy(&tx->tx_cpu[c].tc_cv[i]); list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); } } if (tx->tx_commit_cb_taskq != NULL) taskq_destroy(tx->tx_commit_cb_taskq); kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); bzero(tx, sizeof (tx_state_t)); } /* * Start syncing transaction groups. */ void txg_sync_start(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; mutex_enter(&tx->tx_sync_lock); dprintf("pool %p\n", dp); ASSERT0(tx->tx_threads); tx->tx_threads = 2; tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, dp, 0, &p0, TS_RUN, minclsyspri); /* * The sync thread can need a larger-than-default stack size on * 32-bit x86. This is due in part to nested pools and * scrub_visitbp() recursion. */ tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, dp, 0, &p0, TS_RUN, minclsyspri); mutex_exit(&tx->tx_sync_lock); } static void txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) { CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); mutex_enter(&tx->tx_sync_lock); } static void txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) { ASSERT(*tpp != NULL); *tpp = NULL; tx->tx_threads--; cv_broadcast(&tx->tx_exit_cv); CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ thread_exit(); } static void txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) { CALLB_CPR_SAFE_BEGIN(cpr); if (time) (void) cv_timedwait(cv, &tx->tx_sync_lock, time); else cv_wait(cv, &tx->tx_sync_lock); CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); } /* * Stop syncing transaction groups. */ void txg_sync_stop(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; dprintf("pool %p\n", dp); /* * Finish off any work in progress. */ ASSERT3U(tx->tx_threads, ==, 2); /* * We need to ensure that we've vacated the deferred space_maps. */ txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); /* * Wake all sync threads and wait for them to die. */ mutex_enter(&tx->tx_sync_lock); ASSERT3U(tx->tx_threads, ==, 2); tx->tx_exiting = 1; cv_broadcast(&tx->tx_quiesce_more_cv); cv_broadcast(&tx->tx_quiesce_done_cv); cv_broadcast(&tx->tx_sync_more_cv); while (tx->tx_threads != 0) cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); tx->tx_exiting = 0; mutex_exit(&tx->tx_sync_lock); } uint64_t txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) { tx_state_t *tx = &dp->dp_tx; tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID]; uint64_t txg; mutex_enter(&tc->tc_open_lock); txg = tx->tx_open_txg; mutex_enter(&tc->tc_lock); tc->tc_count[txg & TXG_MASK]++; mutex_exit(&tc->tc_lock); th->th_cpu = tc; th->th_txg = txg; return (txg); } void txg_rele_to_quiesce(txg_handle_t *th) { tx_cpu_t *tc = th->th_cpu; ASSERT(!MUTEX_HELD(&tc->tc_lock)); mutex_exit(&tc->tc_open_lock); } void txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) { tx_cpu_t *tc = th->th_cpu; int g = th->th_txg & TXG_MASK; mutex_enter(&tc->tc_lock); list_move_tail(&tc->tc_callbacks[g], tx_callbacks); mutex_exit(&tc->tc_lock); } void txg_rele_to_sync(txg_handle_t *th) { tx_cpu_t *tc = th->th_cpu; int g = th->th_txg & TXG_MASK; mutex_enter(&tc->tc_lock); ASSERT(tc->tc_count[g] != 0); if (--tc->tc_count[g] == 0) cv_broadcast(&tc->tc_cv[g]); mutex_exit(&tc->tc_lock); th->th_cpu = NULL; /* defensive */ } /* * Blocks until all transactions in the group are committed. * * On return, the transaction group has reached a stable state in which it can * then be passed off to the syncing context. */ static __noinline void txg_quiesce(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; int g = txg & TXG_MASK; int c; /* * Grab all tc_open_locks so nobody else can get into this txg. */ for (c = 0; c < max_ncpus; c++) mutex_enter(&tx->tx_cpu[c].tc_open_lock); ASSERT(txg == tx->tx_open_txg); tx->tx_open_txg++; tx->tx_open_time = gethrtime(); DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg); DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg); /* * Now that we've incremented tx_open_txg, we can let threads * enter the next transaction group. */ for (c = 0; c < max_ncpus; c++) mutex_exit(&tx->tx_cpu[c].tc_open_lock); /* * Quiesce the transaction group by waiting for everyone to txg_exit(). */ for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; mutex_enter(&tc->tc_lock); while (tc->tc_count[g] != 0) cv_wait(&tc->tc_cv[g], &tc->tc_lock); mutex_exit(&tc->tc_lock); } } static void txg_do_callbacks(void *arg) { list_t *cb_list = arg; dmu_tx_do_callbacks(cb_list, 0); list_destroy(cb_list); kmem_free(cb_list, sizeof (list_t)); } /* * Dispatch the commit callbacks registered on this txg to worker threads. * * If no callbacks are registered for a given TXG, nothing happens. * This function creates a taskq for the associated pool, if needed. */ static void txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) { int c; tx_state_t *tx = &dp->dp_tx; list_t *cb_list; for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; /* * No need to lock tx_cpu_t at this point, since this can * only be called once a txg has been synced. */ int g = txg & TXG_MASK; if (list_is_empty(&tc->tc_callbacks[g])) continue; if (tx->tx_commit_cb_taskq == NULL) { /* * Commit callback taskq hasn't been created yet. */ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2, TASKQ_PREPOPULATE); } cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(cb_list, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); list_move_tail(cb_list, &tc->tc_callbacks[g]); (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) txg_do_callbacks, cb_list, TQ_SLEEP); } } +static boolean_t +txg_is_syncing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_syncing_txg != 0); +} + +static boolean_t +txg_is_quiescing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiescing_txg != 0); +} + +static boolean_t +txg_has_quiesced_to_sync(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiesced_txg != 0); +} + static void txg_sync_thread(void *arg) { dsl_pool_t *dp = arg; spa_t *spa = dp->dp_spa; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; uint64_t start, delta; txg_thread_enter(tx, &cpr); start = delta = 0; for (;;) { uint64_t timeout = zfs_txg_timeout * hz; uint64_t timer; uint64_t txg; /* * We sync when we're scanning, there's someone waiting * on us, or the quiesce thread has handed off a txg to * us, or we have reached our timeout. */ timer = (delta >= timeout ? 0 : timeout - delta); while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && - tx->tx_quiesced_txg == 0 && + !txg_has_quiesced_to_sync(dp) && dp->dp_dirty_total < zfs_dirty_data_sync) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); delta = ddi_get_lbolt() - start; timer = (delta > timeout ? 0 : timeout - delta); } /* * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. */ - while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { + while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); } if (tx->tx_exiting) txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); /* * Consume the quiesced txg which has been handed off to * us. This may cause the quiescing thread to now be * able to quiesce another txg, so we must signal it. */ + ASSERT(tx->tx_quiesced_txg != 0); txg = tx->tx_quiesced_txg; tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_quiesce_more_cv); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); start = ddi_get_lbolt(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; mutex_enter(&tx->tx_sync_lock); tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_sync_done_cv); /* * Dispatch commit callbacks to worker threads. */ txg_dispatch_callbacks(dp, txg); } } static void txg_quiesce_thread(void *arg) { dsl_pool_t *dp = arg; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; txg_thread_enter(tx, &cpr); for (;;) { uint64_t txg; /* * We quiesce when there's someone waiting on us. * However, we can only have one txg in "quiescing" or * "quiesced, waiting to sync" state. So we wait until * the "quiesced, waiting to sync" txg has been consumed * by the sync thread. */ while (!tx->tx_exiting && (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || - tx->tx_quiesced_txg != 0)) + txg_has_quiesced_to_sync(dp))) txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); if (tx->tx_exiting) txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); txg = tx->tx_open_txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + tx->tx_quiescing_txg = txg; + mutex_exit(&tx->tx_sync_lock); txg_quiesce(dp, txg); mutex_enter(&tx->tx_sync_lock); /* * Hand this txg off to the sync thread. */ dprintf("quiesce done, handing off txg %llu\n", txg); + tx->tx_quiescing_txg = 0; tx->tx_quiesced_txg = txg; DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); cv_broadcast(&tx->tx_sync_more_cv); cv_broadcast(&tx->tx_quiesce_done_cv); } } /* * Delay this thread by delay nanoseconds if we are still in the open * transaction group and there is already a waiting txg quiesing or quiesced. * Abort the delay if this txg stalls or enters the quiesing state. */ void txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) { tx_state_t *tx = &dp->dp_tx; hrtime_t start = gethrtime(); /* don't delay if this txg could transition to quiescing immediately */ if (tx->tx_open_txg > txg || tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) return; mutex_enter(&tx->tx_sync_lock); if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { mutex_exit(&tx->tx_sync_lock); return; } while (gethrtime() - start < delay && tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) { (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, delay, resolution, 0); } mutex_exit(&tx->tx_sync_lock); } void txg_wait_synced(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; ASSERT(!dsl_pool_config_held(dp)); mutex_enter(&tx->tx_sync_lock); ASSERT3U(tx->tx_threads, ==, 2); if (txg == 0) txg = tx->tx_open_txg + TXG_DEFER_SIZE; if (tx->tx_sync_txg_waiting < txg) tx->tx_sync_txg_waiting = txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); while (tx->tx_synced_txg < txg) { dprintf("broadcasting sync more " "tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); cv_broadcast(&tx->tx_sync_more_cv); cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); } mutex_exit(&tx->tx_sync_lock); } void txg_wait_open(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; ASSERT(!dsl_pool_config_held(dp)); mutex_enter(&tx->tx_sync_lock); ASSERT3U(tx->tx_threads, ==, 2); if (txg == 0) txg = tx->tx_open_txg + 1; if (tx->tx_quiesce_txg_waiting < txg) tx->tx_quiesce_txg_waiting = txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); while (tx->tx_open_txg < txg) { cv_broadcast(&tx->tx_quiesce_more_cv); cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); } mutex_exit(&tx->tx_sync_lock); } /* * If there isn't a txg syncing or in the pipeline, push another txg through * the pipeline by queiscing the open txg. */ void txg_kick(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; ASSERT(!dsl_pool_config_held(dp)); mutex_enter(&tx->tx_sync_lock); - if (tx->tx_syncing_txg == 0 && + if (!txg_is_syncing(dp) && + !txg_is_quiescing(dp) && tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && tx->tx_sync_txg_waiting <= tx->tx_synced_txg && tx->tx_quiesced_txg <= tx->tx_synced_txg) { tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; cv_broadcast(&tx->tx_quiesce_more_cv); } mutex_exit(&tx->tx_sync_lock); } boolean_t txg_stalled(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); } boolean_t txg_sync_waiting(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || tx->tx_quiesced_txg != 0); } /* * Verify that this txg is active (open, quiescing, syncing). Non-active * txg's should not be manipulated. */ void txg_verify(spa_t *spa, uint64_t txg) { dsl_pool_t *dp = spa_get_dsl(spa); if (txg <= TXG_INITIAL || txg == ZILTEST_TXG) return; ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg); ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES); } /* * Per-txg object lists. */ void txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset) { int t; mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); tl->tl_offset = offset; tl->tl_spa = spa; for (t = 0; t < TXG_SIZE; t++) tl->tl_head[t] = NULL; } void txg_list_destroy(txg_list_t *tl) { int t; for (t = 0; t < TXG_SIZE; t++) ASSERT(txg_list_empty(tl, t)); mutex_destroy(&tl->tl_lock); } boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg) { txg_verify(tl->tl_spa, txg); return (tl->tl_head[txg & TXG_MASK] == NULL); } /* * Returns true if all txg lists are empty. * * Warning: this is inherently racy (an item could be added immediately * after this function returns). We don't bother with the lock because * it wouldn't change the semantics. */ boolean_t txg_all_lists_empty(txg_list_t *tl) { for (int i = 0; i < TXG_SIZE; i++) { if (!txg_list_empty(tl, i)) { return (B_FALSE); } } return (B_TRUE); } /* * Add an entry to the list (unless it's already on the list). * Returns B_TRUE if it was actually added. */ boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); boolean_t add; txg_verify(tl->tl_spa, txg); mutex_enter(&tl->tl_lock); add = (tn->tn_member[t] == 0); if (add) { tn->tn_member[t] = 1; tn->tn_next[t] = tl->tl_head[t]; tl->tl_head[t] = tn; } mutex_exit(&tl->tl_lock); return (add); } /* * Add an entry to the end of the list, unless it's already on the list. * (walks list to find end) * Returns B_TRUE if it was actually added. */ boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); boolean_t add; txg_verify(tl->tl_spa, txg); mutex_enter(&tl->tl_lock); add = (tn->tn_member[t] == 0); if (add) { txg_node_t **tp; for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) continue; tn->tn_member[t] = 1; tn->tn_next[t] = NULL; *tp = tn; } mutex_exit(&tl->tl_lock); return (add); } /* * Remove the head of the list and return it. */ void * txg_list_remove(txg_list_t *tl, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn; void *p = NULL; txg_verify(tl->tl_spa, txg); mutex_enter(&tl->tl_lock); if ((tn = tl->tl_head[t]) != NULL) { ASSERT(tn->tn_member[t]); ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]); p = (char *)tn - tl->tl_offset; tl->tl_head[t] = tn->tn_next[t]; tn->tn_next[t] = NULL; tn->tn_member[t] = 0; } mutex_exit(&tl->tl_lock); return (p); } /* * Remove a specific item from the list and return it. */ void * txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn, **tp; txg_verify(tl->tl_spa, txg); mutex_enter(&tl->tl_lock); for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { if ((char *)tn - tl->tl_offset == p) { *tp = tn->tn_next[t]; tn->tn_next[t] = NULL; tn->tn_member[t] = 0; mutex_exit(&tl->tl_lock); return (p); } } mutex_exit(&tl->tl_lock); return (NULL); } boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); txg_verify(tl->tl_spa, txg); return (tn->tn_member[t] != 0); } /* * Walk a txg list -- only safe if you know it's not changing. */ void * txg_list_head(txg_list_t *tl, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = tl->tl_head[t]; txg_verify(tl->tl_spa, txg); return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); } void * txg_list_next(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); txg_verify(tl->tl_spa, txg); tn = tn->tn_next[t]; return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); } Index: stable/11 =================================================================== --- stable/11 (revision 339124) +++ stable/11 (revision 339125) Property changes on: stable/11 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r337172