diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h index 59a1474020e2..c70c97da03f2 100644 --- a/include/sys/dmu_tx.h +++ b/include/sys/dmu_tx.h @@ -1,195 +1,193 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2013 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_TX_H #define _SYS_DMU_TX_H #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct dmu_buf_impl; struct dmu_tx_hold; struct dnode_link; struct dsl_pool; struct dnode; struct dsl_dir; struct dmu_tx { /* * No synchronization is needed because a tx can only be handled * by one thread. */ list_t tx_holds; /* list of dmu_tx_hold_t */ objset_t *tx_objset; struct dsl_dir *tx_dir; struct dsl_pool *tx_pool; uint64_t tx_txg; uint64_t tx_lastsnap_txg; uint64_t tx_lasttried_txg; txg_handle_t tx_txgh; void *tx_tempreserve_cookie; struct dmu_tx_hold *tx_needassign_txh; /* list of dmu_tx_callback_t on this dmu_tx */ list_t tx_callbacks; /* placeholder for syncing context, doesn't need specific holds */ boolean_t tx_anyobj; /* has this transaction already been delayed? */ boolean_t tx_waited; /* time this transaction was created */ hrtime_t tx_start; /* need to wait for sufficient dirty space */ boolean_t tx_wait_dirty; int tx_err; #ifdef DEBUG_DMU_TX uint64_t tx_space_towrite; uint64_t tx_space_tofree; uint64_t tx_space_tooverwrite; uint64_t tx_space_tounref; refcount_t tx_space_written; refcount_t tx_space_freed; #endif }; enum dmu_tx_hold_type { THT_NEWOBJECT, THT_WRITE, THT_BONUS, THT_FREE, THT_ZAP, THT_SPACE, THT_SPILL, THT_NUMTYPES }; typedef struct dmu_tx_hold { dmu_tx_t *txh_tx; list_node_t txh_node; struct dnode *txh_dnode; uint64_t txh_space_towrite; uint64_t txh_space_tofree; uint64_t txh_space_tooverwrite; uint64_t txh_space_tounref; uint64_t txh_memory_tohold; uint64_t txh_fudge; #ifdef DEBUG_DMU_TX enum dmu_tx_hold_type txh_type; uint64_t txh_arg1; uint64_t txh_arg2; #endif } dmu_tx_hold_t; typedef struct dmu_tx_callback { list_node_t dcb_node; /* linked to tx_callbacks list */ dmu_tx_callback_func_t *dcb_func; /* caller function pointer */ void *dcb_data; /* caller private data */ } dmu_tx_callback_t; /* * Used for dmu tx kstat. */ typedef struct dmu_tx_stats { kstat_named_t dmu_tx_assigned; kstat_named_t dmu_tx_delay; kstat_named_t dmu_tx_error; kstat_named_t dmu_tx_suspended; kstat_named_t dmu_tx_group; - kstat_named_t dmu_tx_how; kstat_named_t dmu_tx_memory_reserve; kstat_named_t dmu_tx_memory_reclaim; - kstat_named_t dmu_tx_memory_inflight; kstat_named_t dmu_tx_dirty_throttle; kstat_named_t dmu_tx_dirty_delay; kstat_named_t dmu_tx_dirty_over_max; kstat_named_t dmu_tx_quota; } dmu_tx_stats_t; extern dmu_tx_stats_t dmu_tx_stats; #define DMU_TX_STAT_INCR(stat, val) \ atomic_add_64(&dmu_tx_stats.stat.value.ui64, (val)); #define DMU_TX_STAT_BUMP(stat) \ DMU_TX_STAT_INCR(stat, 1); /* * These routines are defined in dmu.h, and are called by the user. */ dmu_tx_t *dmu_tx_create(objset_t *dd); int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_abort(dmu_tx_t *tx); uint64_t dmu_tx_get_txg(dmu_tx_t *tx); struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, void *dcb_data); void dmu_tx_do_callbacks(list_t *cb_list, int error); /* * These routines are defined in dmu_spa.h, and are called by the SPA. */ extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg); /* * These routines are only called by the DMU. */ dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); int dmu_tx_is_syncing(dmu_tx_t *tx); int dmu_tx_private_ok(dmu_tx_t *tx); void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object); void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta); void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); int dmu_tx_holds(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); #ifdef DEBUG_DMU_TX #define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) #else #define DMU_TX_DIRTY_BUF(tx, db) #endif void dmu_tx_init(void); void dmu_tx_fini(void); #ifdef __cplusplus } #endif #endif /* _SYS_DMU_TX_H */ diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c index 3e04793cf3fb..4159ba74094a 100644 --- a/module/zfs/dmu_tx.c +++ b/module/zfs/dmu_tx.c @@ -1,1682 +1,1680 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. */ #include #include #include #include #include #include /* for dsl_dataset_block_freeable() */ #include /* for dsl_dir_tempreserve_*() */ #include #include /* for fzap_default_block_shift */ #include #include #include #include #include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); dmu_tx_stats_t dmu_tx_stats = { { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, { "dmu_tx_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_error", KSTAT_DATA_UINT64 }, { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, { "dmu_tx_group", KSTAT_DATA_UINT64 }, - { "dmu_tx_how", KSTAT_DATA_UINT64 }, { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, - { "dmu_tx_memory_inflight", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, { "dmu_tx_quota", KSTAT_DATA_UINT64 }, }; static kstat_t *dmu_tx_ksp; dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_PUSHPAGE); tx->tx_dir = dd; if (dd != NULL) tx->tx_pool = dd->dd_pool; list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), offsetof(dmu_tx_hold_t, txh_node)); list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); tx->tx_start = gethrtime(); #ifdef DEBUG_DMU_TX refcount_create(&tx->tx_space_written); refcount_create(&tx->tx_space_freed); #endif return (tx); } dmu_tx_t * dmu_tx_create(objset_t *os) { dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); tx->tx_objset = os; tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); return (tx); } dmu_tx_t * dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) { dmu_tx_t *tx = dmu_tx_create_dd(NULL); ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); tx->tx_pool = dp; tx->tx_txg = txg; tx->tx_anyobj = TRUE; return (tx); } int dmu_tx_is_syncing(dmu_tx_t *tx) { return (tx->tx_anyobj); } int dmu_tx_private_ok(dmu_tx_t *tx) { return (tx->tx_anyobj); } static dmu_tx_hold_t * dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) { dmu_tx_hold_t *txh; dnode_t *dn = NULL; int err; if (object != DMU_NEW_OBJECT) { err = dnode_hold(os, object, tx, &dn); if (err) { tx->tx_err = err; return (NULL); } if (err == 0 && tx->tx_txg != 0) { mutex_enter(&dn->dn_mtx); /* * dn->dn_assigned_txg == tx->tx_txg doesn't pose a * problem, but there's no way for it to happen (for * now, at least). */ ASSERT(dn->dn_assigned_txg == 0); dn->dn_assigned_txg = tx->tx_txg; (void) refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } } txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_PUSHPAGE); txh->txh_tx = tx; txh->txh_dnode = dn; #ifdef DEBUG_DMU_TX txh->txh_type = type; txh->txh_arg1 = arg1; txh->txh_arg2 = arg2; #endif list_insert_tail(&tx->tx_holds, txh); return (txh); } void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) { /* * If we're syncing, they can manipulate any object anyhow, and * the hold on the dnode_t can cause problems. */ if (!dmu_tx_is_syncing(tx)) { (void) dmu_tx_hold_object_impl(tx, os, object, THT_NEWOBJECT, 0, 0); } } static int dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) { int err; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold_level(dn, level, blkid, FTAG); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); dbuf_rele(db, FTAG); return (err); } static void dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, int level, uint64_t blkid, boolean_t freeable, uint64_t *history) { objset_t *os = dn->dn_objset; dsl_dataset_t *ds = os->os_dsl_dataset; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; dmu_buf_impl_t *parent = NULL; blkptr_t *bp = NULL; uint64_t space; if (level >= dn->dn_nlevels || history[level] == blkid) return; history[level] = blkid; space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); if (db == NULL || db == dn->dn_dbuf) { ASSERT(level != 0); db = NULL; } else { ASSERT(DB_DNODE(db) == dn); ASSERT(db->db_level == level); ASSERT(db->db.db_size == space); ASSERT(db->db_blkid == blkid); bp = db->db_blkptr; parent = db->db_parent; } freeable = (bp && (freeable || dsl_dataset_block_freeable(ds, bp, bp->blk_birth))); if (freeable) txh->txh_space_tooverwrite += space; else txh->txh_space_towrite += space; if (bp) txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp); dmu_tx_count_twig(txh, dn, parent, level + 1, blkid >> epbs, freeable, history); } /* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; uint64_t start, end, i; int min_bs, max_bs, min_ibs, max_ibs, epbs, bits; int err = 0; int l; if (len == 0) return; min_bs = SPA_MINBLOCKSHIFT; max_bs = SPA_MAXBLOCKSHIFT; min_ibs = DN_MIN_INDBLKSHIFT; max_ibs = DN_MAX_INDBLKSHIFT; if (dn) { uint64_t history[DN_MAX_LEVELS]; int nlvls = dn->dn_nlevels; int delta; /* * For i/o error checking, read the first and last level-0 * blocks (if they are not aligned), and all the level-1 blocks. */ if (dn->dn_maxblkid == 0) { delta = dn->dn_datablksz; start = (off < dn->dn_datablksz) ? 0 : 1; end = (off+len <= dn->dn_datablksz) ? 0 : 1; if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err) goto out; delta -= off; } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err) goto out; } /* last level-0 block */ end = (off+len-1) >> dn->dn_datablkshift; if (end != start && end <= dn->dn_maxblkid && P2PHASE(off+len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err) goto out; } /* level-1 blocks */ if (nlvls > 1) { int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (i = (start>>shft)+1; i < end>>shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err) goto out; } } err = zio_wait(zio); if (err) goto out; delta = P2NPHASE(off, dn->dn_datablksz); } min_ibs = max_ibs = dn->dn_indblkshift; if (dn->dn_maxblkid > 0) { /* * The blocksize can't change, * so we can make a more precise estimate. */ ASSERT(dn->dn_datablkshift != 0); min_bs = max_bs = dn->dn_datablkshift; } /* * If this write is not off the end of the file * we need to account for overwrites/unref. */ if (start <= dn->dn_maxblkid) { for (l = 0; l < DN_MAX_LEVELS; l++) history[l] = -1ULL; } while (start <= dn->dn_maxblkid) { dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, 0, start, FALSE, FTAG, &db); rw_exit(&dn->dn_struct_rwlock); if (err) { txh->txh_tx->tx_err = err; return; } dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, history); dbuf_rele(db, FTAG); if (++start > end) { /* * Account for new indirects appearing * before this IO gets assigned into a txg. */ bits = 64 - min_bs; epbs = min_ibs - SPA_BLKPTRSHIFT; for (bits -= epbs * (nlvls - 1); bits >= 0; bits -= epbs) txh->txh_fudge += 1ULL << max_ibs; goto out; } off += delta; if (len >= delta) len -= delta; delta = dn->dn_datablksz; } } /* * 'end' is the last thing we will access, not one past. * This way we won't overflow when accessing the last byte. */ start = P2ALIGN(off, 1ULL << max_bs); end = P2ROUNDUP(off + len, 1ULL << max_bs) - 1; txh->txh_space_towrite += end - start + 1; start >>= min_bs; end >>= min_bs; epbs = min_ibs - SPA_BLKPTRSHIFT; /* * The object contains at most 2^(64 - min_bs) blocks, * and each indirect level maps 2^epbs. */ for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { start >>= epbs; end >>= epbs; ASSERT3U(end, >=, start); txh->txh_space_towrite += (end - start + 1) << max_ibs; if (start != 0) { /* * We also need a new blkid=0 indirect block * to reference any existing file data. */ txh->txh_space_towrite += 1ULL << max_ibs; } } out: if (txh->txh_space_towrite + txh->txh_space_tooverwrite > 2 * DMU_MAX_ACCESS) err = SET_ERROR(EFBIG); if (err) txh->txh_tx->tx_err = err; } static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { dnode_t *dn = txh->txh_dnode; dnode_t *mdn = DMU_META_DNODE(txh->txh_tx->tx_objset); uint64_t space = mdn->dn_datablksz + ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); if (dn && dn->dn_dbuf->db_blkptr && dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, dn->dn_dbuf->db_blkptr, dn->dn_dbuf->db_blkptr->blk_birth)) { txh->txh_space_tooverwrite += space; txh->txh_space_tounref += space; } else { txh->txh_space_towrite += space; if (dn && dn->dn_dbuf->db_blkptr) txh->txh_space_tounref += space; } } void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); ASSERT(len < DMU_MAX_ACCESS); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE, off, len); if (txh == NULL) return; dmu_tx_count_write(txh, off, len); dmu_tx_count_dnode(txh); } static void dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { uint64_t blkid, nblks, lastblk; uint64_t space = 0, unref = 0, skipped = 0; dnode_t *dn = txh->txh_dnode; dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; spa_t *spa = txh->txh_tx->tx_pool->dp_spa; int epbs; uint64_t l0span = 0, nl1blks = 0; if (dn->dn_nlevels == 0) return; /* * The struct_rwlock protects us against dn_nlevels * changing, in case (against all odds) we manage to dirty & * sync out the changes after we check for being dirty. * Also, dbuf_hold_impl() wants us to have the struct_rwlock. */ rw_enter(&dn->dn_struct_rwlock, RW_READER); epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; if (dn->dn_maxblkid == 0) { if (off == 0 && len >= dn->dn_datablksz) { blkid = 0; nblks = 1; } else { rw_exit(&dn->dn_struct_rwlock); return; } } else { blkid = off >> dn->dn_datablkshift; nblks = (len + dn->dn_datablksz - 1) >> dn->dn_datablkshift; if (blkid > dn->dn_maxblkid) { rw_exit(&dn->dn_struct_rwlock); return; } if (blkid + nblks > dn->dn_maxblkid) nblks = dn->dn_maxblkid - blkid + 1; } l0span = nblks; /* save for later use to calc level > 1 overhead */ if (dn->dn_nlevels == 1) { int i; for (i = 0; i < nblks; i++) { blkptr_t *bp = dn->dn_phys->dn_blkptr; ASSERT3U(blkid + i, <, dn->dn_nblkptr); bp += blkid + i; if (dsl_dataset_block_freeable(ds, bp, bp->blk_birth)) { dprintf_bp(bp, "can free old%s", ""); space += bp_get_dsize(spa, bp); } unref += BP_GET_ASIZE(bp); } nl1blks = 1; nblks = 0; } lastblk = blkid + nblks - 1; while (nblks) { dmu_buf_impl_t *dbuf; uint64_t ibyte, new_blkid; int epb = 1 << epbs; int err, i, blkoff, tochk; blkptr_t *bp; ibyte = blkid << dn->dn_datablkshift; err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); new_blkid = ibyte >> dn->dn_datablkshift; if (err == ESRCH) { skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; break; } if (err) { txh->txh_tx->tx_err = err; break; } if (new_blkid > lastblk) { skipped += (lastblk >> epbs) - (blkid >> epbs) + 1; break; } if (new_blkid > blkid) { ASSERT((new_blkid >> epbs) > (blkid >> epbs)); skipped += (new_blkid >> epbs) - (blkid >> epbs) - 1; nblks -= new_blkid - blkid; blkid = new_blkid; } blkoff = P2PHASE(blkid, epb); tochk = MIN(epb - blkoff, nblks); err = dbuf_hold_impl(dn, 1, blkid >> epbs, FALSE, FTAG, &dbuf); if (err) { txh->txh_tx->tx_err = err; break; } txh->txh_memory_tohold += dbuf->db.db_size; /* * We don't check memory_tohold against DMU_MAX_ACCESS because * memory_tohold is an over-estimation (especially the >L1 * indirect blocks), so it could fail. Callers should have * already verified that they will not be holding too much * memory. */ err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); if (err != 0) { txh->txh_tx->tx_err = err; dbuf_rele(dbuf, FTAG); break; } bp = dbuf->db.db_data; bp += blkoff; for (i = 0; i < tochk; i++) { if (dsl_dataset_block_freeable(ds, &bp[i], bp[i].blk_birth)) { dprintf_bp(&bp[i], "can free old%s", ""); space += bp_get_dsize(spa, &bp[i]); } unref += BP_GET_ASIZE(bp); } dbuf_rele(dbuf, FTAG); ++nl1blks; blkid += tochk; nblks -= tochk; } rw_exit(&dn->dn_struct_rwlock); /* * Add in memory requirements of higher-level indirects. * This assumes a worst-possible scenario for dn_nlevels and a * worst-possible distribution of l1-blocks over the region to free. */ { uint64_t blkcnt = 1 + ((l0span >> epbs) >> epbs); int level = 2; /* * Here we don't use DN_MAX_LEVEL, but calculate it with the * given datablkshift and indblkshift. This makes the * difference between 19 and 8 on large files. */ int maxlevel = 2 + (DN_MAX_OFFSET_SHIFT - dn->dn_datablkshift) / (dn->dn_indblkshift - SPA_BLKPTRSHIFT); while (level++ < maxlevel) { txh->txh_memory_tohold += MAX(MIN(blkcnt, nl1blks), 1) << dn->dn_indblkshift; blkcnt = 1 + (blkcnt >> epbs); } } /* account for new level 1 indirect blocks that might show up */ if (skipped > 0) { txh->txh_fudge += skipped << dn->dn_indblkshift; skipped = MIN(skipped, DMU_MAX_DELETEBLKCNT >> epbs); txh->txh_memory_tohold += skipped << dn->dn_indblkshift; } txh->txh_space_tofree += space; txh->txh_space_tounref += unref; } void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) { dmu_tx_hold_t *txh; dnode_t *dn; int err; zio_t *zio; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_FREE, off, len); if (txh == NULL) return; dn = txh->txh_dnode; dmu_tx_count_dnode(txh); if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid+1) * dn->dn_datablksz - off; dmu_tx_count_dnode(txh); /* * For i/o error checking, we read the first and last level-0 * blocks if they are not aligned, and all the level-1 blocks. * * Note: dbuf_free_range() assumes that we have not instantiated * any level-0 dbufs that will be completely freed. Therefore we must * exercise care to not read or count the first and last blocks * if they are blocksize-aligned. */ if (dn->dn_datablkshift == 0) { if (off != 0 || len < dn->dn_datablksz) dmu_tx_count_write(txh, 0, dn->dn_datablksz); } else { /* first block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) dmu_tx_count_write(txh, off, 1); /* last block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) dmu_tx_count_write(txh, off+len, 1); } /* * Check level-1 blocks. */ if (dn->dn_nlevels > 1) { int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t start = off >> shift; uint64_t end = (off + len) >> shift; uint64_t i; ASSERT(dn->dn_indblkshift != 0); /* * dnode_reallocate() can result in an object with indirect * blocks having an odd data block size. In this case, * just check the single block. */ if (dn->dn_datablkshift == 0) start = end = 0; zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (i = start; i <= end; i++) { uint64_t ibyte = i << shift; err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; if (err == ESRCH) break; if (err) { tx->tx_err = err; return; } err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err) { tx->tx_err = err; return; } } err = zio_wait(zio); if (err) { tx->tx_err = err; return; } } dmu_tx_count_free(txh, off, len); } void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) { dmu_tx_hold_t *txh; dnode_t *dn; uint64_t nblocks; int epbs, err; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_ZAP, add, (uintptr_t)name); if (txh == NULL) return; dn = txh->txh_dnode; dmu_tx_count_dnode(txh); if (dn == NULL) { /* * We will be able to fit a new object's entries into one leaf * block. So there will be at most 2 blocks total, * including the header block. */ dmu_tx_count_write(txh, 0, 2 << fzap_default_block_shift); return; } ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 && !add) { blkptr_t *bp; /* * If there is only one block (i.e. this is a micro-zap) * and we are not adding anything, the accounting is simple. */ err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err) { tx->tx_err = err; return; } /* * Use max block size here, since we don't know how much * the size will change between now and the dbuf dirty call. */ bp = &dn->dn_phys->dn_blkptr[0]; if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, bp, bp->blk_birth)) txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; else txh->txh_space_towrite += SPA_MAXBLOCKSIZE; if (!BP_IS_HOLE(bp)) txh->txh_space_tounref += SPA_MAXBLOCKSIZE; return; } if (dn->dn_maxblkid > 0 && name) { /* * access the name in this fat-zap so that we'll check * for i/o errors to the leaf blocks, etc. */ err = zap_lookup(dn->dn_objset, dn->dn_object, name, 8, 0, NULL); if (err == EIO) { tx->tx_err = err; return; } } err = zap_count_write(dn->dn_objset, dn->dn_object, name, add, &txh->txh_space_towrite, &txh->txh_space_tooverwrite); /* * If the modified blocks are scattered to the four winds, * we'll have to modify an indirect twig for each. */ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj) txh->txh_space_towrite += 3 << dn->dn_indblkshift; else txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift; } void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS, 0, 0); if (txh) dmu_tx_count_dnode(txh); } void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE, space, 0); if (txh) txh->txh_space_towrite += space; } int dmu_tx_holds(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; int holds = 0; /* * By asserting that the tx is assigned, we're counting the * number of dn_tx_holds, which is the same as the number of * dn_holds. Otherwise, we'd be counting dn_holds, but * dn_tx_holds could be 0. */ ASSERT(tx->tx_txg != 0); /* if (tx->tx_anyobj == TRUE) */ /* return (0); */ for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { if (txh->txh_dnode && txh->txh_dnode->dn_object == object) holds++; } return (holds); } #ifdef DEBUG_DMU_TX void dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { dmu_tx_hold_t *txh; int match_object = FALSE, match_offset = FALSE; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(dn != NULL); ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); if (tx->tx_anyobj) { DB_DNODE_EXIT(db); return; } /* XXX No checking on the meta dnode for now */ if (db->db.db_object == DMU_META_DNODE_OBJECT) { DB_DNODE_EXIT(db); return; } for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) match_object = TRUE; if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { int datablkshift = dn->dn_datablkshift ? dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; int shift = datablkshift + epbs * db->db_level; uint64_t beginblk = shift >= 64 ? 0 : (txh->txh_arg1 >> shift); uint64_t endblk = shift >= 64 ? 0 : ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); uint64_t blkid = db->db_blkid; /* XXX txh_arg2 better not be zero... */ dprintf("found txh type %x beginblk=%llx endblk=%llx\n", txh->txh_type, beginblk, endblk); switch (txh->txh_type) { case THT_WRITE: if (blkid >= beginblk && blkid <= endblk) match_offset = TRUE; /* * We will let this hold work for the bonus * or spill buffer so that we don't need to * hold it when creating a new object. */ if (blkid == DMU_BONUS_BLKID || blkid == DMU_SPILL_BLKID) match_offset = TRUE; /* * They might have to increase nlevels, * thus dirtying the new TLIBs. Or the * might have to change the block size, * thus dirying the new lvl=0 blk=0. */ if (blkid == 0) match_offset = TRUE; break; case THT_FREE: /* * We will dirty all the level 1 blocks in * the free range and perhaps the first and * last level 0 block. */ if (blkid >= beginblk && (blkid <= endblk || txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; case THT_SPILL: if (blkid == DMU_SPILL_BLKID) match_offset = TRUE; break; case THT_BONUS: if (blkid == DMU_BONUS_BLKID) match_offset = TRUE; break; case THT_ZAP: match_offset = TRUE; break; case THT_NEWOBJECT: match_object = TRUE; break; default: ASSERT(!"bad txh_type"); } } if (match_object && match_offset) { DB_DNODE_EXIT(db); return; } } DB_DNODE_EXIT(db); panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", (u_longlong_t)db->db.db_object, db->db_level, (u_longlong_t)db->db_blkid); } #endif /* * If we can't do 10 iops, something is wrong. Let us go ahead * and hit zfs_dirty_data_max. */ hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ /* * We delay transactions when we've determined that the backend storage * isn't able to accommodate the rate of incoming writes. * * If there is already a transaction waiting, we delay relative to when * that transaction finishes waiting. This way the calculated min_time * is independent of the number of threads concurrently executing * transactions. * * If we are the only waiter, wait relative to when the transaction * started, rather than the current time. This credits the transaction for * "time already served", e.g. reading indirect blocks. * * The minimum time for a transaction to take is calculated as: * min_time = scale * (dirty - min) / (max - dirty) * min_time is then capped at zfs_delay_max_ns. * * The delay has two degrees of freedom that can be adjusted via tunables. * The percentage of dirty data at which we start to delay is defined by * zfs_delay_min_dirty_percent. This should typically be at or above * zfs_vdev_async_write_active_max_dirty_percent so that we only start to * delay after writing at full speed has failed to keep up with the incoming * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly * speaking, this variable determines the amount of delay at the midpoint of * the curve. * * delay * 10ms +-------------------------------------------------------------*+ * | *| * 9ms + *+ * | *| * 8ms + *+ * | * | * 7ms + * + * | * | * 6ms + * + * | * | * 5ms + * + * | * | * 4ms + * + * | * | * 3ms + * + * | * | * 2ms + (midpoint) * + * | | ** | * 1ms + v *** + * | zfs_delay_scale ----------> ******** | * 0 +-------------------------------------*********----------------+ * 0% <- zfs_dirty_data_max -> 100% * * Note that since the delay is added to the outstanding time remaining on the * most recent transaction, the delay is effectively the inverse of IOPS. * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve * was chosen such that small changes in the amount of accumulated dirty data * in the first 3/4 of the curve yield relatively small differences in the * amount of delay. * * The effects can be easier to understand when the amount of delay is * represented on a log scale: * * delay * 100ms +-------------------------------------------------------------++ * + + * | | * + *+ * 10ms + *+ * + ** + * | (midpoint) ** | * + | ** + * 1ms + v **** + * + zfs_delay_scale ----------> ***** + * | **** | * + **** + * 100us + ** + * + * + * | * | * + * + * 10us + * + * + + * | | * + + * +--------------------------------------------------------------+ * 0% <- zfs_dirty_data_max -> 100% * * Note here that only as the amount of dirty data approaches its limit does * the delay start to increase rapidly. The goal of a properly tuned system * should be to keep the amount of dirty data out of that range by first * ensuring that the appropriate limits are set for the I/O scheduler to reach * optimal throughput on the backend storage, and then by changing the value * of zfs_delay_scale to increase the steepness of the curve. */ static void dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) { dsl_pool_t *dp = tx->tx_pool; uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; hrtime_t wakeup, min_tx_time, now; if (dirty <= delay_min_bytes) return; /* * The caller has already waited until we are under the max. * We make them pass us the amount of dirty data so we don't * have to handle the case of it being >= the max, which could * cause a divide-by-zero if it's == the max. */ ASSERT3U(dirty, <, zfs_dirty_data_max); now = gethrtime(); min_tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); if (now > tx->tx_start + min_tx_time) return; DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, uint64_t, min_tx_time); mutex_enter(&dp->dp_lock); wakeup = MAX(tx->tx_start + min_tx_time, dp->dp_last_wakeup + min_tx_time); dp->dp_last_wakeup = wakeup; mutex_exit(&dp->dp_lock); zfs_sleep_until(wakeup); } static int dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) { dmu_tx_hold_t *txh; spa_t *spa = tx->tx_pool->dp_spa; uint64_t memory, asize, fsize, usize; uint64_t towrite, tofree, tooverwrite, tounref, tohold, fudge; ASSERT0(tx->tx_txg); if (tx->tx_err) { DMU_TX_STAT_BUMP(dmu_tx_error); return (tx->tx_err); } if (spa_suspended(spa)) { DMU_TX_STAT_BUMP(dmu_tx_suspended); /* * If the user has indicated a blocking failure mode * then return ERESTART which will block in dmu_tx_wait(). * Otherwise, return EIO so that an error can get * propagated back to the VOP calls. * * Note that we always honor the txg_how flag regardless * of the failuremode setting. */ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && txg_how != TXG_WAIT) return (SET_ERROR(EIO)); return (SET_ERROR(ERESTART)); } if (!tx->tx_waited && dsl_pool_need_dirty_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); return (ERESTART); } tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); tx->tx_needassign_txh = NULL; /* * NB: No error returns are allowed after txg_hold_open, but * before processing the dnode holds, due to the * dmu_tx_unassign() logic. */ towrite = tofree = tooverwrite = tounref = tohold = fudge = 0; for (txh = list_head(&tx->tx_holds); txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn != NULL) { mutex_enter(&dn->dn_mtx); if (dn->dn_assigned_txg == tx->tx_txg - 1) { mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = txh; DMU_TX_STAT_BUMP(dmu_tx_group); return (SET_ERROR(ERESTART)); } if (dn->dn_assigned_txg == 0) dn->dn_assigned_txg = tx->tx_txg; ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); (void) refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } towrite += txh->txh_space_towrite; tofree += txh->txh_space_tofree; tooverwrite += txh->txh_space_tooverwrite; tounref += txh->txh_space_tounref; tohold += txh->txh_memory_tohold; fudge += txh->txh_fudge; } /* * If a snapshot has been taken since we made our estimates, * assume that we won't be able to free or overwrite anything. */ if (tx->tx_objset && dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > tx->tx_lastsnap_txg) { towrite += tooverwrite; tooverwrite = tofree = 0; } /* needed allocation: worst-case estimate of write space */ asize = spa_get_asize(tx->tx_pool->dp_spa, towrite + tooverwrite); /* freed space estimate: worst-case overwrite + free estimate */ fsize = spa_get_asize(tx->tx_pool->dp_spa, tooverwrite) + tofree; /* convert unrefd space to worst-case estimate */ usize = spa_get_asize(tx->tx_pool->dp_spa, tounref); /* calculate memory footprint estimate */ memory = towrite + tooverwrite + tohold; #ifdef DEBUG_DMU_TX /* * Add in 'tohold' to account for our dirty holds on this memory * XXX - the "fudge" factor is to account for skipped blocks that * we missed because dnode_next_offset() misses in-core-only blocks. */ tx->tx_space_towrite = asize + spa_get_asize(tx->tx_pool->dp_spa, tohold + fudge); tx->tx_space_tofree = tofree; tx->tx_space_tooverwrite = tooverwrite; tx->tx_space_tounref = tounref; #endif if (tx->tx_dir && asize != 0) { int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, asize, fsize, usize, &tx->tx_tempreserve_cookie, tx); if (err) return (err); } DMU_TX_STAT_BUMP(dmu_tx_assigned); return (0); } static void dmu_tx_unassign(dmu_tx_t *tx) { dmu_tx_hold_t *txh; if (tx->tx_txg == 0) return; txg_rele_to_quiesce(&tx->tx_txgh); /* * Walk the transaction's hold list, removing the hold on the * associated dnode, and notifying waiters if the refcount drops to 0. */ for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn == NULL) continue; mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { dn->dn_assigned_txg = 0; cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); } txg_rele_to_sync(&tx->tx_txgh); tx->tx_lasttried_txg = tx->tx_txg; tx->tx_txg = 0; } /* * Assign tx to a transaction group. txg_how can be one of: * * (1) TXG_WAIT. If the current open txg is full, waits until there's * a new one. This should be used when you're not holding locks. * It will only fail if we're truly out of space (or over quota). * * (2) TXG_NOWAIT. If we can't assign into the current open txg without * blocking, returns immediately with ERESTART. This should be used * whenever you're holding locks. On an ERESTART error, the caller * should drop locks, do a dmu_tx_wait(tx), and try again. * * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() * has already been called on behalf of this operation (though * most likely on a different tx). */ int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) { hrtime_t before; int err; ASSERT(tx->tx_txg == 0); ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || txg_how == TXG_WAITED); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); before = gethrtime(); if (txg_how == TXG_WAITED) tx->tx_waited = B_TRUE; /* If we might wait, we must not hold the config lock. */ ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); if (err != ERESTART || txg_how != TXG_WAIT) return (err); dmu_tx_wait(tx); } txg_rele_to_quiesce(&tx->tx_txgh); spa_tx_assign_add_nsecs(tx->tx_pool->dp_spa, gethrtime() - before); return (0); } void dmu_tx_wait(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; dsl_pool_t *dp = tx->tx_pool; ASSERT(tx->tx_txg == 0); ASSERT(!dsl_pool_config_held(tx->tx_pool)); if (tx->tx_wait_dirty) { uint64_t dirty; /* * dmu_tx_try_assign() has determined that we need to wait * because we've consumed much or all of the dirty buffer * space. */ mutex_enter(&dp->dp_lock); if (dp->dp_dirty_total >= zfs_dirty_data_max) DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); while (dp->dp_dirty_total >= zfs_dirty_data_max) cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); dmu_tx_delay(tx, dirty); tx->tx_wait_dirty = B_FALSE; /* * Note: setting tx_waited only has effect if the caller * used TX_WAIT. Otherwise they are going to destroy * this tx and try again. The common case, zfs_write(), * uses TX_WAIT. */ tx->tx_waited = B_TRUE; } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { /* * If the pool is suspended we need to wait until it * is resumed. Note that it's possible that the pool * has become active after this thread has tried to * obtain a tx. If that's the case then tx_lasttried_txg * would not have been set. */ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } else if (tx->tx_needassign_txh) { dnode_t *dn = tx->tx_needassign_txh->txh_dnode; mutex_enter(&dn->dn_mtx); while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) cv_wait(&dn->dn_notxholds, &dn->dn_mtx); mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = NULL; } else { /* * A dnode is assigned to the quiescing txg. Wait for its * transaction to complete. */ txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); } } void dmu_tx_willuse_space(dmu_tx_t *tx, int64_t delta) { #ifdef DEBUG_DMU_TX if (tx->tx_dir == NULL || delta == 0) return; if (delta > 0) { ASSERT3U(refcount_count(&tx->tx_space_written) + delta, <=, tx->tx_space_towrite); (void) refcount_add_many(&tx->tx_space_written, delta, NULL); } else { (void) refcount_add_many(&tx->tx_space_freed, -delta, NULL); } #endif } void dmu_tx_commit(dmu_tx_t *tx) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg != 0); /* * Go through the transaction's hold list and remove holds on * associated dnodes, notifying waiters if no holds remain. */ while ((txh = list_head(&tx->tx_holds))) { dnode_t *dn = txh->txh_dnode; list_remove(&tx->tx_holds, txh); kmem_free(txh, sizeof (dmu_tx_hold_t)); if (dn == NULL) continue; mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { dn->dn_assigned_txg = 0; cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); dnode_rele(dn, tx); } if (tx->tx_tempreserve_cookie) dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); if (!list_is_empty(&tx->tx_callbacks)) txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); #ifdef DEBUG_DMU_TX dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", tx->tx_space_towrite, refcount_count(&tx->tx_space_written), tx->tx_space_tofree, refcount_count(&tx->tx_space_freed)); refcount_destroy_many(&tx->tx_space_written, refcount_count(&tx->tx_space_written)); refcount_destroy_many(&tx->tx_space_freed, refcount_count(&tx->tx_space_freed)); #endif kmem_free(tx, sizeof (dmu_tx_t)); } void dmu_tx_abort(dmu_tx_t *tx) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); while ((txh = list_head(&tx->tx_holds))) { dnode_t *dn = txh->txh_dnode; list_remove(&tx->tx_holds, txh); kmem_free(txh, sizeof (dmu_tx_hold_t)); if (dn != NULL) dnode_rele(dn, tx); } /* * Call any registered callbacks with an error code. */ if (!list_is_empty(&tx->tx_callbacks)) dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); #ifdef DEBUG_DMU_TX refcount_destroy_many(&tx->tx_space_written, refcount_count(&tx->tx_space_written)); refcount_destroy_many(&tx->tx_space_freed, refcount_count(&tx->tx_space_freed)); #endif kmem_free(tx, sizeof (dmu_tx_t)); } uint64_t dmu_tx_get_txg(dmu_tx_t *tx) { ASSERT(tx->tx_txg != 0); return (tx->tx_txg); } dsl_pool_t * dmu_tx_pool(dmu_tx_t *tx) { ASSERT(tx->tx_pool != NULL); return (tx->tx_pool); } void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { dmu_tx_callback_t *dcb; dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_PUSHPAGE); dcb->dcb_func = func; dcb->dcb_data = data; list_insert_tail(&tx->tx_callbacks, dcb); } /* * Call all the commit callbacks on a list, with a given error code. */ void dmu_tx_do_callbacks(list_t *cb_list, int error) { dmu_tx_callback_t *dcb; while ((dcb = list_head(cb_list))) { list_remove(cb_list, dcb); dcb->dcb_func(dcb->dcb_data, error); kmem_free(dcb, sizeof (dmu_tx_callback_t)); } } /* * Interface to hold a bunch of attributes. * used for creating new files. * attrsize is the total size of all attributes * to be added during object creation * * For updating/adding a single attribute dmu_tx_hold_sa() should be used. */ /* * hold necessary attribute name for attribute registration. * should be a very rare case where this is needed. If it does * happen it would only happen on the first write to the file system. */ static void dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) { int i; if (!sa->sa_need_attr_registration) return; for (i = 0; i != sa->sa_num_attrs; i++) { if (!sa->sa_attr_table[i].sa_registered) { if (sa->sa_reg_attr_obj) dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, B_TRUE, sa->sa_attr_table[i].sa_name); else dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, sa->sa_attr_table[i].sa_name); } } } void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { dnode_t *dn; dmu_tx_hold_t *txh; txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_SPILL, 0, 0); if (txh == NULL) return; dn = txh->txh_dnode; if (dn == NULL) return; /* If blkptr doesn't exist then add space to towrite */ if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { txh->txh_space_towrite += SPA_MAXBLOCKSIZE; } else { blkptr_t *bp; bp = &dn->dn_phys->dn_spill; if (dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, bp, bp->blk_birth)) txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; else txh->txh_space_towrite += SPA_MAXBLOCKSIZE; if (!BP_IS_HOLE(bp)) txh->txh_space_tounref += SPA_MAXBLOCKSIZE; } } void dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) { sa_os_t *sa = tx->tx_objset->os_sa; dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; if (tx->tx_objset->os_sa->sa_layout_attr_obj) dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); else { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } dmu_tx_sa_registration_hold(sa, tx); if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) return; (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPILL, 0, 0); } /* * Hold SA attribute * * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) * * variable_size is the total size of all variable sized attributes * passed to this function. It is not the total size of all * variable size attributes that *may* exist on this object. */ void dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) { uint64_t object; sa_os_t *sa = tx->tx_objset->os_sa; ASSERT(hdl != NULL); object = sa_handle_object(hdl); dmu_tx_hold_bonus(tx, object); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } dmu_tx_sa_registration_hold(sa, tx); if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); if (sa->sa_force_spill || may_grow || hdl->sa_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } else { dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_have_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } DB_DNODE_EXIT(db); } } void dmu_tx_init(void) { dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (dmu_tx_ksp != NULL) { dmu_tx_ksp->ks_data = &dmu_tx_stats; kstat_install(dmu_tx_ksp); } } void dmu_tx_fini(void) { if (dmu_tx_ksp != NULL) { kstat_delete(dmu_tx_ksp); dmu_tx_ksp = NULL; } } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dmu_tx_create); EXPORT_SYMBOL(dmu_tx_hold_write); EXPORT_SYMBOL(dmu_tx_hold_free); EXPORT_SYMBOL(dmu_tx_hold_zap); EXPORT_SYMBOL(dmu_tx_hold_bonus); EXPORT_SYMBOL(dmu_tx_abort); EXPORT_SYMBOL(dmu_tx_assign); EXPORT_SYMBOL(dmu_tx_wait); EXPORT_SYMBOL(dmu_tx_commit); EXPORT_SYMBOL(dmu_tx_get_txg); EXPORT_SYMBOL(dmu_tx_callback_register); EXPORT_SYMBOL(dmu_tx_do_callbacks); EXPORT_SYMBOL(dmu_tx_hold_spill); EXPORT_SYMBOL(dmu_tx_hold_sa_create); EXPORT_SYMBOL(dmu_tx_hold_sa); #endif diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c index f649bba4c880..fb7cd2cd653c 100644 --- a/module/zfs/dsl_dir.c +++ b/module/zfs/dsl_dir.c @@ -1,1376 +1,1378 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013 Martin Matuska. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); /* ARGSUSED */ static void dsl_dir_evict(dmu_buf_t *db, void *arg) { dsl_dir_t *dd = arg; int t; ASSERTV(dsl_pool_t *dp = dd->dd_pool); for (t = 0; t < TXG_SIZE; t++) { ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); ASSERT(dd->dd_tempreserved[t] == 0); ASSERT(dd->dd_space_towrite[t] == 0); } if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); spa_close(dd->dd_pool->dp_spa, dd); /* * The props callback list should have been cleaned up by * objset_evict(). */ list_destroy(&dd->dd_prop_cbs); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); } int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **ddp) { dmu_buf_t *dbuf; dsl_dir_t *dd; int err; ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); if (err != 0) return (err); dd = dmu_buf_get_user(dbuf); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbuf, &doi); ASSERT3U(doi.doi_type, ==, DMU_OT_DSL_DIR); ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); } #endif if (dd == NULL) { dsl_dir_t *winner; dd = kmem_zalloc(sizeof (dsl_dir_t), KM_PUSHPAGE); dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; dd->dd_phys = dbuf->db_data; mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), offsetof(dsl_prop_cb_record_t, cbr_node)); dsl_dir_snap_cmtime_update(dd); if (dd->dd_phys->dd_parent_obj) { err = dsl_dir_hold_obj(dp, dd->dd_phys->dd_parent_obj, NULL, dd, &dd->dd_parent); if (err != 0) goto errout; if (tail) { #ifdef ZFS_DEBUG uint64_t foundobj; err = zap_lookup(dp->dp_meta_objset, dd->dd_parent->dd_phys->dd_child_dir_zapobj, tail, sizeof (foundobj), 1, &foundobj); ASSERT(err || foundobj == ddobj); #endif (void) strcpy(dd->dd_myname, tail); } else { err = zap_value_search(dp->dp_meta_objset, dd->dd_parent->dd_phys->dd_child_dir_zapobj, ddobj, 0, dd->dd_myname); } if (err != 0) goto errout; } else { (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); } if (dsl_dir_is_clone(dd)) { dmu_buf_t *origin_bonus; dsl_dataset_phys_t *origin_phys; /* * We can't open the origin dataset, because * that would require opening this dsl_dir. * Just look at its phys directly instead. */ err = dmu_bonus_hold(dp->dp_meta_objset, dd->dd_phys->dd_origin_obj, FTAG, &origin_bonus); if (err != 0) goto errout; origin_phys = origin_bonus->db_data; dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); } winner = dmu_buf_set_user_ie(dbuf, dd, &dd->dd_phys, dsl_dir_evict); if (winner) { if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dd = winner; } else { spa_open_ref(dp->dp_spa, dd); } } /* * The dsl_dir_t has both open-to-close and instantiate-to-evict * holds on the spa. We need the open-to-close holds because * otherwise the spa_refcnt wouldn't change when we open a * dir which the spa also has open, so we could incorrectly * think it was OK to unload/export/destroy the pool. We need * the instantiate-to-evict hold because the dsl_dir_t has a * pointer to the dd_pool, which has a pointer to the spa_t. */ spa_open_ref(dp->dp_spa, tag); ASSERT3P(dd->dd_pool, ==, dp); ASSERT3U(dd->dd_object, ==, ddobj); ASSERT3P(dd->dd_dbuf, ==, dbuf); *ddp = dd; return (0); errout: if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); return (err); } void dsl_dir_rele(dsl_dir_t *dd, void *tag) { dprintf_dd(dd, "%s\n", ""); spa_close(dd->dd_pool->dp_spa, tag); dmu_buf_rele(dd->dd_dbuf, tag); } /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ void dsl_dir_name(dsl_dir_t *dd, char *buf) { if (dd->dd_parent) { dsl_dir_name(dd->dd_parent, buf); (void) strcat(buf, "/"); } else { buf[0] = '\0'; } if (!MUTEX_HELD(&dd->dd_lock)) { /* * recursive mutex so that we can use * dprintf_dd() with dd_lock held */ mutex_enter(&dd->dd_lock); (void) strcat(buf, dd->dd_myname); mutex_exit(&dd->dd_lock); } else { (void) strcat(buf, dd->dd_myname); } } /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ int dsl_dir_namelen(dsl_dir_t *dd) { int result = 0; if (dd->dd_parent) { /* parent's name + 1 for the "/" */ result = dsl_dir_namelen(dd->dd_parent) + 1; } if (!MUTEX_HELD(&dd->dd_lock)) { /* see dsl_dir_name */ mutex_enter(&dd->dd_lock); result += strlen(dd->dd_myname); mutex_exit(&dd->dd_lock); } else { result += strlen(dd->dd_myname); } return (result); } static int getcomponent(const char *path, char *component, const char **nextp) { char *p; if ((path == NULL) || (path[0] == '\0')) return (SET_ERROR(ENOENT)); /* This would be a good place to reserve some namespace... */ p = strpbrk(path, "/@"); if (p && (p[1] == '/' || p[1] == '@')) { /* two separators in a row */ return (SET_ERROR(EINVAL)); } if (p == NULL || p == path) { /* * if the first thing is an @ or /, it had better be an * @ and it had better not have any more ats or slashes, * and it had better have something after the @. */ if (p != NULL && (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) return (SET_ERROR(EINVAL)); if (strlen(path) >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); (void) strcpy(component, path); p = NULL; } else if (p[0] == '/') { if (p - path >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); (void) strncpy(component, path, p - path); component[p - path] = '\0'; p++; } else if (p[0] == '@') { /* * if the next separator is an @, there better not be * any more slashes. */ if (strchr(path, '/')) return (SET_ERROR(EINVAL)); if (p - path >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); (void) strncpy(component, path, p - path); component[p - path] = '\0'; } else { panic("invalid p=%p", (void *)p); } *nextp = p; return (0); } /* * Return the dsl_dir_t, and possibly the last component which couldn't * be found in *tail. The name must be in the specified dsl_pool_t. This * thread must hold the dp_config_rwlock for the pool. Returns NULL if the * path is bogus, or if tail==NULL and we couldn't parse the whole name. * (*tail)[0] == '@' means that the last component is a snapshot. */ int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) { char *buf; const char *spaname, *next, *nextnext = NULL; int err; dsl_dir_t *dd; uint64_t ddobj; buf = kmem_alloc(MAXNAMELEN, KM_PUSHPAGE); err = getcomponent(name, buf, &next); if (err != 0) goto error; /* Make sure the name is in the specified pool. */ spaname = spa_name(dp->dp_spa); if (strcmp(buf, spaname) != 0) { err = SET_ERROR(EINVAL); goto error; } ASSERT(dsl_pool_config_held(dp)); err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); if (err != 0) { goto error; } while (next != NULL) { dsl_dir_t *child_ds; err = getcomponent(next, buf, &nextnext); if (err != 0) break; ASSERT(next[0] != '\0'); if (next[0] == '@') break; dprintf("looking up %s in obj%lld\n", buf, dd->dd_phys->dd_child_dir_zapobj); err = zap_lookup(dp->dp_meta_objset, dd->dd_phys->dd_child_dir_zapobj, buf, sizeof (ddobj), 1, &ddobj); if (err != 0) { if (err == ENOENT) err = 0; break; } err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds); if (err != 0) break; dsl_dir_rele(dd, tag); dd = child_ds; next = nextnext; } if (err != 0) { dsl_dir_rele(dd, tag); goto error; } /* * It's an error if there's more than one component left, or * tailp==NULL and there's any component left. */ if (next != NULL && (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { /* bad path name */ dsl_dir_rele(dd, tag); dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); err = SET_ERROR(ENOENT); } if (tailp != NULL) *tailp = next; *ddp = dd; error: kmem_free(buf, MAXNAMELEN); return (err); } uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; uint64_t ddobj; dsl_dir_phys_t *ddphys; dmu_buf_t *dbuf; ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); if (pds) { VERIFY(0 == zap_add(mos, pds->dd_phys->dd_child_dir_zapobj, name, sizeof (uint64_t), 1, &ddobj, tx)); } else { /* it's the root dir */ VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); } VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); ddphys = dbuf->db_data; ddphys->dd_creation_time = gethrestime_sec(); if (pds) ddphys->dd_parent_obj = pds->dd_object; ddphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); ddphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); } boolean_t dsl_dir_is_clone(dsl_dir_t *dd) { return (dd->dd_phys->dd_origin_obj && (dd->dd_pool->dp_origin_snap == NULL || dd->dd_phys->dd_origin_obj != dd->dd_pool->dp_origin_snap->ds_object)); } void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { mutex_enter(&dd->dd_lock); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dd->dd_phys->dd_used_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dd->dd_phys->dd_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, dd->dd_phys->dd_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, dd->dd_phys->dd_compressed_bytes == 0 ? 100 : (dd->dd_phys->dd_uncompressed_bytes * 100 / dd->dd_phys->dd_compressed_bytes)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, dd->dd_phys->dd_uncompressed_bytes); if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, dd->dd_phys->dd_used_breakdown[DD_USED_SNAP]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, dd->dd_phys->dd_used_breakdown[DD_USED_HEAD]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, dd->dd_phys->dd_used_breakdown[DD_USED_REFRSRV]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, dd->dd_phys->dd_used_breakdown[DD_USED_CHILD] + dd->dd_phys->dd_used_breakdown[DD_USED_CHILD_RSRV]); } mutex_exit(&dd->dd_lock); if (dsl_dir_is_clone(dd)) { dsl_dataset_t *ds; char buf[MAXNAMELEN]; VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); } } void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; ASSERT(dd->dd_phys); if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(dd->dd_dbuf, dd); } } static int64_t parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) { uint64_t old_accounted = MAX(used, dd->dd_phys->dd_reserved); uint64_t new_accounted = MAX(used + delta, dd->dd_phys->dd_reserved); return (new_accounted - old_accounted); } void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); mutex_enter(&dd->dd_lock); ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]); dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; mutex_exit(&dd->dd_lock); /* release the hold from dsl_dir_dirty */ dmu_buf_rele(dd->dd_dbuf, dd); } static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd) { uint64_t space = 0; int i; ASSERT(MUTEX_HELD(&dd->dd_lock)); for (i = 0; i < TXG_SIZE; i++) { space += dd->dd_space_towrite[i&TXG_MASK]; ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); } return (space); } /* * How much space would dd have available if ancestor had delta applied * to it? If ondiskonly is set, we're only interested in what's * on-disk, not estimated pending changes. */ uint64_t dsl_dir_space_available(dsl_dir_t *dd, dsl_dir_t *ancestor, int64_t delta, int ondiskonly) { uint64_t parentspace, myspace, quota, used; /* * If there are no restrictions otherwise, assume we have * unlimited space available. */ quota = UINT64_MAX; parentspace = UINT64_MAX; if (dd->dd_parent != NULL) { parentspace = dsl_dir_space_available(dd->dd_parent, ancestor, delta, ondiskonly); } mutex_enter(&dd->dd_lock); if (dd->dd_phys->dd_quota != 0) quota = dd->dd_phys->dd_quota; used = dd->dd_phys->dd_used_bytes; if (!ondiskonly) used += dsl_dir_space_towrite(dd); if (dd->dd_parent == NULL) { uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); quota = MIN(quota, poolsize); } if (dd->dd_phys->dd_reserved > used && parentspace != UINT64_MAX) { /* * We have some space reserved, in addition to what our * parent gave us. */ parentspace += dd->dd_phys->dd_reserved - used; } if (dd == ancestor) { ASSERT(delta <= 0); ASSERT(used >= -delta); used += delta; if (parentspace != UINT64_MAX) parentspace -= delta; } if (used > quota) { /* over quota */ myspace = 0; } else { /* * the lesser of the space provided by our parent and * the space left in our quota */ myspace = MIN(parentspace, quota - used); } mutex_exit(&dd->dd_lock); return (myspace); } struct tempreserve { list_node_t tr_node; dsl_dir_t *tr_ds; uint64_t tr_size; }; static int dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, dmu_tx_t *tx, boolean_t first) { uint64_t txg = tx->tx_txg; uint64_t est_inflight, used_on_disk, quota, parent_rsrv; uint64_t deferred = 0; struct tempreserve *tr; int retval = EDQUOT; int txgidx = txg & TXG_MASK; int i; uint64_t ref_rsrv = 0; ASSERT3U(txg, !=, 0); ASSERT3S(asize, >, 0); mutex_enter(&dd->dd_lock); /* * Check against the dsl_dir's quota. We don't add in the delta * when checking for over-quota because they get one free hit. */ est_inflight = dsl_dir_space_towrite(dd); for (i = 0; i < TXG_SIZE; i++) est_inflight += dd->dd_tempreserved[i]; used_on_disk = dd->dd_phys->dd_used_bytes; /* * On the first iteration, fetch the dataset's used-on-disk and * refreservation values. Also, if checkrefquota is set, test if * allocating this space would exceed the dataset's refquota. */ if (first && tx->tx_objset) { int error; dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; error = dsl_dataset_check_quota(ds, checkrefquota, asize, est_inflight, &used_on_disk, &ref_rsrv); if (error) { mutex_exit(&dd->dd_lock); + DMU_TX_STAT_BUMP(dmu_tx_quota); return (error); } } /* * If this transaction will result in a net free of space, * we want to let it through. */ if (ignorequota || netfree || dd->dd_phys->dd_quota == 0) quota = UINT64_MAX; else quota = dd->dd_phys->dd_quota; /* * Adjust the quota against the actual pool size at the root * minus any outstanding deferred frees. * To ensure that it's possible to remove files from a full * pool without inducing transient overcommits, we throttle * netfree transactions against a quota that is slightly larger, * but still within the pool's allocation slop. In cases where * we're very close to full, this will allow a steady trickle of * removes to get through. */ if (dd->dd_parent == NULL) { spa_t *spa = dd->dd_pool->dp_spa; uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); deferred = metaslab_class_get_deferred(spa_normal_class(spa)); if (poolsize - deferred < quota) { quota = poolsize - deferred; retval = ENOSPC; } } /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ if (used_on_disk + est_inflight >= quota) { if (est_inflight > 0 || used_on_disk < quota || (retval == ENOSPC && used_on_disk < quota + deferred)) retval = ERESTART; dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK err=%d\n", used_on_disk>>10, est_inflight>>10, quota>>10, asize>>10, retval); mutex_exit(&dd->dd_lock); + DMU_TX_STAT_BUMP(dmu_tx_quota); return (SET_ERROR(retval)); } /* We need to up our estimated delta before dropping dd_lock */ dd->dd_tempreserved[txgidx] += asize; parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, asize - ref_rsrv); mutex_exit(&dd->dd_lock); tr = kmem_zalloc(sizeof (struct tempreserve), KM_PUSHPAGE); tr->tr_ds = dd; tr->tr_size = asize; list_insert_tail(tr_list, tr); /* see if it's OK with our parent */ if (dd->dd_parent && parent_rsrv) { boolean_t ismos = (dd->dd_phys->dd_head_dataset_obj == 0); return (dsl_dir_tempreserve_impl(dd->dd_parent, parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); } else { return (0); } } /* * Reserve space in this dsl_dir, to be used in this tx's txg. * After the space has been dirtied (and dsl_dir_willuse_space() * has been called), the reservation should be canceled, using * dsl_dir_tempreserve_clear(). */ int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) { int err; list_t *tr_list; if (asize == 0) { *tr_cookiep = NULL; return (0); } tr_list = kmem_alloc(sizeof (list_t), KM_PUSHPAGE); list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); ASSERT3S(asize, >, 0); ASSERT3S(fsize, >=, 0); err = arc_tempreserve_space(lsize, tx->tx_txg); if (err == 0) { struct tempreserve *tr; tr = kmem_zalloc(sizeof (struct tempreserve), KM_PUSHPAGE); tr->tr_size = lsize; list_insert_tail(tr_list, tr); } else { if (err == EAGAIN) { /* * If arc_memory_throttle() detected that pageout * is running and we are low on memory, we delay new * non-pageout transactions to give pageout an * advantage. * * It is unfortunate to be delaying while the caller's * locks are held. */ txg_delay(dd->dd_pool, tx->tx_txg, MSEC2NSEC(10), MSEC2NSEC(10)); err = SET_ERROR(ERESTART); } } if (err == 0) { err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, FALSE, asize > usize, tr_list, tx, TRUE); } if (err != 0) dsl_dir_tempreserve_clear(tr_list, tx); else *tr_cookiep = tr_list; return (err); } /* * Clear a temporary reservation that we previously made with * dsl_dir_tempreserve_space(). */ void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) { int txgidx = tx->tx_txg & TXG_MASK; list_t *tr_list = tr_cookie; struct tempreserve *tr; ASSERT3U(tx->tx_txg, !=, 0); if (tr_cookie == NULL) return; while ((tr = list_head(tr_list)) != NULL) { if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, tr->tr_size); tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; mutex_exit(&tr->tr_ds->dd_lock); } else { arc_tempreserve_clear(tr->tr_size); } list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); } kmem_free(tr_list, sizeof (list_t)); } /* * This should be called from open context when we think we're going to write * or free space, for example when dirtying data. Be conservative; it's okay * to write less space or free more, but we don't want to write more or free * less than the amount specified. * * NOTE: The behavior of this function is identical to the Illumos / FreeBSD * version however it has been adjusted to use an iterative rather then * recursive algorithm to minimize stack usage. */ void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) { int64_t parent_space; uint64_t est_used; do { mutex_enter(&dd->dd_lock); if (space > 0) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; est_used = dsl_dir_space_towrite(dd) + dd->dd_phys->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); /* Make sure that we clean up dd_space_to* */ dsl_dir_dirty(dd, tx); dd = dd->dd_parent; space = parent_space; } while (space && dd); } /* call from syncing context when we actually write/free space for this dd */ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) { int64_t accounted_delta; /* * dsl_dataset_set_refreservation_sync_impl() calls this with * dd_lock held, so that it can atomically update * ds->ds_reserved and the dsl_dir accounting, so that * dsl_dataset_check_quota() can see dataset and dir accounting * consistently. */ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(type < DD_USED_NUM); dmu_buf_will_dirty(dd->dd_dbuf, tx); if (needlock) mutex_enter(&dd->dd_lock); accounted_delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, used); ASSERT(used >= 0 || dd->dd_phys->dd_used_bytes >= -used); ASSERT(compressed >= 0 || dd->dd_phys->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || dd->dd_phys->dd_uncompressed_bytes >= -uncompressed); dd->dd_phys->dd_used_bytes += used; dd->dd_phys->dd_uncompressed_bytes += uncompressed; dd->dd_phys->dd_compressed_bytes += compressed; if (dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN) { ASSERT(used > 0 || dd->dd_phys->dd_used_breakdown[type] >= -used); dd->dd_phys->dd_used_breakdown[type] += used; #ifdef DEBUG { dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) u += dd->dd_phys->dd_used_breakdown[t]; ASSERT3U(u, ==, dd->dd_phys->dd_used_bytes); } #endif } if (needlock) mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, accounted_delta, compressed, uncompressed, tx); dsl_dir_transfer_space(dd->dd_parent, used - accounted_delta, DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); } } void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); if (delta == 0 || !(dd->dd_phys->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? dd->dd_phys->dd_used_breakdown[oldtype] >= delta : dd->dd_phys->dd_used_breakdown[newtype] >= -delta); ASSERT(dd->dd_phys->dd_used_bytes >= ABS(delta)); dd->dd_phys->dd_used_breakdown[oldtype] -= delta; dd->dd_phys->dd_used_breakdown[newtype] += delta; mutex_exit(&dd->dd_lock); } typedef struct dsl_dir_set_qr_arg { const char *ddsqra_name; zprop_source_t ddsqra_source; uint64_t ddsqra_value; } dsl_dir_set_qr_arg_t; static int dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; uint64_t towrite, newval; error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); error = dsl_prop_predict(ds->ds_dir, "quota", ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (newval == 0) { dsl_dataset_rele(ds, FTAG); return (0); } mutex_enter(&ds->ds_dir->dd_lock); /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the * pending changes could under-estimate the amount of space to be * freed up. */ towrite = dsl_dir_space_towrite(ds->ds_dir); if ((dmu_tx_is_syncing(tx) || towrite == 0) && (newval < ds->ds_dir->dd_phys->dd_reserved || newval < ds->ds_dir->dd_phys->dd_used_bytes + towrite)) { error = SET_ERROR(ENOSPC); } mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); return (error); } static void dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); } else { newval = ddsqra->ddsqra_value; spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); } dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); ds->ds_dir->dd_phys->dd_quota = newval; mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) { dsl_dir_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = ddname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = quota; return (dsl_sync_task(ddname, dsl_dir_set_quota_check, dsl_dir_set_quota_sync, &ddsqra, 0)); } int dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; dsl_dir_t *dd; uint64_t newval, used, avail; int error; error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); dd = ds->ds_dir; /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ if (!dmu_tx_is_syncing(tx)) { dsl_dataset_rele(ds, FTAG); return (0); } error = dsl_prop_predict(ds->ds_dir, zfs_prop_to_name(ZFS_PROP_RESERVATION), ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } mutex_enter(&dd->dd_lock); used = dd->dd_phys->dd_used_bytes; mutex_exit(&dd->dd_lock); if (dd->dd_parent) { avail = dsl_dir_space_available(dd->dd_parent, NULL, 0, FALSE); } else { avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; } if (MAX(used, newval) > MAX(used, dd->dd_phys->dd_reserved)) { uint64_t delta = MAX(used, newval) - MAX(used, dd->dd_phys->dd_reserved); if (delta > avail || (dd->dd_phys->dd_quota > 0 && newval > dd->dd_phys->dd_quota)) error = SET_ERROR(ENOSPC); } dsl_dataset_rele(ds, FTAG); return (error); } void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) { uint64_t used; int64_t delta; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); used = dd->dd_phys->dd_used_bytes; delta = MAX(used, value) - MAX(used, dd->dd_phys->dd_reserved); dd->dd_phys->dd_reserved = value; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, delta, 0, 0, tx); } mutex_exit(&dd->dd_lock); } static void dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); } else { newval = ddsqra->ddsqra_value; spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", zfs_prop_to_name(ZFS_PROP_RESERVATION), (longlong_t)newval); } dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation) { dsl_dir_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = ddname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = reservation; return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, dsl_dir_set_reservation_sync, &ddsqra, 0)); } static dsl_dir_t * closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) { for (; ds1; ds1 = ds1->dd_parent) { dsl_dir_t *dd; for (dd = ds2; dd; dd = dd->dd_parent) { if (ds1 == dd) return (dd); } } return (NULL); } /* * If delta is applied to dd, how much of that delta would be applied to * ancestor? Syncing context only. */ static int64_t would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) { if (dd == ancestor) return (delta); mutex_enter(&dd->dd_lock); delta = parent_delta(dd, dd->dd_phys->dd_used_bytes, delta); mutex_exit(&dd->dd_lock); return (would_change(dd->dd_parent, delta, ancestor)); } typedef struct dsl_dir_rename_arg { const char *ddra_oldname; const char *ddra_newname; } dsl_dir_rename_arg_t; /* ARGSUSED */ static int dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { int *deltap = arg; char namebuf[MAXNAMELEN]; dsl_dataset_name(ds, namebuf); if (strlen(namebuf) + *deltap >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int dsl_dir_rename_check(void *arg, dmu_tx_t *tx) { dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; const char *mynewname; int error; int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); /* target dir should exist */ error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); if (error != 0) return (error); /* new parent should exist */ error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname); if (error != 0) { dsl_dir_rele(dd, FTAG); return (error); } /* can't rename to different pool */ if (dd->dd_pool != newparent->dd_pool) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(ENXIO)); } /* new name should not already exist */ if (mynewname == NULL) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EEXIST)); } /* if the name length is growing, validate child name lengths */ if (delta > 0) { error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } } if (newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = MAX(dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_reserved); /* no rename into our descendant */ if (closest_common_ancestor(dd, newparent) == dd) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dir_transfer_possible(dd->dd_parent, newparent, myspace); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } } dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (0); } static void dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) { dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; const char *mynewname; int error; objset_t *mos = dp->dp_meta_objset; VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname)); /* Log this before we change the name. */ spa_history_log_internal_dd(dd, "rename", tx, "-> %s", ddra->ddra_newname); if (newparent != dd->dd_parent) { dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, -dd->dd_phys->dd_used_bytes, -dd->dd_phys->dd_compressed_bytes, -dd->dd_phys->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD, dd->dd_phys->dd_used_bytes, dd->dd_phys->dd_compressed_bytes, dd->dd_phys->dd_uncompressed_bytes, tx); if (dd->dd_phys->dd_reserved > dd->dd_phys->dd_used_bytes) { uint64_t unused_rsrv = dd->dd_phys->dd_reserved - dd->dd_phys->dd_used_bytes; dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, -unused_rsrv, 0, 0, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, unused_rsrv, 0, 0, tx); } } dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ error = zap_remove(mos, dd->dd_parent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, tx); ASSERT0(error); (void) strcpy(dd->dd_myname, mynewname); dsl_dir_rele(dd->dd_parent, dd); dd->dd_phys->dd_parent_obj = newparent->dd_object; VERIFY0(dsl_dir_hold_obj(dp, newparent->dd_object, NULL, dd, &dd->dd_parent)); /* add to new parent zapobj */ VERIFY0(zap_add(mos, newparent->dd_phys->dd_child_dir_zapobj, dd->dd_myname, 8, 1, &dd->dd_object, tx)); #ifdef _KERNEL zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname); #endif dsl_prop_notify_all(dd); dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); } int dsl_dir_rename(const char *oldname, const char *newname) { dsl_dir_rename_arg_t ddra; ddra.ddra_oldname = oldname; ddra.ddra_newname = newname; return (dsl_sync_task(oldname, dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3)); } int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) { dsl_dir_t *ancestor; int64_t adelta; uint64_t avail; ancestor = closest_common_ancestor(sdd, tdd); adelta = would_change(sdd, -space, ancestor); avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); if (avail < space) return (SET_ERROR(ENOSPC)); return (0); } timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd) { timestruc_t t; mutex_enter(&dd->dd_lock); t = dd->dd_snap_cmtime; mutex_exit(&dd->dd_lock); return (t); } void dsl_dir_snap_cmtime_update(dsl_dir_t *dd) { timestruc_t t; gethrestime(&t); mutex_enter(&dd->dd_lock); dd->dd_snap_cmtime = t; mutex_exit(&dd->dd_lock); } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(dsl_dir_set_quota); EXPORT_SYMBOL(dsl_dir_set_reservation); #endif