Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 321548) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c (revision 321549) @@ -1,2265 +1,2306 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. */ /* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ /* Copyright (c) 2013, Joyent, Inc. All rights reserved. */ /* Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #endif /* * Enable/disable nopwrite feature. */ int zfs_nopwrite_enabled = 1; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, nopwrite_enabled, CTLFLAG_RDTUN, &zfs_nopwrite_enabled, 0, "Enable nopwrite feature"); /* * Tunable to control percentage of dirtied blocks from frees in one TXG. * After this threshold is crossed, additional dirty blocks from frees * wait until the next TXG. * A value of zero will disable this throttle. */ uint32_t zfs_per_txg_dirty_frees_percent = 30; SYSCTL_INT(_vfs_zfs, OID_AUTO, per_txg_dirty_frees_percent, CTLFLAG_RWTUN, &zfs_per_txg_dirty_frees_percent, 0, "Percentage of dirtied blocks from frees in one txg"); const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, "object directory" }, { DMU_BSWAP_UINT64, TRUE, "object array" }, { DMU_BSWAP_UINT8, TRUE, "packed nvlist" }, { DMU_BSWAP_UINT64, TRUE, "packed nvlist size" }, { DMU_BSWAP_UINT64, TRUE, "bpobj" }, { DMU_BSWAP_UINT64, TRUE, "bpobj header" }, { DMU_BSWAP_UINT64, TRUE, "SPA space map header" }, { DMU_BSWAP_UINT64, TRUE, "SPA space map" }, { DMU_BSWAP_UINT64, TRUE, "ZIL intent log" }, { DMU_BSWAP_DNODE, TRUE, "DMU dnode" }, { DMU_BSWAP_OBJSET, TRUE, "DMU objset" }, { DMU_BSWAP_UINT64, TRUE, "DSL directory" }, { DMU_BSWAP_ZAP, TRUE, "DSL directory child map"}, { DMU_BSWAP_ZAP, TRUE, "DSL dataset snap map" }, { DMU_BSWAP_ZAP, TRUE, "DSL props" }, { DMU_BSWAP_UINT64, TRUE, "DSL dataset" }, { DMU_BSWAP_ZNODE, TRUE, "ZFS znode" }, { DMU_BSWAP_OLDACL, TRUE, "ZFS V0 ACL" }, { DMU_BSWAP_UINT8, FALSE, "ZFS plain file" }, { DMU_BSWAP_ZAP, TRUE, "ZFS directory" }, { DMU_BSWAP_ZAP, TRUE, "ZFS master node" }, { DMU_BSWAP_ZAP, TRUE, "ZFS delete queue" }, { DMU_BSWAP_UINT8, FALSE, "zvol object" }, { DMU_BSWAP_ZAP, TRUE, "zvol prop" }, { DMU_BSWAP_UINT8, FALSE, "other uint8[]" }, { DMU_BSWAP_UINT64, FALSE, "other uint64[]" }, { DMU_BSWAP_ZAP, TRUE, "other ZAP" }, { DMU_BSWAP_ZAP, TRUE, "persistent error log" }, { DMU_BSWAP_UINT8, TRUE, "SPA history" }, { DMU_BSWAP_UINT64, TRUE, "SPA history offsets" }, { DMU_BSWAP_ZAP, TRUE, "Pool properties" }, { DMU_BSWAP_ZAP, TRUE, "DSL permissions" }, { DMU_BSWAP_ACL, TRUE, "ZFS ACL" }, { DMU_BSWAP_UINT8, TRUE, "ZFS SYSACL" }, { DMU_BSWAP_UINT8, TRUE, "FUID table" }, { DMU_BSWAP_UINT64, TRUE, "FUID table size" }, { DMU_BSWAP_ZAP, TRUE, "DSL dataset next clones"}, { DMU_BSWAP_ZAP, TRUE, "scan work queue" }, { DMU_BSWAP_ZAP, TRUE, "ZFS user/group used" }, { DMU_BSWAP_ZAP, TRUE, "ZFS user/group quota" }, { DMU_BSWAP_ZAP, TRUE, "snapshot refcount tags"}, { DMU_BSWAP_ZAP, TRUE, "DDT ZAP algorithm" }, { DMU_BSWAP_ZAP, TRUE, "DDT statistics" }, { DMU_BSWAP_UINT8, TRUE, "System attributes" }, { DMU_BSWAP_ZAP, TRUE, "SA master node" }, { DMU_BSWAP_ZAP, TRUE, "SA attr registration" }, { DMU_BSWAP_ZAP, TRUE, "SA attr layouts" }, { DMU_BSWAP_ZAP, TRUE, "scan translations" }, { DMU_BSWAP_UINT8, FALSE, "deduplicated block" }, { DMU_BSWAP_ZAP, TRUE, "DSL deadlist map" }, { DMU_BSWAP_UINT64, TRUE, "DSL deadlist map hdr" }, { DMU_BSWAP_ZAP, TRUE, "DSL dir clones" }, { DMU_BSWAP_UINT64, TRUE, "bpobj subobj" } }; const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, { byteswap_uint64_array, "uint64" }, { zap_byteswap, "zap" }, { dnode_buf_byteswap, "dnode" }, { dmu_objset_byteswap, "objset" }, { zfs_znode_byteswap, "znode" }, { zfs_oldacl_byteswap, "oldacl" }, { zfs_acl_byteswap, "acl" } }; int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp) { uint64_t blkid; dmu_buf_impl_t *db; blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (0); } int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (err); } int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_bonus_max(void) { return (DN_MAX_BONUSLEN); } int dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else if (newsize < 0 || newsize > db_fake->db_size) { error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } int dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (!DMU_OT_IS_VALID(type)) { error = SET_ERROR(EINVAL); } else if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); dn = DB_DNODE(db); type = dn->dn_bonustype; DB_DNODE_EXIT(db); return (type); } int dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); dbuf_rm_spill(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_rm_spill(dn, tx); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { dnode_t *dn; dmu_buf_impl_t *db; int error; error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } db = dn->dn_bonus; /* as long as the bonus buf is held, the dnode will be held */ if (refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); atomic_inc_32(&dn->dn_dbufs_count); } /* * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's * hold and incrementing the dbuf count to ensure that dnode_move() sees * a dnode hold for every dbuf. */ rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); *dbp = &db->db; return (0); } /* * returns ENOENT, EIO, or 0. * * This interface will allocate a blank spill dbuf when a spill blk * doesn't already exist on the dnode. * * if you only want to find an already existing spill db, then * dmu_spill_hold_existing() should be used. */ int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); ASSERT(db != NULL); err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; else dbuf_rele(db, tag); return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { err = SET_ERROR(EINVAL); } else { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (!dn->dn_have_spill) { err = SET_ERROR(ENOENT); } else { err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); } rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); DB_DNODE_EXIT(db); return (err); } /* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ static int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio; ASSERT(length <= DMU_MAX_ACCESS); /* * Note: We directly notify the prefetch code of this read, so that * we can tell it about the multi-block read. dbuf_read() only knows * about the one block it is accessing. */ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(EIO)); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); #if defined(_KERNEL) && defined(RACCT) if (racct_enable && !read) { PROC_LOCK(curproc); racct_add_force(curproc, RACCT_WRITEBPS, length); racct_add_force(curproc, RACCT_WRITEIOPS, nblks); PROC_UNLOCK(curproc); } #endif zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); zio_nowait(zio); return (SET_ERROR(EIO)); } /* initiate async i/o */ if (read) (void) dbuf_read(db, zio, dbuf_flags); #ifdef _KERNEL else curthread->td_ru.ru_oublock++; #endif dbp[i] = &db->db; } if ((flags & DMU_READ_NO_PREFETCH) == 0 && DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { dmu_zfetch(&dn->dn_zfetch, blkid, nblks, read && DNODE_IS_CACHEABLE(dn)); } rw_exit(&dn->dn_struct_rwlock); /* wait for async i/o */ err = zio_wait(zio); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ if (read) { for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } *numbufsp = nblks; *dbpp = dbp; return (0); } static int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); } void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; if (numbufs == 0) return; for (i = 0; i < numbufs; i++) { if (dbp[i]) dbuf_rele(dbp[i], tag); } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } /* * Issue prefetch i/os for the given blocks. If level is greater than 0, the * indirect blocks prefeteched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. * * Note that if the indirect blocks above the blocks being prefetched are not in * cache, they will be asychronously read in. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; int nblks, err; if (len == 0) { /* they're interested in the bonus buffer */ dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, level, object * sizeof (dnode_phys_t)); dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } /* * XXX - Note, if the dnode for the requested object is not * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); /* * offset + len - 1 is the last byte we want to prefetch for, and offset * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the * last block we want to prefetch, and dbuf_whichblock(dn, level, * offset) is the first. Then the number we need to prefetch is the * last - first + 1. */ if (level > 0 || dn->dn_datablkshift != 0) { nblks = dbuf_whichblock(dn, level, offset + len - 1) - dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { blkid = dbuf_whichblock(dn, level, offset); for (int i = 0; i < nblks; i++) dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first * offset that should be freed. */ static int get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) { uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ASSERT3U(minimum, <=, *start); if (*start - minimum <= iblkrange * maxblks) { *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* * dnode_next_offset(BACKWARDS) will find an allocated L1 * indirect block at or before the input offset. We must * decrement *start so that it is at the end of the region * to search. */ (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { *start = minimum; break; } else if (err != 0) { return (err); } /* set start to the beginning of this L1 indirect */ *start = P2ALIGN(*start, iblkrange); } if (*start < minimum) *start = minimum; return (0); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t length) { uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; int err; uint64_t dirty_frees_threshold; dsl_pool_t *dp = dmu_objset_pool(os); if (offset >= object_size) return (0); if (zfs_per_txg_dirty_frees_percent <= 100) dirty_frees_threshold = zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; else dirty_frees_threshold = zfs_dirty_data_max / 4; if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin, chunk_len; uint64_t long_free_dirty_all_txgs = 0; dmu_tx_t *tx; chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ err = get_next_chunk(dn, &chunk_begin, offset); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); ASSERT3U(chunk_begin, <=, chunk_end); chunk_len = chunk_end - chunk_begin; mutex_enter(&dp->dp_lock); for (int t = 0; t < TXG_SIZE; t++) { long_free_dirty_all_txgs += dp->dp_long_free_dirty_pertxg[t]; } mutex_exit(&dp->dp_lock); /* * To avoid filling up a TXG with just frees wait for * the next TXG to open before freeing more chunks if * we have reached the threshold of frees */ if (dirty_frees_threshold != 0 && long_free_dirty_all_txgs >= dirty_frees_threshold) { txg_wait_open(dp, 0); continue; } tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); /* * Mark this transaction as typically resulting in a net * reduction in space used. */ dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } mutex_enter(&dp->dp_lock); dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] += chunk_len; mutex_exit(&dp->dp_lock); DTRACE_PROBE3(free__long__range, uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len, uint64_t, dmu_tx_get_txg(tx)); dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx); length -= chunk_len; } return (0); } int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t length) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length); /* * It is important to zero out the maxblkid when freeing the entire * file, so that (a) subsequent calls to dmu_free_long_range_impl() * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); return (err); } int dmu_free_long_object(objset_t *os, uint64_t object) { dmu_tx_t *tx; int err; err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { err = dmu_object_free(os, object, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } return (err); } int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == -1ULL || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); } -int -dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, +static int +dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { - dnode_t *dn; dmu_buf_t **dbp; - int numbufs, err; + int numbufs, err = 0; - err = dnode_hold(os, object, FTAG, &dn); - if (err) - return (err); - /* * Deal with odd block sizes, where there can't be data past the first * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); size = newsz; } while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); bcopy((char *)db->db_data + bufoff, buf, tocpy); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } - dnode_rele(dn, FTAG); return (err); } -void -dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - const void *buf, dmu_tx_t *tx) +int +dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags) { - dmu_buf_t **dbp; - int numbufs, i; + dnode_t *dn; + int err; - if (size == 0) - return; + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) + return (err); - VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, - FALSE, FTAG, &numbufs, &dbp)); + err = dmu_read_impl(dn, offset, size, buf, flags); + dnode_rele(dn, FTAG); + return (err); +} +int +dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, + uint32_t flags) +{ + return (dmu_read_impl(dn, offset, size, buf, flags)); +} + +static void +dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + int i; + for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); bcopy(buf, (char *)db->db_data + bufoff, tocpy); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } +} + +void +dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + + if (size == 0) + return; + + VERIFY0(dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + dmu_write_impl(dbp, numbufs, offset, size, buf, tx); + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +void +dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + + if (size == 0) + return; + + VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); + dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); VERIFY0(dmu_buf_hold_noread(os, object, offset, FTAG, &db)); dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, uncompressed_size, compressed_size, byteorder, tx); dmu_buf_rele(db, FTAG); } /* * DMU support for xuio */ kstat_t *xuio_ksp = NULL; int dmu_xuio_init(xuio_t *xuio, int nblk) { dmu_xuio_t *priv; uio_t *uio = &xuio->xu_uio; uio->uio_iovcnt = nblk; uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); priv->cnt = nblk; priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); priv->iovp = uio->uio_iov; XUIO_XUZC_PRIV(xuio) = priv; if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); return (0); } void dmu_xuio_fini(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int nblk = priv->cnt; kmem_free(priv->iovp, nblk * sizeof (iovec_t)); kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); kmem_free(priv, sizeof (dmu_xuio_t)); if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); } /* * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } * and increase priv->next by 1. */ int dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) { struct iovec *iov; uio_t *uio = &xuio->xu_uio; dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int i = priv->next++; ASSERT(i < priv->cnt); ASSERT(off + n <= arc_buf_lsize(abuf)); iov = uio->uio_iov + i; iov->iov_base = (char *)abuf->b_data + off; iov->iov_len = n; priv->bufs[i] = abuf; return (0); } int dmu_xuio_cnt(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); return (priv->cnt); } arc_buf_t * dmu_xuio_arcbuf(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); return (priv->bufs[i]); } void dmu_xuio_clear(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); priv->bufs[i] = NULL; } static void xuio_stat_init(void) { xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (xuio_ksp != NULL) { xuio_ksp->ks_data = &xuio_stats; kstat_install(xuio_ksp); } } static void xuio_stat_fini(void) { if (xuio_ksp != NULL) { kstat_delete(xuio_ksp); xuio_ksp = NULL; } } void xuio_stat_wbuf_copied(void) { XUIOSTAT_BUMP(xuiostat_wbuf_copied); } void xuio_stat_wbuf_nocopy(void) { XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); } #ifdef _KERNEL static int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; xuio_t *xuio = NULL; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); #ifdef UIO_XUIO if (uio->uio_extflg == UIO_XUIO) xuio = (xuio_t *)uio; #endif for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); if (xuio) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; arc_buf_t *dbuf_abuf = dbi->db_buf; arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); if (!err) { uio->uio_resid -= tocpy; uio->uio_loffset += tocpy; } if (abuf == dbuf_abuf) XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else { #ifdef illumos err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); #else err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, uio); #endif } if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_read_uio_dnode(dn, uio, size); DB_DNODE_EXIT(db); return (err); } /* * Read 'size' bytes into the uio buffer. * From the specified object * Starting at offset uio->uio_loffset. */ int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_read_uio_dnode(dn, uio, size); dnode_rele(dn, FTAG); return (err); } static int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); #ifdef illumos /* * XXX uiomove could block forever (eg. nfs-backed * pages). There needs to be a uiolockdown() function * to lock the pages in memory, so that uiomove won't * block. */ err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); #else err = vn_io_fault_uiomove((char *)db->db_data + bufoff, tocpy, uio); #endif if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_write_uio_dnode(dn, uio, size, tx); DB_DNODE_EXIT(db); return (err); } /* * Write 'size' bytes from the uio buffer. * To the specified object. * Starting at offset uio->uio_loffset. */ int dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); return (err); } #ifdef illumos int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; int err; if (size == 0) return (0); err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy, copied, thiscpy; int bufoff; dmu_buf_t *db = dbp[i]; caddr_t va; ASSERT(size > 0); ASSERT3U(db->db_size, >=, PAGESIZE); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); va = zfs_map_page(pp, S_READ); bcopy(va, (char *)db->db_data + bufoff, thiscpy); zfs_unmap_page(pp, va); pp = pp->p_next; bufoff += PAGESIZE; } if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } #else /* !illumos */ int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, vm_page_t *ma, dmu_tx_t *tx) { dmu_buf_t **dbp; struct sf_buf *sf; int numbufs, i; int err; if (size == 0) return (0); err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy, copied, thiscpy; int bufoff; dmu_buf_t *db = dbp[i]; caddr_t va; ASSERT(size > 0); ASSERT3U(db->db_size, >=, PAGESIZE); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(ptoa((*ma)->pindex), ==, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); va = zfs_map_page(*ma, &sf); bcopy(va, (char *)db->db_data + bufoff, thiscpy); zfs_unmap_page(sf); ma += 1; bufoff += PAGESIZE; } if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } #endif /* illumos */ #endif /* _KERNEL */ /* * Allocate a loaned anonymous arc buffer. */ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); } /* * Free a loaned arc buffer. */ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); arc_buf_destroy(buf, FTAG); } /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; dnode_t *dn; dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(dbuf); /* * We can only assign if the offset is aligned, the arc buf is the * same size as the dbuf, and the dbuf is not metadata. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { #ifdef _KERNEL curthread->td_ru.ru_oublock++; #ifdef RACCT if (racct_enable) { PROC_LOCK(curproc); racct_add_force(curproc, RACCT_WRITEBPS, blksz); racct_add_force(curproc, RACCT_WRITEIOPS, 1); PROC_UNLOCK(curproc); } #endif /* RACCT */ #endif /* _KERNEL */ dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { objset_t *os; uint64_t object; /* compressed bufs must always be assignable to their dbuf */ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); DB_DNODE_ENTER(dbuf); dn = DB_DNODE(dbuf); os = dn->dn_objset; object = dn->dn_object; DB_DNODE_EXIT(dbuf); dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } } typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; zgd_t *dsa_zgd; dmu_tx_t *dsa_tx; } dmu_sync_arg_t; /* ARGSUSED */ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { if (BP_IS_HOLE(bp)) { /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; } } } static void dmu_sync_late_arrival_ready(zio_t *zio) { dmu_sync_ready(zio, NULL, zio->io_private); } /* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint8_t chksum = BP_GET_CHECKSUM(bp_orig); ASSERT(BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); ASSERT(zio_checksum_table[chksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; /* * Old style holes are filled with all zeros, whereas * new-style holes maintain their lsize, type, level, * and birth time (see zio_write_compress). While we * need to reset the BP_SET_LSIZE() call that happened * in dmu_sync_ready for old style holes, we do *not* * want to wipe out the information contained in new * style holes. Thus, only zero out the block pointer if * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && dr->dt.dl.dr_overridden_by.blk_birth == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static void dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; blkptr_t *bp_orig = &zio->io_bp_orig; if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { /* * If we didn't allocate a new block (i.e. ZIO_FLAG_NOPWRITE) * then there is nothing to do here. Otherwise, free the * newly allocated block in this txg. */ if (zio->io_flags & ZIO_FLAG_NOPWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_bp->blk_birth == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } } dmu_tx_commit(dsa->dsa_tx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); } dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, zgd->zgd_db->db_data, zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } /* * Intent log support: sync the block associated with db to disk. * N.B. and XXX: the caller is responsible for making sure that the * data isn't changing while dmu_sync() is writing it. * * Return values: * * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. * The caller should not log the write. * * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * * EIO: could not do the I/O. * The caller should do a txg_wait_synced(). * * 0: the I/O has been initiated. * The caller should log this blkptr in the done callback. * It is possible that the I/O will fail, in which case * the error will be reported to the done callback and * propagated to pio from zio_done(). */ int dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { blkptr_t *bp = zgd->zgd_bp; dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, ZIO_COMPRESS_INHERIT, &zp); DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. */ if (txg > spa_freeze_txg(os->os_spa)) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() * and us. If we determine that this txg is not yet syncing, * but it begins to sync a moment later, that's OK because the * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ mutex_enter(&db->db_mtx); if (txg <= spa_last_synced_txg(os->os_spa)) { /* * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EEXIST)); } if (txg <= spa_syncing_txg(os->os_spa)) { /* * This txg is currently syncing, so we can't mess with * the dirty record anymore; just write a new log block. */ mutex_exit(&db->db_mtx); return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = db->db_last_dirty; while (dr && dr->dr_txg != txg) dr = dr->dr_next; if (dr == NULL) { /* * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); /* * Assume the on-disk data is X, the current syncing data (in * txg - 1) is Y, and the current in-memory data is Z (currently * in dmu_sync). * * We usually want to perform a nopwrite if X and Z are the * same. However, if Y is different (i.e. the BP is going to * change before this write takes effect), then a nopwrite will * be incorrect - we would override with X, which could have * been freed when Y was written. * * (Note that this is not a concern when we are nop-writing from * syncing context, because X and Y must be identical, because * all previous txgs have been synced.) * * Therefore, we disable nopwrite if the current BP could change * before this TXG. There are two ways it could change: by * being dirty (dr_next is non-NULL), or by being freed * (dnode_block_freed()). This behavior is verified by * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * We have already issued a sync write for this buffer, * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EALREADY)); } ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); } int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's checksum function. This * check ensures that the receiving system can understand the * checksum function transmitted. */ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's compression function. This * check ensures that the receiving system can understand the * compression function transmitted. */ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } int zfs_mdcomp_disable = 0; SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RWTUN, &zfs_mdcomp_disable, 0, "Disable metadata compression"); /* * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. */ int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, enum zio_compress override_compress, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; int copies = os->os_copies; boolean_t lz4_ac = spa_feature_is_active(os->os_spa, SPA_FEATURE_LZ4_COMPRESS); IMPLY(override_compress == ZIO_COMPRESS_LZ4, lz4_ac); /* * We maintain different write policies for each of the following * types of data: * 1. metadata * 2. preallocated blocks (i.e. level-0 blocks of a dump device) * 3. all other level 0 blocks */ if (ismd) { if (zfs_mdcomp_disable) { compress = ZIO_COMPRESS_EMPTY; } else { /* * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); } /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_METADATA) || (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_MOST && (level >= zfs_redundant_metadata_most_ditto_level || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) copies++; } else if (wp & WP_NOFILL) { ASSERT(level == 0); /* * If we're writing preallocated blocks, we aren't actually * writing them so don't set any policy properties. These * blocks are currently only used by an external subsystem * outside of zfs (i.e. dump) and not written by the zio * pipeline. */ compress = ZIO_COMPRESS_OFF; checksum = ZIO_CHECKSUM_NOPARITY; } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : dedup_checksum; /* * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the * dedup checkum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } /* * Enable nopwrite if we have secure enough checksum * algorithm (see comment in zio_nop_write) and * compression is enabled. We don't enable nopwrite if * dedup is enabled as the two features are mutually * exclusive. */ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } zp->zp_checksum = checksum; /* * If we're writing a pre-compressed buffer, the compression type we use * must match the data. If it hasn't been compressed yet, then we should * use the value dictated by the policies above. */ zp->zp_compress = override_compress != ZIO_COMPRESS_INHERIT ? override_compress : compress; ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; } int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int err; /* * Sync any current changes before * we go trundling through the block pointers. */ err = dmu_object_wait_synced(os, object); if (err) { return (err); } err = dnode_hold(os, object, FTAG, &dn); if (err) { return (err); } err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); dnode_rele(dn, FTAG); return (err); } /* * Given the ZFS object, if it contains any dirty nodes * this function flushes all dirty blocks to disk. This * ensures the DMU object info is updated. A more efficient * future version might just find the TXG with the maximum * ID and wait for that to be synced. */ int dmu_object_wait_synced(objset_t *os, uint64_t object) { dnode_t *dn; int error, i; error = dnode_hold(os, object, FTAG, &dn); if (error) { return (error); } for (i = 0; i < TXG_SIZE; i++) { if (list_link_active(&dn->dn_dirty_link[i])) { break; } } dnode_rele(dn, FTAG); if (i != TXG_SIZE) { txg_wait_synced(dmu_objset_pool(os), 0); } return (0); } void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { dnode_phys_t *dnp; rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); dnp = dn->dn_phys; doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; doi->doi_nblkptr = dn->dn_nblkptr; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); } /* * Get information on a DMU object. * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); dnode_rele(dn, FTAG); return (0); } /* * As above, but faster; can be used when you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); dmu_object_info_from_dnode(DB_DNODE(db), doi); DB_DNODE_EXIT(db); } /* * Faster still when you only care about the size. * This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, u_longlong_t *nblk512) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add 1 for dnode space */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + 1; DB_DNODE_EXIT(db); } void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } void byteswap_uint32_array(void *vbuf, size_t size) { uint32_t *buf = vbuf; size_t count = size >> 2; int i; ASSERT((size & 3) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); } void byteswap_uint16_array(void *vbuf, size_t size) { uint16_t *buf = vbuf; size_t count = size >> 1; int i; ASSERT((size & 1) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); } /* ARGSUSED */ void byteswap_uint8_array(void *vbuf, size_t size) { } void dmu_init(void) { zfs_dbgmsg_init(); sa_cache_init(); xuio_stat_init(); dmu_objset_init(); dnode_init(); zfetch_init(); zio_compress_init(); l2arc_init(); arc_init(); dbuf_init(); } void dmu_fini(void) { arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); zfetch_fini(); zio_compress_fini(); dbuf_fini(); dnode_fini(); dmu_objset_fini(); xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); } Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c (revision 321548) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c (revision 321549) @@ -1,234 +1,235 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2015 by Delphix. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. */ #include #include #include #include #include #include uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { uint64_t object; uint64_t L1_dnode_count = DNODES_PER_BLOCK << (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; mutex_enter(&os->os_obj_lock); for (;;) { object = os->os_obj_next; /* * Each time we polish off a L1 bp worth of dnodes (2^12 * objects), move to another L1 bp that's still reasonably * sparse (at most 1/4 full). Look from the beginning at most * once per txg, but after that keep looking from here. * os_scan_dnodes is set during txg sync if enough objects * have been freed since the previous rescan to justify * backfilling again. If we can't find a suitable block, just * keep going from here. * * Note that dmu_traverse depends on the behavior that we use * multiple blocks of the dnode object before going back to * reuse objects. Any change to this algorithm should preserve * that property or find another solution to the issues * described in traverse_visitbp. */ if (P2PHASE(object, L1_dnode_count) == 0) { uint64_t offset; int error; if (os->os_rescan_dnodes) { offset = 0; os->os_rescan_dnodes = B_FALSE; } else { offset = object << DNODE_SHIFT; } error = dnode_next_offset(DMU_META_DNODE(os), DNODE_FIND_HOLE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); if (error == 0) object = offset >> DNODE_SHIFT; } os->os_obj_next = ++object; /* * XXX We should check for an i/o error here and return * up to our caller. Actually we should pre-read it in * dmu_tx_assign(), but there is currently no mechanism * to do so. */ (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); if (dn) break; if (dmu_object_next(os, &object, B_TRUE, 0) == 0) os->os_obj_next = object - 1; } dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); - dnode_rele(dn, FTAG); - mutex_exit(&os->os_obj_lock); - dmu_tx_add_new_object(tx, os, object); + dmu_tx_add_new_object(tx, dn); + dnode_rele(dn, FTAG); + return (object); } int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { dnode_t *dn; int err; if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) return (SET_ERROR(EBADF)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); if (err) return (err); dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dmu_tx_add_new_object(tx, dn); + dnode_rele(dn, FTAG); - dmu_tx_add_new_object(tx, os, object); return (0); } int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { dnode_t *dn; int err; if (object == DMU_META_DNODE_OBJECT) return (SET_ERROR(EBADF)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err) return (err); dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int err; ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err) return (err); ASSERT(dn->dn_type != DMU_OT_NONE); dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dnode_rele(dn, FTAG); return (0); } /* * Return (in *objectp) the next object which is allocated (or a hole) * after *object, taking into account only objects that may have been modified * after the specified txg. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) { uint64_t offset = (*objectp + 1) << DNODE_SHIFT; int error; error = dnode_next_offset(DMU_META_DNODE(os), (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); *objectp = offset >> DNODE_SHIFT; return (error); } /* * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. * * Only for use from syncing context, on MOS objects. */ void dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, dmu_tx_t *tx) { dnode_t *dn; ASSERT(dmu_tx_is_syncing(tx)); VERIFY0(dnode_hold(mos, object, FTAG, &dn)); if (dn->dn_type == DMU_OTN_ZAP_METADATA) { dnode_rele(dn, FTAG); return; } ASSERT3U(dn->dn_type, ==, old_type); ASSERT0(dn->dn_maxblkid); dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = DMU_OTN_ZAP_METADATA; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); mzap_create_impl(mos, object, 0, 0, tx); spa_feature_incr(dmu_objset_spa(mos), SPA_FEATURE_EXTENSIBLE_DATASET, tx); } void dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; dmu_object_type_t t; ASSERT(dmu_tx_is_syncing(tx)); VERIFY0(dnode_hold(mos, object, FTAG, &dn)); t = dn->dn_type; dnode_rele(dn, FTAG); if (t == DMU_OTN_ZAP_METADATA) { spa_feature_decr(dmu_objset_spa(mos), SPA_FEATURE_EXTENSIBLE_DATASET, tx); } VERIFY0(dmu_object_free(mos, object, tx)); } Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (revision 321548) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c (revision 321549) @@ -1,1234 +1,1318 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, uint64_t arg1, uint64_t arg2); dmu_tx_t * dmu_tx_create_dd(dsl_dir_t *dd) { dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); tx->tx_dir = dd; if (dd != NULL) tx->tx_pool = dd->dd_pool; list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), offsetof(dmu_tx_hold_t, txh_node)); list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); tx->tx_start = gethrtime(); return (tx); } dmu_tx_t * dmu_tx_create(objset_t *os) { dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); tx->tx_objset = os; return (tx); } dmu_tx_t * dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) { dmu_tx_t *tx = dmu_tx_create_dd(NULL); ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); tx->tx_pool = dp; tx->tx_txg = txg; tx->tx_anyobj = TRUE; return (tx); } int dmu_tx_is_syncing(dmu_tx_t *tx) { return (tx->tx_anyobj); } int dmu_tx_private_ok(dmu_tx_t *tx) { return (tx->tx_anyobj); } static dmu_tx_hold_t * -dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, - enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) +dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, + uint64_t arg1, uint64_t arg2) { dmu_tx_hold_t *txh; - dnode_t *dn = NULL; - int err; - if (object != DMU_NEW_OBJECT) { - err = dnode_hold(os, object, tx, &dn); - if (err) { - tx->tx_err = err; - return (NULL); - } - - if (err == 0 && tx->tx_txg != 0) { + if (dn != NULL) { + (void) refcount_add(&dn->dn_holds, tx); + if (tx->tx_txg != 0) { mutex_enter(&dn->dn_mtx); /* * dn->dn_assigned_txg == tx->tx_txg doesn't pose a * problem, but there's no way for it to happen (for * now, at least). */ ASSERT(dn->dn_assigned_txg == 0); dn->dn_assigned_txg = tx->tx_txg; (void) refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } } txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); txh->txh_tx = tx; txh->txh_dnode = dn; refcount_create(&txh->txh_space_towrite); refcount_create(&txh->txh_memory_tohold); txh->txh_type = type; txh->txh_arg1 = arg1; txh->txh_arg2 = arg2; list_insert_tail(&tx->tx_holds, txh); return (txh); } +static dmu_tx_hold_t * +dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, + enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) +{ + dnode_t *dn = NULL; + dmu_tx_hold_t *txh; + int err; + + if (object != DMU_NEW_OBJECT) { + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) { + tx->tx_err = err; + return (NULL); + } + } + txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); + if (dn != NULL) + dnode_rele(dn, FTAG); + return (txh); +} + void -dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object) +dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) { /* * If we're syncing, they can manipulate any object anyhow, and * the hold on the dnode_t can cause problems. */ - if (!dmu_tx_is_syncing(tx)) { - (void) dmu_tx_hold_object_impl(tx, os, - object, THT_NEWOBJECT, 0, 0); - } + if (!dmu_tx_is_syncing(tx)) + (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); } /* * This function reads specified data from disk. The specified data will * be needed to perform the transaction -- i.e, it will be read after * we do dmu_tx_assign(). There are two reasons that we read the data now * (before dmu_tx_assign()): * * 1. Reading it now has potentially better performance. The transaction * has not yet been assigned, so the TXG is not held open, and also the * caller typically has less locks held when calling dmu_tx_hold_*() than * after the transaction has been assigned. This reduces the lock (and txg) * hold times, thus reducing lock contention. * * 2. It is easier for callers (primarily the ZPL) to handle i/o errors * that are detected before they start making changes to the DMU state * (i.e. now). Once the transaction has been assigned, and some DMU * state has been changed, it can be difficult to recover from an i/o * error (e.g. to undo the changes already made in memory at the DMU * layer). Typically code to do so does not exist in the caller -- it * assumes that the data has already been cached and thus i/o errors are * not possible. * * It has been observed that the i/o initiated here can be a performance * problem, and it appears to be optional, because we don't look at the * data which is read. However, removing this read would only serve to * move the work elsewhere (after the dmu_tx_assign()), where it may * have a greater impact on performance (in addition to the impact on * fault tolerance noted above). */ static int dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) { int err; dmu_buf_impl_t *db; rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold_level(dn, level, blkid, FTAG); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); dbuf_rele(db, FTAG); return (err); } /* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { dnode_t *dn = txh->txh_dnode; int err = 0; if (len == 0) return; (void) refcount_add_many(&txh->txh_space_towrite, len, FTAG); if (refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) err = SET_ERROR(EFBIG); if (dn == NULL) return; /* * For i/o error checking, read the blocks that will be needed * to perform the write: the first and last level-0 blocks (if * they are not aligned, i.e. if they are partial-block writes), * and all the level-1 blocks. */ if (dn->dn_maxblkid == 0) { if (off < dn->dn_datablksz && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { txh->txh_tx->tx_err = err; } } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); /* first level-0 block */ uint64_t start = off >> dn->dn_datablkshift; if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { err = dmu_tx_check_ioerr(zio, dn, 0, start); if (err != 0) { txh->txh_tx->tx_err = err; } } /* last level-0 block */ uint64_t end = (off + len - 1) >> dn->dn_datablkshift; if (end != start && end <= dn->dn_maxblkid && P2PHASE(off + len, dn->dn_datablksz)) { err = dmu_tx_check_ioerr(zio, dn, 0, end); if (err != 0) { txh->txh_tx->tx_err = err; } } /* level-1 blocks */ if (dn->dn_nlevels > 1) { int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = (start >> shft) + 1; i < end >> shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err != 0) { txh->txh_tx->tx_err = err; } } } err = zio_wait(zio); if (err != 0) { txh->txh_tx->tx_err = err; } } } static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { (void) refcount_add_many(&txh->txh_space_towrite, DNODE_SIZE, FTAG); } void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) { dmu_tx_hold_t *txh; ASSERT0(tx->tx_txg); ASSERT3U(len, <=, DMU_MAX_ACCESS); ASSERT(len == 0 || UINT64_MAX - off >= len - 1); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_WRITE, off, len); - if (txh == NULL) - return; + if (txh != NULL) { + dmu_tx_count_write(txh, off, len); + dmu_tx_count_dnode(txh); + } +} - dmu_tx_count_write(txh, off, len); - dmu_tx_count_dnode(txh); +void +dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + ASSERT(len == 0 || UINT64_MAX - off >= len - 1); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); + if (txh != NULL) { + dmu_tx_count_write(txh, off, len); + dmu_tx_count_dnode(txh); + } } /* * This function marks the transaction as being a "net free". The end * result is that refquotas will be disabled for this transaction, and * this transaction will be able to use half of the pool space overhead * (see dsl_pool_adjustedsize()). Therefore this function should only * be called for transactions that we expect will not cause a net increase * in the amount of space used (but it's OK if that is occasionally not true). */ void dmu_tx_mark_netfree(dmu_tx_t *tx) { tx->tx_netfree = B_TRUE; } -void -dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) +static void +dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) { + dmu_tx_t *tx; + dnode_t *dn; int err; + zio_t *zio; + tx = txh->txh_tx; ASSERT(tx->tx_txg == 0); - dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_FREE, off, len); - if (txh == NULL) - return; - dnode_t *dn = txh->txh_dnode; + dn = txh->txh_dnode; dmu_tx_count_dnode(txh); if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; /* * For i/o error checking, we read the first and last level-0 * blocks if they are not aligned, and all the level-1 blocks. * * Note: dbuf_free_range() assumes that we have not instantiated * any level-0 dbufs that will be completely freed. Therefore we must * exercise care to not read or count the first and last blocks * if they are blocksize-aligned. */ if (dn->dn_datablkshift == 0) { if (off != 0 || len < dn->dn_datablksz) dmu_tx_count_write(txh, 0, dn->dn_datablksz); } else { /* first block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) dmu_tx_count_write(txh, off, 1); /* last block will be modified if it is not aligned */ if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) dmu_tx_count_write(txh, off + len, 1); } /* * Check level-1 blocks. */ if (dn->dn_nlevels > 1) { int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t start = off >> shift; uint64_t end = (off + len) >> shift; ASSERT(dn->dn_indblkshift != 0); /* * dnode_reallocate() can result in an object with indirect * blocks having an odd data block size. In this case, * just check the single block. */ if (dn->dn_datablkshift == 0) start = end = 0; zio_t *zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); for (uint64_t i = start; i <= end; i++) { uint64_t ibyte = i << shift; err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; if (err == ESRCH || i > end) break; if (err != 0) { tx->tx_err = err; (void) zio_wait(zio); return; } (void) refcount_add_many(&txh->txh_memory_tohold, 1 << dn->dn_indblkshift, FTAG); err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err != 0) { tx->tx_err = err; (void) zio_wait(zio); return; } } err = zio_wait(zio); if (err != 0) { tx->tx_err = err; return; } } } void -dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) +dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) { + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_FREE, off, len); + if (txh != NULL) + (void) dmu_tx_hold_free_impl(txh, off, len); +} + +void +dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) +{ + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); + if (txh != NULL) + (void) dmu_tx_hold_free_impl(txh, off, len); +} + +static void +dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, int add, const char *name) +{ + dmu_tx_t *tx = txh->txh_tx; + dnode_t *dn; int err; ASSERT(tx->tx_txg == 0); - dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, - object, THT_ZAP, add, (uintptr_t)name); - if (txh == NULL) - return; - dnode_t *dn = txh->txh_dnode; + dn = txh->txh_dnode; dmu_tx_count_dnode(txh); /* * Modifying a almost-full microzap is around the worst case (128KB) * * If it is a fat zap, the worst case would be 7*16KB=112KB: * - 3 blocks overwritten: target leaf, ptrtbl block, header block * - 4 new blocks written if adding: * - 2 blocks for possibly split leaves, * - 2 grown ptrtbl blocks */ (void) refcount_add_many(&txh->txh_space_towrite, MZAP_MAX_BLKSZ, FTAG); if (dn == NULL) return; ASSERT3P(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); if (dn->dn_maxblkid == 0 || name == NULL) { /* * This is a microzap (only one block), or we don't know * the name. Check the first block for i/o errors. */ err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err != 0) { tx->tx_err = err; } } else { /* * Access the name so that we'll check for i/o errors to * the leaf blocks, etc. We ignore ENOENT, as this name * may not yet exist. */ err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); if (err == EIO || err == ECKSUM || err == ENXIO) { tx->tx_err = err; } } } void +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_ZAP, add, (uintptr_t)name); + if (txh != NULL) + dmu_tx_hold_zap_impl(txh, add, name); +} + +void +dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT(dn != NULL); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); + if (txh != NULL) + dmu_tx_hold_zap_impl(txh, add, name); +} + +void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_BONUS, 0, 0); + if (txh) + dmu_tx_count_dnode(txh); +} + +void +dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); if (txh) dmu_tx_count_dnode(txh); } void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) { dmu_tx_hold_t *txh; ASSERT(tx->tx_txg == 0); txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPACE, space, 0); (void) refcount_add_many(&txh->txh_space_towrite, space, FTAG); } #ifdef ZFS_DEBUG void dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) { boolean_t match_object = B_FALSE; boolean_t match_offset = B_FALSE; DB_DNODE_ENTER(db); dnode_t *dn = DB_DNODE(db); ASSERT(tx->tx_txg != 0); ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); if (tx->tx_anyobj) { DB_DNODE_EXIT(db); return; } /* XXX No checking on the meta dnode for now */ if (db->db.db_object == DMU_META_DNODE_OBJECT) { DB_DNODE_EXIT(db); return; } for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { ASSERT(dn == NULL || dn->dn_assigned_txg == tx->tx_txg); if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) match_object = TRUE; if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { int datablkshift = dn->dn_datablkshift ? dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; int shift = datablkshift + epbs * db->db_level; uint64_t beginblk = shift >= 64 ? 0 : (txh->txh_arg1 >> shift); uint64_t endblk = shift >= 64 ? 0 : ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); uint64_t blkid = db->db_blkid; /* XXX txh_arg2 better not be zero... */ dprintf("found txh type %x beginblk=%llx endblk=%llx\n", txh->txh_type, beginblk, endblk); switch (txh->txh_type) { case THT_WRITE: if (blkid >= beginblk && blkid <= endblk) match_offset = TRUE; /* * We will let this hold work for the bonus * or spill buffer so that we don't need to * hold it when creating a new object. */ if (blkid == DMU_BONUS_BLKID || blkid == DMU_SPILL_BLKID) match_offset = TRUE; /* * They might have to increase nlevels, * thus dirtying the new TLIBs. Or the * might have to change the block size, * thus dirying the new lvl=0 blk=0. */ if (blkid == 0) match_offset = TRUE; break; case THT_FREE: /* * We will dirty all the level 1 blocks in * the free range and perhaps the first and * last level 0 block. */ if (blkid >= beginblk && (blkid <= endblk || txh->txh_arg2 == DMU_OBJECT_END)) match_offset = TRUE; break; case THT_SPILL: if (blkid == DMU_SPILL_BLKID) match_offset = TRUE; break; case THT_BONUS: if (blkid == DMU_BONUS_BLKID) match_offset = TRUE; break; case THT_ZAP: match_offset = TRUE; break; case THT_NEWOBJECT: match_object = TRUE; break; default: ASSERT(!"bad txh_type"); } } if (match_object && match_offset) { DB_DNODE_EXIT(db); return; } } DB_DNODE_EXIT(db); panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", (u_longlong_t)db->db.db_object, db->db_level, (u_longlong_t)db->db_blkid); } #endif /* * If we can't do 10 iops, something is wrong. Let us go ahead * and hit zfs_dirty_data_max. */ hrtime_t zfs_delay_max_ns = MSEC2NSEC(100); int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ /* * We delay transactions when we've determined that the backend storage * isn't able to accommodate the rate of incoming writes. * * If there is already a transaction waiting, we delay relative to when * that transaction finishes waiting. This way the calculated min_time * is independent of the number of threads concurrently executing * transactions. * * If we are the only waiter, wait relative to when the transaction * started, rather than the current time. This credits the transaction for * "time already served", e.g. reading indirect blocks. * * The minimum time for a transaction to take is calculated as: * min_time = scale * (dirty - min) / (max - dirty) * min_time is then capped at zfs_delay_max_ns. * * The delay has two degrees of freedom that can be adjusted via tunables. * The percentage of dirty data at which we start to delay is defined by * zfs_delay_min_dirty_percent. This should typically be at or above * zfs_vdev_async_write_active_max_dirty_percent so that we only start to * delay after writing at full speed has failed to keep up with the incoming * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly * speaking, this variable determines the amount of delay at the midpoint of * the curve. * * delay * 10ms +-------------------------------------------------------------*+ * | *| * 9ms + *+ * | *| * 8ms + *+ * | * | * 7ms + * + * | * | * 6ms + * + * | * | * 5ms + * + * | * | * 4ms + * + * | * | * 3ms + * + * | * | * 2ms + (midpoint) * + * | | ** | * 1ms + v *** + * | zfs_delay_scale ----------> ******** | * 0 +-------------------------------------*********----------------+ * 0% <- zfs_dirty_data_max -> 100% * * Note that since the delay is added to the outstanding time remaining on the * most recent transaction, the delay is effectively the inverse of IOPS. * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve * was chosen such that small changes in the amount of accumulated dirty data * in the first 3/4 of the curve yield relatively small differences in the * amount of delay. * * The effects can be easier to understand when the amount of delay is * represented on a log scale: * * delay * 100ms +-------------------------------------------------------------++ * + + * | | * + *+ * 10ms + *+ * + ** + * | (midpoint) ** | * + | ** + * 1ms + v **** + * + zfs_delay_scale ----------> ***** + * | **** | * + **** + * 100us + ** + * + * + * | * | * + * + * 10us + * + * + + * | | * + + * +--------------------------------------------------------------+ * 0% <- zfs_dirty_data_max -> 100% * * Note here that only as the amount of dirty data approaches its limit does * the delay start to increase rapidly. The goal of a properly tuned system * should be to keep the amount of dirty data out of that range by first * ensuring that the appropriate limits are set for the I/O scheduler to reach * optimal throughput on the backend storage, and then by changing the value * of zfs_delay_scale to increase the steepness of the curve. */ static void dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) { dsl_pool_t *dp = tx->tx_pool; uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; hrtime_t wakeup, min_tx_time, now; if (dirty <= delay_min_bytes) return; /* * The caller has already waited until we are under the max. * We make them pass us the amount of dirty data so we don't * have to handle the case of it being >= the max, which could * cause a divide-by-zero if it's == the max. */ ASSERT3U(dirty, <, zfs_dirty_data_max); now = gethrtime(); min_tx_time = zfs_delay_scale * (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); if (now > tx->tx_start + min_tx_time) return; min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, uint64_t, min_tx_time); mutex_enter(&dp->dp_lock); wakeup = MAX(tx->tx_start + min_tx_time, dp->dp_last_wakeup + min_tx_time); dp->dp_last_wakeup = wakeup; mutex_exit(&dp->dp_lock); #ifdef _KERNEL #ifdef illumos mutex_enter(&curthread->t_delay_lock); while (cv_timedwait_hires(&curthread->t_delay_cv, &curthread->t_delay_lock, wakeup, zfs_delay_resolution_ns, CALLOUT_FLAG_ABSOLUTE | CALLOUT_FLAG_ROUNDUP) > 0) continue; mutex_exit(&curthread->t_delay_lock); #else pause_sbt("dmu_tx_delay", wakeup * SBT_1NS, zfs_delay_resolution_ns * SBT_1NS, C_ABSOLUTE); #endif #else hrtime_t delta = wakeup - gethrtime(); struct timespec ts; ts.tv_sec = delta / NANOSEC; ts.tv_nsec = delta % NANOSEC; (void) nanosleep(&ts, NULL); #endif } /* * This routine attempts to assign the transaction to a transaction group. * To do so, we must determine if there is sufficient free space on disk. * * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() * on it), then it is assumed that there is sufficient free space, * unless there's insufficient slop space in the pool (see the comment * above spa_slop_shift in spa_misc.c). * * If it is not a "netfree" transaction, then if the data already on disk * is over the allowed usage (e.g. quota), this will fail with EDQUOT or * ENOSPC. Otherwise, if the current rough estimate of pending changes, * plus the rough estimate of this transaction's changes, may exceed the * allowed usage, then this will fail with ERESTART, which will cause the * caller to wait for the pending changes to be written to disk (by waiting * for the next TXG to open), and then check the space usage again. * * The rough estimate of pending changes is comprised of the sum of: * * - this transaction's holds' txh_space_towrite * * - dd_tempreserved[], which is the sum of in-flight transactions' * holds' txh_space_towrite (i.e. those transactions that have called * dmu_tx_assign() but not yet called dmu_tx_commit()). * * - dd_space_towrite[], which is the amount of dirtied dbufs. * * Note that all of these values are inflated by spa_get_worst_case_asize(), * which means that we may get ERESTART well before we are actually in danger * of running out of space, but this also mitigates any small inaccuracies * in the rough estimate (e.g. txh_space_towrite doesn't take into account * indirect blocks, and dd_space_towrite[] doesn't take into account changes * to the MOS). * * Note that due to this algorithm, it is possible to exceed the allowed * usage by one transaction. Also, as we approach the allowed usage, * we will allow a very limited amount of changes into each TXG, thus * decreasing performance. */ static int dmu_tx_try_assign(dmu_tx_t *tx, txg_how_t txg_how) { spa_t *spa = tx->tx_pool->dp_spa; ASSERT0(tx->tx_txg); if (tx->tx_err) return (tx->tx_err); if (spa_suspended(spa)) { /* * If the user has indicated a blocking failure mode * then return ERESTART which will block in dmu_tx_wait(). * Otherwise, return EIO so that an error can get * propagated back to the VOP calls. * * Note that we always honor the txg_how flag regardless * of the failuremode setting. */ if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && txg_how != TXG_WAIT) return (SET_ERROR(EIO)); return (SET_ERROR(ERESTART)); } if (!tx->tx_waited && dsl_pool_need_dirty_delay(tx->tx_pool)) { tx->tx_wait_dirty = B_TRUE; return (SET_ERROR(ERESTART)); } tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); tx->tx_needassign_txh = NULL; /* * NB: No error returns are allowed after txg_hold_open, but * before processing the dnode holds, due to the * dmu_tx_unassign() logic. */ uint64_t towrite = 0; uint64_t tohold = 0; for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn != NULL) { mutex_enter(&dn->dn_mtx); if (dn->dn_assigned_txg == tx->tx_txg - 1) { mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = txh; return (SET_ERROR(ERESTART)); } if (dn->dn_assigned_txg == 0) dn->dn_assigned_txg = tx->tx_txg; ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); (void) refcount_add(&dn->dn_tx_holds, tx); mutex_exit(&dn->dn_mtx); } towrite += refcount_count(&txh->txh_space_towrite); tohold += refcount_count(&txh->txh_memory_tohold); } /* needed allocation: worst-case estimate of write space */ uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); /* calculate memory footprint estimate */ uint64_t memory = towrite + tohold; if (tx->tx_dir != NULL && asize != 0) { int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); if (err != 0) return (err); } return (0); } static void dmu_tx_unassign(dmu_tx_t *tx) { if (tx->tx_txg == 0) return; txg_rele_to_quiesce(&tx->tx_txgh); /* * Walk the transaction's hold list, removing the hold on the * associated dnode, and notifying waiters if the refcount drops to 0. */ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn == NULL) continue; mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { dn->dn_assigned_txg = 0; cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); } txg_rele_to_sync(&tx->tx_txgh); tx->tx_lasttried_txg = tx->tx_txg; tx->tx_txg = 0; } /* * Assign tx to a transaction group. txg_how can be one of: * * (1) TXG_WAIT. If the current open txg is full, waits until there's * a new one. This should be used when you're not holding locks. * It will only fail if we're truly out of space (or over quota). * * (2) TXG_NOWAIT. If we can't assign into the current open txg without * blocking, returns immediately with ERESTART. This should be used * whenever you're holding locks. On an ERESTART error, the caller * should drop locks, do a dmu_tx_wait(tx), and try again. * * (3) TXG_WAITED. Like TXG_NOWAIT, but indicates that dmu_tx_wait() * has already been called on behalf of this operation (though * most likely on a different tx). */ int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how) { int err; ASSERT(tx->tx_txg == 0); ASSERT(txg_how == TXG_WAIT || txg_how == TXG_NOWAIT || txg_how == TXG_WAITED); ASSERT(!dsl_pool_sync_context(tx->tx_pool)); /* If we might wait, we must not hold the config lock. */ ASSERT(txg_how != TXG_WAIT || !dsl_pool_config_held(tx->tx_pool)); if (txg_how == TXG_WAITED) tx->tx_waited = B_TRUE; while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { dmu_tx_unassign(tx); if (err != ERESTART || txg_how != TXG_WAIT) return (err); dmu_tx_wait(tx); } txg_rele_to_quiesce(&tx->tx_txgh); return (0); } void dmu_tx_wait(dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; dsl_pool_t *dp = tx->tx_pool; ASSERT(tx->tx_txg == 0); ASSERT(!dsl_pool_config_held(tx->tx_pool)); if (tx->tx_wait_dirty) { /* * dmu_tx_try_assign() has determined that we need to wait * because we've consumed much or all of the dirty buffer * space. */ mutex_enter(&dp->dp_lock); while (dp->dp_dirty_total >= zfs_dirty_data_max) cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); dmu_tx_delay(tx, dirty); tx->tx_wait_dirty = B_FALSE; /* * Note: setting tx_waited only has effect if the caller * used TX_WAIT. Otherwise they are going to destroy * this tx and try again. The common case, zfs_write(), * uses TX_WAIT. */ tx->tx_waited = B_TRUE; } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { /* * If the pool is suspended we need to wait until it * is resumed. Note that it's possible that the pool * has become active after this thread has tried to * obtain a tx. If that's the case then tx_lasttried_txg * would not have been set. */ txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); } else if (tx->tx_needassign_txh) { /* * A dnode is assigned to the quiescing txg. Wait for its * transaction to complete. */ dnode_t *dn = tx->tx_needassign_txh->txh_dnode; mutex_enter(&dn->dn_mtx); while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) cv_wait(&dn->dn_notxholds, &dn->dn_mtx); mutex_exit(&dn->dn_mtx); tx->tx_needassign_txh = NULL; } else { txg_wait_open(tx->tx_pool, tx->tx_lasttried_txg + 1); } } static void dmu_tx_destroy(dmu_tx_t *tx) { dmu_tx_hold_t *txh; while ((txh = list_head(&tx->tx_holds)) != NULL) { dnode_t *dn = txh->txh_dnode; list_remove(&tx->tx_holds, txh); refcount_destroy_many(&txh->txh_space_towrite, refcount_count(&txh->txh_space_towrite)); refcount_destroy_many(&txh->txh_memory_tohold, refcount_count(&txh->txh_memory_tohold)); kmem_free(txh, sizeof (dmu_tx_hold_t)); if (dn != NULL) dnode_rele(dn, tx); } list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); kmem_free(tx, sizeof (dmu_tx_t)); } void dmu_tx_commit(dmu_tx_t *tx) { ASSERT(tx->tx_txg != 0); /* * Go through the transaction's hold list and remove holds on * associated dnodes, notifying waiters if no holds remain. */ for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; txh = list_next(&tx->tx_holds, txh)) { dnode_t *dn = txh->txh_dnode; if (dn == NULL) continue; mutex_enter(&dn->dn_mtx); ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); if (refcount_remove(&dn->dn_tx_holds, tx) == 0) { dn->dn_assigned_txg = 0; cv_broadcast(&dn->dn_notxholds); } mutex_exit(&dn->dn_mtx); } if (tx->tx_tempreserve_cookie) dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); if (!list_is_empty(&tx->tx_callbacks)) txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); dmu_tx_destroy(tx); } void dmu_tx_abort(dmu_tx_t *tx) { ASSERT(tx->tx_txg == 0); /* * Call any registered callbacks with an error code. */ if (!list_is_empty(&tx->tx_callbacks)) dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); dmu_tx_destroy(tx); } uint64_t dmu_tx_get_txg(dmu_tx_t *tx) { ASSERT(tx->tx_txg != 0); return (tx->tx_txg); } dsl_pool_t * dmu_tx_pool(dmu_tx_t *tx) { ASSERT(tx->tx_pool != NULL); return (tx->tx_pool); } void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) { dmu_tx_callback_t *dcb; dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); dcb->dcb_func = func; dcb->dcb_data = data; list_insert_tail(&tx->tx_callbacks, dcb); } /* * Call all the commit callbacks on a list, with a given error code. */ void dmu_tx_do_callbacks(list_t *cb_list, int error) { dmu_tx_callback_t *dcb; while ((dcb = list_head(cb_list)) != NULL) { list_remove(cb_list, dcb); dcb->dcb_func(dcb->dcb_data, error); kmem_free(dcb, sizeof (dmu_tx_callback_t)); } } /* * Interface to hold a bunch of attributes. * used for creating new files. * attrsize is the total size of all attributes * to be added during object creation * * For updating/adding a single attribute dmu_tx_hold_sa() should be used. */ /* * hold necessary attribute name for attribute registration. * should be a very rare case where this is needed. If it does * happen it would only happen on the first write to the file system. */ static void dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) { if (!sa->sa_need_attr_registration) return; for (int i = 0; i != sa->sa_num_attrs; i++) { if (!sa->sa_attr_table[i].sa_registered) { if (sa->sa_reg_attr_obj) dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, B_TRUE, sa->sa_attr_table[i].sa_name); else dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, sa->sa_attr_table[i].sa_name); } } } void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) { dmu_tx_hold_t *txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, THT_SPILL, 0, 0); (void) refcount_add_many(&txh->txh_space_towrite, SPA_OLD_MAXBLOCKSIZE, FTAG); } void dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) { sa_os_t *sa = tx->tx_objset->os_sa; dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; if (tx->tx_objset->os_sa->sa_layout_attr_obj) { dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); } else { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } dmu_tx_sa_registration_hold(sa, tx); if (attrsize <= DN_MAX_BONUSLEN && !sa->sa_force_spill) return; (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, THT_SPILL, 0, 0); } /* * Hold SA attribute * * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) * * variable_size is the total size of all variable sized attributes * passed to this function. It is not the total size of all * variable size attributes that *may* exist on this object. */ void dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) { uint64_t object; sa_os_t *sa = tx->tx_objset->os_sa; ASSERT(hdl != NULL); object = sa_handle_object(hdl); dmu_tx_hold_bonus(tx, object); if (tx->tx_objset->os_sa->sa_master_obj == 0) return; if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); } dmu_tx_sa_registration_hold(sa, tx); if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); if (sa->sa_force_spill || may_grow || hdl->sa_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } else { dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_have_spill) { ASSERT(tx->tx_txg == 0); dmu_tx_hold_spill(tx, object); } DB_DNODE_EXIT(db); } } Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 321548) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 321549) @@ -1,961 +1,972 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2013 DEY Storage Systems, Inc. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_H #define _SYS_DMU_H /* * This file describes the interface that the DMU provides for its * consumers. * * The DMU also interacts with the SPA. That interface is described in * dmu_spa.h. */ #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct uio; struct xuio; struct page; struct vnode; struct spa; struct zilog; struct zio; struct blkptr; struct zap_cursor; struct dsl_dataset; struct dsl_pool; struct dnode; struct drr_begin; struct drr_end; struct zbookmark_phys; struct spa; struct nvlist; struct arc_buf; struct zio_prop; struct sa_handle; struct file; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; typedef struct dnode dnode_t; typedef enum dmu_object_byteswap { DMU_BSWAP_UINT8, DMU_BSWAP_UINT16, DMU_BSWAP_UINT32, DMU_BSWAP_UINT64, DMU_BSWAP_ZAP, DMU_BSWAP_DNODE, DMU_BSWAP_OBJSET, DMU_BSWAP_ZNODE, DMU_BSWAP_OLDACL, DMU_BSWAP_ACL, /* * Allocating a new byteswap type number makes the on-disk format * incompatible with any other format that uses the same number. * * Data can usually be structured to work with one of the * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. */ DMU_BSWAP_NUMFUNCS } dmu_object_byteswap_t; #define DMU_OT_NEWTYPE 0x80 #define DMU_OT_METADATA 0x40 #define DMU_OT_BYTESWAP_MASK 0x3f /* * Defines a uint8_t object type. Object types specify if the data * in the object is metadata (boolean) and how to byteswap the data * (dmu_object_byteswap_t). */ #define DMU_OT(byteswap, metadata) \ (DMU_OT_NEWTYPE | \ ((metadata) ? DMU_OT_METADATA : 0) | \ ((byteswap) & DMU_OT_BYTESWAP_MASK)) #define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ (ot) < DMU_OT_NUMTYPES) #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_METADATA) : \ dmu_ot[(ot)].ot_metadata) /* * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill * is repurposed for embedded BPs. */ #define DMU_OT_HAS_FILL(ot) \ ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) : \ dmu_ot[(ot)].ot_byteswap) typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ DMU_OT_OBJECT_DIRECTORY, /* ZAP */ DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ DMU_OT_BPOBJ, /* UINT64 */ DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ /* zil: */ DMU_OT_INTENT_LOG, /* UINT64 */ /* dmu: */ DMU_OT_DNODE, /* DNODE */ DMU_OT_OBJSET, /* OBJSET */ /* dsl: */ DMU_OT_DSL_DIR, /* UINT64 */ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ DMU_OT_DSL_PROPS, /* ZAP */ DMU_OT_DSL_DATASET, /* UINT64 */ /* zpl: */ DMU_OT_ZNODE, /* ZNODE */ DMU_OT_OLDACL, /* Old ACL */ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ DMU_OT_MASTER_NODE, /* ZAP */ DMU_OT_UNLINKED_SET, /* ZAP */ /* zvol: */ DMU_OT_ZVOL, /* UINT8 */ DMU_OT_ZVOL_PROP, /* ZAP */ /* other; for testing only! */ DMU_OT_PLAIN_OTHER, /* UINT8 */ DMU_OT_UINT64_OTHER, /* UINT64 */ DMU_OT_ZAP_OTHER, /* ZAP */ /* new object types: */ DMU_OT_ERROR_LOG, /* ZAP */ DMU_OT_SPA_HISTORY, /* UINT8 */ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ DMU_OT_POOL_PROPS, /* ZAP */ DMU_OT_DSL_PERMS, /* ZAP */ DMU_OT_ACL, /* ACL */ DMU_OT_SYSACL, /* SYSACL */ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ DMU_OT_DDT_ZAP, /* ZAP */ DMU_OT_DDT_STATS, /* ZAP */ DMU_OT_SA, /* System attr */ DMU_OT_SA_MASTER_NODE, /* ZAP */ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_SCAN_XLATE, /* ZAP */ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ DMU_OT_DEADLIST, /* ZAP */ DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ /* * Do not allocate new object types here. Doing so makes the on-disk * format incompatible with any other format that uses the same object * type number. * * When creating an object which does not have one of the above types * use the DMU_OTN_* type with the correct byteswap and metadata * values. * * The DMU_OTN_* types do not have entries in the dmu_ot table, * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead * of indexing into dmu_ot directly (this works for both DMU_OT_* types * and DMU_OTN_* types). */ DMU_OT_NUMTYPES, /* * Names for valid types declared with DMU_OT(). */ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), } dmu_object_type_t; typedef enum txg_how { TXG_WAIT = 1, TXG_NOWAIT, TXG_WAITED, } txg_how_t; void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); void byteswap_uint16_array(void *buf, size_t size); void byteswap_uint8_array(void *buf, size_t size); void zap_byteswap(void *buf, size_t size); void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) #define DS_FIND_SERIALIZE (1<<2) /* * The maximum number of bytes that can be accessed as part of one * operation, including metadata. */ #define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) /* * artificial blkids for bonus buffer and spill blocks */ #define DMU_BONUS_BLKID (-1ULL) #define DMU_SPILL_BLKID (-2ULL) /* * Public routines to create, destroy, open, and close objsets. */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname, struct nvlist *snaps); int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); int dmu_objset_snapshot_tmp(const char *, const char *, int); int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ void *db_data; /* data in buffer */ } dmu_buf_t; /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" #define DMU_POOL_FEATURES_FOR_READ "features_for_read" #define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" #define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" #define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map" /* * Allocate an object from this objset. The range of object numbers * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. * * The transaction must be assigned to a txg. The newly allocated * object will be "held" in the transaction (ie. you can modify the * newly allocated object in this transaction). * * dmu_object_alloc() chooses an object and returns it in *objectp. * * dmu_object_claim() allocates a specific object number. If that * number is already allocated, it fails and returns EEXIST. * * Return 0 on success, or ENOSPC or EEXIST as specified above. */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); /* * Free an object from this objset. * * The object's data will be freed as well (ie. you don't need to call * dmu_free(object, 0, -1, tx)). * * The object need not be held in the transaction. * * If there are any holds on this object's buffers (via dmu_buf_hold()), * or tx holds on the object (via dmu_tx_hold_object()), you can not * free it; it fails and returns EBUSY. * * If the object is not allocated, it fails and returns ENOENT. * * Return 0 on success, or EBUSY or ENOENT as specified above. */ int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Find the next allocated or free object. * * The objectp parameter is in-out. It will be updated to be the next * object which is allocated. Ignore objects which have not been * modified since txg. * * XXX Can only be called on a objset with no dirty data. * * Returns 0 on success, or ENOENT if there are no more objects. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg); /* * Set the data blocksize for an object. * * The object cannot have any blocks allcated beyond the first. If * the first block is allocated already, the new size must be greater * than the current block size. If these conditions are not met, * ENOTSUP will be returned. * * Returns 0 on success, or EBUSY if there are any holds on the object * contents, or ENOTSUP as described above. */ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx); /* * Set the checksum property on a dnode. The new checksum algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx); /* * Set the compress property on a dnode. The new compression algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); /* * Decide how to write a block: checksum, compression, number of copies, etc. */ #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, enum zio_compress compress_override, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus * data. As with any normal buffer, you must call dmu_buf_read() to * read db_data, dmu_buf_will_dirty() before modifying it, and the * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release your hold with dmu_buf_rele(). * * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); /* * Special spill buffer support used by "SA" framework */ int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so * that it will remain in memory. You must release the hold with * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. * * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill * on the returned buffer before reading or writing the buffer's * db_data. The comments for those routines describe what particular * operations are valid after calling them. * * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **, int flags); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. */ void dmu_buf_add_ref(dmu_buf_t *db, void* tag); /* * Attempt to add a reference to a dmu buffer that is in an unknown state, * using a pointer that may have been invalidated by eviction processing. * The request will succeed if the passed in dbuf still represents the * same os/object/blkid, is ineligible for eviction, and has at least * one hold by a user other than the syncer. */ boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, uint64_t blkid, void *tag); void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); /* * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a * range of an object. A pointer to an array of dmu_buf_t*'s is * returned (in *dbpp). * * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and * frees the array. The hold on the array of buffers MUST be released * with dmu_buf_rele_array. You can NOT release the hold on each buffer * individually with dmu_buf_rele. */ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); typedef void dmu_buf_evict_func_t(void *user_ptr); /* * A DMU buffer user object may be associated with a dbuf for the * duration of its lifetime. This allows the user of a dbuf (client) * to attach private data to a dbuf (e.g. in-core only data such as a * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified * when that dbuf has been evicted. Clients typically respond to the * eviction notification by freeing their private data, thus ensuring * the same lifetime for both dbuf and private data. * * The mapping from a dmu_buf_user_t to any client private data is the * client's responsibility. All current consumers of the API with private * data embed a dmu_buf_user_t as the first member of the structure for * their private data. This allows conversions between the two types * with a simple cast. Since the DMU buf user API never needs access * to the private data, other strategies can be employed if necessary * or convenient for the client (e.g. using container_of() to do the * conversion for private data that cannot have the dmu_buf_user_t as * its first member). * * Eviction callbacks are executed without the dbuf mutex held or any * other type of mechanism to guarantee that the dbuf is still available. * For this reason, users must assume the dbuf has already been freed * and not reference the dbuf from the callback context. * * Users requesting "immediate eviction" are notified as soon as the dbuf * is only referenced by dirty records (dirties == holds). Otherwise the * notification occurs after eviction processing for the dbuf begins. */ typedef struct dmu_buf_user { /* * Asynchronous user eviction callback state. */ taskq_ent_t dbu_tqent; /* * This instance's eviction function pointers. * * dbu_evict_func_sync is called synchronously and then * dbu_evict_func_async is executed asynchronously on a taskq. */ dmu_buf_evict_func_t *dbu_evict_func_sync; dmu_buf_evict_func_t *dbu_evict_func_async; #ifdef ZFS_DEBUG /* * Pointer to user's dbuf pointer. NULL for clients that do * not associate a dbuf with their user data. * * The dbuf pointer is cleared upon eviction so as to catch * use-after-evict bugs in clients. */ dmu_buf_t **dbu_clear_on_evict_dbufp; #endif } dmu_buf_user_t; /* * Initialize the given dmu_buf_user_t instance with the eviction function * evict_func, to be called when the user is evicted. * * NOTE: This function should only be called once on a given dmu_buf_user_t. * To allow enforcement of this, dbu must already be zeroed on entry. */ /*ARGSUSED*/ inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp) { ASSERT(dbu->dbu_evict_func_sync == NULL); ASSERT(dbu->dbu_evict_func_async == NULL); /* must have at least one evict func */ IMPLY(evict_func_sync == NULL, evict_func_async != NULL); dbu->dbu_evict_func_sync = evict_func_sync; dbu->dbu_evict_func_async = evict_func_async; #ifdef ZFS_DEBUG dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; #endif } /* * Attach user data to a dbuf and mark it for normal (when the dbuf's * data is cleared or its reference count goes to zero) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Attach user data to a dbuf and mark it for immediate (its dirty and * reference counts are equal) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); /* * Replace the current user of a dbuf. * * If given the current user of a dbuf, replaces the dbuf's user with * "new_user" and returns the user data pointer that was replaced. * Otherwise returns the current, and unmodified, dbuf user pointer. */ void *dmu_buf_replace_user(dmu_buf_t *db, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); /* * Remove the specified user data for a DMU buffer. * * Returns the user that was removed on success, or the current user if * another user currently owns the buffer. */ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ void *dmu_buf_get_user(dmu_buf_t *db); objset_t *dmu_buf_get_objset(dmu_buf_t *db); dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); void dmu_buf_dnode_exit(dmu_buf_t *db); /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); /* * Indicate that you are going to modify the buffer's data (db_data). * * The transaction (tx) must be assigned to a txg (ie. you've called * dmu_tx_assign()). The buffer's object must be held in the tx * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); /* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has * been assigned, you can modify buffers which belong to held objects as * part of this transaction. You can't modify buffers before the * transaction has been assigned; you can't modify buffers which don't * belong to objects which this transaction holds; you can't hold * objects once the transaction has been assigned. You may hold an * object which you are going to free (with dmu_object_free()), but you * don't have to. * * You can abort the transaction before it has been assigned. * * Note that you may hold buffers (with dmu_buf_hold) at any time, * regardless of transaction state. */ #define DMU_NEW_OBJECT (-1ULL) #define DMU_OBJECT_END (-1ULL) dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); +void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); +void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + uint64_t len); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); +void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, + const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn); void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); /* * To register a commit callback, dmu_tx_callback_register() must be called. * * dcb_data is a pointer to caller private data that is passed on as a * callback parameter. The caller is responsible for properly allocating and * freeing it. * * When registering a callback, the transaction must be already created, but * it cannot be committed or aborted. It can be assigned to a txg or not. * * The callback will be called after the transaction has been safely written * to stable storage and will also be called if the dmu_tx is aborted. * If there is any error which prevents the transaction from being committed to * disk, the callback will be called with a value of error != 0. */ typedef void dmu_tx_callback_func_t(void *dcb_data, int error); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, void *dcb_data); /* * Free up the data blocks for a defined range of a file. If size is * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); int dmu_free_long_object(objset_t *os, uint64_t object); /* * Convenience functions. * * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); +int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, + uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); +void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL #ifdef illumos int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); #else int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct vm_page **ppa, dmu_tx_t *tx); #endif #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_xuio_init(struct xuio *uio, int niov); void dmu_xuio_fini(struct xuio *uio); int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, size_t n); int dmu_xuio_cnt(struct xuio *uio); struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); void dmu_xuio_clear(struct xuio *uio, int i); void xuio_stat_wbuf_copied(void); void xuio_stat_wbuf_nocopy(void); extern boolean_t zfs_prefetch_disable; extern int zfs_max_recordsize; /* * Asynchronously try to read in the data. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_nblkptr; uint8_t doi_pad[4]; uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ uint64_t doi_max_offset; uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void arc_byteswap_func_t(void *buf, size_t size); typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; char *ot_name; } dmu_object_type_info_t; typedef struct dmu_object_byteswap_info { arc_byteswap_func_t *ob_func; char *ob_name; } dmu_object_byteswap_info_t; extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. * * Return 0 on success or ENOENT if object is not allocated. * * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); /* * Like dmu_object_info_from_db, but faster still when you only care about * the size. This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; uint64_t dds_guid; dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; char dds_origin[ZFS_MAX_DATASET_NAME_LEN]; } dmu_objset_stats_t; /* * Get stats on a dataset. */ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); /* * Add entries to the nvlist for all the objset's properties. See * zfs_prop_table[] and zfs(1m) for details on the properties. */ void dmu_objset_stats(objset_t *os, struct nvlist *nv); /* * Get the space usage statistics for statvfs(). * * refdbytes is the amount of space "referenced" by this objset. * availbytes is the amount of space available to this objset, taking * into account quotas & reservations, assuming that no other objsets * use the space first. These values correspond to the 'referenced' and * 'available' properties, described in the zfs(1m) manpage. * * usedobjs and availobjs are the number of objects currently allocated, * and available. */ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); /* * The fsid_guid is a 56-bit ID that can change to avoid collisions. * (Contrast with the ds_guid which is a 64-bit ID that will never * change, so there is a small probability that it will collide.) */ uint64_t dmu_objset_fsid_guid(objset_t *os); /* * Get the [cm]time for an objset's snapshot dir */ timestruc_t dmu_objset_snap_cmtime(objset_t *os); int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); extern struct zilog *dmu_objset_zil(objset_t *os); extern struct dsl_pool *dmu_objset_pool(objset_t *os); extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); typedef int objset_used_cb_t(dmu_object_type_t bonustype, void *bonus, uint64_t *userp, uint64_t *groupp); extern void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); /* * Return the txg number for the given assigned transaction. */ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* * Synchronous write. * If a parent zio is provided this function initiates a write on the * provided buffer as a child of the parent zio. * In the absence of a parent zio, the write is completed synchronously. * At write completion, blk is filled with the bp of the written block. * Note that while the data covered by this function will be on stable * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ /* * {zfs,zvol,ztest}_get_done() args */ typedef struct zgd { struct zilog *zgd_zilog; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct rl *zgd_rl; void *zgd_private; } zgd_t; typedef void dmu_sync_cb_t(zgd_t *arg, int error); int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off * Return found offset in *off. Return ESRCH for end of file. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); /* * Check if a DMU object has any dirty blocks. If so, sync out * all pending transaction groups. Otherwise, this function * does not alter DMU state. This could be improved to only sync * out the necessary transaction groups for this particular * object. */ int dmu_object_wait_synced(objset_t *os, uint64_t object); /* * Initial setup and final teardown. */ extern void dmu_init(void); extern void dmu_fini(void); typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, uint64_t object, uint64_t offset, int len); void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); int dmu_diff(const char *tosnap_name, const char *fromsnap_name, struct file *fp, offset_t *offp); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; extern int zfs_mdcomp_disable; #ifdef __cplusplus } #endif #endif /* _SYS_DMU_H */ Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h (revision 321548) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_tx.h (revision 321549) @@ -1,152 +1,152 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2016 by Delphix. All rights reserved. */ #ifndef _SYS_DMU_TX_H #define _SYS_DMU_TX_H #include #include #include #ifdef __cplusplus extern "C" { #endif struct dmu_buf_impl; struct dmu_tx_hold; struct dnode_link; struct dsl_pool; struct dnode; struct dsl_dir; struct dmu_tx { /* * No synchronization is needed because a tx can only be handled * by one thread. */ list_t tx_holds; /* list of dmu_tx_hold_t */ objset_t *tx_objset; struct dsl_dir *tx_dir; struct dsl_pool *tx_pool; uint64_t tx_txg; uint64_t tx_lastsnap_txg; uint64_t tx_lasttried_txg; txg_handle_t tx_txgh; void *tx_tempreserve_cookie; struct dmu_tx_hold *tx_needassign_txh; /* list of dmu_tx_callback_t on this dmu_tx */ list_t tx_callbacks; /* placeholder for syncing context, doesn't need specific holds */ boolean_t tx_anyobj; /* has this transaction already been delayed? */ boolean_t tx_waited; /* transaction is marked as being a "net free" of space */ boolean_t tx_netfree; /* time this transaction was created */ hrtime_t tx_start; /* need to wait for sufficient dirty space */ boolean_t tx_wait_dirty; int tx_err; }; enum dmu_tx_hold_type { THT_NEWOBJECT, THT_WRITE, THT_BONUS, THT_FREE, THT_ZAP, THT_SPACE, THT_SPILL, THT_NUMTYPES }; typedef struct dmu_tx_hold { dmu_tx_t *txh_tx; list_node_t txh_node; struct dnode *txh_dnode; refcount_t txh_space_towrite; refcount_t txh_memory_tohold; enum dmu_tx_hold_type txh_type; uint64_t txh_arg1; uint64_t txh_arg2; } dmu_tx_hold_t; typedef struct dmu_tx_callback { list_node_t dcb_node; /* linked to tx_callbacks list */ dmu_tx_callback_func_t *dcb_func; /* caller function pointer */ void *dcb_data; /* caller private data */ } dmu_tx_callback_t; /* * These routines are defined in dmu.h, and are called by the user. */ dmu_tx_t *dmu_tx_create(objset_t *dd); int dmu_tx_assign(dmu_tx_t *tx, txg_how_t txg_how); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_abort(dmu_tx_t *tx); uint64_t dmu_tx_get_txg(dmu_tx_t *tx); struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, void *dcb_data); void dmu_tx_do_callbacks(list_t *cb_list, int error); /* * These routines are defined in dmu_spa.h, and are called by the SPA. */ extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg); /* * These routines are only called by the DMU. */ dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); int dmu_tx_is_syncing(dmu_tx_t *tx); int dmu_tx_private_ok(dmu_tx_t *tx); -void dmu_tx_add_new_object(dmu_tx_t *tx, objset_t *os, uint64_t object); +void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn); void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); #ifdef ZFS_DEBUG #define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) #else #define DMU_TX_DIRTY_BUF(tx, db) #endif #ifdef __cplusplus } #endif #endif /* _SYS_DMU_TX_H */ Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h (revision 321548) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap.h (revision 321549) @@ -1,486 +1,490 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ #ifndef _SYS_ZAP_H #define _SYS_ZAP_H /* * ZAP - ZFS Attribute Processor * * The ZAP is a module which sits on top of the DMU (Data Management * Unit) and implements a higher-level storage primitive using DMU * objects. Its primary consumer is the ZPL (ZFS Posix Layer). * * A "zapobj" is a DMU object which the ZAP uses to stores attributes. * Users should use only zap routines to access a zapobj - they should * not access the DMU object directly using DMU routines. * * The attributes stored in a zapobj are name-value pairs. The name is * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including * terminating NULL). The value is an array of integers, which may be * 1, 2, 4, or 8 bytes long. The total space used by the array (number * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes. * Note that an 8-byte integer value can be used to store the location * (object number) of another dmu object (which may be itself a zapobj). * Note that you can use a zero-length attribute to store a single bit * of information - the attribute is present or not. * * The ZAP routines are thread-safe. However, you must observe the * DMU's restriction that a transaction may not be operated on * concurrently. * * Any of the routines that return an int may return an I/O error (EIO * or ECHECKSUM). * * * Implementation / Performance Notes: * * The ZAP is intended to operate most efficiently on attributes with * short (49 bytes or less) names and single 8-byte values, for which * the microzap will be used. The ZAP should be efficient enough so * that the user does not need to cache these attributes. * * The ZAP's locking scheme makes its routines thread-safe. Operations * on different zapobjs will be processed concurrently. Operations on * the same zapobj which only read data will be processed concurrently. * Operations on the same zapobj which modify data will be processed * concurrently when there are many attributes in the zapobj (because * the ZAP uses per-block locking - more than 128 * (number of cpus) * small attributes will suffice). */ /* * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C * strings) for the names of attributes, rather than a byte string * bounded by an explicit length. If some day we want to support names * in character sets which have embedded zeros (eg. UTF-16, UTF-32), * we'll have to add routines for using length-bounded strings. */ #include #include #ifdef __cplusplus extern "C" { #endif /* * Specifies matching criteria for ZAP lookups. * MT_NORMALIZE Use ZAP normalization flags, which can include both * unicode normalization and case-insensitivity. * MT_MATCH_CASE Do case-sensitive lookups even if MT_NORMALIZE is * specified and ZAP normalization flags include * U8_TEXTPREP_TOUPPER. */ typedef enum matchtype { MT_NORMALIZE = 1 << 0, MT_MATCH_CASE = 1 << 1, } matchtype_t; typedef enum zap_flags { /* Use 64-bit hash value (serialized cursors will always use 64-bits) */ ZAP_FLAG_HASH64 = 1 << 0, /* Key is binary, not string (zap_add_uint64() can be used) */ ZAP_FLAG_UINT64_KEY = 1 << 1, /* * First word of key (which must be an array of uint64) is * already randomly distributed. */ ZAP_FLAG_PRE_HASHED_KEY = 1 << 2, } zap_flags_t; /* * Create a new zapobj with no attributes and return its object number. */ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx); /* * Initialize an already-allocated object. */ void mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, dmu_tx_t *tx); /* * Create a new zapobj with no attributes from the given (unallocated) * object number. */ int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); int zap_create_claim_norm(objset_t *ds, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); /* * The zapobj passed in must be a valid ZAP object for all of the * following routines. */ /* * Destroy this zapobj and all its attributes. * * Frees the object number using dmu_object_free. */ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); /* * Manipulate attributes. * * 'integer_size' is in bytes, and must be 1, 2, 4, or 8. */ /* * Retrieve the contents of the attribute with the given name. * * If the requested attribute does not exist, the call will fail and * return ENOENT. * * If 'integer_size' is smaller than the attribute's integer size, the * call will fail and return EINVAL. * * If 'integer_size' is equal to or larger than the attribute's integer * size, the call will succeed and return 0. * * When converting to a larger integer size, the integers will be treated as * unsigned (ie. no sign-extension will be performed). * * 'num_integers' is the length (in integers) of 'buf'. * * If the attribute is longer than the buffer, as many integers as will * fit will be transferred to 'buf'. If the entire attribute was not * transferred, the call will return EOVERFLOW. */ int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf); /* * If rn_len is nonzero, realname will be set to the name of the found * entry (which may be different from the requested name if matchtype is * not MT_EXACT). * * If normalization_conflictp is not NULL, it will be set if there is * another name with the same case/unicode normalized form. */ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *normalization_conflictp); int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints); int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf); int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp); int zap_count_write_by_dnode(dnode_t *dn, const char *name, int add, refcount_t *towrite, refcount_t *tooverwrite); /* * Create an attribute with the given name and value. * * If an attribute with the given name already exists, the call will * fail and return EEXIST. */ int zap_add(objset_t *ds, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_add_by_dnode(dnode_t *dn, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Set the attribute with the given name to the given value. If an * attribute with the given name does not exist, it will be created. If * an attribute with the given name already exists, the previous value * will be overwritten. The integer_size may be different from the * existing attribute's integer size, in which case the attribute's * integer size will be updated to the new value. */ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Get the length (in integers) and the integer size of the specified * attribute. * * If the requested attribute does not exist, the call will fail and * return ENOENT. */ int zap_length(objset_t *ds, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers); int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers); /* * Remove the specified attribute. * * If the specified attribute does not exist, the call will fail and * return ENOENT. */ int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx); +int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx); int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx); /* * Returns (in *count) the number of attributes in the specified zap * object. */ int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); /* * Returns (in name) the name of the entry whose (value & mask) * (za_first_integer) is value, or ENOENT if not found. The string * pointed to by name must be at least 256 bytes long. If mask==0, the * match must be exact (ie, same as mask=-1ULL). */ int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name); /* * Transfer all the entries from fromobj into intoobj. Only works on * int_size=8 num_integers=1 values. Fails if there are any duplicated * entries. */ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); /* Same as zap_join, but set the values to 'value'. */ int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, uint64_t value, dmu_tx_t *tx); /* Same as zap_join, but add together any duplicated entries. */ int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); /* * Manipulate entries where the name + value are the "same" (the name is * a stringified version of the value). */ int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, dmu_tx_t *tx); /* Here the key is an int and the value is a different int. */ int zap_add_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx); int zap_update_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx); int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep); int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, dmu_tx_t *tx); struct zap; struct zap_leaf; typedef struct zap_cursor { /* This structure is opaque! */ objset_t *zc_objset; struct zap *zc_zap; struct zap_leaf *zc_leaf; uint64_t zc_zapobj; uint64_t zc_serialized; uint64_t zc_hash; uint32_t zc_cd; } zap_cursor_t; typedef struct { int za_integer_length; /* * za_normalization_conflict will be set if there are additional * entries with this normalized form (eg, "foo" and "Foo"). */ boolean_t za_normalization_conflict; uint64_t za_num_integers; uint64_t za_first_integer; /* no sign extension for <8byte ints */ char za_name[ZAP_MAXNAMELEN]; } zap_attribute_t; /* * The interface for listing all the attributes of a zapobj can be * thought of as cursor moving down a list of the attributes one by * one. The cookie returned by the zap_cursor_serialize routine is * persistent across system calls (and across reboot, even). */ /* * Initialize a zap cursor, pointing to the "first" attribute of the * zapobj. You must _fini the cursor when you are done with it. */ void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); void zap_cursor_fini(zap_cursor_t *zc); /* * Get the attribute currently pointed to by the cursor. Returns * ENOENT if at the end of the attributes. */ int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za); /* * Advance the cursor to the next attribute. */ void zap_cursor_advance(zap_cursor_t *zc); /* * Get a persistent cookie pointing to the current position of the zap * cursor. The low 4 bits in the cookie are always zero, and thus can * be used as to differentiate a serialized cookie from a different type * of value. The cookie will be less than 2^32 as long as there are * fewer than 2^22 (4.2 million) entries in the zap object. */ uint64_t zap_cursor_serialize(zap_cursor_t *zc); /* * Advance the cursor to the attribute having the given key. */ int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt); /* * Initialize a zap cursor pointing to the position recorded by * zap_cursor_serialize (in the "serialized" argument). You can also * use a "serialized" argument of 0 to start at the beginning of the * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to * zap_cursor_init(...).) */ void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj, uint64_t serialized); #define ZAP_HISTOGRAM_SIZE 10 typedef struct zap_stats { /* * Size of the pointer table (in number of entries). * This is always a power of 2, or zero if it's a microzap. * In general, it should be considerably greater than zs_num_leafs. */ uint64_t zs_ptrtbl_len; uint64_t zs_blocksize; /* size of zap blocks */ /* * The number of blocks used. Note that some blocks may be * wasted because old ptrtbl's and large name/value blocks are * not reused. (Although their space is reclaimed, we don't * reuse those offsets in the object.) */ uint64_t zs_num_blocks; /* * Pointer table values from zap_ptrtbl in the zap_phys_t */ uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */ uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */ uint64_t zs_ptrtbl_zt_blk; /* starting block number */ uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */ uint64_t zs_ptrtbl_zt_shift; /* bits to index it */ /* * Values of the other members of the zap_phys_t */ uint64_t zs_block_type; /* ZBT_HEADER */ uint64_t zs_magic; /* ZAP_MAGIC */ uint64_t zs_num_leafs; /* The number of leaf blocks */ uint64_t zs_num_entries; /* The number of zap entries */ uint64_t zs_salt; /* salt to stir into hash function */ /* * Histograms. For all histograms, the last index * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater * than what can be represented. For example * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number * of leafs with more than 45 entries. */ /* * zs_leafs_with_n_pointers[n] is the number of leafs with * 2^n pointers to it. */ uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE]; /* * zs_leafs_with_n_entries[n] is the number of leafs with * [n*5, (n+1)*5) entries. In the current implementation, there * can be at most 55 entries in any block, but there may be * fewer if the name or value is large, or the block is not * completely full. */ uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE]; /* * zs_leafs_n_tenths_full[n] is the number of leafs whose * fullness is in the range [n/10, (n+1)/10). */ uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE]; /* * zs_entries_using_n_chunks[n] is the number of entries which * consume n 24-byte chunks. (Note, large names/values only use * one chunk, but contribute to zs_num_blocks_large.) */ uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE]; /* * zs_buckets_with_n_entries[n] is the number of buckets (each * leaf has 64 buckets) with n entries. * zs_buckets_with_n_entries[1] should be very close to * zs_num_entries. */ uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE]; } zap_stats_t; /* * Get statistics about a ZAP object. Note: you need to be aware of the * internal implementation of the ZAP to correctly interpret some of the * statistics. This interface shouldn't be relied on unless you really * know what you're doing. */ int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); #ifdef __cplusplus } #endif #endif /* _SYS_ZAP_H */ Index: stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c =================================================================== --- stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c (revision 321548) +++ stable/11/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c (revision 321549) @@ -1,1533 +1,1583 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2016 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif extern inline mzap_phys_t *zap_m_phys(zap_t *zap); static int mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags); uint64_t zap_getflags(zap_t *zap) { if (zap->zap_ismicro) return (0); return (zap_f_phys(zap)->zap_flags); } int zap_hashbits(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return (48); else return (28); } uint32_t zap_maxcd(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return ((1<<16)-1); else return (-1U); } static uint64_t zap_hash(zap_name_t *zn) { zap_t *zap = zn->zn_zap; uint64_t h = 0; if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); h = *(uint64_t *)zn->zn_key_orig; } else { h = zap->zap_salt; ASSERT(h != 0); ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { int i; const uint64_t *wp = zn->zn_key_norm; ASSERT(zn->zn_key_intlen == 8); for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) { int j; uint64_t word = *wp; for (j = 0; j < zn->zn_key_intlen; j++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ word) & 0xFF]; word >>= NBBY; } } } else { int i, len; const uint8_t *cp = zn->zn_key_norm; /* * We previously stored the terminating null on * disk, but didn't hash it, so we need to * continue to not hash it. (The * zn_key_*_numints includes the terminating * null for non-binary keys.) */ len = zn->zn_key_norm_numints - 1; ASSERT(zn->zn_key_intlen == 1); for (i = 0; i < len; cp++, i++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ *cp) & 0xFF]; } } } /* * Don't use all 64 bits, since we need some in the cookie for * the collision differentiator. We MUST use the high bits, * since those are the ones that we first pay attention to when * chosing the bucket. */ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); return (h); } static int zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags) { size_t inlen, outlen; int err; ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); inlen = strlen(name) + 1; outlen = ZAP_MAXNAMELEN; err = 0; (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); return (err); } boolean_t zap_match(zap_name_t *zn, const char *matchname) { ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); if (zn->zn_matchtype & MT_NORMALIZE) { char norm[ZAP_MAXNAMELEN]; if (zap_normalize(zn->zn_zap, matchname, norm, zn->zn_normflags) != 0) return (B_FALSE); return (strcmp(zn->zn_key_norm, norm) == 0); } else { return (strcmp(zn->zn_key_orig, matchname) == 0); } } void zap_name_free(zap_name_t *zn) { kmem_free(zn, sizeof (zap_name_t)); } zap_name_t * zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = key; zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; zn->zn_matchtype = mt; zn->zn_normflags = zap->zap_normflags; /* * If we're dealing with a case sensitive lookup on a mixed or * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup * will fold case to all caps overriding the lookup request. */ if (mt & MT_MATCH_CASE) zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; if (zap->zap_normflags) { /* * We *must* use zap_normflags because this normalization is * what the hash is computed from. */ if (zap_normalize(zap, key, zn->zn_normbuf, zap->zap_normflags) != 0) { zap_name_free(zn); return (NULL); } zn->zn_key_norm = zn->zn_normbuf; zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { if (mt != 0) { zap_name_free(zn); return (NULL); } zn->zn_key_norm = zn->zn_key_orig; zn->zn_key_norm_numints = zn->zn_key_orig_numints; } zn->zn_hash = zap_hash(zn); if (zap->zap_normflags != zn->zn_normflags) { /* * We *must* use zn_normflags because this normalization is * what the matching is based on. (Not the hash!) */ if (zap_normalize(zap, key, zn->zn_normbuf, zn->zn_normflags) != 0) { zap_name_free(zn); return (NULL); } zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } return (zn); } zap_name_t * zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); ASSERT(zap->zap_normflags == 0); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = zn->zn_key_norm = key; zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; zn->zn_matchtype = 0; zn->zn_hash = zap_hash(zn); return (zn); } static void mzap_byteswap(mzap_phys_t *buf, size_t size) { int i, max; buf->mz_block_type = BSWAP_64(buf->mz_block_type); buf->mz_salt = BSWAP_64(buf->mz_salt); buf->mz_normflags = BSWAP_64(buf->mz_normflags); max = (size / MZAP_ENT_LEN) - 1; for (i = 0; i < max; i++) { buf->mz_chunk[i].mze_value = BSWAP_64(buf->mz_chunk[i].mze_value); buf->mz_chunk[i].mze_cd = BSWAP_32(buf->mz_chunk[i].mze_cd); } } void zap_byteswap(void *buf, size_t size) { uint64_t block_type; block_type = *(uint64_t *)buf; if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { /* ASSERT(magic == ZAP_LEAF_MAGIC); */ mzap_byteswap(buf, size); } else { fzap_byteswap(buf, size); } } static int mze_compare(const void *arg1, const void *arg2) { const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; if (mze1->mze_hash > mze2->mze_hash) return (+1); if (mze1->mze_hash < mze2->mze_hash) return (-1); if (mze1->mze_cd > mze2->mze_cd) return (+1); if (mze1->mze_cd < mze2->mze_cd) return (-1); return (0); } static int mze_insert(zap_t *zap, int chunkid, uint64_t hash) { mzap_ent_t *mze; avl_index_t idx; ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze->mze_chunkid = chunkid; mze->mze_hash = hash; mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) { kmem_free(mze, sizeof (mzap_ent_t)); return (EEXIST); } avl_insert(&zap->zap_m.zap_avl, mze, idx); return (0); } static mzap_ent_t * mze_find(zap_name_t *zn) { mzap_ent_t mze_tofind; mzap_ent_t *mze; avl_index_t idx; avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); mze_tofind.mze_hash = zn->zn_hash; mze_tofind.mze_cd = 0; mze = avl_find(avl, &mze_tofind, &idx); if (mze == NULL) mze = avl_nearest(avl, idx, AVL_AFTER); for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } return (NULL); } static uint32_t mze_find_unused_cd(zap_t *zap, uint64_t hash) { mzap_ent_t mze_tofind; mzap_ent_t *mze; avl_index_t idx; avl_tree_t *avl = &zap->zap_m.zap_avl; uint32_t cd; ASSERT(zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; cd = 0; for (mze = avl_find(avl, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { if (mze->mze_cd != cd) break; cd++; } return (cd); } static void mze_remove(zap_t *zap, mzap_ent_t *mze) { ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); avl_remove(&zap->zap_m.zap_avl, mze); kmem_free(mze, sizeof (mzap_ent_t)); } static void mze_destroy(zap_t *zap) { mzap_ent_t *mze; void *avlcookie = NULL; while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) kmem_free(mze, sizeof (mzap_ent_t)); avl_destroy(&zap->zap_m.zap_avl); } static zap_t * mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) { zap_t *winner; zap_t *zap; int i; uint64_t *zap_hdr = (uint64_t *)db->db_data; uint64_t zap_block_type = zap_hdr[0]; uint64_t zap_magic = zap_hdr[1]; ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, 0, 0, 0); rw_enter(&zap->zap_rwlock, RW_WRITER); zap->zap_objset = os; zap->zap_object = obj; zap->zap_dbuf = db; if (zap_block_type != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { winner = NULL; /* No actual winner here... */ goto handle_winner; } } else { zap->zap_ismicro = TRUE; } /* * Make sure that zap_ismicro is set before we let others see * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); winner = dmu_buf_set_user(db, &zap->zap_dbu); if (winner != NULL) goto handle_winner; if (zap->zap_ismicro) { zap->zap_salt = zap_m_phys(zap)->mz_salt; zap->zap_normflags = zap_m_phys(zap)->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; avl_create(&zap->zap_m.zap_avl, mze_compare, sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0]) { zap_name_t *zn; zn = zap_name_alloc(zap, mze->mze_name, 0); if (mze_insert(zap, i, zn->zn_hash) == 0) zap->zap_m.zap_num_entries++; else { printf("ZFS WARNING: Duplicated ZAP " "entry detected (%s).\n", mze->mze_name); } zap_name_free(zn); } } } else { zap->zap_salt = zap_f_phys(zap)->zap_salt; zap->zap_normflags = zap_f_phys(zap)->zap_normflags; ASSERT3U(sizeof (struct zap_leaf_header), ==, 2*ZAP_LEAF_CHUNKSIZE); /* * The embedded pointer table should not overlap the * other members. */ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, &zap_f_phys(zap)->zap_salt); /* * The embedded pointer table should end at the end of * the block */ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 1<zap_dbuf->db_size); } rw_exit(&zap->zap_rwlock); return (zap); handle_winner: rw_exit(&zap->zap_rwlock); rw_destroy(&zap->zap_rwlock); if (!zap->zap_ismicro) mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); return (winner); } static int zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { zap_t *zap; krw_t lt; ASSERT0(db->db_offset); objset_t *os = dmu_buf_get_objset(db); uint64_t obj = db->db_object; *zapp = NULL; #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif zap = dmu_buf_get_user(db); if (zap == NULL) { zap = mzap_open(os, obj, db); if (zap == NULL) { /* * mzap_open() didn't like what it saw on-disk. * Check for corruption! */ return (SET_ERROR(EIO)); } } /* * We're checking zap_ismicro without the lock held, in order to * tell what type of lock we want. Once we have some sort of * lock, see if it really is the right type. In practice this * can only be different if it was upgraded from micro to fat, * and micro wanted WRITER but fat only needs READER. */ lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; rw_enter(&zap->zap_rwlock, lt); if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { /* it was upgraded, now we only need reader */ ASSERT(lt == RW_WRITER); ASSERT(RW_READER == (!zap->zap_ismicro && fatreader) ? RW_READER : lti); rw_downgrade(&zap->zap_rwlock); lt = RW_READER; } zap->zap_objset = os; if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3P(zap->zap_dbuf, ==, db); ASSERT(!zap->zap_ismicro || zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); if (zap->zap_ismicro && tx && adding && zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; if (newsz > MZAP_MAX_BLKSZ) { dprintf("upgrading obj %llu: num_entries=%u\n", obj, zap->zap_m.zap_num_entries); *zapp = zap; int err = mzap_upgrade(zapp, tag, tx, 0); if (err != 0) rw_exit(&zap->zap_rwlock); return (err); } VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; } *zapp = zap; return (0); } static int zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) { dmu_buf_t *db; int err; err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) { return (err); } err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) { dmu_buf_rele(db, tag); } return (err); } int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) { dmu_buf_t *db; int err; err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) dmu_buf_rele(db, tag); return (err); } void zap_unlockdir(zap_t *zap, void *tag) { rw_exit(&zap->zap_rwlock); dmu_buf_rele(zap->zap_dbuf, tag); } static int mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) { mzap_phys_t *mzp; int i, sz, nchunks; int err = 0; zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); sz = zap->zap_dbuf->db_size; mzp = zio_buf_alloc(sz); bcopy(zap->zap_dbuf->db_data, mzp, sz); nchunks = zap->zap_m.zap_num_chunks; if (!flags) { err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err) { zio_buf_free(mzp, sz); return (err); } } dprintf("upgrading obj=%llu with %u chunks\n", zap->zap_object, nchunks); /* XXX destroy the avl later, so we can use the stored hash value */ mze_destroy(zap); fzap_upgrade(zap, tx, flags); for (i = 0; i < nchunks; i++) { mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; zap_name_t *zn; if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", mze->mze_name, mze->mze_value); zn = zap_name_alloc(zap, mze->mze_name, 0); err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tag, tx); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ zap_name_free(zn); if (err) break; } zio_buf_free(mzp, sz); *zapp = zap; return (err); } /* * The "normflags" determine the behavior of the matchtype_t which is * passed to zap_lookup_norm(). Names which have the same normalized * version will be stored with the same hash value, and therefore we can * perform normalization-insensitive lookups. We can be Unicode form- * insensitive and/or case-insensitive. The following flags are valid for * "normflags": * * U8_TEXTPREP_NFC * U8_TEXTPREP_NFD * U8_TEXTPREP_NFKC * U8_TEXTPREP_NFKD * U8_TEXTPREP_TOUPPER * * The *_NF* (Normalization Form) flags are mutually exclusive; at most one * of them may be supplied. */ void mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, dmu_tx_t *tx) { dmu_buf_t *db; mzap_phys_t *zp; VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif dmu_buf_will_dirty(db, tx); zp = db->db_data; zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; zp->mz_normflags = normflags; dmu_buf_rele(db, FTAG); if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER, B_FALSE, B_FALSE, FTAG, &zap)); VERIFY3U(0, ==, mzap_upgrade(&zap, FTAG, tx, flags)); zap_unlockdir(zap, FTAG); } } int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_norm(os, obj, 0, ot, bonustype, bonuslen, tx)); } int zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { int err; err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); if (err != 0) return (err); mzap_create_impl(os, obj, normflags, 0, tx); return (0); } uint64_t zap_create(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); } uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); mzap_create_impl(os, obj, normflags, 0, tx); return (obj); } uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT && indirect_blockshift >= SPA_MINBLOCKSHIFT && indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT); VERIFY(dmu_object_set_blocksize(os, obj, 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); mzap_create_impl(os, obj, normflags, flags, tx); return (obj); } int zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) { /* * dmu_object_free will free the object number and free the * data. Freeing the data will cause our pageout function to be * called, which will destroy our data (zap_leaf_t's and zap_t). */ return (dmu_object_free(os, zapobj, tx)); } void zap_evict_sync(void *dbu) { zap_t *zap = dbu; rw_destroy(&zap->zap_rwlock); if (zap->zap_ismicro) mze_destroy(zap); else mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); } int zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); if (!zap->zap_ismicro) { err = fzap_count(zap, count); } else { *count = zap->zap_m.zap_num_entries; } zap_unlockdir(zap, FTAG); return (err); } /* * zn may be NULL; if not specified, it will be computed if needed. * See also the comment above zap_entry_normalization_conflict(). */ static boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) { mzap_ent_t *other; int direction = AVL_BEFORE; boolean_t allocdzn = B_FALSE; if (zap->zap_normflags == 0) return (B_FALSE); again: for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction); other && other->mze_hash == mze->mze_hash; other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { if (zn == NULL) { zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } if (direction == AVL_BEFORE) { direction = AVL_AFTER; goto again; } if (allocdzn) zap_name_free(zn); return (B_FALSE); } /* * Routines for manipulating attributes. */ int zap_lookup(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm(os, zapobj, name, integer_size, num_integers, buf, 0, NULL, 0, NULL)); } static int zap_lookup_impl(zap_t *zap, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { int err = 0; mzap_ent_t *mze; zap_name_t *zn; zn = zap_name_alloc(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_lookup(zn, integer_size, num_integers, buf, realname, rn_len, ncp); } else { mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (num_integers < 1) { err = SET_ERROR(EOVERFLOW); } else if (integer_size != 8) { err = SET_ERROR(EINVAL); } else { *(uint64_t *)buf = MZE_PHYS(zap, mze)->mze_value; (void) strlcpy(realname, MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, zn, mze); } } } } zap_name_free(zn); return (err); } int zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, num_integers, buf, mt, realname, rn_len, ncp); zap_unlockdir(zap, FTAG); return (err); } int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm_by_dnode(dn, name, integer_size, num_integers, buf, 0, NULL, 0, NULL)); } int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err; err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, num_integers, buf, mt, realname, rn_len, ncp); zap_unlockdir(zap, FTAG); return (err); } int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } fzap_prefetch(zn); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_lookup(zn, integer_size, num_integers, buf, NULL, 0, NULL); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_contains(objset_t *os, uint64_t zapobj, const char *name) { int err = zap_lookup_norm(os, zapobj, name, 0, 0, NULL, 0, NULL, 0, NULL); if (err == EOVERFLOW || err == EINVAL) err = 0; /* found, but skipped reading the value */ return (err); } int zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err; mzap_ent_t *mze; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_length(zn, integer_size, num_integers); } else { mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (integer_size) *integer_size = 8; if (num_integers) *num_integers = 1; } } zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_length(zn, integer_size, num_integers); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } static void mzap_addent(zap_name_t *zn, uint64_t value) { int i; zap_t *zap = zn->zn_zap; int start = zap->zap_m.zap_alloc_next; uint32_t cd; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ ASSERT(cd < zap_maxcd(zap)); again: for (i = start; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; (void) strcpy(mze->mze_name, zn->zn_key_orig); zap->zap_m.zap_num_entries++; zap->zap_m.zap_alloc_next = i+1; if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; VERIFY(0 == mze_insert(zap, i, zn->zn_hash)); return; } } if (start != 0) { start = 0; goto again; } ASSERT(!"out of entries!"); } -int -zap_add(objset_t *os, uint64_t zapobj, const char *key, +static int +zap_add_impl(zap_t *zap, const char *key, int integer_size, uint64_t num_integers, - const void *val, dmu_tx_t *tx) + const void *val, dmu_tx_t *tx, void *tag) { - zap_t *zap; - int err; + int err = 0; mzap_ent_t *mze; const uint64_t *intval = val; zap_name_t *zn; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); - if (err) - return (err); zn = zap_name_alloc(zap, key, 0); if (zn == NULL) { - zap_unlockdir(zap, FTAG); + zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { - err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); + err = fzap_add(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(key) >= MZAP_NAME_LEN) { - err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); + err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); if (err == 0) { err = fzap_add(zn, integer_size, num_integers, val, - FTAG, tx); + tag, tx); } zap = zn->zn_zap; /* fzap_add() may change zap */ } else { mze = mze_find(zn); if (mze != NULL) { err = SET_ERROR(EEXIST); } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ - zap_unlockdir(zap, FTAG); + zap_unlockdir(zap, tag); return (err); } int +zap_add(objset_t *os, uint64_t zapobj, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); + /* zap_add_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_add_by_dnode(dnode_t *dn, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); + /* zap_add_impl() calls zap_unlockdir() */ + return (err); +} + +int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap, FTAG); return (err); } int zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; mzap_ent_t *mze; uint64_t oldval; const uint64_t *intval = val; zap_name_t *zn; int err; #ifdef ZFS_DEBUG /* * If there is an old value, it shouldn't change across the * lockdir (eg, due to bprewrite's xlation). */ if (integer_size == 8 && num_integers == 1) (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); #endif err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", zapobj, integer_size, num_integers, name); err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); if (err == 0) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); } zap = zn->zn_zap; /* fzap_update() may change zap */ } else { mze = mze_find(zn); if (mze != NULL) { ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap, FTAG); return (err); } int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; zap_name_t *zn; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap, FTAG); return (err); } int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { return (zap_remove_norm(os, zapobj, name, 0, tx)); } -int -zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, +static int +zap_remove_impl(zap_t *zap, const char *name, matchtype_t mt, dmu_tx_t *tx) { - zap_t *zap; - int err; mzap_ent_t *mze; zap_name_t *zn; + int err = 0; - err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); - if (err) - return (err); zn = zap_name_alloc(zap, name, mt); - if (zn == NULL) { - zap_unlockdir(zap, FTAG); + if (zn == NULL) return (SET_ERROR(ENOTSUP)); - } if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { zap->zap_m.zap_num_entries--; bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], sizeof (mzap_ent_phys_t)); mze_remove(zap, mze); } } zap_name_free(zn); + return (err); +} + +int +zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + err = zap_remove_impl(zap, name, mt, tx); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + err = zap_remove_impl(zap, name, 0, tx); zap_unlockdir(zap, FTAG); return (err); } int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_remove(zn, tx); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } /* * Routines for iterating over the attributes. */ void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized) { zc->zc_objset = os; zc->zc_zap = NULL; zc->zc_leaf = NULL; zc->zc_zapobj = zapobj; zc->zc_serialized = serialized; zc->zc_hash = 0; zc->zc_cd = 0; } void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { zap_cursor_init_serialized(zc, os, zapobj, 0); } void zap_cursor_fini(zap_cursor_t *zc) { if (zc->zc_zap) { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); zap_unlockdir(zc->zc_zap, NULL); zc->zc_zap = NULL; } if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; } zc->zc_objset = NULL; } uint64_t zap_cursor_serialize(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return (-1ULL); if (zc->zc_zap == NULL) return (zc->zc_serialized); ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); /* * We want to keep the high 32 bits of the cursor zero if we can, so * that 32-bit programs can access this. So usually use a small * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits * of the cursor. * * [ collision differentiator | zap_hashbits()-bit hash value ] */ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); } int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) { int err; avl_index_t idx; mzap_ent_t mze_tofind; mzap_ent_t *mze; if (zc->zc_hash == -1ULL) return (SET_ERROR(ENOENT)); if (zc->zc_zap == NULL) { int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); if (err) return (err); /* * To support zap_cursor_init_serialized, advance, retrieve, * we must add to the existing zc_cd, which may already * be 1 due to the zap_cursor_advance. */ ASSERT(zc->zc_hash == 0); hb = zap_hashbits(zc->zc_zap); zc->zc_hash = zc->zc_serialized << (64 - hb); zc->zc_cd += zc->zc_serialized >> hb; if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ zc->zc_cd = 0; } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { mze_tofind.mze_hash = zc->zc_hash; mze_tofind.mze_cd = zc->zc_cd; mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); if (mze == NULL) { mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, idx, AVL_AFTER); } if (mze) { mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = mzap_normalization_conflict(zc->zc_zap, NULL, mze); za->za_integer_length = 8; za->za_num_integers = 1; za->za_first_integer = mzep->mze_value; (void) strcpy(za->za_name, mzep->mze_name); zc->zc_hash = mze->mze_hash; zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; err = SET_ERROR(ENOENT); } } rw_exit(&zc->zc_zap->zap_rwlock); return (err); } void zap_cursor_advance(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return; zc->zc_cd++; } int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) { int err = 0; mzap_ent_t *mze; zap_name_t *zn; if (zc->zc_zap == NULL) { err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zc->zc_zap); if (err) return (err); } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } zn = zap_name_alloc(zc->zc_zap, name, mt); if (zn == NULL) { rw_exit(&zc->zc_zap->zap_rwlock); return (SET_ERROR(ENOTSUP)); } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_move_to_key(zc, zn); } else { mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); goto out; } zc->zc_hash = mze->mze_hash; zc->zc_cd = mze->mze_cd; } out: zap_name_free(zn); rw_exit(&zc->zc_zap->zap_rwlock); return (err); } int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) { int err; zap_t *zap; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err) return (err); bzero(zs, sizeof (zap_stats_t)); if (zap->zap_ismicro) { zs->zs_blocksize = zap->zap_dbuf->db_size; zs->zs_num_entries = zap->zap_m.zap_num_entries; zs->zs_num_blocks = 1; } else { fzap_get_stats(zap, zs); } zap_unlockdir(zap, FTAG); return (0); } Index: stable/11 =================================================================== --- stable/11 (revision 321548) +++ stable/11 (revision 321549) Property changes on: stable/11 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r318823