Index: vendor-sys/illumos/dist/uts/common/fs/zfs/ddt_zap.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/ddt_zap.c (revision 353618) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/ddt_zap.c (revision 353619) @@ -1,157 +1,169 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include int ddt_zap_leaf_blockshift = 12; int ddt_zap_indirect_blockshift = 12; static int ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) { zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY; if (prehash) flags |= ZAP_FLAG_PRE_HASHED_KEY; *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, DMU_OT_NONE, 0, tx); return (*objectp == 0 ? ENOTSUP : 0); } static int ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) { return (zap_destroy(os, object, tx)); } static int ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) { uchar_t cbuf[sizeof (dde->dde_phys) + 1]; uint64_t one, csize; int error; error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key, DDT_KEY_WORDS, &one, &csize); if (error) return (error); ASSERT(one == 1); ASSERT(csize <= sizeof (cbuf)); error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, DDT_KEY_WORDS, 1, csize, cbuf); if (error) return (error); ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); return (0); } static void ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde) { (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key, DDT_KEY_WORDS); } static int ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) { uchar_t cbuf[sizeof (dde->dde_phys) + 1]; uint64_t csize; csize = ddt_compress(dde->dde_phys, cbuf, sizeof (dde->dde_phys), sizeof (cbuf)); return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, DDT_KEY_WORDS, 1, csize, cbuf, tx)); } static int ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) { return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key, DDT_KEY_WORDS, tx)); } static int ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) { zap_cursor_t zc; zap_attribute_t za; int error; - zap_cursor_init_serialized(&zc, os, object, *walk); + if (*walk == 0) { + /* + * We don't want to prefetch the entire ZAP object, because + * it can be enormous. Also the primary use of DDT iteration + * is for scrubbing, in which case we will be issuing many + * scrub i/os for each ZAP block that we read in, so + * reading the ZAP is unlikely to be the bottleneck. + */ + zap_cursor_init_noprefetch(&zc, os, object); + } else { + zap_cursor_init_serialized(&zc, os, object, *walk); + } if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { uchar_t cbuf[sizeof (dde->dde_phys) + 1]; uint64_t csize = za.za_num_integers; ASSERT(za.za_integer_length == 1); error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name, DDT_KEY_WORDS, 1, csize, cbuf); ASSERT(error == 0); if (error == 0) { ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); dde->dde_key = *(ddt_key_t *)za.za_name; } zap_cursor_advance(&zc); *walk = zap_cursor_serialize(&zc); } zap_cursor_fini(&zc); return (error); } static uint64_t ddt_zap_count(objset_t *os, uint64_t object) { uint64_t count = 0; VERIFY(zap_count(os, object, &count) == 0); return (count); } const ddt_ops_t ddt_zap_ops = { "zap", ddt_zap_create, ddt_zap_destroy, ddt_zap_lookup, ddt_zap_prefetch, ddt_zap_update, ddt_zap_remove, ddt_zap_walk, ddt_zap_count, }; Index: vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c (revision 353618) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/dmu.c (revision 353619) @@ -1,2421 +1,2433 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. */ /* * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright (c) 2018 DilOS */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #endif static xuio_stats_t xuio_stats = { { "onloan_read_buf", KSTAT_DATA_UINT64 }, { "onloan_write_buf", KSTAT_DATA_UINT64 }, { "read_buf_copied", KSTAT_DATA_UINT64 }, { "read_buf_nocopy", KSTAT_DATA_UINT64 }, { "write_buf_copied", KSTAT_DATA_UINT64 }, { "write_buf_nocopy", KSTAT_DATA_UINT64 } }; #define XUIOSTAT_INCR(stat, val) \ atomic_add_64(&xuio_stats.stat.value.ui64, (val)) #define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) /* * Enable/disable nopwrite feature. */ int zfs_nopwrite_enabled = 1; /* * Tunable to control percentage of dirtied blocks from frees in one TXG. * After this threshold is crossed, additional dirty blocks from frees * wait until the next TXG. * A value of zero will disable this throttle. */ uint32_t zfs_per_txg_dirty_frees_percent = 30; /* * This can be used for testing, to ensure that certain actions happen * while in the middle of a remap (which might otherwise complete too * quickly). */ int zfs_object_remap_one_indirect_delay_ticks = 0; +/* + * Limit the amount we can prefetch with one call to this amount. This + * helps to limit the amount of memory that can be used by prefetching. + * Larger objects should be prefetched a bit at a time. + */ +uint64_t dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; + const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { DMU_BSWAP_UINT8, TRUE, FALSE, "unallocated" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "object directory" }, { DMU_BSWAP_UINT64, TRUE, TRUE, "object array" }, { DMU_BSWAP_UINT8, TRUE, FALSE, "packed nvlist" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "packed nvlist size" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj header" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map header" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA space map" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "ZIL intent log" }, { DMU_BSWAP_DNODE, TRUE, FALSE, "DMU dnode" }, { DMU_BSWAP_OBJSET, TRUE, TRUE, "DMU objset" }, { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL directory" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL directory child map" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset snap map" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL props" }, { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL dataset" }, { DMU_BSWAP_ZNODE, TRUE, FALSE, "ZFS znode" }, { DMU_BSWAP_OLDACL, TRUE, FALSE, "ZFS V0 ACL" }, { DMU_BSWAP_UINT8, FALSE, FALSE, "ZFS plain file" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS directory" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS master node" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS delete queue" }, { DMU_BSWAP_UINT8, FALSE, FALSE, "zvol object" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "zvol prop" }, { DMU_BSWAP_UINT8, FALSE, FALSE, "other uint8[]" }, { DMU_BSWAP_UINT64, FALSE, FALSE, "other uint64[]" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "other ZAP" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "persistent error log" }, { DMU_BSWAP_UINT8, TRUE, FALSE, "SPA history" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "SPA history offsets" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "Pool properties" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL permissions" }, { DMU_BSWAP_ACL, TRUE, FALSE, "ZFS ACL" }, { DMU_BSWAP_UINT8, TRUE, FALSE, "ZFS SYSACL" }, { DMU_BSWAP_UINT8, TRUE, FALSE, "FUID table" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "FUID table size" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dataset next clones" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "scan work queue" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group used" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "ZFS user/group quota" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "snapshot refcount tags" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT ZAP algorithm" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "DDT statistics" }, { DMU_BSWAP_UINT8, TRUE, FALSE, "System attributes" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "SA master node" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr registration" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "SA attr layouts" }, { DMU_BSWAP_ZAP, TRUE, FALSE, "scan translations" }, { DMU_BSWAP_UINT8, FALSE, FALSE, "deduplicated block" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL deadlist map" }, { DMU_BSWAP_UINT64, TRUE, TRUE, "DSL deadlist map hdr" }, { DMU_BSWAP_ZAP, TRUE, TRUE, "DSL dir clones" }, { DMU_BSWAP_UINT64, TRUE, FALSE, "bpobj subobj" } }; const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { { byteswap_uint8_array, "uint8" }, { byteswap_uint16_array, "uint16" }, { byteswap_uint32_array, "uint32" }, { byteswap_uint64_array, "uint64" }, { zap_byteswap, "zap" }, { dnode_buf_byteswap, "dnode" }, { dmu_objset_byteswap, "objset" }, { zfs_znode_byteswap, "znode" }, { zfs_oldacl_byteswap, "oldacl" }, { zfs_acl_byteswap, "acl" } }; int dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp) { uint64_t blkid; dmu_buf_impl_t *db; blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (0); } int dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp) { dnode_t *dn; uint64_t blkid; dmu_buf_impl_t *db; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); blkid = dbuf_whichblock(dn, 0, offset); rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, blkid, tag); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); if (db == NULL) { *dbp = NULL; return (SET_ERROR(EIO)); } *dbp = &db->db; return (err); } int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags) { int err; int db_flags = DB_RF_CANFAIL; if (flags & DMU_READ_NO_PREFETCH) db_flags |= DB_RF_NOPREFETCH; err = dmu_buf_hold_noread(os, object, offset, tag, dbp); if (err == 0) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); err = dbuf_read(db, NULL, db_flags); if (err != 0) { dbuf_rele(db, tag); *dbp = NULL; } } return (err); } int dmu_bonus_max(void) { return (DN_OLD_MAX_BONUSLEN); } int dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else if (newsize < 0 || newsize > db_fake->db_size) { error = SET_ERROR(EINVAL); } else { dnode_setbonuslen(dn, newsize, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } int dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int error; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (!DMU_OT_IS_VALID(type)) { error = SET_ERROR(EINVAL); } else if (dn->dn_bonus != db) { error = SET_ERROR(EINVAL); } else { dnode_setbonus_type(dn, type, tx); error = 0; } DB_DNODE_EXIT(db); return (error); } dmu_object_type_t dmu_get_bonustype(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; dmu_object_type_t type; DB_DNODE_ENTER(db); dn = DB_DNODE(db); type = dn->dn_bonustype; DB_DNODE_EXIT(db); return (type); } int dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int error; error = dnode_hold(os, object, FTAG, &dn); dbuf_rm_spill(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_rm_spill(dn, tx); rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); return (error); } /* * returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) { dnode_t *dn; dmu_buf_impl_t *db; int error; error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus == NULL) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus == NULL) dbuf_create_bonus(dn); } db = dn->dn_bonus; /* as long as the bonus buf is held, the dnode will be held */ if (zfs_refcount_add(&db->db_holds, tag) == 1) { VERIFY(dnode_add_ref(dn, db)); atomic_inc_32(&dn->dn_dbufs_count); } /* * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's * hold and incrementing the dbuf count to ensure that dnode_move() sees * a dnode hold for every dbuf. */ rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH)); *dbp = &db->db; return (0); } /* * returns ENOENT, EIO, or 0. * * This interface will allocate a blank spill dbuf when a spill blk * doesn't already exist on the dnode. * * if you only want to find an already existing spill db, then * dmu_spill_hold_existing() should be used. */ int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = NULL; int err; if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); ASSERT(db != NULL); err = dbuf_read(db, NULL, flags); if (err == 0) *dbp = &db->db; else dbuf_rele(db, tag); return (err); } int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { err = SET_ERROR(EINVAL); } else { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (!dn->dn_have_spill) { err = SET_ERROR(ENOENT); } else { err = dmu_spill_hold_by_dnode(dn, DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); } rw_exit(&dn->dn_struct_rwlock); } DB_DNODE_EXIT(db); return (err); } int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_spill_hold_by_dnode(dn, DB_RF_CANFAIL, tag, dbp); DB_DNODE_EXIT(db); return (err); } /* * Note: longer-term, we should modify all of the dmu_buf_*() interfaces * to take a held dnode rather than -- the lookup is wasteful, * and can induce severe lock contention when writing to several files * whose dnodes are in the same block. */ int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dmu_buf_t **dbp; uint64_t blkid, nblks, i; uint32_t dbuf_flags; int err; zio_t *zio; ASSERT(length <= DMU_MAX_ACCESS); /* * Note: We directly notify the prefetch code of this read, so that * we can tell it about the multi-block read. dbuf_read() only knows * about the one block it is accessing. */ dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { int blkshift = dn->dn_datablkshift; nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; } else { if (offset + length > dn->dn_datablksz) { zfs_panic_recover("zfs: accessing past end of object " "%llx/%llx (size=%u access=%llu+%llu)", (longlong_t)dn->dn_objset-> os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(EIO)); } nblks = 1; } dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); blkid = dbuf_whichblock(dn, 0, offset); for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); if (db == NULL) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_rele_array(dbp, nblks, tag); zio_nowait(zio); return (SET_ERROR(EIO)); } /* initiate async i/o */ if (read) (void) dbuf_read(db, zio, dbuf_flags); dbp[i] = &db->db; } if ((flags & DMU_READ_NO_PREFETCH) == 0 && DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { dmu_zfetch(&dn->dn_zfetch, blkid, nblks, read && DNODE_IS_CACHEABLE(dn)); } rw_exit(&dn->dn_struct_rwlock); /* wait for async i/o */ err = zio_wait(zio); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } /* wait for other io to complete */ if (read) { for (i = 0; i < nblks; i++) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); mutex_exit(&db->db_mtx); if (err) { dmu_buf_rele_array(dbp, nblks, tag); return (err); } } } *numbufsp = nblks; *dbpp = dbp; return (0); } static int dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); return (err); } int dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; int err; DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, numbufsp, dbpp, DMU_READ_PREFETCH); DB_DNODE_EXIT(db); return (err); } void dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) { int i; dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; if (numbufs == 0) return; for (i = 0; i < numbufs; i++) { if (dbp[i]) dbuf_rele(dbp[i], tag); } kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); } /* * Issue prefetch i/os for the given blocks. If level is greater than 0, the * indirect blocks prefeteched will be those that point to the blocks containing * the data starting at offset, and continuing to offset + len. * * Note that if the indirect blocks above the blocks being prefetched are not in * cache, they will be asychronously read in. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, zio_priority_t pri) { dnode_t *dn; uint64_t blkid; int nblks, err; if (len == 0) { /* they're interested in the bonus buffer */ dn = DMU_META_DNODE(os); if (object == 0 || object >= DN_MAX_OBJECT) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, level, object * sizeof (dnode_phys_t)); dbuf_prefetch(dn, level, blkid, pri, 0); rw_exit(&dn->dn_struct_rwlock); return; } + + /* + * See comment before the definition of dmu_prefetch_max. + */ + len = MIN(len, dmu_prefetch_max); /* * XXX - Note, if the dnode for the requested object is not * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return; rw_enter(&dn->dn_struct_rwlock, RW_READER); /* * offset + len - 1 is the last byte we want to prefetch for, and offset * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the * last block we want to prefetch, and dbuf_whichblock(dn, level, * offset) is the first. Then the number we need to prefetch is the * last - first + 1. */ if (level > 0 || dn->dn_datablkshift != 0) { nblks = dbuf_whichblock(dn, level, offset + len - 1) - dbuf_whichblock(dn, level, offset) + 1; } else { nblks = (offset < dn->dn_datablksz); } if (nblks != 0) { blkid = dbuf_whichblock(dn, level, offset); for (int i = 0; i < nblks; i++) dbuf_prefetch(dn, level, blkid + i, pri, 0); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } /* * Get the next "chunk" of file data to free. We traverse the file from * the end so that the file gets shorter over time (if we crashes in the * middle, this will leave us in a better state). We find allocated file * data by simply searching the allocated level 1 indirects. * * On input, *start should be the first offset that does not need to be * freed (e.g. "offset + length"). On return, *start will be the first * offset that should be freed. */ static int get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum) { uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); /* bytes of data covered by a level-1 indirect block */ uint64_t iblkrange = dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); ASSERT3U(minimum, <=, *start); if (*start - minimum <= iblkrange * maxblks) { *start = minimum; return (0); } ASSERT(ISP2(iblkrange)); for (uint64_t blks = 0; *start > minimum && blks < maxblks; blks++) { int err; /* * dnode_next_offset(BACKWARDS) will find an allocated L1 * indirect block at or before the input offset. We must * decrement *start so that it is at the end of the region * to search. */ (*start)--; err = dnode_next_offset(dn, DNODE_FIND_BACKWARDS, start, 2, 1, 0); /* if there are no indirect blocks before start, we are done */ if (err == ESRCH) { *start = minimum; break; } else if (err != 0) { return (err); } /* set start to the beginning of this L1 indirect */ *start = P2ALIGN(*start, iblkrange); } if (*start < minimum) *start = minimum; return (0); } /* * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set, * otherwise return false. * Used below in dmu_free_long_range_impl() to enable abort when unmounting */ /*ARGSUSED*/ static boolean_t dmu_objset_zfs_unmounting(objset_t *os) { #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) return (zfs_get_vfs_flag_unmounted(os)); #endif return (B_FALSE); } static int dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, uint64_t length) { uint64_t object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; int err; uint64_t dirty_frees_threshold; dsl_pool_t *dp = dmu_objset_pool(os); if (offset >= object_size) return (0); if (zfs_per_txg_dirty_frees_percent <= 100) dirty_frees_threshold = zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; else dirty_frees_threshold = zfs_dirty_data_max / 4; if (length == DMU_OBJECT_END || offset + length > object_size) length = object_size - offset; while (length != 0) { uint64_t chunk_end, chunk_begin, chunk_len; uint64_t long_free_dirty_all_txgs = 0; dmu_tx_t *tx; if (dmu_objset_zfs_unmounting(dn->dn_objset)) return (SET_ERROR(EINTR)); chunk_end = chunk_begin = offset + length; /* move chunk_begin backwards to the beginning of this chunk */ err = get_next_chunk(dn, &chunk_begin, offset); if (err) return (err); ASSERT3U(chunk_begin, >=, offset); ASSERT3U(chunk_begin, <=, chunk_end); chunk_len = chunk_end - chunk_begin; mutex_enter(&dp->dp_lock); for (int t = 0; t < TXG_SIZE; t++) { long_free_dirty_all_txgs += dp->dp_long_free_dirty_pertxg[t]; } mutex_exit(&dp->dp_lock); /* * To avoid filling up a TXG with just frees wait for * the next TXG to open before freeing more chunks if * we have reached the threshold of frees */ if (dirty_frees_threshold != 0 && long_free_dirty_all_txgs >= dirty_frees_threshold) { txg_wait_open(dp, 0); continue; } tx = dmu_tx_create(os); dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); /* * Mark this transaction as typically resulting in a net * reduction in space used. */ dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); return (err); } mutex_enter(&dp->dp_lock); dp->dp_long_free_dirty_pertxg[dmu_tx_get_txg(tx) & TXG_MASK] += chunk_len; mutex_exit(&dp->dp_lock); DTRACE_PROBE3(free__long__range, uint64_t, long_free_dirty_all_txgs, uint64_t, chunk_len, uint64_t, dmu_tx_get_txg(tx)); dnode_free_range(dn, chunk_begin, chunk_len, tx); dmu_tx_commit(tx); length -= chunk_len; } return (0); } int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t length) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length); /* * It is important to zero out the maxblkid when freeing the entire * file, so that (a) subsequent calls to dmu_free_long_range_impl() * will take the fast path, and (b) dnode_reallocate() can verify * that the entire file has been freed. */ if (err == 0 && offset == 0 && length == DMU_OBJECT_END) dn->dn_maxblkid = 0; dnode_rele(dn, FTAG); return (err); } int dmu_free_long_object(objset_t *os, uint64_t object) { dmu_tx_t *tx; int err; err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); dmu_tx_mark_netfree(tx); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { err = dmu_object_free(os, object, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } return (err); } int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); ASSERT(size == -1ULL || size <= UINT64_MAX - offset); dnode_free_range(dn, offset, size, tx); dnode_rele(dn, FTAG); return (0); } static int dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dmu_buf_t **dbp; int numbufs, err = 0; /* * Deal with odd block sizes, where there can't be data past the first * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ if (dn->dn_maxblkid == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); size = newsz; } while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); bcopy((char *)db->db_data + bufoff, buf, tocpy); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); } return (err); } int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_read_impl(dn, offset, size, buf, flags); dnode_rele(dn, FTAG); return (err); } int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags) { return (dmu_read_impl(dn, offset, size, buf, flags)); } static void dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { int i; for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); bcopy(buf, (char *)db->db_data + bufoff, tocpy); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; buf = (char *)buf + tocpy; } } void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; if (size == 0) return; VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); dmu_write_impl(dbp, numbufs, offset, size, buf, tx); dmu_buf_rele_array(dbp, numbufs, FTAG); } static int dmu_object_remap_one_indirect(objset_t *os, dnode_t *dn, uint64_t last_removal_txg, uint64_t offset) { uint64_t l1blkid = dbuf_whichblock(dn, 1, offset); int err = 0; rw_enter(&dn->dn_struct_rwlock, RW_READER); dmu_buf_impl_t *dbuf = dbuf_hold_level(dn, 1, l1blkid, FTAG); ASSERT3P(dbuf, !=, NULL); /* * If the block hasn't been written yet, this default will ensure * we don't try to remap it. */ uint64_t birth = UINT64_MAX; ASSERT3U(last_removal_txg, !=, UINT64_MAX); if (dbuf->db_blkptr != NULL) birth = dbuf->db_blkptr->blk_birth; rw_exit(&dn->dn_struct_rwlock); /* * If this L1 was already written after the last removal, then we've * already tried to remap it. */ if (birth <= last_removal_txg && dbuf_read(dbuf, NULL, DB_RF_MUST_SUCCEED) == 0 && dbuf_can_remap(dbuf)) { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_remap_l1indirect(tx, dn->dn_object); err = dmu_tx_assign(tx, TXG_WAIT); if (err == 0) { (void) dbuf_dirty(dbuf, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } } dbuf_rele(dbuf, FTAG); delay(zfs_object_remap_one_indirect_delay_ticks); return (err); } /* * Remap all blockpointers in the object, if possible, so that they reference * only concrete vdevs. * * To do this, iterate over the L0 blockpointers and remap any that reference * an indirect vdev. Note that we only examine L0 blockpointers; since we * cannot guarantee that we can remap all blockpointer anyways (due to split * blocks), we do not want to make the code unnecessarily complicated to * catch the unlikely case that there is an L1 block on an indirect vdev that * contains no indirect blockpointers. */ int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t last_removal_txg) { uint64_t offset, l1span; int err; dnode_t *dn; err = dnode_hold(os, object, FTAG, &dn); if (err != 0) { return (err); } if (dn->dn_nlevels <= 1) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = SET_ERROR(EINTR); } /* * If the dnode has no indirect blocks, we cannot dirty them. * We still want to remap the blkptr(s) in the dnode if * appropriate, so mark it as dirty. */ if (err == 0 && dnode_needs_remap(dn)) { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, dn->dn_object); if ((err = dmu_tx_assign(tx, TXG_WAIT)) == 0) { dnode_setdirty(dn, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } } dnode_rele(dn, FTAG); return (err); } offset = 0; l1span = 1ULL << (dn->dn_indblkshift - SPA_BLKPTRSHIFT + dn->dn_datablkshift); /* * Find the next L1 indirect that is not a hole. */ while (dnode_next_offset(dn, 0, &offset, 2, 1, 0) == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { err = SET_ERROR(EINTR); break; } if ((err = dmu_object_remap_one_indirect(os, dn, last_removal_txg, offset)) != 0) { break; } offset += l1span; } dnode_rele(dn, FTAG); return (err); } void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; if (size == 0) return; VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp)); for (i = 0; i < numbufs; i++) { dmu_buf_t *db = dbp[i]; dmu_buf_will_not_fill(db, tx); } dmu_buf_rele_array(dbp, numbufs, FTAG); } void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_t *db; ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); VERIFY0(dmu_buf_hold_noread(os, object, offset, FTAG, &db)); dmu_buf_write_embedded(db, data, (bp_embedded_type_t)etype, (enum zio_compress)comp, uncompressed_size, compressed_size, byteorder, tx); dmu_buf_rele(db, FTAG); } /* * DMU support for xuio */ kstat_t *xuio_ksp = NULL; int dmu_xuio_init(xuio_t *xuio, int nblk) { dmu_xuio_t *priv; uio_t *uio = &xuio->xu_uio; uio->uio_iovcnt = nblk; uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); priv->cnt = nblk; priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); priv->iovp = uio->uio_iov; XUIO_XUZC_PRIV(xuio) = priv; if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); return (0); } void dmu_xuio_fini(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int nblk = priv->cnt; kmem_free(priv->iovp, nblk * sizeof (iovec_t)); kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); kmem_free(priv, sizeof (dmu_xuio_t)); if (XUIO_XUZC_RW(xuio) == UIO_READ) XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); else XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); } /* * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } * and increase priv->next by 1. */ int dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) { struct iovec *iov; uio_t *uio = &xuio->xu_uio; dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); int i = priv->next++; ASSERT(i < priv->cnt); ASSERT(off + n <= arc_buf_lsize(abuf)); iov = uio->uio_iov + i; iov->iov_base = (char *)abuf->b_data + off; iov->iov_len = n; priv->bufs[i] = abuf; return (0); } int dmu_xuio_cnt(xuio_t *xuio) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); return (priv->cnt); } arc_buf_t * dmu_xuio_arcbuf(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); return (priv->bufs[i]); } void dmu_xuio_clear(xuio_t *xuio, int i) { dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); ASSERT(i < priv->cnt); priv->bufs[i] = NULL; } static void xuio_stat_init(void) { xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (xuio_ksp != NULL) { xuio_ksp->ks_data = &xuio_stats; kstat_install(xuio_ksp); } } static void xuio_stat_fini(void) { if (xuio_ksp != NULL) { kstat_delete(xuio_ksp); xuio_ksp = NULL; } } void xuio_stat_wbuf_copied(void) { XUIOSTAT_BUMP(xuiostat_wbuf_copied); } void xuio_stat_wbuf_nocopy(void) { XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); } #ifdef _KERNEL int dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; xuio_t *xuio = NULL; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, TRUE, FTAG, &numbufs, &dbp, 0); if (err) return (err); if (uio->uio_extflg == UIO_XUIO) xuio = (xuio_t *)uio; for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); if (xuio) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; arc_buf_t *dbuf_abuf = dbi->db_buf; arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); if (!err) { uio->uio_resid -= tocpy; uio->uio_loffset += tocpy; } if (abuf == dbuf_abuf) XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); else XUIOSTAT_BUMP(xuiostat_rbuf_copied); } else { err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_READ, uio); } if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Read 'size' bytes into the uio buffer. * From object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_read_uio_dnode(dn, uio, size); DB_DNODE_EXIT(db); return (err); } /* * Read 'size' bytes into the uio buffer. * From the specified object * Starting at offset uio->uio_loffset. */ int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_read_uio_dnode(dn, uio, size); dnode_rele(dn, FTAG); return (err); } int dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs; int err = 0; int i; err = dmu_buf_hold_array_by_dnode(dn, uio->uio_loffset, size, FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; dmu_buf_t *db = dbp[i]; ASSERT(size > 0); bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); /* * XXX uiomove could block forever (eg. nfs-backed * pages). There needs to be a uiolockdown() function * to lock the pages in memory, so that uiomove won't * block. */ err = uiomove((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); if (err) break; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } /* * Write 'size' bytes from the uio buffer. * To object zdb->db_object. * Starting at offset uio->uio_loffset. * * If the caller already has a dbuf in the target object * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), * because we don't have to find the dnode_t for the object. */ int dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; dnode_t *dn; int err; if (size == 0) return (0); DB_DNODE_ENTER(db); dn = DB_DNODE(db); err = dmu_write_uio_dnode(dn, uio, size, tx); DB_DNODE_EXIT(db); return (err); } /* * Write 'size' bytes from the uio buffer. * To the specified object. * Starting at offset uio->uio_loffset. */ int dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; int err; if (size == 0) return (0); err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_write_uio_dnode(dn, uio, size, tx); dnode_rele(dn, FTAG); return (err); } int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, page_t *pp, dmu_tx_t *tx) { dmu_buf_t **dbp; int numbufs, i; int err; if (size == 0) return (0); err = dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, &numbufs, &dbp); if (err) return (err); for (i = 0; i < numbufs; i++) { int tocpy, copied, thiscpy; int bufoff; dmu_buf_t *db = dbp[i]; caddr_t va; ASSERT(size > 0); ASSERT3U(db->db_size, >=, PAGESIZE); bufoff = offset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) dmu_buf_will_fill(db, tx); else dmu_buf_will_dirty(db, tx); for (copied = 0; copied < tocpy; copied += PAGESIZE) { ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff); thiscpy = MIN(PAGESIZE, tocpy - copied); va = zfs_map_page(pp, S_READ); bcopy(va, (char *)db->db_data + bufoff, thiscpy); zfs_unmap_page(pp, va); pp = pp->p_next; bufoff += PAGESIZE; } if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); offset += tocpy; size -= tocpy; } dmu_buf_rele_array(dbp, numbufs, FTAG); return (err); } #endif /* * Allocate a loaned anonymous arc buffer. */ arc_buf_t * dmu_request_arcbuf(dmu_buf_t *handle, int size) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); } /* * Free a loaned arc buffer. */ void dmu_return_arcbuf(arc_buf_t *buf) { arc_return_buf(buf, FTAG); arc_buf_destroy(buf, FTAG); } /* * When possible directly assign passed loaned arc buffer to a dbuf. * If this is not possible copy the contents of passed arc buf via * dmu_write(). */ void dmu_assign_arcbuf_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *db; uint32_t blksz = (uint32_t)arc_buf_lsize(buf); uint64_t blkid; rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); rw_exit(&dn->dn_struct_rwlock); /* * We can only assign if the offset is aligned, the arc buf is the * same size as the dbuf, and the dbuf is not metadata. */ if (offset == db->db.db_offset && blksz == db->db.db_size) { dbuf_assign_arcbuf(db, buf, tx); dbuf_rele(db, FTAG); } else { objset_t *os; uint64_t object; /* compressed bufs must always be assignable to their dbuf */ ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); os = dn->dn_objset; object = dn->dn_object; dbuf_rele(db, FTAG); dmu_write(os, object, offset, blksz, buf->b_data, tx); dmu_return_arcbuf(buf); XUIOSTAT_BUMP(xuiostat_wbuf_copied); } } void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, dmu_tx_t *tx) { dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; DB_DNODE_ENTER(dbuf); dmu_assign_arcbuf_dnode(DB_DNODE(dbuf), offset, buf, tx); DB_DNODE_EXIT(dbuf); } typedef struct { dbuf_dirty_record_t *dsa_dr; dmu_sync_cb_t *dsa_done; zgd_t *dsa_zgd; dmu_tx_t *dsa_tx; } dmu_sync_arg_t; /* ARGSUSED */ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dmu_buf_t *db = dsa->dsa_zgd->zgd_db; blkptr_t *bp = zio->io_bp; if (zio->io_error == 0) { if (BP_IS_HOLE(bp)) { /* * A block of zeros may compress to a hole, but the * block size still needs to be known for replay. */ BP_SET_LSIZE(bp, db->db_size); } else if (!BP_IS_EMBEDDED(bp)) { ASSERT(BP_GET_LEVEL(bp) == 0); bp->blk_fill = 1; } } } static void dmu_sync_late_arrival_ready(zio_t *zio) { dmu_sync_ready(zio, NULL, zio->io_private); } /* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { dmu_sync_arg_t *dsa = varg; dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; zgd_t *zgd = dsa->dsa_zgd; /* * Record the vdev(s) backing this blkptr so they can be flushed after * the writes for the lwb have completed. */ if (zio->io_error == 0) { zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); } mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); if (zio->io_error == 0) { dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); if (dr->dt.dl.dr_nopwrite) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint8_t chksum = BP_GET_CHECKSUM(bp_orig); ASSERT(BP_EQUAL(bp, bp_orig)); VERIFY(BP_EQUAL(bp, db->db_blkptr)); ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); ASSERT(zio_checksum_table[chksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE); } dr->dt.dl.dr_overridden_by = *zio->io_bp; dr->dt.dl.dr_override_state = DR_OVERRIDDEN; dr->dt.dl.dr_copies = zio->io_prop.zp_copies; /* * Old style holes are filled with all zeros, whereas * new-style holes maintain their lsize, type, level, * and birth time (see zio_write_compress). While we * need to reset the BP_SET_LSIZE() call that happened * in dmu_sync_ready for old style holes, we do *not* * want to wipe out the information contained in new * style holes. Thus, only zero out the block pointer if * it's an old style hole. */ if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && dr->dt.dl.dr_overridden_by.blk_birth == 0) BP_ZERO(&dr->dt.dl.dr_overridden_by); } else { dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); kmem_free(dsa, sizeof (*dsa)); } static void dmu_sync_late_arrival_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; dmu_sync_arg_t *dsa = zio->io_private; blkptr_t *bp_orig = &zio->io_bp_orig; zgd_t *zgd = dsa->dsa_zgd; if (zio->io_error == 0) { /* * Record the vdev(s) backing this blkptr so they can be * flushed after the writes for the lwb have completed. */ zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); if (!BP_IS_HOLE(bp)) { ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); ASSERT(zio->io_bp->blk_birth == zio->io_txg); ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); zio_free(zio->io_spa, zio->io_txg, zio->io_bp); } } dmu_tx_commit(dsa->dsa_tx); dsa->dsa_done(dsa->dsa_zgd, zio->io_error); abd_put(zio->io_abd); kmem_free(dsa, sizeof (*dsa)); } static int dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, zio_prop_t *zp, zbookmark_phys_t *zb) { dmu_sync_arg_t *dsa; dmu_tx_t *tx; tx = dmu_tx_create(os); dmu_tx_hold_space(tx, zgd->zgd_db->db_size); if (dmu_tx_assign(tx, TXG_WAIT) != 0) { dmu_tx_abort(tx); /* Make zl_get_data do txg_waited_synced() */ return (SET_ERROR(EIO)); } /* * In order to prevent the zgd's lwb from being free'd prior to * dmu_sync_late_arrival_done() being called, we have to ensure * the lwb's "max txg" takes this tx's txg into account. */ zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = NULL; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = tx; /* * Since we are currently syncing this txg, it's nontrivial to * determine what BP to nopwrite against, so we disable nopwrite. * * When syncing, the db_blkptr is initially the BP of the previous * txg. We can not nopwrite against it because it will be changed * (this is similar to the non-late-arrival case where the dbuf is * dirty in a future txg). * * Then dbuf_write_ready() sets bp_blkptr to the location we will write. * We can not nopwrite against it because although the BP will not * (typically) be changed, the data has not yet been persisted to this * location. * * Finally, when dbuf_write_done() is called, it is theoretically * possible to always nopwrite, because the data that was written in * this txg is the same data that we are trying to write. However we * would need to check that this dbuf is not dirty in any future * txg's (as we do in the normal dmu_sync() path). For simplicity, we * don't nopwrite in this case. */ zp->zp_nopwrite = B_FALSE; zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); return (0); } /* * Intent log support: sync the block associated with db to disk. * N.B. and XXX: the caller is responsible for making sure that the * data isn't changing while dmu_sync() is writing it. * * Return values: * * EEXIST: this txg has already been synced, so there's nothing to do. * The caller should not log the write. * * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. * The caller should not log the write. * * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * * EIO: could not do the I/O. * The caller should do a txg_wait_synced(). * * 0: the I/O has been initiated. * The caller should log this blkptr in the done callback. * It is possible that the I/O will fail, in which case * the error will be reported to the done callback and * propagated to pio from zio_done(). */ int dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; objset_t *os = db->db_objset; dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr; dmu_sync_arg_t *dsa; zbookmark_phys_t zb; zio_prop_t zp; dnode_t *dn; ASSERT(pio != NULL); ASSERT(txg != 0); SET_BOOKMARK(&zb, ds->ds_object, db->db.db_object, db->db_level, db->db_blkid); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); DB_DNODE_EXIT(db); /* * If we're frozen (running ziltest), we always need to generate a bp. */ if (txg > spa_freeze_txg(os->os_spa)) return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() * and us. If we determine that this txg is not yet syncing, * but it begins to sync a moment later, that's OK because the * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ mutex_enter(&db->db_mtx); if (txg <= spa_last_synced_txg(os->os_spa)) { /* * This txg has already synced. There's nothing to do. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EEXIST)); } if (txg <= spa_syncing_txg(os->os_spa)) { /* * This txg is currently syncing, so we can't mess with * the dirty record anymore; just write a new log block. */ mutex_exit(&db->db_mtx); return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = db->db_last_dirty; while (dr && dr->dr_txg != txg) dr = dr->dr_next; if (dr == NULL) { /* * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); return (SET_ERROR(ENOENT)); } ASSERT(dr->dr_next == NULL || dr->dr_next->dr_txg < txg); if (db->db_blkptr != NULL) { /* * We need to fill in zgd_bp with the current blkptr so that * the nopwrite code can check if we're writing the same * data that's already on disk. We can only nopwrite if we * are sure that after making the copy, db_blkptr will not * change until our i/o completes. We ensure this by * holding the db_mtx, and only allowing nopwrite if the * block is not already dirty (see below). This is verified * by dmu_sync_done(), which VERIFYs that the db_blkptr has * not changed. */ *zgd->zgd_bp = *db->db_blkptr; } /* * Assume the on-disk data is X, the current syncing data (in * txg - 1) is Y, and the current in-memory data is Z (currently * in dmu_sync). * * We usually want to perform a nopwrite if X and Z are the * same. However, if Y is different (i.e. the BP is going to * change before this write takes effect), then a nopwrite will * be incorrect - we would override with X, which could have * been freed when Y was written. * * (Note that this is not a concern when we are nop-writing from * syncing context, because X and Y must be identical, because * all previous txgs have been synced.) * * Therefore, we disable nopwrite if the current BP could change * before this TXG. There are two ways it could change: by * being dirty (dr_next is non-NULL), or by being freed * (dnode_block_freed()). This behavior is verified by * zio_done(), which VERIFYs that the override BP is identical * to the on-disk BP. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dr->dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) zp.zp_nopwrite = B_FALSE; DB_DNODE_EXIT(db); ASSERT(dr->dr_txg == txg); if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * We have already issued a sync write for this buffer, * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ mutex_exit(&db->db_mtx); return (SET_ERROR(EALREADY)); } ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; mutex_exit(&db->db_mtx); dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); dsa->dsa_dr = dr; dsa->dsa_done = done; dsa->dsa_zgd = zgd; dsa->dsa_tx = NULL; zio_nowait(arc_write(pio, os->os_spa, txg, zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); return (0); } int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx) { dnode_t *dn; int err; err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); dnode_rele(dn, FTAG); return (err); } void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's checksum function. This * check ensures that the receiving system can understand the * checksum function transmitted. */ ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx) { dnode_t *dn; /* * Send streams include each object's compression function. This * check ensures that the receiving system can understand the * compression function transmitted. */ ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); VERIFY0(dnode_hold(os, object, FTAG, &dn)); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } int zfs_mdcomp_disable = 0; /* * When the "redundant_metadata" property is set to "most", only indirect * blocks of this level and higher will have an additional ditto block. */ int zfs_redundant_metadata_most_ditto_level = 2; void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) { dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)); enum zio_checksum checksum = os->os_checksum; enum zio_compress compress = os->os_compress; enum zio_checksum dedup_checksum = os->os_dedup_checksum; boolean_t dedup = B_FALSE; boolean_t nopwrite = B_FALSE; boolean_t dedup_verify = os->os_dedup_verify; int copies = os->os_copies; /* * We maintain different write policies for each of the following * types of data: * 1. metadata * 2. preallocated blocks (i.e. level-0 blocks of a dump device) * 3. all other level 0 blocks */ if (ismd) { if (zfs_mdcomp_disable) { compress = ZIO_COMPRESS_EMPTY; } else { /* * XXX -- we should design a compression algorithm * that specializes in arrays of bps. */ compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); } /* * Metadata always gets checksummed. If the data * checksum is multi-bit correctable, and it's not a * ZBT-style checksum, then it's suitable for metadata * as well. Otherwise, the metadata checksum defaults * to fletcher4. */ if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_METADATA) || (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED)) checksum = ZIO_CHECKSUM_FLETCHER_4; if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_MOST && (level >= zfs_redundant_metadata_most_ditto_level || DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) copies++; } else if (wp & WP_NOFILL) { ASSERT(level == 0); /* * If we're writing preallocated blocks, we aren't actually * writing them so don't set any policy properties. These * blocks are currently only used by an external subsystem * outside of zfs (i.e. dump) and not written by the zio * pipeline. */ compress = ZIO_COMPRESS_OFF; checksum = ZIO_CHECKSUM_NOPARITY; } else { compress = zio_compress_select(os->os_spa, dn->dn_compress, compress); checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? zio_checksum_select(dn->dn_checksum, checksum) : dedup_checksum; /* * Determine dedup setting. If we are in dmu_sync(), * we won't actually dedup now because that's all * done in syncing context; but we do want to use the * dedup checkum. If the checksum is not strong * enough to ensure unique signatures, force * dedup_verify. */ if (dedup_checksum != ZIO_CHECKSUM_OFF) { dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; if (!(zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_DEDUP)) dedup_verify = B_TRUE; } /* * Enable nopwrite if we have secure enough checksum * algorithm (see comment in zio_nop_write) and * compression is enabled. We don't enable nopwrite if * dedup is enabled as the two features are mutually * exclusive. */ nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_NOPWRITE) && compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); } zp->zp_checksum = checksum; zp->zp_compress = compress; ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; zp->zp_level = level; zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); zp->zp_dedup = dedup; zp->zp_dedup_verify = dedup && dedup_verify; zp->zp_nopwrite = nopwrite; } int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int err; /* * Sync any current changes before * we go trundling through the block pointers. */ err = dmu_object_wait_synced(os, object); if (err) { return (err); } err = dnode_hold(os, object, FTAG, &dn); if (err) { return (err); } err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); dnode_rele(dn, FTAG); return (err); } /* * Given the ZFS object, if it contains any dirty nodes * this function flushes all dirty blocks to disk. This * ensures the DMU object info is updated. A more efficient * future version might just find the TXG with the maximum * ID and wait for that to be synced. */ int dmu_object_wait_synced(objset_t *os, uint64_t object) { dnode_t *dn; int error, i; error = dnode_hold(os, object, FTAG, &dn); if (error) { return (error); } for (i = 0; i < TXG_SIZE; i++) { if (list_link_active(&dn->dn_dirty_link[i])) { break; } } dnode_rele(dn, FTAG); if (i != TXG_SIZE) { txg_wait_synced(dmu_objset_pool(os), 0); } return (0); } void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { dnode_phys_t *dnp; rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); dnp = dn->dn_phys; doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; doi->doi_type = dn->dn_type; doi->doi_bonus_type = dn->dn_bonustype; doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; doi->doi_nblkptr = dn->dn_nblkptr; doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; doi->doi_fill_count = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); } /* * Get information on a DMU object. * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); if (doi != NULL) dmu_object_info_from_dnode(dn, doi); dnode_rele(dn, FTAG); return (0); } /* * As above, but faster; can be used when you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; DB_DNODE_ENTER(db); dmu_object_info_from_dnode(DB_DNODE(db), doi); DB_DNODE_EXIT(db); } /* * Faster still when you only care about the size. * This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, u_longlong_t *nblk512) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *blksize = dn->dn_datablksz; /* add in number of slots used for the dnode itself */ *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT) + dn->dn_num_slots; DB_DNODE_EXIT(db); } void dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); *dnsize = dn->dn_num_slots << DNODE_SHIFT; DB_DNODE_EXIT(db); } void byteswap_uint64_array(void *vbuf, size_t size) { uint64_t *buf = vbuf; size_t count = size >> 3; int i; ASSERT((size & 7) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_64(buf[i]); } void byteswap_uint32_array(void *vbuf, size_t size) { uint32_t *buf = vbuf; size_t count = size >> 2; int i; ASSERT((size & 3) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_32(buf[i]); } void byteswap_uint16_array(void *vbuf, size_t size) { uint16_t *buf = vbuf; size_t count = size >> 1; int i; ASSERT((size & 1) == 0); for (i = 0; i < count; i++) buf[i] = BSWAP_16(buf[i]); } /* ARGSUSED */ void byteswap_uint8_array(void *vbuf, size_t size) { } void dmu_init(void) { abd_init(); zfs_dbgmsg_init(); sa_cache_init(); xuio_stat_init(); dmu_objset_init(); dnode_init(); zfetch_init(); l2arc_init(); arc_init(); dbuf_init(); } void dmu_fini(void) { arc_fini(); /* arc depends on l2arc, so arc must go first */ l2arc_fini(); zfetch_fini(); dbuf_fini(); dnode_fini(); dmu_objset_fini(); xuio_stat_fini(); sa_cache_fini(); zfs_dbgmsg_fini(); abd_fini(); } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zap.h =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zap.h (revision 353618) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/sys/zap.h (revision 353619) @@ -1,504 +1,507 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ #ifndef _SYS_ZAP_H #define _SYS_ZAP_H /* * ZAP - ZFS Attribute Processor * * The ZAP is a module which sits on top of the DMU (Data Management * Unit) and implements a higher-level storage primitive using DMU * objects. Its primary consumer is the ZPL (ZFS Posix Layer). * * A "zapobj" is a DMU object which the ZAP uses to stores attributes. * Users should use only zap routines to access a zapobj - they should * not access the DMU object directly using DMU routines. * * The attributes stored in a zapobj are name-value pairs. The name is * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including * terminating NULL). The value is an array of integers, which may be * 1, 2, 4, or 8 bytes long. The total space used by the array (number * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes. * Note that an 8-byte integer value can be used to store the location * (object number) of another dmu object (which may be itself a zapobj). * Note that you can use a zero-length attribute to store a single bit * of information - the attribute is present or not. * * The ZAP routines are thread-safe. However, you must observe the * DMU's restriction that a transaction may not be operated on * concurrently. * * Any of the routines that return an int may return an I/O error (EIO * or ECHECKSUM). * * * Implementation / Performance Notes: * * The ZAP is intended to operate most efficiently on attributes with * short (49 bytes or less) names and single 8-byte values, for which * the microzap will be used. The ZAP should be efficient enough so * that the user does not need to cache these attributes. * * The ZAP's locking scheme makes its routines thread-safe. Operations * on different zapobjs will be processed concurrently. Operations on * the same zapobj which only read data will be processed concurrently. * Operations on the same zapobj which modify data will be processed * concurrently when there are many attributes in the zapobj (because * the ZAP uses per-block locking - more than 128 * (number of cpus) * small attributes will suffice). */ /* * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C * strings) for the names of attributes, rather than a byte string * bounded by an explicit length. If some day we want to support names * in character sets which have embedded zeros (eg. UTF-16, UTF-32), * we'll have to add routines for using length-bounded strings. */ #include #include #ifdef __cplusplus extern "C" { #endif /* * Specifies matching criteria for ZAP lookups. * MT_NORMALIZE Use ZAP normalization flags, which can include both * unicode normalization and case-insensitivity. * MT_MATCH_CASE Do case-sensitive lookups even if MT_NORMALIZE is * specified and ZAP normalization flags include * U8_TEXTPREP_TOUPPER. */ typedef enum matchtype { MT_NORMALIZE = 1 << 0, MT_MATCH_CASE = 1 << 1, } matchtype_t; typedef enum zap_flags { /* Use 64-bit hash value (serialized cursors will always use 64-bits) */ ZAP_FLAG_HASH64 = 1 << 0, /* Key is binary, not string (zap_add_uint64() can be used) */ ZAP_FLAG_UINT64_KEY = 1 << 1, /* * First word of key (which must be an array of uint64) is * already randomly distributed. */ ZAP_FLAG_PRE_HASHED_KEY = 1 << 2, } zap_flags_t; /* * Create a new zapobj with no attributes and return its object number. * * dnodesize specifies the on-disk size of the dnode for the new zapobj. * Valid values are multiples of 512 up to DNODE_MAX_SIZE. */ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx); uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx); /* * Initialize an already-allocated object. */ void mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, dmu_tx_t *tx); /* * Create a new zapobj with no attributes from the given (unallocated) * object number. */ int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); int zap_create_claim_norm(objset_t *ds, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); /* * The zapobj passed in must be a valid ZAP object for all of the * following routines. */ /* * Destroy this zapobj and all its attributes. * * Frees the object number using dmu_object_free. */ int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); /* * Manipulate attributes. * * 'integer_size' is in bytes, and must be 1, 2, 4, or 8. */ /* * Retrieve the contents of the attribute with the given name. * * If the requested attribute does not exist, the call will fail and * return ENOENT. * * If 'integer_size' is smaller than the attribute's integer size, the * call will fail and return EINVAL. * * If 'integer_size' is equal to or larger than the attribute's integer * size, the call will succeed and return 0. * * When converting to a larger integer size, the integers will be treated as * unsigned (ie. no sign-extension will be performed). * * 'num_integers' is the length (in integers) of 'buf'. * * If the attribute is longer than the buffer, as many integers as will * fit will be transferred to 'buf'. If the entire attribute was not * transferred, the call will return EOVERFLOW. */ int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf); /* * If rn_len is nonzero, realname will be set to the name of the found * entry (which may be different from the requested name if matchtype is * not MT_EXACT). * * If normalization_conflictp is not NULL, it will be set if there is * another name with the same case/unicode normalized form. */ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *normalization_conflictp); int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints); int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf); int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp); int zap_count_write_by_dnode(dnode_t *dn, const char *name, int add, zfs_refcount_t *towrite, zfs_refcount_t *tooverwrite); /* * Create an attribute with the given name and value. * * If an attribute with the given name already exists, the call will * fail and return EEXIST. */ int zap_add(objset_t *ds, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int zap_add_by_dnode(dnode_t *dn, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Set the attribute with the given name to the given value. If an * attribute with the given name does not exist, it will be created. If * an attribute with the given name already exists, the previous value * will be overwritten. The integer_size may be different from the * existing attribute's integer size, in which case the attribute's * integer size will be updated to the new value. */ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Get the length (in integers) and the integer size of the specified * attribute. * * If the requested attribute does not exist, the call will fail and * return ENOENT. */ int zap_length(objset_t *ds, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers); int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers); /* * Remove the specified attribute. * * If the specified attribute does not exist, the call will fail and * return ENOENT. */ int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx); int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx); int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx); /* * Returns (in *count) the number of attributes in the specified zap * object. */ int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); /* * Returns (in name) the name of the entry whose (value & mask) * (za_first_integer) is value, or ENOENT if not found. The string * pointed to by name must be at least 256 bytes long. If mask==0, the * match must be exact (ie, same as mask=-1ULL). */ int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name); /* * Transfer all the entries from fromobj into intoobj. Only works on * int_size=8 num_integers=1 values. Fails if there are any duplicated * entries. */ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); /* Same as zap_join, but set the values to 'value'. */ int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, uint64_t value, dmu_tx_t *tx); /* Same as zap_join, but add together any duplicated entries. */ int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); /* * Manipulate entries where the name + value are the "same" (the name is * a stringified version of the value). */ int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, dmu_tx_t *tx); /* Here the key is an int and the value is a different int. */ int zap_add_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx); int zap_update_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx); int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep); int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, dmu_tx_t *tx); struct zap; struct zap_leaf; typedef struct zap_cursor { /* This structure is opaque! */ objset_t *zc_objset; struct zap *zc_zap; struct zap_leaf *zc_leaf; uint64_t zc_zapobj; uint64_t zc_serialized; uint64_t zc_hash; uint32_t zc_cd; + boolean_t zc_prefetch; } zap_cursor_t; typedef struct { int za_integer_length; /* * za_normalization_conflict will be set if there are additional * entries with this normalized form (eg, "foo" and "Foo"). */ boolean_t za_normalization_conflict; uint64_t za_num_integers; uint64_t za_first_integer; /* no sign extension for <8byte ints */ char za_name[ZAP_MAXNAMELEN]; } zap_attribute_t; /* * The interface for listing all the attributes of a zapobj can be * thought of as cursor moving down a list of the attributes one by * one. The cookie returned by the zap_cursor_serialize routine is * persistent across system calls (and across reboot, even). */ /* * Initialize a zap cursor, pointing to the "first" attribute of the * zapobj. You must _fini the cursor when you are done with it. */ void zap_cursor_init(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj); +void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, + uint64_t zapobj); void zap_cursor_fini(zap_cursor_t *zc); /* * Get the attribute currently pointed to by the cursor. Returns * ENOENT if at the end of the attributes. */ int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za); /* * Advance the cursor to the next attribute. */ void zap_cursor_advance(zap_cursor_t *zc); /* * Get a persistent cookie pointing to the current position of the zap * cursor. The low 4 bits in the cookie are always zero, and thus can * be used as to differentiate a serialized cookie from a different type * of value. The cookie will be less than 2^32 as long as there are * fewer than 2^22 (4.2 million) entries in the zap object. */ uint64_t zap_cursor_serialize(zap_cursor_t *zc); /* * Initialize a zap cursor pointing to the position recorded by * zap_cursor_serialize (in the "serialized" argument). You can also * use a "serialized" argument of 0 to start at the beginning of the * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to * zap_cursor_init(...).) */ void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, uint64_t zapobj, uint64_t serialized); #define ZAP_HISTOGRAM_SIZE 10 typedef struct zap_stats { /* * Size of the pointer table (in number of entries). * This is always a power of 2, or zero if it's a microzap. * In general, it should be considerably greater than zs_num_leafs. */ uint64_t zs_ptrtbl_len; uint64_t zs_blocksize; /* size of zap blocks */ /* * The number of blocks used. Note that some blocks may be * wasted because old ptrtbl's and large name/value blocks are * not reused. (Although their space is reclaimed, we don't * reuse those offsets in the object.) */ uint64_t zs_num_blocks; /* * Pointer table values from zap_ptrtbl in the zap_phys_t */ uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */ uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */ uint64_t zs_ptrtbl_zt_blk; /* starting block number */ uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */ uint64_t zs_ptrtbl_zt_shift; /* bits to index it */ /* * Values of the other members of the zap_phys_t */ uint64_t zs_block_type; /* ZBT_HEADER */ uint64_t zs_magic; /* ZAP_MAGIC */ uint64_t zs_num_leafs; /* The number of leaf blocks */ uint64_t zs_num_entries; /* The number of zap entries */ uint64_t zs_salt; /* salt to stir into hash function */ /* * Histograms. For all histograms, the last index * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater * than what can be represented. For example * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number * of leafs with more than 45 entries. */ /* * zs_leafs_with_n_pointers[n] is the number of leafs with * 2^n pointers to it. */ uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE]; /* * zs_leafs_with_n_entries[n] is the number of leafs with * [n*5, (n+1)*5) entries. In the current implementation, there * can be at most 55 entries in any block, but there may be * fewer if the name or value is large, or the block is not * completely full. */ uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE]; /* * zs_leafs_n_tenths_full[n] is the number of leafs whose * fullness is in the range [n/10, (n+1)/10). */ uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE]; /* * zs_entries_using_n_chunks[n] is the number of entries which * consume n 24-byte chunks. (Note, large names/values only use * one chunk, but contribute to zs_num_blocks_large.) */ uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE]; /* * zs_buckets_with_n_entries[n] is the number of buckets (each * leaf has 64 buckets) with n entries. * zs_buckets_with_n_entries[1] should be very close to * zs_num_entries. */ uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE]; } zap_stats_t; /* * Get statistics about a ZAP object. Note: you need to be aware of the * internal implementation of the ZAP to correctly interpret some of the * statistics. This interface shouldn't be relied on unless you really * know what you're doing. */ int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); #ifdef __cplusplus } #endif #endif /* _SYS_ZAP_H */ Index: vendor-sys/illumos/dist/uts/common/fs/zfs/zap.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/zap.c (revision 353618) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/zap.c (revision 353619) @@ -1,1309 +1,1353 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* * This file contains the top half of the zfs directory structure * implementation. The bottom half is in zap_leaf.c. * * The zdir is an extendable hash data structure. There is a table of * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are * each a constant size and hold a variable number of directory entries. * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. * * The pointer table holds a power of 2 number of pointers. * (1<zd_data->zd_phys->zd_prefix_len). The bucket pointed to * by the pointer at index i in the table holds entries whose hash value * has a zd_prefix_len - bit prefix */ #include #include #include #include #include #include #include #include #include +/* + * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object + * (all leaf blocks) when we start iterating over it. + * + * For zap_cursor_init(), the callers all intend to iterate through all the + * entries. There are a few cases where an error (typically i/o error) could + * cause it to bail out early. + * + * For zap_cursor_init_serialized(), there are callers that do the iteration + * outside of ZFS. Typically they would iterate over everything, but we + * don't have control of that. E.g. zfs_ioc_snapshot_list_next(), + * zcp_snapshots_iter(), and other iterators over things in the MOS - these + * are called by /sbin/zfs and channel programs. The other example is + * zfs_readdir() which iterates over directory entries for the getdents() + * syscall. /sbin/ls iterates to the end (unless it receives a signal), but + * userland doesn't have to. + * + * Given that the ZAP entries aren't returned in a specific order, the only + * legitimate use cases for partial iteration would be: + * + * 1. Pagination: e.g. you only want to display 100 entries at a time, so you + * get the first 100 and then wait for the user to hit "next page", which + * they may never do). + * + * 2. You want to know if there are more than X entries, without relying on + * the zfs-specific implementation of the directory's st_size (which is + * the number of entries). + */ +boolean_t zap_iterate_prefetch = B_TRUE; + int fzap_default_block_shift = 14; /* 16k blocksize */ extern inline zap_phys_t *zap_f_phys(zap_t *zap); static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); void fzap_byteswap(void *vbuf, size_t size) { uint64_t block_type = *(uint64_t *)vbuf; if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) zap_leaf_byteswap(vbuf, size); else { /* it's a ptrtbl block */ byteswap_uint64_array(vbuf, size); } } void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) { ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); zap->zap_ismicro = FALSE; zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync; zap->zap_dbu.dbu_evict_func_async = NULL; mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; zap_phys_t *zp = zap_f_phys(zap); /* * explicitly zero it since it might be coming from an * initialized microzap */ bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); zp->zap_block_type = ZBT_HEADER; zp->zap_magic = ZAP_MAGIC; zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); zp->zap_freeblk = 2; /* block 1 will be the first leaf */ zp->zap_num_leafs = 1; zp->zap_num_entries = 0; zp->zap_salt = zap->zap_salt; zp->zap_normflags = zap->zap_normflags; zp->zap_flags = flags; /* block 1 will be the first leaf */ for (int i = 0; i < (1<zap_ptrtbl.zt_shift); i++) ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; /* * set up block 1 - the first leaf */ dmu_buf_t *db; VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, 1<l_dbuf = db; zap_leaf_init(l, zp->zap_normflags != 0); kmem_free(l, sizeof (zap_leaf_t)); dmu_buf_rele(db, FTAG); } static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) { if (RW_WRITE_HELD(&zap->zap_rwlock)) return (1); if (rw_tryupgrade(&zap->zap_rwlock)) { dmu_buf_will_dirty(zap->zap_dbuf, tx); return (1); } return (0); } /* * Generic routines for dealing with the pointer & cookie tables. */ static int zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), dmu_tx_t *tx) { uint64_t newblk; int bs = FZAP_BLOCK_SHIFT(zap); int hepb = 1<<(bs-4); /* hepb = half the number of entries in a block */ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ASSERT(tbl->zt_blk != 0); ASSERT(tbl->zt_numblks > 0); if (tbl->zt_nextblk != 0) { newblk = tbl->zt_nextblk; } else { newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); tbl->zt_nextblk = newblk; ASSERT0(tbl->zt_blks_copied); dmu_prefetch(zap->zap_objset, zap->zap_object, 0, tbl->zt_blk << bs, tbl->zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); } /* * Copy the ptrtbl from the old to new location. */ uint64_t b = tbl->zt_blks_copied; dmu_buf_t *db_old; int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); if (err != 0) return (err); /* first half of entries in old[b] go to new[2*b+0] */ dmu_buf_t *db_new; VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func(db_old->db_data, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func((uint64_t *)db_old->db_data + hepb, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); dmu_buf_rele(db_old, FTAG); tbl->zt_blks_copied++; dprintf("copied block %llu of %llu\n", tbl->zt_blks_copied, tbl->zt_numblks); if (tbl->zt_blks_copied == tbl->zt_numblks) { (void) dmu_free_range(zap->zap_objset, zap->zap_object, tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); tbl->zt_blk = newblk; tbl->zt_numblks *= 2; tbl->zt_shift++; tbl->zt_nextblk = 0; tbl->zt_blks_copied = 0; dprintf("finished; numblocks now %llu (%lluk entries)\n", tbl->zt_numblks, 1<<(tbl->zt_shift-10)); } return (0); } static int zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, dmu_tx_t *tx) { int bs = FZAP_BLOCK_SHIFT(zap); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(tbl->zt_blk != 0); dprintf("storing %llx at index %llx\n", val, idx); uint64_t blk = idx >> (bs-3); uint64_t off = idx & ((1<<(bs-3))-1); dmu_buf_t *db; int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); dmu_buf_will_dirty(db, tx); if (tbl->zt_nextblk != 0) { uint64_t idx2 = idx * 2; uint64_t blk2 = idx2 >> (bs-3); uint64_t off2 = idx2 & ((1<<(bs-3))-1); dmu_buf_t *db2; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, DMU_READ_NO_PREFETCH); if (err != 0) { dmu_buf_rele(db, FTAG); return (err); } dmu_buf_will_dirty(db2, tx); ((uint64_t *)db2->db_data)[off2] = val; ((uint64_t *)db2->db_data)[off2+1] = val; dmu_buf_rele(db2, FTAG); } ((uint64_t *)db->db_data)[off] = val; dmu_buf_rele(db, FTAG); return (0); } static int zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) { int bs = FZAP_BLOCK_SHIFT(zap); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); uint64_t blk = idx >> (bs-3); uint64_t off = idx & ((1<<(bs-3))-1); /* * Note: this is equivalent to dmu_buf_hold(), but we use * _dnode_enter / _by_dnode because it's faster because we don't * have to hold the dnode. */ dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); dmu_buf_t *db; int err = dmu_buf_hold_by_dnode(dn, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); dmu_buf_dnode_exit(zap->zap_dbuf); if (err != 0) return (err); *valp = ((uint64_t *)db->db_data)[off]; dmu_buf_rele(db, FTAG); if (tbl->zt_nextblk != 0) { /* * read the nextblk for the sake of i/o error checking, * so that zap_table_load() will catch errors for * zap_table_store. */ blk = (idx*2) >> (bs-3); dn = dmu_buf_dnode_enter(zap->zap_dbuf); err = dmu_buf_hold_by_dnode(dn, (tbl->zt_nextblk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); dmu_buf_dnode_exit(zap->zap_dbuf); if (err == 0) dmu_buf_rele(db, FTAG); } return (err); } /* * Routines for growing the ptrtbl. */ static void zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) { for (int i = 0; i < n; i++) { uint64_t lb = src[i]; dst[2 * i + 0] = lb; dst[2 * i + 1] = lb; } } static int zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) { /* * The pointer table should never use more hash bits than we * have (otherwise we'd be using useless zero bits to index it). * If we are within 2 bits of running out, stop growing, since * this is already an aberrant condition. */ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) return (SET_ERROR(ENOSPC)); if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { /* * We are outgrowing the "embedded" ptrtbl (the one * stored in the header block). Give it its own entire * block, which will double the size of the ptrtbl. */ ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); uint64_t newblk = zap_allocate_blocks(zap, 1); dmu_buf_t *db_new; int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, DMU_READ_NO_PREFETCH); if (err != 0) return (err); dmu_buf_will_dirty(db_new, tx); zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); dmu_buf_rele(db_new, FTAG); zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; zap_f_phys(zap)->zap_ptrtbl.zt_shift++; ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << (FZAP_BLOCK_SHIFT(zap)-3)); return (0); } else { return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, zap_ptrtbl_transfer, tx)); } } static void zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) { dmu_buf_will_dirty(zap->zap_dbuf, tx); mutex_enter(&zap->zap_f.zap_num_entries_mtx); ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); zap_f_phys(zap)->zap_num_entries += delta; mutex_exit(&zap->zap_f.zap_num_entries_mtx); } static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks) { ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); uint64_t newblk = zap_f_phys(zap)->zap_freeblk; zap_f_phys(zap)->zap_freeblk += nblocks; return (newblk); } static void zap_leaf_evict_sync(void *dbu) { zap_leaf_t *l = dbu; rw_destroy(&l->l_rwlock); kmem_free(l, sizeof (zap_leaf_t)); } static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); rw_init(&l->l_rwlock, 0, 0, 0); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = zap_allocate_blocks(zap, 1); l->l_dbuf = NULL; VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu)); dmu_buf_will_dirty(l->l_dbuf, tx); zap_leaf_init(l, zap->zap_normflags != 0); zap_f_phys(zap)->zap_num_leafs++; return (l); } int fzap_count(zap_t *zap, uint64_t *count) { ASSERT(!zap->zap_ismicro); mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ *count = zap_f_phys(zap)->zap_num_entries; mutex_exit(&zap->zap_f.zap_num_entries_mtx); return (0); } /* * Routines for obtaining zap_leaf_t's */ void zap_put_leaf(zap_leaf_t *l) { rw_exit(&l->l_rwlock); dmu_buf_rele(l->l_dbuf, NULL); } static zap_leaf_t * zap_open_leaf(uint64_t blkid, dmu_buf_t *db) { ASSERT(blkid != 0); zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); rw_init(&l->l_rwlock, 0, 0, 0); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = blkid; l->l_bs = highbit64(db->db_size) - 1; l->l_dbuf = db; dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu); rw_exit(&l->l_rwlock); if (winner != NULL) { /* someone else set it first */ zap_leaf_evict_sync(&l->l_dbu); l = winner; } /* * lhr_pad was previously used for the next leaf in the leaf * chain. There should be no chained leafs (as we have removed * support for them). */ ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); /* * There should be more hash entries than there can be * chunks to put in the hash table */ ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); /* The chunks should begin at the end of the hash table */ ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); /* The chunks should end at the end of the block */ ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); return (l); } static int zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { dmu_buf_t *db; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); int bs = FZAP_BLOCK_SHIFT(zap); dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); int err = dmu_buf_hold_by_dnode(dn, blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); dmu_buf_dnode_exit(zap->zap_dbuf); if (err != 0) return (err); ASSERT3U(db->db_object, ==, zap->zap_object); ASSERT3U(db->db_offset, ==, blkid << bs); ASSERT3U(db->db_size, ==, 1 << bs); ASSERT(blkid != 0); zap_leaf_t *l = dmu_buf_get_user(db); if (l == NULL) l = zap_open_leaf(blkid, db); rw_enter(&l->l_rwlock, lt); /* * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, * causing ASSERT below to fail. */ if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3U(l->l_blkid, ==, blkid); ASSERT3P(l->l_dbuf, ==, db); ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); *lp = l; return (0); } static int zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) { ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { ASSERT3U(idx, <, (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); return (0); } else { return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, idx, valp)); } } static int zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) { ASSERT(tx != NULL); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; return (0); } else { return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, idx, blk, tx)); } } static int zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { uint64_t blk; ASSERT(zap->zap_dbuf == NULL || zap_f_phys(zap) == zap->zap_dbuf->db_data); /* Reality check for corrupt zap objects (leaf or header). */ if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF && zap_f_phys(zap)->zap_block_type != ZBT_HEADER) || zap_f_phys(zap)->zap_magic != ZAP_MAGIC) { return (SET_ERROR(EIO)); } uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); int err = zap_idx_to_blk(zap, idx, &blk); if (err != 0) return (err); err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); ASSERT(err || ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == zap_leaf_phys(*lp)->l_hdr.lh_prefix); return (err); } static int zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, void *tag, dmu_tx_t *tx, zap_leaf_t **lp) { zap_t *zap = zn->zn_zap; uint64_t hash = zn->zn_hash; int err; int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, zap_leaf_phys(l)->l_hdr.lh_prefix); if (zap_tryupgradedir(zap, tx) == 0 || old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { /* We failed to upgrade, or need to grow the pointer table */ objset_t *os = zap->zap_objset; uint64_t object = zap->zap_object; zap_put_leaf(l); zap_unlockdir(zap, tag); err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); zap = zn->zn_zap; if (err != 0) return (err); ASSERT(!zap->zap_ismicro); while (old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { err = zap_grow_ptrtbl(zap, tx); if (err != 0) return (err); } err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); if (err != 0) return (err); if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { /* it split while our locks were down */ *lp = l; return (0); } } ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, zap_leaf_phys(l)->l_hdr.lh_prefix); int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - (old_prefix_len + 1); uint64_t sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; /* check for i/o errors before doing zap_leaf_split */ for (int i = 0; i < (1ULL << prefix_diff); i++) { uint64_t blk; err = zap_idx_to_blk(zap, sibling + i, &blk); if (err != 0) return (err); ASSERT3U(blk, ==, l->l_blkid); } zap_leaf_t *nl = zap_create_leaf(zap, tx); zap_leaf_split(l, nl, zap->zap_normflags != 0); /* set sibling pointers */ for (int i = 0; i < (1ULL << prefix_diff); i++) { err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx); ASSERT0(err); /* we checked for i/o errors above */ } if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { /* we want the sibling */ zap_put_leaf(l); *lp = nl; } else { zap_put_leaf(nl); *lp = l; } return (0); } static void zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, void *tag, dmu_tx_t *tx) { zap_t *zap = zn->zn_zap; int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); zap_put_leaf(l); if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { /* * We are in the middle of growing the pointer table, or * this leaf will soon make us grow it. */ if (zap_tryupgradedir(zap, tx) == 0) { objset_t *os = zap->zap_objset; uint64_t zapobj = zap->zap_object; zap_unlockdir(zap, tag); int err = zap_lockdir(os, zapobj, tx, RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); zap = zn->zn_zap; if (err != 0) return; } /* could have finished growing while our locks were down */ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) (void) zap_grow_ptrtbl(zap, tx); } } static int fzap_checkname(zap_name_t *zn) { if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int fzap_checksize(uint64_t integer_size, uint64_t num_integers) { /* Only integer sizes supported by C */ switch (integer_size) { case 1: case 2: case 4: case 8: break; default: return (SET_ERROR(EINVAL)); } if (integer_size * num_integers > ZAP_MAXVALUELEN) return (E2BIG); return (0); } static int fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) { int err = fzap_checkname(zn); if (err != 0) return (err); return (fzap_checksize(integer_size, num_integers)); } /* * Routines for manipulating attributes. */ int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *ncp) { zap_leaf_t *l; zap_entry_handle_t zeh; int err = fzap_checkname(zn); if (err != 0) return (err); err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { if ((err = fzap_checksize(integer_size, num_integers)) != 0) { zap_put_leaf(l); return (err); } err = zap_entry_read(&zeh, integer_size, num_integers, buf); (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); if (ncp) { *ncp = zap_entry_normalization_conflict(&zeh, zn, NULL, zn->zn_zap); } } zap_put_leaf(l); return (err); } int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, void *tag, dmu_tx_t *tx) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; zap_t *zap = zn->zn_zap; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(!zap->zap_ismicro); ASSERT(fzap_check(zn, integer_size, num_integers) == 0); err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); retry: err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { err = SET_ERROR(EEXIST); goto out; } if (err != ENOENT) goto out; err = zap_entry_create(l, zn, cd, integer_size, num_integers, val, &zeh); if (err == 0) { zap_increment_num_entries(zap, 1, tx); } else if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } out: if (zap != NULL) zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); return (err); } int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, void *tag, dmu_tx_t *tx) { int err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); return (fzap_add_cd(zn, integer_size, num_integers, val, ZAP_NEED_CD, tag, tx)); } int fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers, const void *val, void *tag, dmu_tx_t *tx) { zap_leaf_t *l; int err; boolean_t create; zap_entry_handle_t zeh; zap_t *zap = zn->zn_zap; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); retry: err = zap_leaf_lookup(l, zn, &zeh); create = (err == ENOENT); ASSERT(err == 0 || err == ENOENT); if (create) { err = zap_entry_create(l, zn, ZAP_NEED_CD, integer_size, num_integers, val, &zeh); if (err == 0) zap_increment_num_entries(zap, 1, tx); } else { err = zap_entry_update(&zeh, integer_size, num_integers, val); } if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tag, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } if (zap != NULL) zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); return (err); } int fzap_length(zap_name_t *zn, uint64_t *integer_size, uint64_t *num_integers) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err != 0) goto out; if (integer_size != 0) *integer_size = zeh.zeh_integer_size; if (num_integers != 0) *num_integers = zeh.zeh_num_integers; out: zap_put_leaf(l); return (err); } int fzap_remove(zap_name_t *zn, dmu_tx_t *tx) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { zap_entry_remove(&zeh); zap_increment_num_entries(zn->zn_zap, -1, tx); } zap_put_leaf(l); return (err); } void fzap_prefetch(zap_name_t *zn) { uint64_t blk; zap_t *zap = zn->zn_zap; uint64_t idx = ZAP_HASH_IDX(zn->zn_hash, zap_f_phys(zap)->zap_ptrtbl.zt_shift); if (zap_idx_to_blk(zap, idx, &blk) != 0) return; int bs = FZAP_BLOCK_SHIFT(zap); dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, ZIO_PRIORITY_SYNC_READ); } /* * Helper functions for consumers. */ uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx) { return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx)); } uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx) { uint64_t new_obj; VERIFY((new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, dnodesize, tx)) > 0); VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, tx)); return (new_obj); } int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name) { zap_cursor_t zc; int err; if (mask == 0) mask = -1ULL; zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); for (zap_cursor_init(&zc, os, zapobj); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { if ((za->za_first_integer & mask) == (value & mask)) { (void) strcpy(name, za->za_name); break; } } zap_cursor_fini(&zc); kmem_free(za, sizeof (*za)); return (err); } int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) { zap_cursor_t zc; int err = 0; zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { if (za->za_integer_length != 8 || za->za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } err = zap_add(os, intoobj, za->za_name, 8, 1, &za->za_first_integer, tx); if (err != 0) break; } zap_cursor_fini(&zc); kmem_free(za, sizeof (*za)); return (err); } int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, uint64_t value, dmu_tx_t *tx) { zap_cursor_t zc; int err = 0; zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { if (za->za_integer_length != 8 || za->za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } err = zap_add(os, intoobj, za->za_name, 8, 1, &value, tx); if (err != 0) break; } zap_cursor_fini(&zc); kmem_free(za, sizeof (*za)); return (err); } int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) { zap_cursor_t zc; int err = 0; zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, za) == 0; (void) zap_cursor_advance(&zc)) { uint64_t delta = 0; if (za->za_integer_length != 8 || za->za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta); if (err != 0 && err != ENOENT) break; delta += za->za_first_integer; err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx); if (err != 0) break; } zap_cursor_fini(&zc); kmem_free(za, sizeof (*za)); return (err); } int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); return (zap_add(os, obj, name, 8, 1, &value, tx)); } int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); return (zap_remove(os, obj, name, tx)); } int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); return (zap_lookup(os, obj, name, 8, 1, &value)); } int zap_add_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_add(os, obj, name, 8, 1, &value, tx)); } int zap_update_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_update(os, obj, name, 8, 1, &value, tx)); } int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_lookup(os, obj, name, 8, 1, valuep)); } int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, dmu_tx_t *tx) { uint64_t value = 0; if (delta == 0) return (0); int err = zap_lookup(os, obj, name, 8, 1, &value); if (err != 0 && err != ENOENT) return (err); value += delta; if (value == 0) err = zap_remove(os, obj, name, tx); else err = zap_update(os, obj, name, 8, 1, &value, tx); return (err); } int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_increment(os, obj, name, delta, tx)); } /* * Routines for iterating over the attributes. */ int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) { int err = ENOENT; zap_entry_handle_t zeh; zap_leaf_t *l; /* retrieve the next entry at or after zc_hash/zc_cd */ /* if no entry, return ENOENT */ + + /* + * If we are reading from the beginning, we're almost + * certain to iterate over the entire ZAP object. If there are + * multiple leaf blocks (freeblk > 2), prefetch the whole + * object, so that we read the leaf blocks concurrently. + * (Unless noprefetch was requested via zap_cursor_init_noprefetch()). + */ + if (zc->zc_hash == 0 && zap_iterate_prefetch && + zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { + dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0, + zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), + ZIO_PRIORITY_ASYNC_READ); + } if (zc->zc_leaf && (ZAP_HASH_IDX(zc->zc_hash, zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; } again: if (zc->zc_leaf == NULL) { err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, &zc->zc_leaf); if (err != 0) return (err); } else { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); } l = zc->zc_leaf; err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); if (err == ENOENT) { uint64_t nocare = (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; zc->zc_cd = 0; if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) { zc->zc_hash = -1ULL; } else { zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; goto again; } } if (err == 0) { zc->zc_hash = zeh.zeh_hash; zc->zc_cd = zeh.zeh_cd; za->za_integer_length = zeh.zeh_integer_size; za->za_num_integers = zeh.zeh_num_integers; if (zeh.zeh_num_integers == 0) { za->za_first_integer = 0; } else { err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); ASSERT(err == 0 || err == EOVERFLOW); } err = zap_entry_read_name(zap, &zeh, sizeof (za->za_name), za->za_name); ASSERT(err == 0); za->za_normalization_conflict = zap_entry_normalization_conflict(&zeh, NULL, za->za_name, zap); } rw_exit(&zc->zc_leaf->l_rwlock); return (err); } static void zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) { uint64_t lastblk = 0; /* * NB: if a leaf has more pointers than an entire ptrtbl block * can hold, then it'll be accounted for more than once, since * we won't have lastblk. */ for (int i = 0; i < len; i++) { zap_leaf_t *l; if (tbl[i] == lastblk) continue; lastblk = tbl[i]; int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); if (err == 0) { zap_leaf_stats(zap, l, zs); zap_put_leaf(l); } } } void fzap_get_stats(zap_t *zap, zap_stats_t *zs) { int bs = FZAP_BLOCK_SHIFT(zap); zs->zs_blocksize = 1ULL << bs; /* * Set zap_phys_t fields */ zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; zs->zs_block_type = zap_f_phys(zap)->zap_block_type; zs->zs_magic = zap_f_phys(zap)->zap_magic; zs->zs_salt = zap_f_phys(zap)->zap_salt; /* * Set zap_ptrtbl fields */ zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; zs->zs_ptrtbl_blks_copied = zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { /* the ptrtbl is entirely in the header block. */ zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); } else { dmu_prefetch(zap->zap_objset, zap->zap_object, 0, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, ZIO_PRIORITY_SYNC_READ); for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; b++) { dmu_buf_t *db; int err; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { zap_stats_ptrtbl(zap, db->db_data, 1<<(bs-3), zs); dmu_buf_rele(db, FTAG); } } } } Index: vendor-sys/illumos/dist/uts/common/fs/zfs/zap_micro.c =================================================================== --- vendor-sys/illumos/dist/uts/common/fs/zfs/zap_micro.c (revision 353618) +++ vendor-sys/illumos/dist/uts/common/fs/zfs/zap_micro.c (revision 353619) @@ -1,1540 +1,1561 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2011, 2017 by Delphix. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif extern inline mzap_phys_t *zap_m_phys(zap_t *zap); static int mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags); uint64_t zap_getflags(zap_t *zap) { if (zap->zap_ismicro) return (0); return (zap_f_phys(zap)->zap_flags); } int zap_hashbits(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return (48); else return (28); } uint32_t zap_maxcd(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return ((1<<16)-1); else return (-1U); } static uint64_t zap_hash(zap_name_t *zn) { zap_t *zap = zn->zn_zap; uint64_t h = 0; if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); h = *(uint64_t *)zn->zn_key_orig; } else { h = zap->zap_salt; ASSERT(h != 0); ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { const uint64_t *wp = zn->zn_key_norm; ASSERT(zn->zn_key_intlen == 8); for (int i = 0; i < zn->zn_key_norm_numints; wp++, i++) { uint64_t word = *wp; for (int j = 0; j < zn->zn_key_intlen; j++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ word) & 0xFF]; word >>= NBBY; } } } else { const uint8_t *cp = zn->zn_key_norm; /* * We previously stored the terminating null on * disk, but didn't hash it, so we need to * continue to not hash it. (The * zn_key_*_numints includes the terminating * null for non-binary keys.) */ int len = zn->zn_key_norm_numints - 1; ASSERT(zn->zn_key_intlen == 1); for (int i = 0; i < len; cp++, i++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ *cp) & 0xFF]; } } } /* * Don't use all 64 bits, since we need some in the cookie for * the collision differentiator. We MUST use the high bits, * since those are the ones that we first pay attention to when * chosing the bucket. */ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); return (h); } static int zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags) { ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); size_t inlen = strlen(name) + 1; size_t outlen = ZAP_MAXNAMELEN; int err = 0; (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); return (err); } boolean_t zap_match(zap_name_t *zn, const char *matchname) { ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); if (zn->zn_matchtype & MT_NORMALIZE) { char norm[ZAP_MAXNAMELEN]; if (zap_normalize(zn->zn_zap, matchname, norm, zn->zn_normflags) != 0) return (B_FALSE); return (strcmp(zn->zn_key_norm, norm) == 0); } else { return (strcmp(zn->zn_key_orig, matchname) == 0); } } void zap_name_free(zap_name_t *zn) { kmem_free(zn, sizeof (zap_name_t)); } zap_name_t * zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = key; zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; zn->zn_matchtype = mt; zn->zn_normflags = zap->zap_normflags; /* * If we're dealing with a case sensitive lookup on a mixed or * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup * will fold case to all caps overriding the lookup request. */ if (mt & MT_MATCH_CASE) zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; if (zap->zap_normflags) { /* * We *must* use zap_normflags because this normalization is * what the hash is computed from. */ if (zap_normalize(zap, key, zn->zn_normbuf, zap->zap_normflags) != 0) { zap_name_free(zn); return (NULL); } zn->zn_key_norm = zn->zn_normbuf; zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { if (mt != 0) { zap_name_free(zn); return (NULL); } zn->zn_key_norm = zn->zn_key_orig; zn->zn_key_norm_numints = zn->zn_key_orig_numints; } zn->zn_hash = zap_hash(zn); if (zap->zap_normflags != zn->zn_normflags) { /* * We *must* use zn_normflags because this normalization is * what the matching is based on. (Not the hash!) */ if (zap_normalize(zap, key, zn->zn_normbuf, zn->zn_normflags) != 0) { zap_name_free(zn); return (NULL); } zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } return (zn); } zap_name_t * zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); ASSERT(zap->zap_normflags == 0); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = zn->zn_key_norm = key; zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; zn->zn_matchtype = 0; zn->zn_hash = zap_hash(zn); return (zn); } static void mzap_byteswap(mzap_phys_t *buf, size_t size) { buf->mz_block_type = BSWAP_64(buf->mz_block_type); buf->mz_salt = BSWAP_64(buf->mz_salt); buf->mz_normflags = BSWAP_64(buf->mz_normflags); int max = (size / MZAP_ENT_LEN) - 1; for (int i = 0; i < max; i++) { buf->mz_chunk[i].mze_value = BSWAP_64(buf->mz_chunk[i].mze_value); buf->mz_chunk[i].mze_cd = BSWAP_32(buf->mz_chunk[i].mze_cd); } } void zap_byteswap(void *buf, size_t size) { uint64_t block_type = *(uint64_t *)buf; if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { /* ASSERT(magic == ZAP_LEAF_MAGIC); */ mzap_byteswap(buf, size); } else { fzap_byteswap(buf, size); } } static int mze_compare(const void *arg1, const void *arg2) { const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; if (mze1->mze_hash > mze2->mze_hash) return (+1); if (mze1->mze_hash < mze2->mze_hash) return (-1); if (mze1->mze_cd > mze2->mze_cd) return (+1); if (mze1->mze_cd < mze2->mze_cd) return (-1); return (0); } static void mze_insert(zap_t *zap, int chunkid, uint64_t hash) { ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze->mze_chunkid = chunkid; mze->mze_hash = hash; mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); avl_add(&zap->zap_m.zap_avl, mze); } static mzap_ent_t * mze_find(zap_name_t *zn) { mzap_ent_t mze_tofind; mzap_ent_t *mze; avl_index_t idx; avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); mze_tofind.mze_hash = zn->zn_hash; mze_tofind.mze_cd = 0; mze = avl_find(avl, &mze_tofind, &idx); if (mze == NULL) mze = avl_nearest(avl, idx, AVL_AFTER); for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } return (NULL); } static uint32_t mze_find_unused_cd(zap_t *zap, uint64_t hash) { mzap_ent_t mze_tofind; avl_index_t idx; avl_tree_t *avl = &zap->zap_m.zap_avl; ASSERT(zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; uint32_t cd = 0; for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { if (mze->mze_cd != cd) break; cd++; } return (cd); } static void mze_remove(zap_t *zap, mzap_ent_t *mze) { ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); avl_remove(&zap->zap_m.zap_avl, mze); kmem_free(mze, sizeof (mzap_ent_t)); } static void mze_destroy(zap_t *zap) { mzap_ent_t *mze; void *avlcookie = NULL; while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) kmem_free(mze, sizeof (mzap_ent_t)); avl_destroy(&zap->zap_m.zap_avl); } static zap_t * mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) { zap_t *winner; uint64_t *zap_hdr = (uint64_t *)db->db_data; uint64_t zap_block_type = zap_hdr[0]; uint64_t zap_magic = zap_hdr[1]; ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, 0, 0, 0); rw_enter(&zap->zap_rwlock, RW_WRITER); zap->zap_objset = os; zap->zap_object = obj; zap->zap_dbuf = db; if (zap_block_type != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { winner = NULL; /* No actual winner here... */ goto handle_winner; } } else { zap->zap_ismicro = TRUE; } /* * Make sure that zap_ismicro is set before we let others see * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); winner = dmu_buf_set_user(db, &zap->zap_dbu); if (winner != NULL) goto handle_winner; if (zap->zap_ismicro) { zap->zap_salt = zap_m_phys(zap)->mz_salt; zap->zap_normflags = zap_m_phys(zap)->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; avl_create(&zap->zap_m.zap_avl, mze_compare, sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0]) { zap_name_t *zn; zap->zap_m.zap_num_entries++; zn = zap_name_alloc(zap, mze->mze_name, 0); mze_insert(zap, i, zn->zn_hash); zap_name_free(zn); } } } else { zap->zap_salt = zap_f_phys(zap)->zap_salt; zap->zap_normflags = zap_f_phys(zap)->zap_normflags; ASSERT3U(sizeof (struct zap_leaf_header), ==, 2*ZAP_LEAF_CHUNKSIZE); /* * The embedded pointer table should not overlap the * other members. */ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, &zap_f_phys(zap)->zap_salt); /* * The embedded pointer table should end at the end of * the block */ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 1<zap_dbuf->db_size); } rw_exit(&zap->zap_rwlock); return (zap); handle_winner: rw_exit(&zap->zap_rwlock); rw_destroy(&zap->zap_rwlock); if (!zap->zap_ismicro) mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); return (winner); } /* * This routine "consumes" the caller's hold on the dbuf, which must * have the specified tag. */ static int zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { ASSERT0(db->db_offset); objset_t *os = dmu_buf_get_objset(db); uint64_t obj = db->db_object; *zapp = NULL; zap_t *zap = dmu_buf_get_user(db); if (zap == NULL) { zap = mzap_open(os, obj, db); if (zap == NULL) { /* * mzap_open() didn't like what it saw on-disk. * Check for corruption! */ return (SET_ERROR(EIO)); } } /* * We're checking zap_ismicro without the lock held, in order to * tell what type of lock we want. Once we have some sort of * lock, see if it really is the right type. In practice this * can only be different if it was upgraded from micro to fat, * and micro wanted WRITER but fat only needs READER. */ krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; rw_enter(&zap->zap_rwlock, lt); if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { /* it was upgraded, now we only need reader */ ASSERT(lt == RW_WRITER); ASSERT(RW_READER == (!zap->zap_ismicro && fatreader) ? RW_READER : lti); rw_downgrade(&zap->zap_rwlock); lt = RW_READER; } zap->zap_objset = os; if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3P(zap->zap_dbuf, ==, db); ASSERT(!zap->zap_ismicro || zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); if (zap->zap_ismicro && tx && adding && zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; if (newsz > MZAP_MAX_BLKSZ) { dprintf("upgrading obj %llu: num_entries=%u\n", obj, zap->zap_m.zap_num_entries); *zapp = zap; int err = mzap_upgrade(zapp, tag, tx, 0); if (err != 0) rw_exit(&zap->zap_rwlock); return (err); } VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; } *zapp = zap; return (0); } static int zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) { dmu_buf_t *db; int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) { return (err); } #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) { dmu_buf_rele(db, tag); } return (err); } int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) { dmu_buf_t *db; int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); if (err != 0) return (err); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); if (err != 0) dmu_buf_rele(db, tag); return (err); } void zap_unlockdir(zap_t *zap, void *tag) { rw_exit(&zap->zap_rwlock); dmu_buf_rele(zap->zap_dbuf, tag); } static int mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) { int err = 0; zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); int sz = zap->zap_dbuf->db_size; mzap_phys_t *mzp = zio_buf_alloc(sz); bcopy(zap->zap_dbuf->db_data, mzp, sz); int nchunks = zap->zap_m.zap_num_chunks; if (!flags) { err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err != 0) { zio_buf_free(mzp, sz); return (err); } } dprintf("upgrading obj=%llu with %u chunks\n", zap->zap_object, nchunks); /* XXX destroy the avl later, so we can use the stored hash value */ mze_destroy(zap); fzap_upgrade(zap, tx, flags); for (int i = 0; i < nchunks; i++) { mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", mze->mze_name, mze->mze_value); zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0); err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tag, tx); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ zap_name_free(zn); if (err != 0) break; } zio_buf_free(mzp, sz); *zapp = zap; return (err); } /* * The "normflags" determine the behavior of the matchtype_t which is * passed to zap_lookup_norm(). Names which have the same normalized * version will be stored with the same hash value, and therefore we can * perform normalization-insensitive lookups. We can be Unicode form- * insensitive and/or case-insensitive. The following flags are valid for * "normflags": * * U8_TEXTPREP_NFC * U8_TEXTPREP_NFD * U8_TEXTPREP_NFKC * U8_TEXTPREP_NFKD * U8_TEXTPREP_TOUPPER * * The *_NF* (Normalization Form) flags are mutually exclusive; at most one * of them may be supplied. */ void mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, dmu_tx_t *tx) { dmu_buf_t *db; VERIFY0(dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db, tx); mzap_phys_t *zp = db->db_data; zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; zp->mz_normflags = normflags; if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, B_FALSE, B_FALSE, &zap)); VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); zap_unlockdir(zap, FTAG); } else { dmu_buf_rele(db, FTAG); } } int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, 0, tx)); } int zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_claim_norm_dnsize(os, obj, 0, ot, bonustype, bonuslen, dnodesize, tx)); } int zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, bonuslen, 0, tx)); } int zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { int err; err = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, dnodesize, tx); if (err != 0) return (err); mzap_create_impl(os, obj, normflags, 0, tx); return (0); } uint64_t zap_create(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); } uint64_t zap_create_dnsize(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, dnodesize, tx)); } uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, 0, tx)); } uint64_t zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen, dnodesize, tx); mzap_create_impl(os, obj, normflags, 0, tx); return (obj); } uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); return (zap_create_flags_dnsize(os, normflags, flags, ot, leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); } uint64_t zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) { uint64_t obj = dmu_object_alloc_dnsize(os, ot, 0, bonustype, bonuslen, dnodesize, tx); ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT && indirect_blockshift >= SPA_MINBLOCKSHIFT && indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT); VERIFY(dmu_object_set_blocksize(os, obj, 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); mzap_create_impl(os, obj, normflags, flags, tx); return (obj); } int zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) { /* * dmu_object_free will free the object number and free the * data. Freeing the data will cause our pageout function to be * called, which will destroy our data (zap_leaf_t's and zap_t). */ return (dmu_object_free(os, zapobj, tx)); } void zap_evict_sync(void *dbu) { zap_t *zap = dbu; rw_destroy(&zap->zap_rwlock); if (zap->zap_ismicro) mze_destroy(zap); else mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); } int zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); if (!zap->zap_ismicro) { err = fzap_count(zap, count); } else { *count = zap->zap_m.zap_num_entries; } zap_unlockdir(zap, FTAG); return (err); } /* * zn may be NULL; if not specified, it will be computed if needed. * See also the comment above zap_entry_normalization_conflict(). */ static boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) { int direction = AVL_BEFORE; boolean_t allocdzn = B_FALSE; if (zap->zap_normflags == 0) return (B_FALSE); again: for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction); other && other->mze_hash == mze->mze_hash; other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { if (zn == NULL) { zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, MT_NORMALIZE); allocdzn = B_TRUE; } if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } if (direction == AVL_BEFORE) { direction = AVL_AFTER; goto again; } if (allocdzn) zap_name_free(zn); return (B_FALSE); } /* * Routines for manipulating attributes. */ int zap_lookup(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm(os, zapobj, name, integer_size, num_integers, buf, 0, NULL, 0, NULL)); } static int zap_lookup_impl(zap_t *zap, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { int err = 0; zap_name_t *zn = zap_name_alloc(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_lookup(zn, integer_size, num_integers, buf, realname, rn_len, ncp); } else { mzap_ent_t *mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (num_integers < 1) { err = SET_ERROR(EOVERFLOW); } else if (integer_size != 8) { err = SET_ERROR(EINVAL); } else { *(uint64_t *)buf = MZE_PHYS(zap, mze)->mze_value; (void) strlcpy(realname, MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, zn, mze); } } } } zap_name_free(zn); return (err); } int zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, num_integers, buf, mt, realname, rn_len, ncp); zap_unlockdir(zap, FTAG); return (err); } int zap_lookup_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm_by_dnode(dn, name, integer_size, num_integers, buf, 0, NULL, 0, NULL)); } int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); err = zap_lookup_impl(zap, name, integer_size, num_integers, buf, mt, realname, rn_len, ncp); zap_unlockdir(zap, FTAG); return (err); } int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } fzap_prefetch(zn); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_lookup(zn, integer_size, num_integers, buf, NULL, 0, NULL); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_contains(objset_t *os, uint64_t zapobj, const char *name) { int err = zap_lookup_norm(os, zapobj, name, 0, 0, NULL, 0, NULL, 0, NULL); if (err == EOVERFLOW || err == EINVAL) err = 0; /* found, but skipped reading the value */ return (err); } int zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_length(zn, integer_size, num_integers); } else { mzap_ent_t *mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (integer_size) *integer_size = 8; if (num_integers) *num_integers = 1; } } zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_length(zn, integer_size, num_integers); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } static void mzap_addent(zap_name_t *zn, uint64_t value) { zap_t *zap = zn->zn_zap; int start = zap->zap_m.zap_alloc_next; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ ASSERT(cd < zap_maxcd(zap)); again: for (int i = start; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; (void) strcpy(mze->mze_name, zn->zn_key_orig); zap->zap_m.zap_num_entries++; zap->zap_m.zap_alloc_next = i+1; if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; mze_insert(zap, i, zn->zn_hash); return; } } if (start != 0) { start = 0; goto again; } ASSERT(!"out of entries!"); } static int zap_add_impl(zap_t *zap, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx, void *tag) { const uint64_t *intval = val; int err = 0; zap_name_t *zn = zap_name_alloc(zap, key, 0); if (zn == NULL) { zap_unlockdir(zap, tag); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_add(zn, integer_size, num_integers, val, tag, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(key) >= MZAP_NAME_LEN) { err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); if (err == 0) { err = fzap_add(zn, integer_size, num_integers, val, tag, tx); } zap = zn->zn_zap; /* fzap_add() may change zap */ } else { if (mze_find(zn) != NULL) { err = SET_ERROR(EEXIST); } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap, tag); return (err); } int zap_add(objset_t *os, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); /* zap_add_impl() calls zap_unlockdir() */ return (err); } int zap_add_by_dnode(dnode_t *dn, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); /* zap_add_impl() calls zap_unlockdir() */ return (err); } int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap, FTAG); return (err); } int zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; uint64_t oldval; const uint64_t *intval = val; #ifdef ZFS_DEBUG /* * If there is an old value, it shouldn't change across the * lockdir (eg, due to bprewrite's xlation). */ if (integer_size == 8 && num_integers == 1) (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); #endif int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc(zap, name, 0); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", zapobj, integer_size, num_integers, name); err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); if (err == 0) { err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); } zap = zn->zn_zap; /* fzap_update() may change zap */ } else { mzap_ent_t *mze = mze_find(zn); if (mze != NULL) { ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap, FTAG); return (err); } int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap, FTAG); return (err); } int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { return (zap_remove_norm(os, zapobj, name, 0, tx)); } static int zap_remove_impl(zap_t *zap, const char *name, matchtype_t mt, dmu_tx_t *tx) { int err = 0; zap_name_t *zn = zap_name_alloc(zap, name, mt); if (zn == NULL) return (SET_ERROR(ENOTSUP)); if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { mzap_ent_t *mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { zap->zap_m.zap_num_entries--; bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], sizeof (mzap_ent_phys_t)); mze_remove(zap, mze); } } zap_name_free(zn); return (err); } int zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); err = zap_remove_impl(zap, name, mt, tx); zap_unlockdir(zap, FTAG); return (err); } int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) { zap_t *zap; int err; err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err) return (err); err = zap_remove_impl(zap, name, 0, tx); zap_unlockdir(zap, FTAG); return (err); } int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) { zap_t *zap; int err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap, FTAG); return (SET_ERROR(ENOTSUP)); } err = fzap_remove(zn, tx); zap_name_free(zn); zap_unlockdir(zap, FTAG); return (err); } /* * Routines for iterating over the attributes. */ -void -zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, - uint64_t serialized) +static void +zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized, boolean_t prefetch) { zc->zc_objset = os; zc->zc_zap = NULL; zc->zc_leaf = NULL; zc->zc_zapobj = zapobj; zc->zc_serialized = serialized; zc->zc_hash = 0; zc->zc_cd = 0; + zc->zc_prefetch = prefetch; } +void +zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized) +{ + zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); +} +/* + * Initialize a cursor at the beginning of the ZAP object. The entire + * ZAP object will be prefetched. + */ void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { - zap_cursor_init_serialized(zc, os, zapobj, 0); + zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); +} + +/* + * Initialize a cursor at the beginning, but request that we not prefetch + * the entire ZAP object. + */ +void +zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) +{ + zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); } void zap_cursor_fini(zap_cursor_t *zc) { if (zc->zc_zap) { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); zap_unlockdir(zc->zc_zap, NULL); zc->zc_zap = NULL; } if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; } zc->zc_objset = NULL; } uint64_t zap_cursor_serialize(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return (-1ULL); if (zc->zc_zap == NULL) return (zc->zc_serialized); ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); /* * We want to keep the high 32 bits of the cursor zero if we can, so * that 32-bit programs can access this. So usually use a small * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits * of the cursor. * * [ collision differentiator | zap_hashbits()-bit hash value ] */ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); } int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) { int err; if (zc->zc_hash == -1ULL) return (SET_ERROR(ENOENT)); if (zc->zc_zap == NULL) { int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); if (err != 0) return (err); /* * To support zap_cursor_init_serialized, advance, retrieve, * we must add to the existing zc_cd, which may already * be 1 due to the zap_cursor_advance. */ ASSERT(zc->zc_hash == 0); hb = zap_hashbits(zc->zc_zap); zc->zc_hash = zc->zc_serialized << (64 - hb); zc->zc_cd += zc->zc_serialized >> hb; if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ zc->zc_cd = 0; } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { avl_index_t idx; mzap_ent_t mze_tofind; mze_tofind.mze_hash = zc->zc_hash; mze_tofind.mze_cd = zc->zc_cd; mzap_ent_t *mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); if (mze == NULL) { mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, idx, AVL_AFTER); } if (mze) { mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = mzap_normalization_conflict(zc->zc_zap, NULL, mze); za->za_integer_length = 8; za->za_num_integers = 1; za->za_first_integer = mzep->mze_value; (void) strcpy(za->za_name, mzep->mze_name); zc->zc_hash = mze->mze_hash; zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; err = SET_ERROR(ENOENT); } } rw_exit(&zc->zc_zap->zap_rwlock); return (err); } void zap_cursor_advance(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return; zc->zc_cd++; } int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) { zap_t *zap; int err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); if (err != 0) return (err); bzero(zs, sizeof (zap_stats_t)); if (zap->zap_ismicro) { zs->zs_blocksize = zap->zap_dbuf->db_size; zs->zs_num_entries = zap->zap_m.zap_num_entries; zs->zs_num_blocks = 1; } else { fzap_get_stats(zap, zs); } zap_unlockdir(zap, FTAG); return (0); }