Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c (revision 337168) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_object.c (revision 337169) @@ -1,241 +1,251 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2017 by Delphix. All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. */ #include #include #include #include #include #include uint64_t -dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, +dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { uint64_t object; uint64_t L1_dnode_count = DNODES_PER_BLOCK << (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; mutex_enter(&os->os_obj_lock); for (;;) { object = os->os_obj_next; /* * Each time we polish off a L1 bp worth of dnodes (2^12 * objects), move to another L1 bp that's still reasonably * sparse (at most 1/4 full). Look from the beginning at most * once per txg, but after that keep looking from here. * os_scan_dnodes is set during txg sync if enough objects * have been freed since the previous rescan to justify * backfilling again. If we can't find a suitable block, just * keep going from here. * * Note that dmu_traverse depends on the behavior that we use * multiple blocks of the dnode object before going back to * reuse objects. Any change to this algorithm should preserve * that property or find another solution to the issues * described in traverse_visitbp. */ if (P2PHASE(object, L1_dnode_count) == 0) { uint64_t offset; int error; if (os->os_rescan_dnodes) { offset = 0; os->os_rescan_dnodes = B_FALSE; } else { offset = object << DNODE_SHIFT; } error = dnode_next_offset(DMU_META_DNODE(os), DNODE_FIND_HOLE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); if (error == 0) object = offset >> DNODE_SHIFT; } os->os_obj_next = ++object; /* * XXX We should check for an i/o error here and return * up to our caller. Actually we should pre-read it in * dmu_tx_assign(), but there is currently no mechanism * to do so. */ (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); if (dn) break; if (dmu_object_next(os, &object, B_TRUE, 0) == 0) os->os_obj_next = object - 1; } - dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); + dnode_allocate(dn, ot, blocksize, indirect_blockshift, + bonustype, bonuslen, tx); mutex_exit(&os->os_obj_lock); dmu_tx_add_new_object(tx, dn); dnode_rele(dn, FTAG); return (object); +} + +uint64_t +dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (dmu_object_alloc_ibs(os, ot, blocksize, 0, + bonustype, bonuslen, tx)); } int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { dnode_t *dn; int err; if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) return (SET_ERROR(EBADF)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); if (err) return (err); dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); dmu_tx_add_new_object(tx, dn); dnode_rele(dn, FTAG); return (0); } int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { dnode_t *dn; int err; if (object == DMU_META_DNODE_OBJECT) return (SET_ERROR(EBADF)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err) return (err); dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); dnode_rele(dn, FTAG); return (err); } int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; int err; ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err) return (err); ASSERT(dn->dn_type != DMU_OT_NONE); dnode_free_range(dn, 0, DMU_OBJECT_END, tx); dnode_free(dn, tx); dnode_rele(dn, FTAG); return (0); } /* * Return (in *objectp) the next object which is allocated (or a hole) * after *object, taking into account only objects that may have been modified * after the specified txg. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) { uint64_t offset = (*objectp + 1) << DNODE_SHIFT; int error; error = dnode_next_offset(DMU_META_DNODE(os), (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); *objectp = offset >> DNODE_SHIFT; return (error); } /* * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. * * Only for use from syncing context, on MOS objects. */ void dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, dmu_tx_t *tx) { dnode_t *dn; ASSERT(dmu_tx_is_syncing(tx)); VERIFY0(dnode_hold(mos, object, FTAG, &dn)); if (dn->dn_type == DMU_OTN_ZAP_METADATA) { dnode_rele(dn, FTAG); return; } ASSERT3U(dn->dn_type, ==, old_type); ASSERT0(dn->dn_maxblkid); /* * We must initialize the ZAP data before changing the type, * so that concurrent calls to *_is_zapified() can determine if * the object has been completely zapified by checking the type. */ mzap_create_impl(mos, object, 0, 0, tx); dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = DMU_OTN_ZAP_METADATA; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); spa_feature_incr(dmu_objset_spa(mos), SPA_FEATURE_EXTENSIBLE_DATASET, tx); } void dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) { dnode_t *dn; dmu_object_type_t t; ASSERT(dmu_tx_is_syncing(tx)); VERIFY0(dnode_hold(mos, object, FTAG, &dn)); t = dn->dn_type; dnode_rele(dn, FTAG); if (t == DMU_OTN_ZAP_METADATA) { spa_feature_decr(dmu_objset_spa(mos), SPA_FEATURE_EXTENSIBLE_DATASET, tx); } VERIFY0(dmu_object_free(mos, object, tx)); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c (revision 337168) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c (revision 337169) @@ -1,1089 +1,1101 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012, 2017 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_zfs); /* * Note on space map block size: * * The data for a given space map can be kept on blocks of any size. * Larger blocks entail fewer I/O operations, but they also cause the * DMU to keep more data in-core, and also to waste more I/O bandwidth * when only a few blocks have changed since the last transaction group. */ /* * Enabled whenever we want to stress test the use of double-word * space map entries. */ boolean_t zfs_force_some_double_word_sm_entries = B_FALSE; +/* + * Override the default indirect block size of 128K, instead using 16K for + * spacemaps (2^14 bytes). This dramatically reduces write inflation since + * appending to a spacemap typically has to write one data block (4KB) and one + * or two indirect blocks (16K-32K, rather than 128K). + */ +int space_map_ibs = 14; + +SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, + &space_map_ibs, 0, "Space map indirect block shift"); + boolean_t sm_entry_is_debug(uint64_t e) { return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX); } boolean_t sm_entry_is_single_word(uint64_t e) { uint8_t prefix = SM_PREFIX_DECODE(e); return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX); } boolean_t sm_entry_is_double_word(uint64_t e) { return (SM_PREFIX_DECODE(e) == SM2_PREFIX); } /* * Iterate through the space map, invoking the callback on each (non-debug) * space map entry. */ int space_map_iterate(space_map_t *sm, sm_cb_t callback, void *arg) { uint64_t sm_len = space_map_length(sm); ASSERT3U(sm->sm_blksz, !=, 0); dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, sm_len, ZIO_PRIORITY_SYNC_READ); uint64_t blksz = sm->sm_blksz; int error = 0; for (uint64_t block_base = 0; block_base < sm_len && error == 0; block_base += blksz) { dmu_buf_t *db; error = dmu_buf_hold(sm->sm_os, space_map_object(sm), block_base, FTAG, &db, DMU_READ_PREFETCH); if (error != 0) return (error); uint64_t *block_start = db->db_data; uint64_t block_length = MIN(sm_len - block_base, blksz); uint64_t *block_end = block_start + (block_length / sizeof (uint64_t)); VERIFY0(P2PHASE(block_length, sizeof (uint64_t))); VERIFY3U(block_length, !=, 0); ASSERT3U(blksz, ==, db->db_size); for (uint64_t *block_cursor = block_start; block_cursor < block_end && error == 0; block_cursor++) { uint64_t e = *block_cursor; if (sm_entry_is_debug(e)) /* Skip debug entries */ continue; uint64_t raw_offset, raw_run, vdev_id; maptype_t type; if (sm_entry_is_single_word(e)) { type = SM_TYPE_DECODE(e); vdev_id = SM_NO_VDEVID; raw_offset = SM_OFFSET_DECODE(e); raw_run = SM_RUN_DECODE(e); } else { /* it is a two-word entry */ ASSERT(sm_entry_is_double_word(e)); raw_run = SM2_RUN_DECODE(e); vdev_id = SM2_VDEV_DECODE(e); /* move on to the second word */ block_cursor++; e = *block_cursor; VERIFY3P(block_cursor, <=, block_end); type = SM2_TYPE_DECODE(e); raw_offset = SM2_OFFSET_DECODE(e); } uint64_t entry_offset = (raw_offset << sm->sm_shift) + sm->sm_start; uint64_t entry_run = raw_run << sm->sm_shift; VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); ASSERT3U(entry_offset, >=, sm->sm_start); ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size); ASSERT3U(entry_run, <=, sm->sm_size); ASSERT3U(entry_offset + entry_run, <=, sm->sm_start + sm->sm_size); space_map_entry_t sme = { .sme_type = type, .sme_vdev = vdev_id, .sme_offset = entry_offset, .sme_run = entry_run }; error = callback(&sme, arg); } dmu_buf_rele(db, FTAG); } return (error); } /* * Reads the entries from the last block of the space map into * buf in reverse order. Populates nwords with number of words * in the last block. * * Refer to block comment within space_map_incremental_destroy() * to understand why this function is needed. */ static int space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, uint64_t bufsz, uint64_t *nwords) { int error = 0; dmu_buf_t *db; /* * Find the offset of the last word in the space map and use * that to read the last block of the space map with * dmu_buf_hold(). */ uint64_t last_word_offset = sm->sm_phys->smp_objsize - sizeof (uint64_t); error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset, FTAG, &db, DMU_READ_NO_PREFETCH); if (error != 0) return (error); ASSERT3U(sm->sm_object, ==, db->db_object); ASSERT3U(sm->sm_blksz, ==, db->db_size); ASSERT3U(bufsz, >=, db->db_size); ASSERT(nwords != NULL); uint64_t *words = db->db_data; *nwords = (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t)); uint64_t n = *nwords; uint64_t j = n - 1; for (uint64_t i = 0; i < n; i++) { uint64_t entry = words[i]; if (sm_entry_is_double_word(entry)) { /* * Since we are populating the buffer backwards * we have to be extra careful and add the two * words of the double-word entry in the right * order. */ ASSERT3U(j, >, 0); buf[j - 1] = entry; i++; ASSERT3U(i, <, n); entry = words[i]; buf[j] = entry; j -= 2; } else { ASSERT(sm_entry_is_debug(entry) || sm_entry_is_single_word(entry)); buf[j] = entry; j--; } } /* * Assert that we wrote backwards all the * way to the beginning of the buffer. */ ASSERT3S(j, ==, -1); dmu_buf_rele(db, FTAG); return (error); } /* * Note: This function performs destructive actions - specifically * it deletes entries from the end of the space map. Thus, callers * should ensure that they are holding the appropriate locks for * the space map that they provide. */ int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, dmu_tx_t *tx) { uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); uint64_t *buf = zio_buf_alloc(bufsz); dmu_buf_will_dirty(sm->sm_dbuf, tx); /* * Ideally we would want to iterate from the beginning of the * space map to the end in incremental steps. The issue with this * approach is that we don't have any field on-disk that points * us where to start between each step. We could try zeroing out * entries that we've destroyed, but this doesn't work either as * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]). * * As a result, we destroy its entries incrementally starting from * the end after applying the callback to each of them. * * The problem with this approach is that we cannot literally * iterate through the words in the space map backwards as we * can't distinguish two-word space map entries from their second * word. Thus we do the following: * * 1] We get all the entries from the last block of the space map * and put them into a buffer in reverse order. This way the * last entry comes first in the buffer, the second to last is * second, etc. * 2] We iterate through the entries in the buffer and we apply * the callback to each one. As we move from entry to entry we * we decrease the size of the space map, deleting effectively * each entry. * 3] If there are no more entries in the space map or the callback * returns a value other than 0, we stop iterating over the * space map. If there are entries remaining and the callback * returned 0, we go back to step [1]. */ int error = 0; while (space_map_length(sm) > 0 && error == 0) { uint64_t nwords = 0; error = space_map_reversed_last_block_entries(sm, buf, bufsz, &nwords); if (error != 0) break; ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t)); for (uint64_t i = 0; i < nwords; i++) { uint64_t e = buf[i]; if (sm_entry_is_debug(e)) { sm->sm_phys->smp_objsize -= sizeof (uint64_t); space_map_update(sm); continue; } int words = 1; uint64_t raw_offset, raw_run, vdev_id; maptype_t type; if (sm_entry_is_single_word(e)) { type = SM_TYPE_DECODE(e); vdev_id = SM_NO_VDEVID; raw_offset = SM_OFFSET_DECODE(e); raw_run = SM_RUN_DECODE(e); } else { ASSERT(sm_entry_is_double_word(e)); words = 2; raw_run = SM2_RUN_DECODE(e); vdev_id = SM2_VDEV_DECODE(e); /* move to the second word */ i++; e = buf[i]; ASSERT3P(i, <=, nwords); type = SM2_TYPE_DECODE(e); raw_offset = SM2_OFFSET_DECODE(e); } uint64_t entry_offset = (raw_offset << sm->sm_shift) + sm->sm_start; uint64_t entry_run = raw_run << sm->sm_shift; VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); VERIFY3U(entry_offset, >=, sm->sm_start); VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size); VERIFY3U(entry_run, <=, sm->sm_size); VERIFY3U(entry_offset + entry_run, <=, sm->sm_start + sm->sm_size); space_map_entry_t sme = { .sme_type = type, .sme_vdev = vdev_id, .sme_offset = entry_offset, .sme_run = entry_run }; error = callback(&sme, arg); if (error != 0) break; if (type == SM_ALLOC) sm->sm_phys->smp_alloc -= entry_run; else sm->sm_phys->smp_alloc += entry_run; sm->sm_phys->smp_objsize -= words * sizeof (uint64_t); space_map_update(sm); } } if (space_map_length(sm) == 0) { ASSERT0(error); ASSERT0(sm->sm_phys->smp_objsize); ASSERT0(sm->sm_alloc); } zio_buf_free(buf, bufsz); return (error); } typedef struct space_map_load_arg { space_map_t *smla_sm; range_tree_t *smla_rt; maptype_t smla_type; } space_map_load_arg_t; static int space_map_load_callback(space_map_entry_t *sme, void *arg) { space_map_load_arg_t *smla = arg; if (sme->sme_type == smla->smla_type) { VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=, smla->smla_sm->sm_size); range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run); } else { range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run); } return (0); } /* * Load the space map disk into the specified range tree. Segments of maptype * are added to the range tree, other segment types are removed. */ int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) { uint64_t space; int err; space_map_load_arg_t smla; VERIFY0(range_tree_space(rt)); space = space_map_allocated(sm); if (maptype == SM_FREE) { range_tree_add(rt, sm->sm_start, sm->sm_size); space = sm->sm_size - space; } smla.smla_rt = rt; smla.smla_sm = sm; smla.smla_type = maptype; err = space_map_iterate(sm, space_map_load_callback, &smla); if (err == 0) { VERIFY3U(range_tree_space(rt), ==, space); } else { range_tree_vacate(rt, NULL, NULL); } return (err); } void space_map_histogram_clear(space_map_t *sm) { if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) return; bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram)); } boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt) { /* * Verify that the in-core range tree does not have any * ranges smaller than our sm_shift size. */ for (int i = 0; i < sm->sm_shift; i++) { if (rt->rt_histogram[i] != 0) return (B_FALSE); } return (B_TRUE); } void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) { int idx = 0; ASSERT(dmu_tx_is_syncing(tx)); VERIFY3U(space_map_object(sm), !=, 0); if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) return; dmu_buf_will_dirty(sm->sm_dbuf, tx); ASSERT(space_map_histogram_verify(sm, rt)); /* * Transfer the content of the range tree histogram to the space * map histogram. The space map histogram contains 32 buckets ranging * between 2^sm_shift to 2^(32+sm_shift-1). The range tree, * however, can represent ranges from 2^0 to 2^63. Since the space * map only cares about allocatable blocks (minimum of sm_shift) we * can safely ignore all ranges in the range tree smaller than sm_shift. */ for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { /* * Since the largest histogram bucket in the space map is * 2^(32+sm_shift-1), we need to normalize the values in * the range tree for any bucket larger than that size. For * example given an sm_shift of 9, ranges larger than 2^40 * would get normalized as if they were 1TB ranges. Assume * the range tree had a count of 5 in the 2^44 (16TB) bucket, * the calculation below would normalize this to 5 * 2^4 (16). */ ASSERT3U(i, >=, idx + sm->sm_shift); sm->sm_phys->smp_histogram[idx] += rt->rt_histogram[i] << (i - idx - sm->sm_shift); /* * Increment the space map's index as long as we haven't * reached the maximum bucket size. Accumulate all ranges * larger than the max bucket size into the last bucket. */ if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { ASSERT3U(idx + sm->sm_shift, ==, i); idx++; ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); } } } static void space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) { dmu_buf_will_dirty(sm->sm_dbuf, tx); uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | SM_DEBUG_ACTION_ENCODE(maptype) | SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) | SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_objsize, sizeof (dentry), &dentry, tx); sm->sm_phys->smp_objsize += sizeof (dentry); } /* * Writes one or more entries given a segment. * * Note: The function may release the dbuf from the pointer initially * passed to it, and return a different dbuf. Also, the space map's * dbuf must be dirty for the changes in sm_phys to take effect. */ static void space_map_write_seg(space_map_t *sm, range_seg_t *rs, maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, void *tag, dmu_tx_t *tx) { ASSERT3U(words, !=, 0); ASSERT3U(words, <=, 2); /* ensure the vdev_id can be represented by the space map */ ASSERT3U(vdev_id, <=, SM_NO_VDEVID); /* * if this is a single word entry, ensure that no vdev was * specified. */ IMPLY(words == 1, vdev_id == SM_NO_VDEVID); dmu_buf_t *db = *dbp; ASSERT3U(db->db_size, ==, sm->sm_blksz); uint64_t *block_base = db->db_data; uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t)); uint64_t *block_cursor = block_base + (sm->sm_phys->smp_objsize - db->db_offset) / sizeof (uint64_t); ASSERT3P(block_cursor, <=, block_end); uint64_t size = (rs->rs_end - rs->rs_start) >> sm->sm_shift; uint64_t start = (rs->rs_start - sm->sm_start) >> sm->sm_shift; uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; ASSERT3U(rs->rs_start, >=, sm->sm_start); ASSERT3U(rs->rs_start, <, sm->sm_start + sm->sm_size); ASSERT3U(rs->rs_end - rs->rs_start, <=, sm->sm_size); ASSERT3U(rs->rs_end, <=, sm->sm_start + sm->sm_size); while (size != 0) { ASSERT3P(block_cursor, <=, block_end); /* * If we are at the end of this block, flush it and start * writing again from the beginning. */ if (block_cursor == block_end) { dmu_buf_rele(db, tag); uint64_t next_word_offset = sm->sm_phys->smp_objsize; VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), next_word_offset, tag, &db, DMU_READ_PREFETCH)); dmu_buf_will_dirty(db, tx); /* update caller's dbuf */ *dbp = db; ASSERT3U(db->db_size, ==, sm->sm_blksz); block_base = db->db_data; block_cursor = block_base; block_end = block_base + (db->db_size / sizeof (uint64_t)); } /* * If we are writing a two-word entry and we only have one * word left on this block, just pad it with an empty debug * entry and write the two-word entry in the next block. */ uint64_t *next_entry = block_cursor + 1; if (next_entry == block_end && words > 1) { ASSERT3U(words, ==, 2); *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | SM_DEBUG_ACTION_ENCODE(0) | SM_DEBUG_SYNCPASS_ENCODE(0) | SM_DEBUG_TXG_ENCODE(0); block_cursor++; sm->sm_phys->smp_objsize += sizeof (uint64_t); ASSERT3P(block_cursor, ==, block_end); continue; } uint64_t run_len = MIN(size, run_max); switch (words) { case 1: *block_cursor = SM_OFFSET_ENCODE(start) | SM_TYPE_ENCODE(maptype) | SM_RUN_ENCODE(run_len); block_cursor++; break; case 2: /* write the first word of the entry */ *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) | SM2_RUN_ENCODE(run_len) | SM2_VDEV_ENCODE(vdev_id); block_cursor++; /* move on to the second word of the entry */ ASSERT3P(block_cursor, <, block_end); *block_cursor = SM2_TYPE_ENCODE(maptype) | SM2_OFFSET_ENCODE(start); block_cursor++; break; default: panic("%d-word space map entries are not supported", words); break; } sm->sm_phys->smp_objsize += words * sizeof (uint64_t); start += run_len; size -= run_len; } ASSERT0(size); } /* * Note: The space map's dbuf must be dirty for the changes in sm_phys to * take effect. */ static void space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, uint64_t vdev_id, dmu_tx_t *tx) { spa_t *spa = tx->tx_pool->dp_spa; dmu_buf_t *db; space_map_write_intro_debug(sm, maptype, tx); #ifdef DEBUG /* * We do this right after we write the intro debug entry * because the estimate does not take it into account. */ uint64_t initial_objsize = sm->sm_phys->smp_objsize; uint64_t estimated_growth = space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID); uint64_t estimated_final_objsize = initial_objsize + estimated_growth; #endif /* * Find the offset right after the last word in the space map * and use that to get a hold of the last block, so we can * start appending to it. */ uint64_t next_word_offset = sm->sm_phys->smp_objsize; VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), next_word_offset, FTAG, &db, DMU_READ_PREFETCH)); ASSERT3U(db->db_size, ==, sm->sm_blksz); dmu_buf_will_dirty(db, tx); avl_tree_t *t = &rt->rt_root; for (range_seg_t *rs = avl_first(t); rs != NULL; rs = AVL_NEXT(t, rs)) { uint64_t offset = (rs->rs_start - sm->sm_start) >> sm->sm_shift; uint64_t length = (rs->rs_end - rs->rs_start) >> sm->sm_shift; uint8_t words = 1; /* * We only write two-word entries when both of the following * are true: * * [1] The feature is enabled. * [2] The offset or run is too big for a single-word entry, - * or the vdev_id is set (meaning not equal to - * SM_NO_VDEVID). + * or the vdev_id is set (meaning not equal to + * SM_NO_VDEVID). * * Note that for purposes of testing we've added the case that * we write two-word entries occasionally when the feature is * enabled and zfs_force_some_double_word_sm_entries has been * set. */ if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) && (offset >= (1ULL << SM_OFFSET_BITS) || length > SM_RUN_MAX || vdev_id != SM_NO_VDEVID || (zfs_force_some_double_word_sm_entries && spa_get_random(100) == 0))) words = 2; space_map_write_seg(sm, rs, maptype, vdev_id, words, &db, FTAG, tx); } dmu_buf_rele(db, FTAG); #ifdef DEBUG /* * We expect our estimation to be based on the worst case * scenario [see comment in space_map_estimate_optimal_size()]. * Therefore we expect the actual objsize to be equal or less * than whatever we estimated it to be. */ ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_objsize); #endif } /* * Note: This function manipulates the state of the given space map but * does not hold any locks implicitly. Thus the caller is responsible * for synchronizing writes to the space map. */ void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, uint64_t vdev_id, dmu_tx_t *tx) { objset_t *os = sm->sm_os; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); VERIFY3U(space_map_object(sm), !=, 0); dmu_buf_will_dirty(sm->sm_dbuf, tx); /* * This field is no longer necessary since the in-core space map * now contains the object number but is maintained for backwards * compatibility. */ sm->sm_phys->smp_object = sm->sm_object; if (range_tree_is_empty(rt)) { VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object); return; } if (maptype == SM_ALLOC) sm->sm_phys->smp_alloc += range_tree_space(rt); else sm->sm_phys->smp_alloc -= range_tree_space(rt); uint64_t nodes = avl_numnodes(&rt->rt_root); uint64_t rt_space = range_tree_space(rt); space_map_write_impl(sm, rt, maptype, vdev_id, tx); /* * Ensure that the space_map's accounting wasn't changed * while we were in the middle of writing it out. */ VERIFY3U(nodes, ==, avl_numnodes(&rt->rt_root)); VERIFY3U(range_tree_space(rt), ==, rt_space); } static int space_map_open_impl(space_map_t *sm) { int error; u_longlong_t blocks; error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf); if (error) return (error); dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks); sm->sm_phys = sm->sm_dbuf->db_data; return (0); } int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, uint64_t start, uint64_t size, uint8_t shift) { space_map_t *sm; int error; ASSERT(*smp == NULL); ASSERT(os != NULL); ASSERT(object != 0); sm = kmem_zalloc(sizeof (space_map_t), KM_SLEEP); sm->sm_start = start; sm->sm_size = size; sm->sm_shift = shift; sm->sm_os = os; sm->sm_object = object; error = space_map_open_impl(sm); if (error != 0) { space_map_close(sm); return (error); } *smp = sm; return (0); } void space_map_close(space_map_t *sm) { if (sm == NULL) return; if (sm->sm_dbuf != NULL) dmu_buf_rele(sm->sm_dbuf, sm); sm->sm_dbuf = NULL; sm->sm_phys = NULL; kmem_free(sm, sizeof (*sm)); } void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) { objset_t *os = sm->sm_os; spa_t *spa = dmu_objset_spa(os); dmu_object_info_t doi; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(dmu_tx_is_syncing(tx)); VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa)); dmu_object_info_from_db(sm->sm_dbuf, &doi); /* * If the space map has the wrong bonus size (because * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or * the wrong block size (because space_map_blksz has changed), * free and re-allocate its object with the updated sizes. * * Otherwise, just truncate the current object. */ if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && doi.doi_bonus_size != sizeof (space_map_phys_t)) || - doi.doi_data_block_size != blocksize) { + doi.doi_data_block_size != blocksize || + doi.doi_metadata_block_size != 1 << space_map_ibs) { zfs_dbgmsg("txg %llu, spa %s, sm %p, reallocating " "object[%llu]: old bonus %u, old blocksz %u", dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object, doi.doi_bonus_size, doi.doi_data_block_size); space_map_free(sm, tx); dmu_buf_rele(sm->sm_dbuf, sm); sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx); VERIFY0(space_map_open_impl(sm)); } else { VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx)); /* * If the spacemap is reallocated, its histogram * will be reset. Do the same in the common case so that * bugs related to the uncommon case do not go unnoticed. */ bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram)); } dmu_buf_will_dirty(sm->sm_dbuf, tx); sm->sm_phys->smp_objsize = 0; sm->sm_phys->smp_alloc = 0; } /* * Update the in-core space_map allocation and length values. */ void space_map_update(space_map_t *sm) { if (sm == NULL) return; sm->sm_alloc = sm->sm_phys->smp_alloc; sm->sm_length = sm->sm_phys->smp_objsize; } uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(os); uint64_t object; int bonuslen; if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); bonuslen = sizeof (space_map_phys_t); ASSERT3U(bonuslen, <=, dmu_bonus_max()); } else { bonuslen = SPACE_MAP_SIZE_V0; } - object = dmu_object_alloc(os, DMU_OT_SPACE_MAP, blocksize, - DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); + object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize, + space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); return (object); } void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx) { spa_t *spa = dmu_objset_spa(os); if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { dmu_object_info_t doi; VERIFY0(dmu_object_info(os, smobj, &doi)); if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) { spa_feature_decr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); } } VERIFY0(dmu_object_free(os, smobj, tx)); } void space_map_free(space_map_t *sm, dmu_tx_t *tx) { if (sm == NULL) return; space_map_free_obj(sm->sm_os, space_map_object(sm), tx); sm->sm_object = 0; } /* * Given a range tree, it makes a worst-case estimate of how much * space would the tree's segments take if they were written to * the given space map. */ uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, uint64_t vdev_id) { spa_t *spa = dmu_objset_spa(sm->sm_os); uint64_t shift = sm->sm_shift; uint64_t *histogram = rt->rt_histogram; uint64_t entries_for_seg = 0; /* * In order to get a quick estimate of the optimal size that this * range tree would have on-disk as a space map, we iterate through * its histogram buckets instead of iterating through its nodes. * * Note that this is a highest-bound/worst-case estimate for the * following reasons: * * 1] We assume that we always add a debug padding for each block * we write and we also assume that we start at the last word * of a block attempting to write a two-word entry. * 2] Rounding up errors due to the way segments are distributed * in the buckets of the range tree's histogram. * 3] The activation of zfs_force_some_double_word_sm_entries * (tunable) when testing. * * = Math and Rounding Errors = * * rt_histogram[i] bucket of a range tree represents the number * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given * that, we want to divide the buckets into groups: Buckets that * can be represented using a single-word entry, ones that can * be represented with a double-word entry, and ones that can * only be represented with multiple two-word entries. * * [Note that if the new encoding feature is not enabled there * are only two groups: single-word entry buckets and multiple * single-word entry buckets. The information below assumes * two-word entries enabled, but it can easily applied when * the feature is not enabled] * * To find the highest bucket that can be represented with a * single-word entry we look at the maximum run that such entry * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that * the run of a space map entry is shifted by sm_shift, thus we * add it to the exponent]. This way, excluding the value of the * maximum run that can be represented by a single-word entry, * all runs that are smaller exist in buckets 0 to * SM_RUN_BITS + shift - 1. * * To find the highest bucket that can be represented with a * double-word entry, we follow the same approach. Finally, any * bucket higher than that are represented with multiple two-word * entries. To be more specific, if the highest bucket whose * segments can be represented with a single two-word entry is X, * then bucket X+1 will need 2 two-word entries for each of its * segments, X+2 will need 4, X+3 will need 8, ...etc. * * With all of the above we make our estimation based on bucket * groups. There is a rounding error though. As we mentioned in * the example with the one-word entry, the maximum run that can * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of * that length fall into the next bucket (and bucket group) where * we start counting two-word entries and this is one more reason * why the estimated size may end up being bigger than the actual * size written. */ uint64_t size = 0; uint64_t idx = 0; if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) || (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) { /* * If we are trying to force some double word entries just * assume the worst-case of every single word entry being * written as a double word entry. */ uint64_t entry_size = (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) && zfs_force_some_double_word_sm_entries) ? (2 * sizeof (uint64_t)) : sizeof (uint64_t); uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1; for (; idx <= single_entry_max_bucket; idx++) size += histogram[idx] * entry_size; if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) { for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { ASSERT3U(idx, >=, single_entry_max_bucket); entries_for_seg = 1ULL << (idx - single_entry_max_bucket); size += histogram[idx] * entries_for_seg * entry_size; } return (size); } } ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)); uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1; for (; idx <= double_entry_max_bucket; idx++) size += histogram[idx] * 2 * sizeof (uint64_t); for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { ASSERT3U(idx, >=, double_entry_max_bucket); entries_for_seg = 1ULL << (idx - double_entry_max_bucket); size += histogram[idx] * entries_for_seg * 2 * sizeof (uint64_t); } /* * Assume the worst case where we start with the padding at the end * of the current block and we add an extra padding entry at the end * of all subsequent blocks. */ size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t); return (size); } uint64_t space_map_object(space_map_t *sm) { return (sm != NULL ? sm->sm_object : 0); } /* * Returns the already synced, on-disk allocated space. */ uint64_t space_map_allocated(space_map_t *sm) { return (sm != NULL ? sm->sm_alloc : 0); } /* * Returns the already synced, on-disk length; */ uint64_t space_map_length(space_map_t *sm) { return (sm != NULL ? sm->sm_length : 0); } /* * Returns the allocated space that is currently syncing. */ int64_t space_map_alloc_delta(space_map_t *sm) { if (sm == NULL) return (0); ASSERT(sm->sm_dbuf != NULL); return (sm->sm_phys->smp_alloc - space_map_allocated(sm)); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 337168) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 337169) @@ -1,990 +1,993 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2017 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2013 DEY Storage Systems, Inc. * Copyright 2014 HybridCluster. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2013 Saso Kiselkov. All rights reserved. * Copyright (c) 2014 Integros [integros.com] */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_H #define _SYS_DMU_H /* * This file describes the interface that the DMU provides for its * consumers. * * The DMU also interacts with the SPA. That interface is described in * dmu_spa.h. */ #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct uio; struct xuio; struct page; struct vnode; struct spa; struct zilog; struct zio; struct blkptr; struct zap_cursor; struct dsl_dataset; struct dsl_pool; struct dnode; struct drr_begin; struct drr_end; struct zbookmark_phys; struct spa; struct nvlist; struct arc_buf; struct zio_prop; struct sa_handle; struct file; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; typedef struct dnode dnode_t; typedef enum dmu_object_byteswap { DMU_BSWAP_UINT8, DMU_BSWAP_UINT16, DMU_BSWAP_UINT32, DMU_BSWAP_UINT64, DMU_BSWAP_ZAP, DMU_BSWAP_DNODE, DMU_BSWAP_OBJSET, DMU_BSWAP_ZNODE, DMU_BSWAP_OLDACL, DMU_BSWAP_ACL, /* * Allocating a new byteswap type number makes the on-disk format * incompatible with any other format that uses the same number. * * Data can usually be structured to work with one of the * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. */ DMU_BSWAP_NUMFUNCS } dmu_object_byteswap_t; #define DMU_OT_NEWTYPE 0x80 #define DMU_OT_METADATA 0x40 #define DMU_OT_BYTESWAP_MASK 0x3f /* * Defines a uint8_t object type. Object types specify if the data * in the object is metadata (boolean) and how to byteswap the data * (dmu_object_byteswap_t). All of the types created by this method * are cached in the dbuf metadata cache. */ #define DMU_OT(byteswap, metadata) \ (DMU_OT_NEWTYPE | \ ((metadata) ? DMU_OT_METADATA : 0) | \ ((byteswap) & DMU_OT_BYTESWAP_MASK)) #define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ (ot) < DMU_OT_NUMTYPES) #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_METADATA) : \ dmu_ot[(ot)].ot_metadata) #define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache) /* * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill * is repurposed for embedded BPs. */ #define DMU_OT_HAS_FILL(ot) \ ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) : \ dmu_ot[(ot)].ot_byteswap) typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ DMU_OT_OBJECT_DIRECTORY, /* ZAP */ DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ DMU_OT_BPOBJ, /* UINT64 */ DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ /* zil: */ DMU_OT_INTENT_LOG, /* UINT64 */ /* dmu: */ DMU_OT_DNODE, /* DNODE */ DMU_OT_OBJSET, /* OBJSET */ /* dsl: */ DMU_OT_DSL_DIR, /* UINT64 */ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ DMU_OT_DSL_PROPS, /* ZAP */ DMU_OT_DSL_DATASET, /* UINT64 */ /* zpl: */ DMU_OT_ZNODE, /* ZNODE */ DMU_OT_OLDACL, /* Old ACL */ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ DMU_OT_MASTER_NODE, /* ZAP */ DMU_OT_UNLINKED_SET, /* ZAP */ /* zvol: */ DMU_OT_ZVOL, /* UINT8 */ DMU_OT_ZVOL_PROP, /* ZAP */ /* other; for testing only! */ DMU_OT_PLAIN_OTHER, /* UINT8 */ DMU_OT_UINT64_OTHER, /* UINT64 */ DMU_OT_ZAP_OTHER, /* ZAP */ /* new object types: */ DMU_OT_ERROR_LOG, /* ZAP */ DMU_OT_SPA_HISTORY, /* UINT8 */ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ DMU_OT_POOL_PROPS, /* ZAP */ DMU_OT_DSL_PERMS, /* ZAP */ DMU_OT_ACL, /* ACL */ DMU_OT_SYSACL, /* SYSACL */ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ DMU_OT_DDT_ZAP, /* ZAP */ DMU_OT_DDT_STATS, /* ZAP */ DMU_OT_SA, /* System attr */ DMU_OT_SA_MASTER_NODE, /* ZAP */ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_SCAN_XLATE, /* ZAP */ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ DMU_OT_DEADLIST, /* ZAP */ DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ /* * Do not allocate new object types here. Doing so makes the on-disk * format incompatible with any other format that uses the same object * type number. * * When creating an object which does not have one of the above types * use the DMU_OTN_* type with the correct byteswap and metadata * values. * * The DMU_OTN_* types do not have entries in the dmu_ot table, * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead * of indexing into dmu_ot directly (this works for both DMU_OT_* types * and DMU_OTN_* types). */ DMU_OT_NUMTYPES, /* * Names for valid types declared with DMU_OT(). */ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), } dmu_object_type_t; /* * These flags are intended to be used to specify the "txg_how" * parameter when calling the dmu_tx_assign() function. See the comment * above dmu_tx_assign() for more details on the meaning of these flags. */ #define TXG_NOWAIT (0ULL) #define TXG_WAIT (1ULL<<0) #define TXG_NOTHROTTLE (1ULL<<1) void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); void byteswap_uint16_array(void *buf, size_t size); void byteswap_uint8_array(void *buf, size_t size); void zap_byteswap(void *buf, size_t size); void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) #define DS_FIND_SERIALIZE (1<<2) /* * The maximum number of bytes that can be accessed as part of one * operation, including metadata. */ #define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) /* * artificial blkids for bonus buffer and spill blocks */ #define DMU_BONUS_BLKID (-1ULL) #define DMU_SPILL_BLKID (-2ULL) /* * Public routines to create, destroy, open, and close objsets. */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname, struct nvlist *snaps); int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); int dmu_objset_snapshot_tmp(const char *, const char *, int); int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); int dmu_objset_remap_indirects(const char *fsname); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ void *db_data; /* data in buffer */ } dmu_buf_t; /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" #define DMU_POOL_FEATURES_FOR_READ "features_for_read" #define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" #define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" #define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map" #define DMU_POOL_REMOVING "com.delphix:removing" #define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" #define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" #define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" /* * Allocate an object from this objset. The range of object numbers * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. * * The transaction must be assigned to a txg. The newly allocated * object will be "held" in the transaction (ie. you can modify the * newly allocated object in this transaction). * * dmu_object_alloc() chooses an object and returns it in *objectp. * * dmu_object_claim() allocates a specific object number. If that * number is already allocated, it fails and returns EEXIST. * * Return 0 on success, or ENOSPC or EEXIST as specified above. */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); /* * Free an object from this objset. * * The object's data will be freed as well (ie. you don't need to call * dmu_free(object, 0, -1, tx)). * * The object need not be held in the transaction. * * If there are any holds on this object's buffers (via dmu_buf_hold()), * or tx holds on the object (via dmu_tx_hold_object()), you can not * free it; it fails and returns EBUSY. * * If the object is not allocated, it fails and returns ENOENT. * * Return 0 on success, or EBUSY or ENOENT as specified above. */ int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Find the next allocated or free object. * * The objectp parameter is in-out. It will be updated to be the next * object which is allocated. Ignore objects which have not been * modified since txg. * * XXX Can only be called on a objset with no dirty data. * * Returns 0 on success, or ENOENT if there are no more objects. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg); /* * Set the data blocksize for an object. * * The object cannot have any blocks allcated beyond the first. If * the first block is allocated already, the new size must be greater * than the current block size. If these conditions are not met, * ENOTSUP will be returned. * * Returns 0 on success, or EBUSY if there are any holds on the object * contents, or ENOTSUP as described above. */ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx); /* * Set the checksum property on a dnode. The new checksum algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx); /* * Set the compress property on a dnode. The new compression algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); int dmu_object_remap_indirects(objset_t *os, uint64_t object, uint64_t txg); void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); /* * Decide how to write a block: checksum, compression, number of copies, etc. */ #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus * data. As with any normal buffer, you must call dmu_buf_will_dirty() * before modifying it, and the * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release your hold with dmu_buf_rele(). * * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); /* * Special spill buffer support used by "SA" framework */ int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so * that it will remain in memory. You must release the hold with * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. * * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill * on the returned buffer before reading or writing the buffer's * db_data. The comments for those routines describe what particular * operations are valid after calling them. * * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **, int flags); int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, void *tag, dmu_buf_t **dbp, int flags); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. */ void dmu_buf_add_ref(dmu_buf_t *db, void* tag); /* * Attempt to add a reference to a dmu buffer that is in an unknown state, * using a pointer that may have been invalidated by eviction processing. * The request will succeed if the passed in dbuf still represents the * same os/object/blkid, is ineligible for eviction, and has at least * one hold by a user other than the syncer. */ boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, uint64_t blkid, void *tag); void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); /* * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a * range of an object. A pointer to an array of dmu_buf_t*'s is * returned (in *dbpp). * * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and * frees the array. The hold on the array of buffers MUST be released * with dmu_buf_rele_array. You can NOT release the hold on each buffer * individually with dmu_buf_rele. */ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); typedef void dmu_buf_evict_func_t(void *user_ptr); /* * A DMU buffer user object may be associated with a dbuf for the * duration of its lifetime. This allows the user of a dbuf (client) * to attach private data to a dbuf (e.g. in-core only data such as a * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified * when that dbuf has been evicted. Clients typically respond to the * eviction notification by freeing their private data, thus ensuring * the same lifetime for both dbuf and private data. * * The mapping from a dmu_buf_user_t to any client private data is the * client's responsibility. All current consumers of the API with private * data embed a dmu_buf_user_t as the first member of the structure for * their private data. This allows conversions between the two types * with a simple cast. Since the DMU buf user API never needs access * to the private data, other strategies can be employed if necessary * or convenient for the client (e.g. using container_of() to do the * conversion for private data that cannot have the dmu_buf_user_t as * its first member). * * Eviction callbacks are executed without the dbuf mutex held or any * other type of mechanism to guarantee that the dbuf is still available. * For this reason, users must assume the dbuf has already been freed * and not reference the dbuf from the callback context. * * Users requesting "immediate eviction" are notified as soon as the dbuf * is only referenced by dirty records (dirties == holds). Otherwise the * notification occurs after eviction processing for the dbuf begins. */ typedef struct dmu_buf_user { /* * Asynchronous user eviction callback state. */ taskq_ent_t dbu_tqent; /* * This instance's eviction function pointers. * * dbu_evict_func_sync is called synchronously and then * dbu_evict_func_async is executed asynchronously on a taskq. */ dmu_buf_evict_func_t *dbu_evict_func_sync; dmu_buf_evict_func_t *dbu_evict_func_async; #ifdef ZFS_DEBUG /* * Pointer to user's dbuf pointer. NULL for clients that do * not associate a dbuf with their user data. * * The dbuf pointer is cleared upon eviction so as to catch * use-after-evict bugs in clients. */ dmu_buf_t **dbu_clear_on_evict_dbufp; #endif } dmu_buf_user_t; /* * Initialize the given dmu_buf_user_t instance with the eviction function * evict_func, to be called when the user is evicted. * * NOTE: This function should only be called once on a given dmu_buf_user_t. * To allow enforcement of this, dbu must already be zeroed on entry. */ /*ARGSUSED*/ inline void dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, dmu_buf_evict_func_t *evict_func_async, dmu_buf_t **clear_on_evict_dbufp) { ASSERT(dbu->dbu_evict_func_sync == NULL); ASSERT(dbu->dbu_evict_func_async == NULL); /* must have at least one evict func */ IMPLY(evict_func_sync == NULL, evict_func_async != NULL); dbu->dbu_evict_func_sync = evict_func_sync; dbu->dbu_evict_func_async = evict_func_async; #ifdef ZFS_DEBUG dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; #endif } /* * Attach user data to a dbuf and mark it for normal (when the dbuf's * data is cleared or its reference count goes to zero) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Attach user data to a dbuf and mark it for immediate (its dirty and * reference counts are equal) eviction processing. * * Returns NULL on success, or the existing user if another user currently * owns the buffer. */ void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); /* * Replace the current user of a dbuf. * * If given the current user of a dbuf, replaces the dbuf's user with * "new_user" and returns the user data pointer that was replaced. * Otherwise returns the current, and unmodified, dbuf user pointer. */ void *dmu_buf_replace_user(dmu_buf_t *db, dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); /* * Remove the specified user data for a DMU buffer. * * Returns the user that was removed on success, or the current user if * another user currently owns the buffer. */ void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); /* * Returns the user data (dmu_buf_user_t *) associated with this dbuf. */ void *dmu_buf_get_user(dmu_buf_t *db); objset_t *dmu_buf_get_objset(dmu_buf_t *db); dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); void dmu_buf_dnode_exit(dmu_buf_t *db); /* Block until any in-progress dmu buf user evictions complete. */ void dmu_buf_user_evict_wait(void); /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); /* * Indicate that you are going to modify the buffer's data (db_data). * * The transaction (tx) must be assigned to a txg (ie. you've called * dmu_tx_assign()). The buffer's object must be held in the tx * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); /* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has * been assigned, you can modify buffers which belong to held objects as * part of this transaction. You can't modify buffers before the * transaction has been assigned; you can't modify buffers which don't * belong to objects which this transaction holds; you can't hold * objects once the transaction has been assigned. You may hold an * object which you are going to free (with dmu_object_free()), but you * don't have to. * * You can abort the transaction before it has been assigned. * * Note that you may hold buffers (with dmu_buf_hold) at any time, * regardless of transaction state. */ #define DMU_NEW_OBJECT (-1ULL) #define DMU_OBJECT_END (-1ULL) dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len); void dmu_tx_hold_remap_l1indirect(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn); void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); /* * To register a commit callback, dmu_tx_callback_register() must be called. * * dcb_data is a pointer to caller private data that is passed on as a * callback parameter. The caller is responsible for properly allocating and * freeing it. * * When registering a callback, the transaction must be already created, but * it cannot be committed or aborted. It can be assigned to a txg or not. * * The callback will be called after the transaction has been safely written * to stable storage and will also be called if the dmu_tx is aborted. * If there is any error which prevents the transaction from being committed to * disk, the callback will be called with a value of error != 0. */ typedef void dmu_tx_callback_func_t(void *dcb_data, int error); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, void *dcb_data); /* * Free up the data blocks for a defined range of a file. If size is * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); int dmu_free_long_object(objset_t *os, uint64_t object); /* * Convenience functions. * * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL #ifdef illumos int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); #else int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct vm_page **ppa, dmu_tx_t *tx); int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, int *rbehind, int *rahead, int last_size); #endif #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_xuio_init(struct xuio *uio, int niov); void dmu_xuio_fini(struct xuio *uio); int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, size_t n); int dmu_xuio_cnt(struct xuio *uio); struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); void dmu_xuio_clear(struct xuio *uio, int i); void xuio_stat_wbuf_copied(void); void xuio_stat_wbuf_nocopy(void); extern boolean_t zfs_prefetch_disable; extern int zfs_max_recordsize; /* * Asynchronously try to read in the data. */ void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, uint64_t len, enum zio_priority pri); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_nblkptr; uint8_t doi_pad[4]; uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ uint64_t doi_max_offset; uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void arc_byteswap_func_t(void *buf, size_t size); typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; boolean_t ot_dbuf_metadata_cache; char *ot_name; } dmu_object_type_info_t; typedef struct dmu_object_byteswap_info { arc_byteswap_func_t *ob_func; char *ob_name; } dmu_object_byteswap_info_t; extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. * * Return 0 on success or ENOENT if object is not allocated. * * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); /* * Like dmu_object_info_from_db, but faster still when you only care about * the size. This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; uint64_t dds_guid; dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; char dds_origin[ZFS_MAX_DATASET_NAME_LEN]; } dmu_objset_stats_t; /* * Get stats on a dataset. */ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); /* * Add entries to the nvlist for all the objset's properties. See * zfs_prop_table[] and zfs(1m) for details on the properties. */ void dmu_objset_stats(objset_t *os, struct nvlist *nv); /* * Get the space usage statistics for statvfs(). * * refdbytes is the amount of space "referenced" by this objset. * availbytes is the amount of space available to this objset, taking * into account quotas & reservations, assuming that no other objsets * use the space first. These values correspond to the 'referenced' and * 'available' properties, described in the zfs(1m) manpage. * * usedobjs and availobjs are the number of objects currently allocated, * and available. */ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); /* * The fsid_guid is a 56-bit ID that can change to avoid collisions. * (Contrast with the ds_guid which is a 64-bit ID that will never * change, so there is a small probability that it will collide.) */ uint64_t dmu_objset_fsid_guid(objset_t *os); /* * Get the [cm]time for an objset's snapshot dir */ timestruc_t dmu_objset_snap_cmtime(objset_t *os); int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); extern struct zilog *dmu_objset_zil(objset_t *os); extern struct dsl_pool *dmu_objset_pool(objset_t *os); extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); typedef int objset_used_cb_t(dmu_object_type_t bonustype, void *bonus, uint64_t *userp, uint64_t *groupp); extern void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); /* * Return the txg number for the given assigned transaction. */ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* * Synchronous write. * If a parent zio is provided this function initiates a write on the * provided buffer as a child of the parent zio. * In the absence of a parent zio, the write is completed synchronously. * At write completion, blk is filled with the bp of the written block. * Note that while the data covered by this function will be on stable * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ /* * {zfs,zvol,ztest}_get_done() args */ typedef struct zgd { struct lwb *zgd_lwb; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct rl *zgd_rl; void *zgd_private; } zgd_t; typedef void dmu_sync_cb_t(zgd_t *arg, int error); int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off * Return found offset in *off. Return ESRCH for end of file. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); /* * Check if a DMU object has any dirty blocks. If so, sync out * all pending transaction groups. Otherwise, this function * does not alter DMU state. This could be improved to only sync * out the necessary transaction groups for this particular * object. */ int dmu_object_wait_synced(objset_t *os, uint64_t object); /* * Initial setup and final teardown. */ extern void dmu_init(void); extern void dmu_fini(void); typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, uint64_t object, uint64_t offset, int len); void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); int dmu_diff(const char *tosnap_name, const char *fromsnap_name, struct file *fp, offset_t *offp); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; extern int zfs_mdcomp_disable; #ifdef __cplusplus } #endif #endif /* _SYS_DMU_H */ Index: head/sys/cddl/contrib/opensolaris =================================================================== --- head/sys/cddl/contrib/opensolaris (revision 337168) +++ head/sys/cddl/contrib/opensolaris (revision 337169) Property changes on: head/sys/cddl/contrib/opensolaris ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /vendor-sys/illumos/dist:r337167