diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index 5ad3b2f269d6..345803e42f2f 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -1,3081 +1,3082 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2015, STRATO AG, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2022 Hewlett Packard Enterprise Development LP. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include #include /* * Needed to close a window in dnode_move() that allows the objset to be freed * before it can be safely accessed. */ krwlock_t os_lock; /* * Tunable to overwrite the maximum number of threads for the parallelization * of dmu_objset_find_dp, needed to speed up the import of pools with many * datasets. * Default is 4 times the number of leaf vdevs. */ int dmu_find_threads = 0; /* * Backfill lower metadnode objects after this many have been freed. * Backfilling negatively impacts object creation rates, so only do it * if there are enough holes to fill. */ int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT; static char *upgrade_tag = "upgrade_tag"; static void dmu_objset_find_dp_cb(void *arg); static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb); static void dmu_objset_upgrade_stop(objset_t *os); void dmu_objset_init(void) { rw_init(&os_lock, NULL, RW_DEFAULT, NULL); } void dmu_objset_fini(void) { rw_destroy(&os_lock); } spa_t * dmu_objset_spa(objset_t *os) { return (os->os_spa); } zilog_t * dmu_objset_zil(objset_t *os) { return (os->os_zil); } dsl_pool_t * dmu_objset_pool(objset_t *os) { dsl_dataset_t *ds; if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) return (ds->ds_dir->dd_pool); else return (spa_get_dsl(os->os_spa)); } dsl_dataset_t * dmu_objset_ds(objset_t *os) { return (os->os_dsl_dataset); } dmu_objset_type_t dmu_objset_type(objset_t *os) { return (os->os_phys->os_type); } void dmu_objset_name(objset_t *os, char *buf) { dsl_dataset_name(os->os_dsl_dataset, buf); } uint64_t dmu_objset_id(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; return (ds ? ds->ds_object : 0); } uint64_t dmu_objset_dnodesize(objset_t *os) { return (os->os_dnodesize); } zfs_sync_type_t dmu_objset_syncprop(objset_t *os) { return (os->os_sync); } zfs_logbias_op_t dmu_objset_logbias(objset_t *os) { return (os->os_logbias); } static void checksum_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); } static void compression_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval != ZIO_COMPRESS_INHERIT); os->os_compress = zio_compress_select(os->os_spa, ZIO_COMPRESS_ALGO(newval), ZIO_COMPRESS_ON); os->os_complevel = zio_complevel_select(os->os_spa, os->os_compress, ZIO_COMPRESS_LEVEL(newval), ZIO_COMPLEVEL_DEFAULT); } static void copies_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval > 0); ASSERT(newval <= spa_max_replication(os->os_spa)); os->os_copies = newval; } static void dedup_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; spa_t *spa = os->os_spa; enum zio_checksum checksum; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); } static void primary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_primary_cache = newval; } static void secondary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_secondary_cache = newval; } static void sync_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || newval == ZFS_SYNC_DISABLED); os->os_sync = newval; if (os->os_zil) zil_set_sync(os->os_zil, newval); } static void redundant_metadata_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || newval == ZFS_REDUNDANT_METADATA_MOST || newval == ZFS_REDUNDANT_METADATA_SOME || newval == ZFS_REDUNDANT_METADATA_NONE); os->os_redundant_metadata = newval; } static void dnodesize_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; switch (newval) { case ZFS_DNSIZE_LEGACY: os->os_dnodesize = DNODE_MIN_SIZE; break; case ZFS_DNSIZE_AUTO: /* * Choose a dnode size that will work well for most * workloads if the user specified "auto". Future code * improvements could dynamically select a dnode size * based on observed workload patterns. */ os->os_dnodesize = DNODE_MIN_SIZE * 2; break; case ZFS_DNSIZE_1K: case ZFS_DNSIZE_2K: case ZFS_DNSIZE_4K: case ZFS_DNSIZE_8K: case ZFS_DNSIZE_16K: os->os_dnodesize = newval; break; } } static void smallblk_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval <= SPA_MAXBLOCKSIZE); ASSERT(ISP2(newval)); os->os_zpl_special_smallblock = newval; } static void logbias_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; ASSERT(newval == ZFS_LOGBIAS_LATENCY || newval == ZFS_LOGBIAS_THROUGHPUT); os->os_logbias = newval; if (os->os_zil) zil_set_logbias(os->os_zil, newval); } static void recordsize_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; os->os_recordsize = newval; } void dmu_objset_byteswap(void *buf, size_t size) { objset_phys_t *osp = buf; ASSERT(size == OBJSET_PHYS_SIZE_V1 || size == OBJSET_PHYS_SIZE_V2 || size == sizeof (objset_phys_t)); dnode_byteswap(&osp->os_meta_dnode); byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); osp->os_type = BSWAP_64(osp->os_type); osp->os_flags = BSWAP_64(osp->os_flags); if (size >= OBJSET_PHYS_SIZE_V2) { dnode_byteswap(&osp->os_userused_dnode); dnode_byteswap(&osp->os_groupused_dnode); if (size >= sizeof (objset_phys_t)) dnode_byteswap(&osp->os_projectused_dnode); } } /* * The hash is a CRC-based hash of the objset_t pointer and the object number. */ static uint64_t dnode_hash(const objset_t *os, uint64_t obj) { uintptr_t osv = (uintptr_t)os; uint64_t crc = -1ULL; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); /* * The low 6 bits of the pointer don't have much entropy, because * the objset_t is larger than 2^6 bytes long. */ crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; crc ^= (osv>>14) ^ (obj>>24); return (crc); } static unsigned int dnode_multilist_index_func(multilist_t *ml, void *obj) { dnode_t *dn = obj; /* * The low order bits of the hash value are thought to be * distributed evenly. Otherwise, in the case that the multilist * has a power of two number of sublists, each sublists' usage * would not be evenly distributed. In this context full 64bit * division would be a waste of time, so limit it to 32 bits. */ return ((unsigned int)dnode_hash(dn->dn_objset, dn->dn_object) % multilist_get_num_sublists(ml)); } static inline boolean_t dmu_os_is_l2cacheable(objset_t *os) { vdev_t *vd = NULL; zfs_cache_type_t cache = os->os_secondary_cache; blkptr_t *bp = os->os_rootbp; if (bp != NULL && !BP_IS_HOLE(bp)) { uint64_t vdev = DVA_GET_VDEV(bp->blk_dva); vdev_t *rvd = os->os_spa->spa_root_vdev; if (vdev < rvd->vdev_children) vd = rvd->vdev_child[vdev]; if (cache == ZFS_CACHE_ALL || cache == ZFS_CACHE_METADATA) { if (vd == NULL) return (B_TRUE); if ((vd->vdev_alloc_bias != VDEV_BIAS_SPECIAL && vd->vdev_alloc_bias != VDEV_BIAS_DEDUP) || l2arc_exclude_special == 0) return (B_TRUE); } } return (B_FALSE); } /* * Instantiates the objset_t in-memory structure corresponding to the * objset_phys_t that's pointed to by the specified blkptr_t. */ int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, objset_t **osp) { objset_t *os; int i, err; ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); ASSERT(!BP_IS_REDACTED(bp)); /* * We need the pool config lock to get properties. */ ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool)); /* * The $ORIGIN dataset (if it exists) doesn't have an associated * objset, so there's no reason to open it. The $ORIGIN dataset * will not exist on pools older than SPA_VERSION_ORIGIN. */ if (ds != NULL && spa_get_dsl(spa) != NULL && spa_get_dsl(spa)->dp_origin_snap != NULL) { ASSERT3P(ds->ds_dir, !=, spa_get_dsl(spa)->dp_origin_snap->ds_dir); } os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); os->os_dsl_dataset = ds; os->os_spa = spa; os->os_rootbp = bp; if (!BP_IS_HOLE(os->os_rootbp)) { arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; int size; enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (dmu_os_is_l2cacheable(os)) aflags |= ARC_FLAG_L2CACHE; if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) { ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); ASSERT(BP_IS_AUTHENTICATED(bp)); zio_flags |= ZIO_FLAG_RAW; } dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (err != 0) { kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } if (spa_version(spa) < SPA_VERSION_USERSPACE) size = OBJSET_PHYS_SIZE_V1; else if (!spa_feature_is_enabled(spa, SPA_FEATURE_PROJECT_QUOTA)) size = OBJSET_PHYS_SIZE_V2; else size = sizeof (objset_phys_t); /* Increase the blocksize if we are permitted. */ if (arc_buf_size(os->os_phys_buf) < size) { arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf, ARC_BUFC_METADATA, size); bzero(buf->b_data, size); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } os->os_phys = os->os_phys_buf->b_data; os->os_flags = os->os_phys->os_flags; } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_PHYS_SIZE_V1; os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf, ARC_BUFC_METADATA, size); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); } /* * These properties will be filled in by the logic in zfs_get_zplprop() * when they are queried for the first time. */ os->os_version = OBJSET_PROP_UNINITIALIZED; os->os_normalization = OBJSET_PROP_UNINITIALIZED; os->os_utf8only = OBJSET_PROP_UNINITIALIZED; os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED; /* * Note: the changed_cb will be called once before the register * func returns, thus changing the checksum/compression from the * default (fletcher2/off). Snapshots don't need to know about * checksum/compression/copies. */ if (ds != NULL) { os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0); err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } if (!ds->ds_is_snapshot) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name( ZFS_PROP_REDUNDANT_METADATA), redundant_metadata_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), recordsize_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DNODESIZE), dnodesize_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name( ZFS_PROP_SPECIAL_SMALL_BLOCKS), smallblk_changed_cb, os); } } if (err != 0) { arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); kmem_free(os, sizeof (objset_t)); return (err); } } else { /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_compress = ZIO_COMPRESS_ON; os->os_complevel = ZIO_COMPLEVEL_DEFAULT; os->os_encrypted = B_FALSE; os->os_copies = spa_max_replication(spa); os->os_dedup_checksum = ZIO_CHECKSUM_OFF; os->os_dedup_verify = B_FALSE; os->os_logbias = ZFS_LOGBIAS_LATENCY; os->os_sync = ZFS_SYNC_STANDARD; os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; os->os_dnodesize = DNODE_MIN_SIZE; } if (ds == NULL || !ds->ds_is_snapshot) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { multilist_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i]), dnode_multilist_index_func); } list_create(&os->os_dnodes, sizeof (dnode_t), offsetof(dnode_t, dn_link)); list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); list_link_init(&os->os_evicting_node); mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); os->os_obj_next_percpu_len = boot_ncpus; os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0]), KM_SLEEP); dnode_special_open(os, &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (OBJSET_BUF_HAS_USERUSED(os->os_phys_buf)) { dnode_special_open(os, &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, &os->os_userused_dnode); dnode_special_open(os, &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); if (OBJSET_BUF_HAS_PROJECTUSED(os->os_phys_buf)) dnode_special_open(os, &os->os_phys->os_projectused_dnode, DMU_PROJECTUSED_OBJECT, &os->os_projectused_dnode); } mutex_init(&os->os_upgrade_lock, NULL, MUTEX_DEFAULT, NULL); *osp = os; return (0); } int dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { int err = 0; /* * We need the pool_config lock to manipulate the dsl_dataset_t. * Even if the dataset is long-held, we need the pool_config lock * to open the objset, as it needs to get properties. */ ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { objset_t *os; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, dsl_dataset_get_blkptr(ds), &os); rrw_exit(&ds->ds_bp_rwlock, FTAG); if (err == 0) { mutex_enter(&ds->ds_lock); ASSERT(ds->ds_objset == NULL); ds->ds_objset = os; mutex_exit(&ds->ds_lock); } } *osp = ds->ds_objset; mutex_exit(&ds->ds_opening_lock); return (err); } /* * Holds the pool while the objset is held. Therefore only one objset * can be held at a time. */ int dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; ds_hold_flags_t flags; flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_pool_hold(name, tag, &dp); if (err != 0) return (err); err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } err = dmu_objset_from_ds(ds, osp); if (err != 0) { dsl_dataset_rele(ds, tag); dsl_pool_rele(dp, tag); } return (err); } int dmu_objset_hold(const char *name, void *tag, objset_t **osp) { return (dmu_objset_hold_flags(name, B_FALSE, tag, osp)); } static int dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) { (void) tag; int err = dmu_objset_from_ds(ds, osp); if (err != 0) { return (err); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { return (SET_ERROR(EINVAL)); } else if (!readonly && dsl_dataset_is_snapshot(ds)) { return (SET_ERROR(EROFS)); } else if (!readonly && decrypt && dsl_dir_incompatible_encryption_version(ds->ds_dir)) { return (SET_ERROR(EROFS)); } /* if we are decrypting, we can now check MACs in os->os_phys_buf */ if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) { zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa, &zb, B_FALSE); if (err != 0) return (err); ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf)); } return (0); } /* * dsl_pool must not be held when this is called. * Upon successful return, there will be a longhold on the dataset, * and the dsl_pool will not be held. */ int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; ds_hold_flags_t flags; flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_own(dp, name, flags, tag, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp); if (err != 0) { dsl_dataset_disown(ds, flags, tag); dsl_pool_rele(dp, FTAG); return (err); } /* * User accounting requires the dataset to be decrypted and rw. * We also don't begin user accounting during claiming to help * speed up pool import times and to keep this txg reserved * completely for recovery work. */ if (!readonly && !dp->dp_spa->spa_claiming && (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) { if (dmu_objset_userobjspace_upgradable(*osp) || dmu_objset_projectquota_upgradable(*osp)) { dmu_objset_id_quota_upgrade(*osp); } else if (dmu_objset_userused_enabled(*osp)) { dmu_objset_userspace_upgrade(*osp); } } dsl_pool_rele(dp, FTAG); return (0); } int dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) { dsl_dataset_t *ds; int err; ds_hold_flags_t flags; flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds); if (err != 0) return (err); err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp); if (err != 0) { dsl_dataset_disown(ds, flags, tag); return (err); } return (0); } void dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag) { ds_hold_flags_t flags; dsl_pool_t *dp = dmu_objset_pool(os); flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag); dsl_pool_rele(dp, tag); } void dmu_objset_rele(objset_t *os, void *tag) { dmu_objset_rele_flags(os, B_FALSE, tag); } /* * When we are called, os MUST refer to an objset associated with a dataset * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner * == tag. We will then release and reacquire ownership of the dataset while * holding the pool config_rwlock to avoid intervening namespace or ownership * changes may occur. * * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to * release the hold on its dataset and acquire a new one on the dataset of the * same name so that it can be partially torn down and reconstructed. */ void dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, boolean_t decrypt, void *tag) { dsl_pool_t *dp; char name[ZFS_MAX_DATASET_NAME_LEN]; ds_hold_flags_t flags; flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; VERIFY3P(ds, !=, NULL); VERIFY3P(ds->ds_owner, ==, tag); VERIFY(dsl_dataset_long_held(ds)); dsl_dataset_name(ds, name); dp = ds->ds_dir->dd_pool; dsl_pool_config_enter(dp, FTAG); dsl_dataset_disown(ds, flags, tag); VERIFY0(dsl_dataset_own(dp, name, flags, tag, newds)); dsl_pool_config_exit(dp, FTAG); } void dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag) { ds_hold_flags_t flags; flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : DS_HOLD_FLAG_NONE; /* * Stop upgrading thread */ dmu_objset_upgrade_stop(os); dsl_dataset_disown(os->os_dsl_dataset, flags, tag); } void dmu_objset_evict_dbufs(objset_t *os) { dnode_t *dn_marker; dnode_t *dn; dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP); mutex_enter(&os->os_lock); dn = list_head(&os->os_dnodes); while (dn != NULL) { /* * Skip dnodes without holds. We have to do this dance * because dnode_add_ref() only works if there is already a * hold. If the dnode has no holds, then it has no dbufs. */ if (dnode_add_ref(dn, FTAG)) { list_insert_after(&os->os_dnodes, dn, dn_marker); mutex_exit(&os->os_lock); dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); mutex_enter(&os->os_lock); dn = list_next(&os->os_dnodes, dn_marker); list_remove(&os->os_dnodes, dn_marker); } else { dn = list_next(&os->os_dnodes, dn); } } mutex_exit(&os->os_lock); kmem_free(dn_marker, sizeof (dnode_t)); if (DMU_USERUSED_DNODE(os) != NULL) { if (DMU_PROJECTUSED_DNODE(os) != NULL) dnode_evict_dbufs(DMU_PROJECTUSED_DNODE(os)); dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); } dnode_evict_dbufs(DMU_META_DNODE(os)); } /* * Objset eviction processing is split into into two pieces. * The first marks the objset as evicting, evicts any dbufs that * have a refcount of zero, and then queues up the objset for the * second phase of eviction. Once os->os_dnodes has been cleared by * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. * The second phase closes the special dnodes, dequeues the objset from * the list of those undergoing eviction, and finally frees the objset. * * NOTE: Due to asynchronous eviction processing (invocation of * dnode_buf_pageout()), it is possible for the meta dnode for the * objset to have no holds even though os->os_dnodes is not empty. */ void dmu_objset_evict(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; for (int t = 0; t < TXG_SIZE; t++) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) dsl_prop_unregister_all(ds, os); if (os->os_sa) sa_tear_down(os); dmu_objset_evict_dbufs(os); mutex_enter(&os->os_lock); spa_evicting_os_register(os->os_spa, os); if (list_is_empty(&os->os_dnodes)) { mutex_exit(&os->os_lock); dmu_objset_evict_done(os); } else { mutex_exit(&os->os_lock); } } void dmu_objset_evict_done(objset_t *os) { ASSERT3P(list_head(&os->os_dnodes), ==, NULL); dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { if (DMU_PROJECTUSED_DNODE(os)) dnode_special_close(&os->os_projectused_dnode); dnode_special_close(&os->os_userused_dnode); dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); /* * This is a barrier to prevent the objset from going away in * dnode_move() until we can safely ensure that the objset is still in * use. We consider the objset valid before the barrier and invalid * after the barrier. */ rw_enter(&os_lock, RW_READER); rw_exit(&os_lock); kmem_free(os->os_obj_next_percpu, os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0])); mutex_destroy(&os->os_lock); mutex_destroy(&os->os_userused_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); mutex_destroy(&os->os_upgrade_lock); for (int i = 0; i < TXG_SIZE; i++) multilist_destroy(&os->os_dirty_dnodes[i]); spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } inode_timespec_t dmu_objset_snap_cmtime(objset_t *os) { return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); } objset_t * dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx) { objset_t *os; dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); if (blksz == 0) blksz = DNODE_BLOCK_SIZE; if (ibs == 0) ibs = DN_MAX_INDBLKSHIFT; if (ds != NULL) VERIFY0(dmu_objset_from_ds(ds, &os)); else VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); mdn = DMU_META_DNODE(os); dnode_allocate(mdn, DMU_OT_DNODE, blksz, ibs, DMU_OT_NONE, 0, DNODE_MIN_SLOTS, tx); /* * We don't want to have to increase the meta-dnode's nlevels * later, because then we could do it in quiescing context while * we are also accessing it in open context. * * This precaution is not necessary for the MOS (ds == NULL), * because the MOS is only updated in syncing context. * This is most fortunate: the MOS is the only objset that * needs to be synced multiple times as spa_sync() iterates * to convergence, so minimizing its dn_nlevels matters. */ if (ds != NULL) { if (levels == 0) { levels = 1; /* * Determine the number of levels necessary for the * meta-dnode to contain DN_MAX_OBJECT dnodes. Note * that in order to ensure that we do not overflow * 64 bits, there has to be a nlevels that gives us a * number of blocks > DN_MAX_OBJECT but < 2^64. * Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) * (10) must be less than (64 - log2(DN_MAX_OBJECT)) * (16). */ while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < DN_MAX_OBJECT) levels++; } mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = mdn->dn_nlevels = levels; } ASSERT(type != DMU_OST_NONE); ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); os->os_phys->os_type = type; /* * Enable user accounting if it is enabled and this is not an * encrypted receive. */ if (dmu_objset_userused_enabled(os) && (!os->os_encrypted || !dmu_objset_is_receiving(os))) { os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; if (dmu_objset_userobjused_enabled(os)) { ds->ds_feature_activation[ SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; os->os_phys->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; } if (dmu_objset_projectquota_enabled(os)) { ds->ds_feature_activation[ SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE; os->os_phys->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE; } os->os_flags = os->os_phys->os_flags; } dsl_dataset_dirty(ds, tx); return (os); } /* called from dsl for meta-objset */ objset_t * dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx) { return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx)); } typedef struct dmu_objset_create_arg { const char *doca_name; cred_t *doca_cred; proc_t *doca_proc; void (*doca_userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *doca_userarg; dmu_objset_type_t doca_type; uint64_t doca_flags; dsl_crypto_params_t *doca_dcp; } dmu_objset_create_arg_t; static int dmu_objset_create_check(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; dsl_dataset_t *parentds; objset_t *parentos; const char *tail; int error; if (strchr(doca->doca_name, '@') != NULL) return (SET_ERROR(EINVAL)); if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); if (dataset_nestcheck(doca->doca_name) != 0) return (SET_ERROR(ENAMETOOLONG)); error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); if (error != 0) return (error); if (tail == NULL) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp, NULL); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (error); } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred, doca->doca_proc); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (error); } /* can't create below anything but filesystems (eg. no ZVOLs) */ error = dsl_dataset_hold_obj(pdd->dd_pool, dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (error); } error = dmu_objset_from_ds(parentds, &parentos); if (error != 0) { dsl_dataset_rele(parentds, FTAG); dsl_dir_rele(pdd, FTAG); return (error); } if (dmu_objset_type(parentos) != DMU_OST_ZFS) { dsl_dataset_rele(parentds, FTAG); dsl_dir_rele(pdd, FTAG); return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); } dsl_dataset_rele(parentds, FTAG); dsl_dir_rele(pdd, FTAG); return (error); } static void dmu_objset_create_sync(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; dsl_dir_t *pdd; const char *tail; dsl_dataset_t *ds; uint64_t obj; blkptr_t *bp; objset_t *os; zio_t *rzio; VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, doca->doca_cred, doca->doca_dcp, tx); VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); bp = dsl_dataset_get_blkptr(ds); os = dmu_objset_create_impl(spa, ds, bp, doca->doca_type, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); if (doca->doca_userfunc != NULL) { doca->doca_userfunc(os, doca->doca_userarg, doca->doca_cred, tx); } /* * The doca_userfunc() may write out some data that needs to be * encrypted if the dataset is encrypted (specifically the root * directory). This data must be written out before the encryption * key mapping is removed by dsl_dataset_rele_flags(). Force the * I/O to occur immediately by invoking the relevant sections of * dsl_pool_sync(). */ if (os->os_encrypted) { dsl_dataset_t *tmpds = NULL; boolean_t need_sync_done = B_FALSE; mutex_enter(&ds->ds_lock); ds->ds_owner = FTAG; mutex_exit(&ds->ds_lock); rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds, tx->tx_txg); if (tmpds != NULL) { dsl_dataset_sync(ds, rzio, tx); need_sync_done = B_TRUE; } VERIFY0(zio_wait(rzio)); dmu_objset_sync_done(os, tx); taskq_wait(dp->dp_sync_taskq); if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(spa, ds->ds_key_mapping, ds); } rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds, tx->tx_txg); if (tmpds != NULL) { dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, rzio, tx); } VERIFY0(zio_wait(rzio)); if (need_sync_done) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(spa, ds->ds_key_mapping, ds); dsl_dataset_sync_done(ds, tx); + dmu_buf_rele(ds->ds_dbuf, ds); } mutex_enter(&ds->ds_lock); ds->ds_owner = NULL; mutex_exit(&ds->ds_lock); } spa_history_log_internal_ds(ds, "create", tx, " "); dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg) { dmu_objset_create_arg_t doca; dsl_crypto_params_t tmp_dcp = { 0 }; doca.doca_name = name; doca.doca_cred = CRED(); doca.doca_proc = curproc; doca.doca_flags = flags; doca.doca_userfunc = func; doca.doca_userarg = arg; doca.doca_type = type; /* * Some callers (mostly for testing) do not provide a dcp on their * own but various code inside the sync task will require it to be * allocated. Rather than adding NULL checks throughout this code * or adding dummy dcp's to all of the callers we simply create a * dummy one here and use that. This zero dcp will have the same * effect as asking for inheritance of all encryption params. */ doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp; int rv = dsl_sync_task(name, dmu_objset_create_check, dmu_objset_create_sync, &doca, 6, ZFS_SPACE_CHECK_NORMAL); if (rv == 0) zvol_create_minor(name); return (rv); } typedef struct dmu_objset_clone_arg { const char *doca_clone; const char *doca_origin; cred_t *doca_cred; proc_t *doca_proc; } dmu_objset_clone_arg_t; static int dmu_objset_clone_check(void *arg, dmu_tx_t *tx) { dmu_objset_clone_arg_t *doca = arg; dsl_dir_t *pdd; const char *tail; int error; dsl_dataset_t *origin; dsl_pool_t *dp = dmu_tx_pool(tx); if (strchr(doca->doca_clone, '@') != NULL) return (SET_ERROR(EINVAL)); if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); if (error != 0) return (error); if (tail == NULL) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred, doca->doca_proc); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EDQUOT)); } error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (error); } /* You can only clone snapshots, not the head datasets. */ if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EINVAL)); } dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); return (0); } static void dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) { dmu_objset_clone_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; dsl_dataset_t *origin, *ds; uint64_t obj; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); obj = dsl_dataset_create_sync(pdd, tail, origin, 0, doca->doca_cred, NULL, tx); VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); dsl_dataset_name(origin, namebuf); spa_history_log_internal_ds(ds, "clone", tx, "origin=%s (%llu)", namebuf, (u_longlong_t)origin->ds_object); dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_clone(const char *clone, const char *origin) { dmu_objset_clone_arg_t doca; doca.doca_clone = clone; doca.doca_origin = origin; doca.doca_cred = CRED(); doca.doca_proc = curproc; int rv = dsl_sync_task(clone, dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 6, ZFS_SPACE_CHECK_NORMAL); if (rv == 0) zvol_create_minor(clone); return (rv); } int dmu_objset_snapshot_one(const char *fsname, const char *snapname) { int err; char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); nvlist_t *snaps = fnvlist_alloc(); fnvlist_add_boolean(snaps, longsnap); kmem_strfree(longsnap); err = dsl_dataset_snapshot(snaps, NULL, NULL); fnvlist_free(snaps); return (err); } static void dmu_objset_upgrade_task_cb(void *data) { objset_t *os = data; mutex_enter(&os->os_upgrade_lock); os->os_upgrade_status = EINTR; if (!os->os_upgrade_exit) { int status; mutex_exit(&os->os_upgrade_lock); status = os->os_upgrade_cb(os); mutex_enter(&os->os_upgrade_lock); os->os_upgrade_status = status; } os->os_upgrade_exit = B_TRUE; os->os_upgrade_id = 0; mutex_exit(&os->os_upgrade_lock); dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); } static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb) { if (os->os_upgrade_id != 0) return; ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); dsl_dataset_long_hold(dmu_objset_ds(os), upgrade_tag); mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0 && os->os_upgrade_status == 0) { os->os_upgrade_exit = B_FALSE; os->os_upgrade_cb = cb; os->os_upgrade_id = taskq_dispatch( os->os_spa->spa_upgrade_taskq, dmu_objset_upgrade_task_cb, os, TQ_SLEEP); if (os->os_upgrade_id == TASKQID_INVALID) { dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); os->os_upgrade_status = ENOMEM; } } else { dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); } mutex_exit(&os->os_upgrade_lock); } static void dmu_objset_upgrade_stop(objset_t *os) { mutex_enter(&os->os_upgrade_lock); os->os_upgrade_exit = B_TRUE; if (os->os_upgrade_id != 0) { taskqid_t id = os->os_upgrade_id; os->os_upgrade_id = 0; mutex_exit(&os->os_upgrade_lock); if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) { dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); } txg_wait_synced(os->os_spa->spa_dsl_pool, 0); } else { mutex_exit(&os->os_upgrade_lock); } } static void dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) { dnode_t *dn; while ((dn = multilist_sublist_head(list)) != NULL) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ASSERT(dn->dn_dbuf->db_data_pending); /* * Initialize dn_zio outside dnode_sync() because the * meta-dnode needs to set it outside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); multilist_sublist_remove(list, dn); /* * See the comment above dnode_rele_task() for an explanation * of why this dnode hold is always needed (even when not * doing user accounting). */ multilist_t *newlist = &dn->dn_objset->os_synced_dnodes; (void) dnode_add_ref(dn, newlist); multilist_insert(newlist, dn); dnode_sync(dn, tx); } } static void dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { (void) abuf; blkptr_t *bp = zio->io_bp; objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; uint64_t fill = 0; ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); ASSERT0(BP_GET_LEVEL(bp)); /* * Update rootbp fill count: it should be the number of objects * allocated in the object set (not counting the "special" * objects that are stored in the objset_phys_t -- the meta * dnode and user/group/project accounting objects). */ for (int i = 0; i < dnp->dn_nblkptr; i++) fill += BP_GET_FILL(&dnp->dn_blkptr[i]); BP_SET_FILL(bp, fill); if (os->os_dsl_dataset != NULL) rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG); *os->os_rootbp = *bp; if (os->os_dsl_dataset != NULL) rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); } static void dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) { (void) abuf; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; objset_t *os = arg; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } kmem_free(bp, sizeof (*bp)); } typedef struct sync_dnodes_arg { multilist_t *sda_list; int sda_sublist_idx; multilist_t *sda_newlist; dmu_tx_t *sda_tx; } sync_dnodes_arg_t; static void sync_dnodes_task(void *arg) { sync_dnodes_arg_t *sda = arg; multilist_sublist_t *ms = multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); dmu_objset_sync_dnodes(ms, sda->sda_tx); multilist_sublist_unlock(ms); kmem_free(sda, sizeof (*sda)); } /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; list_t *list; dbuf_dirty_record_t *dr; int num_sublists; multilist_t *ml; blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); *blkptr_copy = *os->os_rootbp; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", (u_longlong_t)tx->tx_txg); ASSERT(dmu_tx_is_syncing(tx)); /* XXX the write_done callback should really give us the tx... */ os->os_synctx = tx; if (os->os_dsl_dataset == NULL) { /* * This is the MOS. If we have upgraded, * spa_max_replication() could change, so reset * os_copies here. */ os->os_copies = spa_max_replication(os->os_spa); } /* * Create the root block IO */ SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); arc_release(os->os_phys_buf, &os->os_phys_buf); dmu_write_policy(os, NULL, 0, 0, &zp); /* * If we are either claiming the ZIL or doing a raw receive, write * out the os_phys_buf raw. Neither of these actions will effect the * MAC at this point. */ if (os->os_raw_receive || os->os_next_write_raw[tx->tx_txg & TXG_MASK]) { ASSERT(os->os_encrypted); arc_convert_to_raw(os->os_phys_buf, os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER, DMU_OT_OBJSET, NULL, NULL, NULL); } zio = arc_write(pio, os->os_spa, tx->tx_txg, blkptr_copy, os->os_phys_buf, dmu_os_is_l2cacheable(os), &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* * Sync special dnodes - the parent IO for the sync is the root block */ DMU_META_DNODE(os)->dn_zio = zio; dnode_sync(DMU_META_DNODE(os), tx); os->os_phys->os_flags = os->os_flags; if (DMU_USERUSED_DNODE(os) && DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { DMU_USERUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_USERUSED_DNODE(os), tx); DMU_GROUPUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_GROUPUSED_DNODE(os), tx); } if (DMU_PROJECTUSED_DNODE(os) && DMU_PROJECTUSED_DNODE(os)->dn_type != DMU_OT_NONE) { DMU_PROJECTUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_PROJECTUSED_DNODE(os), tx); } txgoff = tx->tx_txg & TXG_MASK; /* * We must create the list here because it uses the * dn_dirty_link[] of this txg. But it may already * exist because we call dsl_dataset_sync() twice per txg. */ if (os->os_synced_dnodes.ml_sublists == NULL) { multilist_create(&os->os_synced_dnodes, sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[txgoff]), dnode_multilist_index_func); } else { ASSERT3U(os->os_synced_dnodes.ml_offset, ==, offsetof(dnode_t, dn_dirty_link[txgoff])); } ml = &os->os_dirty_dnodes[txgoff]; num_sublists = multilist_get_num_sublists(ml); for (int i = 0; i < num_sublists; i++) { if (multilist_sublist_is_empty_idx(ml, i)) continue; sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); sda->sda_list = ml; sda->sda_sublist_idx = i; sda->sda_tx = tx; (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, sync_dnodes_task, sda, 0); /* callback frees sda */ } taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while ((dr = list_head(list)) != NULL) { ASSERT0(dr->dr_dbuf->db_level); list_remove(list, dr); zio_nowait(dr->dr_zio); } /* Enable dnode backfill if enough objects have been freed. */ if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { os->os_rescan_dnodes = B_TRUE; os->os_freed_dnodes = 0; } /* * Free intent log blocks up to this tx. */ zil_sync(os->os_zil, tx); os->os_phys->os_zil_header = os->os_zil_header; zio_nowait(zio); } boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg) { return (!multilist_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK])); } static file_info_cb_t *file_cbs[DMU_OST_NUMTYPES]; void dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb) { file_cbs[ost] = cb; } int dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, const void *data, zfs_file_info_t *zfi) { file_info_cb_t *cb = file_cbs[os->os_phys->os_type]; if (cb == NULL) return (EINVAL); return (cb(bonustype, data, zfi)); } boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && file_cbs[os->os_phys->os_type] != NULL && DMU_USERUSED_DNODE(os) != NULL); } boolean_t dmu_objset_userobjused_enabled(objset_t *os) { return (dmu_objset_userused_enabled(os) && spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING)); } boolean_t dmu_objset_projectquota_enabled(objset_t *os) { return (file_cbs[os->os_phys->os_type] != NULL && DMU_PROJECTUSED_DNODE(os) != NULL && spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA)); } typedef struct userquota_node { /* must be in the first filed, see userquota_update_cache() */ char uqn_id[20 + DMU_OBJACCT_PREFIX_LEN]; int64_t uqn_delta; avl_node_t uqn_node; } userquota_node_t; typedef struct userquota_cache { avl_tree_t uqc_user_deltas; avl_tree_t uqc_group_deltas; avl_tree_t uqc_project_deltas; } userquota_cache_t; static int userquota_compare(const void *l, const void *r) { const userquota_node_t *luqn = l; const userquota_node_t *ruqn = r; int rv; /* * NB: can only access uqn_id because userquota_update_cache() doesn't * pass in an entire userquota_node_t. */ rv = strcmp(luqn->uqn_id, ruqn->uqn_id); return (TREE_ISIGN(rv)); } static void do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) { void *cookie; userquota_node_t *uqn; ASSERT(dmu_tx_is_syncing(tx)); cookie = NULL; while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas, &cookie)) != NULL) { /* * os_userused_lock protects against concurrent calls to * zap_increment_int(). It's needed because zap_increment_int() * is not thread-safe (i.e. not atomic). */ mutex_enter(&os->os_userused_lock); VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT, uqn->uqn_id, uqn->uqn_delta, tx)); mutex_exit(&os->os_userused_lock); kmem_free(uqn, sizeof (*uqn)); } avl_destroy(&cache->uqc_user_deltas); cookie = NULL; while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas, &cookie)) != NULL) { mutex_enter(&os->os_userused_lock); VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT, uqn->uqn_id, uqn->uqn_delta, tx)); mutex_exit(&os->os_userused_lock); kmem_free(uqn, sizeof (*uqn)); } avl_destroy(&cache->uqc_group_deltas); if (dmu_objset_projectquota_enabled(os)) { cookie = NULL; while ((uqn = avl_destroy_nodes(&cache->uqc_project_deltas, &cookie)) != NULL) { mutex_enter(&os->os_userused_lock); VERIFY0(zap_increment(os, DMU_PROJECTUSED_OBJECT, uqn->uqn_id, uqn->uqn_delta, tx)); mutex_exit(&os->os_userused_lock); kmem_free(uqn, sizeof (*uqn)); } avl_destroy(&cache->uqc_project_deltas); } } static void userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta) { userquota_node_t *uqn; avl_index_t idx; ASSERT(strlen(id) < sizeof (uqn->uqn_id)); /* * Use id directly for searching because uqn_id is the first field of * userquota_node_t and fields after uqn_id won't be accessed in * avl_find(). */ uqn = avl_find(avl, (const void *)id, &idx); if (uqn == NULL) { uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP); strlcpy(uqn->uqn_id, id, sizeof (uqn->uqn_id)); avl_insert(avl, uqn, idx); } uqn->uqn_delta += delta; } static void do_userquota_update(objset_t *os, userquota_cache_t *cache, uint64_t used, uint64_t flags, uint64_t user, uint64_t group, uint64_t project, boolean_t subtract) { if (flags & DNODE_FLAG_USERUSED_ACCOUNTED) { int64_t delta = DNODE_MIN_SIZE + used; char name[20]; if (subtract) delta = -delta; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)user); userquota_update_cache(&cache->uqc_user_deltas, name, delta); (void) snprintf(name, sizeof (name), "%llx", (longlong_t)group); userquota_update_cache(&cache->uqc_group_deltas, name, delta); if (dmu_objset_projectquota_enabled(os)) { (void) snprintf(name, sizeof (name), "%llx", (longlong_t)project); userquota_update_cache(&cache->uqc_project_deltas, name, delta); } } } static void do_userobjquota_update(objset_t *os, userquota_cache_t *cache, uint64_t flags, uint64_t user, uint64_t group, uint64_t project, boolean_t subtract) { if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) { char name[20 + DMU_OBJACCT_PREFIX_LEN]; int delta = subtract ? -1 : 1; (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx", (longlong_t)user); userquota_update_cache(&cache->uqc_user_deltas, name, delta); (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx", (longlong_t)group); userquota_update_cache(&cache->uqc_group_deltas, name, delta); if (dmu_objset_projectquota_enabled(os)) { (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx", (longlong_t)project); userquota_update_cache(&cache->uqc_project_deltas, name, delta); } } } typedef struct userquota_updates_arg { objset_t *uua_os; int uua_sublist_idx; dmu_tx_t *uua_tx; } userquota_updates_arg_t; static void userquota_updates_task(void *arg) { userquota_updates_arg_t *uua = arg; objset_t *os = uua->uua_os; dmu_tx_t *tx = uua->uua_tx; dnode_t *dn; userquota_cache_t cache = { { 0 } }; multilist_sublist_t *list = multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); ASSERT(multilist_sublist_head(list) == NULL || dmu_objset_userused_enabled(os)); avl_create(&cache.uqc_user_deltas, userquota_compare, sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); avl_create(&cache.uqc_group_deltas, userquota_compare, sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); if (dmu_objset_projectquota_enabled(os)) avl_create(&cache.uqc_project_deltas, userquota_compare, sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); while ((dn = multilist_sublist_head(list)) != NULL) { int flags; ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED); flags = dn->dn_id_flags; ASSERT(flags); if (flags & DN_ID_OLD_EXIST) { do_userquota_update(os, &cache, dn->dn_oldused, dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, dn->dn_oldprojid, B_TRUE); do_userobjquota_update(os, &cache, dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, dn->dn_oldprojid, B_TRUE); } if (flags & DN_ID_NEW_EXIST) { do_userquota_update(os, &cache, DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid, dn->dn_newprojid, B_FALSE); do_userobjquota_update(os, &cache, dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid, dn->dn_newprojid, B_FALSE); } mutex_enter(&dn->dn_mtx); dn->dn_oldused = 0; dn->dn_oldflags = 0; if (dn->dn_id_flags & DN_ID_NEW_EXIST) { dn->dn_olduid = dn->dn_newuid; dn->dn_oldgid = dn->dn_newgid; dn->dn_oldprojid = dn->dn_newprojid; dn->dn_id_flags |= DN_ID_OLD_EXIST; if (dn->dn_bonuslen == 0) dn->dn_id_flags |= DN_ID_CHKED_SPILL; else dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); mutex_exit(&dn->dn_mtx); multilist_sublist_remove(list, dn); dnode_rele(dn, &os->os_synced_dnodes); } do_userquota_cacheflush(os, &cache, tx); multilist_sublist_unlock(list); kmem_free(uua, sizeof (*uua)); } /* * Release dnode holds from dmu_objset_sync_dnodes(). When the dnode is being * synced (i.e. we have issued the zio's for blocks in the dnode), it can't be * evicted because the block containing the dnode can't be evicted until it is * written out. However, this hold is necessary to prevent the dnode_t from * being moved (via dnode_move()) while it's still referenced by * dbuf_dirty_record_t:dr_dnode. And dr_dnode is needed for * dirty_lightweight_leaf-type dirty records. * * If we are doing user-object accounting, the dnode_rele() happens from * userquota_updates_task() instead. */ static void dnode_rele_task(void *arg) { userquota_updates_arg_t *uua = arg; objset_t *os = uua->uua_os; multilist_sublist_t *list = multilist_sublist_lock(&os->os_synced_dnodes, uua->uua_sublist_idx); dnode_t *dn; while ((dn = multilist_sublist_head(list)) != NULL) { multilist_sublist_remove(list, dn); dnode_rele(dn, &os->os_synced_dnodes); } multilist_sublist_unlock(list); kmem_free(uua, sizeof (*uua)); } /* * Return TRUE if userquota updates are needed. */ static boolean_t dmu_objset_do_userquota_updates_prep(objset_t *os, dmu_tx_t *tx) { if (!dmu_objset_userused_enabled(os)) return (B_FALSE); /* * If this is a raw receive just return and handle accounting * later when we have the keys loaded. We also don't do user * accounting during claiming since the datasets are not owned * for the duration of claiming and this txg should only be * used for recovery. */ if (os->os_encrypted && dmu_objset_is_receiving(os)) return (B_FALSE); if (tx->tx_txg <= os->os_spa->spa_claim_max_txg) return (B_FALSE); /* Allocate the user/group/project used objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { VERIFY0(zap_create_claim(os, DMU_USERUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); VERIFY0(zap_create_claim(os, DMU_GROUPUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } if (dmu_objset_projectquota_enabled(os) && DMU_PROJECTUSED_DNODE(os)->dn_type == DMU_OT_NONE) { VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } return (B_TRUE); } /* * Dispatch taskq tasks to dp_sync_taskq to update the user accounting, and * also release the holds on the dnodes from dmu_objset_sync_dnodes(). * The caller must taskq_wait(dp_sync_taskq). */ void dmu_objset_sync_done(objset_t *os, dmu_tx_t *tx) { boolean_t need_userquota = dmu_objset_do_userquota_updates_prep(os, tx); int num_sublists = multilist_get_num_sublists(&os->os_synced_dnodes); for (int i = 0; i < num_sublists; i++) { userquota_updates_arg_t *uua = kmem_alloc(sizeof (*uua), KM_SLEEP); uua->uua_os = os; uua->uua_sublist_idx = i; uua->uua_tx = tx; /* * If we don't need to update userquotas, use * dnode_rele_task() to call dnode_rele() */ (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, need_userquota ? userquota_updates_task : dnode_rele_task, uua, 0); /* callback frees uua */ } } /* * Returns a pointer to data to find uid/gid from * * If a dirty record for transaction group that is syncing can't * be found then NULL is returned. In the NULL case it is assumed * the uid/gid aren't changing. */ static void * dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; void *data; if (db->db_dirtycnt == 0) return (db->db.db_data); /* Nothing is changing */ dr = dbuf_find_dirty_eq(db, tx->tx_txg); if (dr == NULL) { data = NULL; } else { if (dr->dr_dnode->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) data = dr->dt.dl.dr_data->b_data; else data = dr->dt.dl.dr_data; } return (data); } void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; void *data = NULL; dmu_buf_impl_t *db = NULL; int flags = dn->dn_id_flags; int error; boolean_t have_spill = B_FALSE; if (!dmu_objset_userused_enabled(dn->dn_objset)) return; /* * Raw receives introduce a problem with user accounting. Raw * receives cannot update the user accounting info because the * user ids and the sizes are encrypted. To guarantee that we * never end up with bad user accounting, we simply disable it * during raw receives. We also disable this for normal receives * so that an incremental raw receive may be done on top of an * existing non-raw receive. */ if (os->os_encrypted && dmu_objset_is_receiving(os)) return; if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| DN_ID_CHKED_SPILL))) return; if (before && dn->dn_bonuslen != 0) data = DN_BONUS(dn->dn_phys); else if (!before && dn->dn_bonuslen != 0) { if (dn->dn_bonus) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); data = dmu_objset_userquota_find_data(db, tx); } else { data = DN_BONUS(dn->dn_phys); } } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { int rf = 0; if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; error = dmu_spill_hold_by_dnode(dn, rf | DB_RF_MUST_SUCCEED, FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); data = (before) ? db->db.db_data : dmu_objset_userquota_find_data(db, tx); have_spill = B_TRUE; } else { mutex_enter(&dn->dn_mtx); dn->dn_id_flags |= DN_ID_CHKED_BONUS; mutex_exit(&dn->dn_mtx); return; } /* * Must always call the callback in case the object * type has changed and that type isn't an object type to track */ zfs_file_info_t zfi; error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi); if (before) { ASSERT(data); dn->dn_olduid = zfi.zfi_user; dn->dn_oldgid = zfi.zfi_group; dn->dn_oldprojid = zfi.zfi_project; } else if (data) { dn->dn_newuid = zfi.zfi_user; dn->dn_newgid = zfi.zfi_group; dn->dn_newprojid = zfi.zfi_project; } /* * Preserve existing uid/gid when the callback can't determine * what the new uid/gid are and the callback returned EEXIST. * The EEXIST error tells us to just use the existing uid/gid. * If we don't know what the old values are then just assign * them to 0, since that is a new file being created. */ if (!before && data == NULL && error == EEXIST) { if (flags & DN_ID_OLD_EXIST) { dn->dn_newuid = dn->dn_olduid; dn->dn_newgid = dn->dn_oldgid; dn->dn_newprojid = dn->dn_oldprojid; } else { dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_newprojid = ZFS_DEFAULT_PROJID; } error = 0; } if (db) mutex_exit(&db->db_mtx); mutex_enter(&dn->dn_mtx); if (error == 0 && before) dn->dn_id_flags |= DN_ID_OLD_EXIST; if (error == 0 && !before) dn->dn_id_flags |= DN_ID_NEW_EXIST; if (have_spill) { dn->dn_id_flags |= DN_ID_CHKED_SPILL; } else { dn->dn_id_flags |= DN_ID_CHKED_BONUS; } mutex_exit(&dn->dn_mtx); if (have_spill) dmu_buf_rele((dmu_buf_t *)db, FTAG); } boolean_t dmu_objset_userspace_present(objset_t *os) { return (os->os_phys->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); } boolean_t dmu_objset_userobjspace_present(objset_t *os) { return (os->os_phys->os_flags & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE); } boolean_t dmu_objset_projectquota_present(objset_t *os) { return (os->os_phys->os_flags & OBJSET_FLAG_PROJECTQUOTA_COMPLETE); } static int dmu_objset_space_upgrade(objset_t *os) { uint64_t obj; int err = 0; /* * We simply need to mark every object dirty, so that it will be * synced out and now accounted. If this is called * concurrently, or if we already did some work before crashing, * that's fine, since we track each object's accounted state * independently. */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { dmu_tx_t *tx; dmu_buf_t *db; int objerr; mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_exit) err = SET_ERROR(EINTR); mutex_exit(&os->os_upgrade_lock); if (err != 0) return (err); if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); objerr = dmu_bonus_hold(os, obj, FTAG, &db); if (objerr != 0) continue; tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); if (objerr != 0) { dmu_buf_rele(db, FTAG); dmu_tx_abort(tx); continue; } dmu_buf_will_dirty(db, tx); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); } return (0); } static int dmu_objset_userspace_upgrade_cb(objset_t *os) { int err = 0; if (dmu_objset_userspace_present(os)) return (0); if (dmu_objset_is_snapshot(os)) return (SET_ERROR(EINVAL)); if (!dmu_objset_userused_enabled(os)) return (SET_ERROR(ENOTSUP)); err = dmu_objset_space_upgrade(os); if (err) return (err); os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; txg_wait_synced(dmu_objset_pool(os), 0); return (0); } void dmu_objset_userspace_upgrade(objset_t *os) { dmu_objset_upgrade(os, dmu_objset_userspace_upgrade_cb); } static int dmu_objset_id_quota_upgrade_cb(objset_t *os) { int err = 0; if (dmu_objset_userobjspace_present(os) && dmu_objset_projectquota_present(os)) return (0); if (dmu_objset_is_snapshot(os)) return (SET_ERROR(EINVAL)); if (!dmu_objset_userused_enabled(os)) return (SET_ERROR(ENOTSUP)); if (!dmu_objset_projectquota_enabled(os) && dmu_objset_userobjspace_present(os)) return (SET_ERROR(ENOTSUP)); if (dmu_objset_userobjused_enabled(os)) dmu_objset_ds(os)->ds_feature_activation[ SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; if (dmu_objset_projectquota_enabled(os)) dmu_objset_ds(os)->ds_feature_activation[ SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE; err = dmu_objset_space_upgrade(os); if (err) return (err); os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; if (dmu_objset_userobjused_enabled(os)) os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; if (dmu_objset_projectquota_enabled(os)) os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE; txg_wait_synced(dmu_objset_pool(os), 0); return (0); } void dmu_objset_id_quota_upgrade(objset_t *os) { dmu_objset_upgrade(os, dmu_objset_id_quota_upgrade_cb); } boolean_t dmu_objset_userobjspace_upgradable(objset_t *os) { return (dmu_objset_type(os) == DMU_OST_ZFS && !dmu_objset_is_snapshot(os) && dmu_objset_userobjused_enabled(os) && !dmu_objset_userobjspace_present(os) && spa_writeable(dmu_objset_spa(os))); } boolean_t dmu_objset_projectquota_upgradable(objset_t *os) { return (dmu_objset_type(os) == DMU_OST_ZFS && !dmu_objset_is_snapshot(os) && dmu_objset_projectquota_enabled(os) && !dmu_objset_projectquota_present(os) && spa_writeable(dmu_objset_spa(os))); } void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, usedobjsp, availobjsp); } uint64_t dmu_objset_fsid_guid(objset_t *os) { return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); } void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) { stat->dds_type = os->os_phys->os_type; if (os->os_dsl_dataset) dsl_dataset_fast_stat(os->os_dsl_dataset, stat); } void dmu_objset_stats(objset_t *os, nvlist_t *nv) { ASSERT(os->os_dsl_dataset || os->os_phys->os_type == DMU_OST_META); if (os->os_dsl_dataset != NULL) dsl_dataset_stats(os->os_dsl_dataset, nv); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, os->os_phys->os_type); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, dmu_objset_userspace_present(os)); } int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) return (os->os_dsl_dataset->ds_is_snapshot); else return (B_FALSE); } int dmu_snapshot_realname(objset_t *os, const char *name, char *real, int maxlen, boolean_t *conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; uint64_t ignored; if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_NORMALIZE, real, maxlen, conflict)); } int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; zap_cursor_t cursor; zap_attribute_t attr; ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (SET_ERROR(ENAMETOOLONG)); } (void) strlcpy(name, attr.za_name, namelen); if (idp) *idp = attr.za_first_integer; if (case_conflict) *case_conflict = attr.za_normalization_conflict; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value) { return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value)); } int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp) { dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; zap_cursor_t cursor; zap_attribute_t attr; /* there is no next dir on a snapshot! */ if (os->os_dsl_dataset->ds_object != dsl_dir_phys(dd)->dd_head_dataset_obj) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (SET_ERROR(ENAMETOOLONG)); } (void) strlcpy(name, attr.za_name, namelen); if (idp) *idp = attr.za_first_integer; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } typedef struct dmu_objset_find_ctx { taskq_t *dc_tq; dsl_pool_t *dc_dp; uint64_t dc_ddobj; char *dc_ddname; /* last component of ddobj's name */ int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *); void *dc_arg; int dc_flags; kmutex_t *dc_error_lock; int *dc_error; } dmu_objset_find_ctx_t; static void dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp) { dsl_pool_t *dp = dcp->dc_dp; dsl_dir_t *dd; dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; uint64_t thisobj; int err = 0; /* don't process if there already was an error */ if (*dcp->dc_error != 0) goto out; /* * Note: passing the name (dc_ddname) here is optional, but it * improves performance because we don't need to call * zap_value_search() to determine the name. */ err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd); if (err != 0) goto out; /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); goto out; } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ if (dcp->dc_flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); dmu_objset_find_ctx_t *child_dcp = kmem_alloc(sizeof (*child_dcp), KM_SLEEP); *child_dcp = *dcp; child_dcp->dc_ddobj = attr->za_first_integer; child_dcp->dc_ddname = spa_strdup(attr->za_name); if (dcp->dc_tq != NULL) (void) taskq_dispatch(dcp->dc_tq, dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP); else dmu_objset_find_dp_impl(child_dcp); } zap_cursor_fini(&zc); } /* * Iterate over all snapshots. */ if (dcp->dc_flags & DS_FIND_SNAPSHOTS) { dsl_dataset_t *ds; err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); err = dsl_dataset_hold_obj(dp, attr->za_first_integer, FTAG, &ds); if (err != 0) break; err = dcp->dc_func(dp, ds, dcp->dc_arg); dsl_dataset_rele(ds, FTAG); if (err != 0) break; } zap_cursor_fini(&zc); } } kmem_free(attr, sizeof (zap_attribute_t)); if (err != 0) { dsl_dir_rele(dd, FTAG); goto out; } /* * Apply to self. */ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); /* * Note: we hold the dir while calling dsl_dataset_hold_obj() so * that the dir will remain cached, and we won't have to re-instantiate * it (which could be expensive due to finding its name via * zap_value_search()). */ dsl_dir_rele(dd, FTAG); if (err != 0) goto out; err = dcp->dc_func(dp, ds, dcp->dc_arg); dsl_dataset_rele(ds, FTAG); out: if (err != 0) { mutex_enter(dcp->dc_error_lock); /* only keep first error */ if (*dcp->dc_error == 0) *dcp->dc_error = err; mutex_exit(dcp->dc_error_lock); } if (dcp->dc_ddname != NULL) spa_strfree(dcp->dc_ddname); kmem_free(dcp, sizeof (*dcp)); } static void dmu_objset_find_dp_cb(void *arg) { dmu_objset_find_ctx_t *dcp = arg; dsl_pool_t *dp = dcp->dc_dp; /* * We need to get a pool_config_lock here, as there are several * assert(pool_config_held) down the stack. Getting a lock via * dsl_pool_config_enter is risky, as it might be stalled by a * pending writer. This would deadlock, as the write lock can * only be granted when our parent thread gives up the lock. * The _prio interface gives us priority over a pending writer. */ dsl_pool_config_enter_prio(dp, FTAG); dmu_objset_find_dp_impl(dcp); dsl_pool_config_exit(dp, FTAG); } /* * Find objsets under and including ddobj, call func(ds) on each. * The order for the enumeration is completely undefined. * func is called with dsl_pool_config held. */ int dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) { int error = 0; taskq_t *tq = NULL; int ntasks; dmu_objset_find_ctx_t *dcp; kmutex_t err_lock; mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL); dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP); dcp->dc_tq = NULL; dcp->dc_dp = dp; dcp->dc_ddobj = ddobj; dcp->dc_ddname = NULL; dcp->dc_func = func; dcp->dc_arg = arg; dcp->dc_flags = flags; dcp->dc_error_lock = &err_lock; dcp->dc_error = &error; if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) { /* * In case a write lock is held we can't make use of * parallelism, as down the stack of the worker threads * the lock is asserted via dsl_pool_config_held. * In case of a read lock this is solved by getting a read * lock in each worker thread, which isn't possible in case * of a writer lock. So we fall back to the synchronous path * here. * In the future it might be possible to get some magic into * dsl_pool_config_held in a way that it returns true for * the worker threads so that a single lock held from this * thread suffices. For now, stay single threaded. */ dmu_objset_find_dp_impl(dcp); mutex_destroy(&err_lock); return (error); } ntasks = dmu_find_threads; if (ntasks == 0) ntasks = vdev_count_leaves(dp->dp_spa) * 4; tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks, INT_MAX, 0); if (tq == NULL) { kmem_free(dcp, sizeof (*dcp)); mutex_destroy(&err_lock); return (SET_ERROR(ENOMEM)); } dcp->dc_tq = tq; /* dcp will be freed by task */ (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP); /* * PORTING: this code relies on the property of taskq_wait to wait * until no more tasks are queued and no more tasks are active. As * we always queue new tasks from within other tasks, task_wait * reliably waits for the full recursion to finish, even though we * enqueue new tasks after taskq_wait has been called. * On platforms other than illumos, taskq_wait may not have this * property. */ taskq_wait(tq); taskq_destroy(tq); mutex_destroy(&err_lock); return (error); } /* * Find all objsets under name, and for each, call 'func(child_name, arg)'. * The dp_config_rwlock must not be held when this is called, and it * will not be held when the callback is called. * Therefore this function should only be used when the pool is not changing * (e.g. in syncing context), or the callback can deal with the possible races. */ static int dmu_objset_find_impl(spa_t *spa, const char *name, int func(const char *, void *), void *arg, int flags) { dsl_dir_t *dd; dsl_pool_t *dp = spa_get_dsl(spa); dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; char *child; uint64_t thisobj; int err; dsl_pool_config_enter(dp, FTAG); err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); if (err != 0) { dsl_pool_config_exit(dp, FTAG); return (err); } /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); dsl_pool_config_exit(dp, FTAG); return (0); } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s/%s", name, attr->za_name); dsl_pool_config_exit(dp, FTAG); err = dmu_objset_find_impl(spa, child, func, arg, flags); dsl_pool_config_enter(dp, FTAG); kmem_strfree(child); if (err != 0) break; } zap_cursor_fini(&zc); if (err != 0) { dsl_dir_rele(dd, FTAG); dsl_pool_config_exit(dp, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } } /* * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s@%s", name, attr->za_name); dsl_pool_config_exit(dp, FTAG); err = func(child, arg); dsl_pool_config_enter(dp, FTAG); kmem_strfree(child); if (err != 0) break; } zap_cursor_fini(&zc); } } dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); dsl_pool_config_exit(dp, FTAG); if (err != 0) return (err); /* Apply to self. */ return (func(name, arg)); } /* * See comment above dmu_objset_find_impl(). */ int dmu_objset_find(const char *name, int func(const char *, void *), void *arg, int flags) { spa_t *spa; int error; error = spa_open(name, &spa, FTAG); if (error != 0) return (error); error = dmu_objset_find_impl(spa, name, func, arg, flags); spa_close(spa, FTAG); return (error); } boolean_t dmu_objset_incompatible_encryption_version(objset_t *os) { return (dsl_dir_incompatible_encryption_version( os->os_dsl_dataset->ds_dir)); } void dmu_objset_set_user(objset_t *os, void *user_ptr) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); os->os_user_ptr = user_ptr; } void * dmu_objset_get_user(objset_t *os) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); return (os->os_user_ptr); } /* * Determine name of filesystem, given name of snapshot. * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ int dmu_fsname(const char *snapname, char *buf) { char *atp = strchr(snapname, '@'); if (atp == NULL) return (SET_ERROR(EINVAL)); if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN) return (SET_ERROR(ENAMETOOLONG)); (void) strlcpy(buf, snapname, atp - snapname + 1); return (0); } /* * Call when we think we're going to write/free space in open context * to track the amount of dirty data in the open txg, which is also the * amount of memory that can not be evicted until this txg syncs. * * Note that there are two conditions where this can be called from * syncing context: * * [1] When we just created the dataset, in which case we go on with * updating any accounting of dirty data as usual. * [2] When we are dirtying MOS data, in which case we only update the * pool's accounting of dirty data. */ void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) { dsl_dataset_t *ds = os->os_dsl_dataset; int64_t aspace = spa_get_worst_case_asize(os->os_spa, space); if (ds != NULL) { dsl_dir_willuse_space(ds->ds_dir, aspace, tx); } dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } #if defined(_KERNEL) EXPORT_SYMBOL(dmu_objset_zil); EXPORT_SYMBOL(dmu_objset_pool); EXPORT_SYMBOL(dmu_objset_ds); EXPORT_SYMBOL(dmu_objset_type); EXPORT_SYMBOL(dmu_objset_name); EXPORT_SYMBOL(dmu_objset_hold); EXPORT_SYMBOL(dmu_objset_hold_flags); EXPORT_SYMBOL(dmu_objset_own); EXPORT_SYMBOL(dmu_objset_rele); EXPORT_SYMBOL(dmu_objset_rele_flags); EXPORT_SYMBOL(dmu_objset_disown); EXPORT_SYMBOL(dmu_objset_from_ds); EXPORT_SYMBOL(dmu_objset_create); EXPORT_SYMBOL(dmu_objset_clone); EXPORT_SYMBOL(dmu_objset_stats); EXPORT_SYMBOL(dmu_objset_fast_stat); EXPORT_SYMBOL(dmu_objset_spa); EXPORT_SYMBOL(dmu_objset_space); EXPORT_SYMBOL(dmu_objset_fsid_guid); EXPORT_SYMBOL(dmu_objset_find); EXPORT_SYMBOL(dmu_objset_byteswap); EXPORT_SYMBOL(dmu_objset_evict_dbufs); EXPORT_SYMBOL(dmu_objset_snap_cmtime); EXPORT_SYMBOL(dmu_objset_dnodesize); EXPORT_SYMBOL(dmu_objset_sync); EXPORT_SYMBOL(dmu_objset_is_dirty); EXPORT_SYMBOL(dmu_objset_create_impl_dnstats); EXPORT_SYMBOL(dmu_objset_create_impl); EXPORT_SYMBOL(dmu_objset_open_impl); EXPORT_SYMBOL(dmu_objset_evict); EXPORT_SYMBOL(dmu_objset_register_type); EXPORT_SYMBOL(dmu_objset_sync_done); EXPORT_SYMBOL(dmu_objset_userquota_get_ids); EXPORT_SYMBOL(dmu_objset_userused_enabled); EXPORT_SYMBOL(dmu_objset_userspace_upgrade); EXPORT_SYMBOL(dmu_objset_userspace_present); EXPORT_SYMBOL(dmu_objset_userobjused_enabled); EXPORT_SYMBOL(dmu_objset_userobjspace_upgradable); EXPORT_SYMBOL(dmu_objset_userobjspace_present); EXPORT_SYMBOL(dmu_objset_projectquota_enabled); EXPORT_SYMBOL(dmu_objset_projectquota_present); EXPORT_SYMBOL(dmu_objset_projectquota_upgradable); EXPORT_SYMBOL(dmu_objset_id_quota_upgrade); #endif diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c index bf1f55e68ff5..872174f5f90d 100644 --- a/module/zfs/dsl_crypt.c +++ b/module/zfs/dsl_crypt.c @@ -1,2876 +1,2873 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2017, Datto, Inc. All rights reserved. * Copyright (c) 2018 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include /* * This file's primary purpose is for managing master encryption keys in * memory and on disk. For more info on how these keys are used, see the * block comment in zio_crypt.c. * * All master keys are stored encrypted on disk in the form of the DSL * Crypto Key ZAP object. The binary key data in this object is always * randomly generated and is encrypted with the user's wrapping key. This * layer of indirection allows the user to change their key without * needing to re-encrypt the entire dataset. The ZAP also holds on to the * (non-encrypted) encryption algorithm identifier, IV, and MAC needed to * safely decrypt the master key. For more info on the user's key see the * block comment in libzfs_crypto.c * * In-memory encryption keys are managed through the spa_keystore. The * keystore consists of 3 AVL trees, which are as follows: * * The Wrapping Key Tree: * The wrapping key (wkey) tree stores the user's keys that are fed into the * kernel through 'zfs load-key' and related commands. Datasets inherit their * parent's wkey by default, so these structures are refcounted. The wrapping * keys remain in memory until they are explicitly unloaded (with * "zfs unload-key"). Unloading is only possible when no datasets are using * them (refcount=0). * * The DSL Crypto Key Tree: * The DSL Crypto Keys (DCK) are the in-memory representation of decrypted * master keys. They are used by the functions in zio_crypt.c to perform * encryption, decryption, and authentication. Snapshots and clones of a given * dataset will share a DSL Crypto Key, so they are also refcounted. Once the * refcount on a key hits zero, it is immediately zeroed out and freed. * * The Crypto Key Mapping Tree: * The zio layer needs to lookup master keys by their dataset object id. Since * the DSL Crypto Keys can belong to multiple datasets, we maintain a tree of * dsl_key_mapping_t's which essentially just map the dataset object id to its * appropriate DSL Crypto Key. The management for creating and destroying these * mappings hooks into the code for owning and disowning datasets. Usually, * there will only be one active dataset owner, but there are times * (particularly during dataset creation and destruction) when this may not be * true or the dataset may not be initialized enough to own. As a result, this * object is also refcounted. */ /* * This tunable allows datasets to be raw received even if the stream does * not include IVset guids or if the guids don't match. This is used as part * of the resolution for ZPOOL_ERRATA_ZOL_8308_ENCRYPTION. */ int zfs_disable_ivset_guid_check = 0; static void dsl_wrapping_key_hold(dsl_wrapping_key_t *wkey, void *tag) { (void) zfs_refcount_add(&wkey->wk_refcnt, tag); } static void dsl_wrapping_key_rele(dsl_wrapping_key_t *wkey, void *tag) { (void) zfs_refcount_remove(&wkey->wk_refcnt, tag); } static void dsl_wrapping_key_free(dsl_wrapping_key_t *wkey) { ASSERT0(zfs_refcount_count(&wkey->wk_refcnt)); if (wkey->wk_key.ck_data) { bzero(wkey->wk_key.ck_data, CRYPTO_BITS2BYTES(wkey->wk_key.ck_length)); kmem_free(wkey->wk_key.ck_data, CRYPTO_BITS2BYTES(wkey->wk_key.ck_length)); } zfs_refcount_destroy(&wkey->wk_refcnt); kmem_free(wkey, sizeof (dsl_wrapping_key_t)); } static void dsl_wrapping_key_create(uint8_t *wkeydata, zfs_keyformat_t keyformat, uint64_t salt, uint64_t iters, dsl_wrapping_key_t **wkey_out) { dsl_wrapping_key_t *wkey; /* allocate the wrapping key */ wkey = kmem_alloc(sizeof (dsl_wrapping_key_t), KM_SLEEP); /* allocate and initialize the underlying crypto key */ wkey->wk_key.ck_data = kmem_alloc(WRAPPING_KEY_LEN, KM_SLEEP); wkey->wk_key.ck_format = CRYPTO_KEY_RAW; wkey->wk_key.ck_length = CRYPTO_BYTES2BITS(WRAPPING_KEY_LEN); bcopy(wkeydata, wkey->wk_key.ck_data, WRAPPING_KEY_LEN); /* initialize the rest of the struct */ zfs_refcount_create(&wkey->wk_refcnt); wkey->wk_keyformat = keyformat; wkey->wk_salt = salt; wkey->wk_iters = iters; *wkey_out = wkey; } int dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props, nvlist_t *crypto_args, dsl_crypto_params_t **dcp_out) { int ret; uint64_t crypt = ZIO_CRYPT_INHERIT; uint64_t keyformat = ZFS_KEYFORMAT_NONE; uint64_t salt = 0, iters = 0; dsl_crypto_params_t *dcp = NULL; dsl_wrapping_key_t *wkey = NULL; uint8_t *wkeydata = NULL; uint_t wkeydata_len = 0; char *keylocation = NULL; dcp = kmem_zalloc(sizeof (dsl_crypto_params_t), KM_SLEEP); dcp->cp_cmd = cmd; /* get relevant arguments from the nvlists */ if (props != NULL) { (void) nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION), &crypt); (void) nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &keyformat); (void) nvlist_lookup_string(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation); (void) nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), &salt); (void) nvlist_lookup_uint64(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &iters); dcp->cp_crypt = crypt; } if (crypto_args != NULL) { (void) nvlist_lookup_uint8_array(crypto_args, "wkeydata", &wkeydata, &wkeydata_len); } /* check for valid command */ if (dcp->cp_cmd >= DCP_CMD_MAX) { ret = SET_ERROR(EINVAL); goto error; } else { dcp->cp_cmd = cmd; } /* check for valid crypt */ if (dcp->cp_crypt >= ZIO_CRYPT_FUNCTIONS) { ret = SET_ERROR(EINVAL); goto error; } else { dcp->cp_crypt = crypt; } /* check for valid keyformat */ if (keyformat >= ZFS_KEYFORMAT_FORMATS) { ret = SET_ERROR(EINVAL); goto error; } /* check for a valid keylocation (of any kind) and copy it in */ if (keylocation != NULL) { if (!zfs_prop_valid_keylocation(keylocation, B_FALSE)) { ret = SET_ERROR(EINVAL); goto error; } dcp->cp_keylocation = spa_strdup(keylocation); } /* check wrapping key length, if given */ if (wkeydata != NULL && wkeydata_len != WRAPPING_KEY_LEN) { ret = SET_ERROR(EINVAL); goto error; } /* if the user asked for the default crypt, determine that now */ if (dcp->cp_crypt == ZIO_CRYPT_ON) dcp->cp_crypt = ZIO_CRYPT_ON_VALUE; /* create the wrapping key from the raw data */ if (wkeydata != NULL) { /* create the wrapping key with the verified parameters */ dsl_wrapping_key_create(wkeydata, keyformat, salt, iters, &wkey); dcp->cp_wkey = wkey; } /* * Remove the encryption properties from the nvlist since they are not * maintained through the DSL. */ (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION)); (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT)); (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT)); (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS)); *dcp_out = dcp; return (0); error: kmem_free(dcp, sizeof (dsl_crypto_params_t)); *dcp_out = NULL; return (ret); } void dsl_crypto_params_free(dsl_crypto_params_t *dcp, boolean_t unload) { if (dcp == NULL) return; if (dcp->cp_keylocation != NULL) spa_strfree(dcp->cp_keylocation); if (unload && dcp->cp_wkey != NULL) dsl_wrapping_key_free(dcp->cp_wkey); kmem_free(dcp, sizeof (dsl_crypto_params_t)); } static int spa_crypto_key_compare(const void *a, const void *b) { const dsl_crypto_key_t *dcka = a; const dsl_crypto_key_t *dckb = b; if (dcka->dck_obj < dckb->dck_obj) return (-1); if (dcka->dck_obj > dckb->dck_obj) return (1); return (0); } static int spa_key_mapping_compare(const void *a, const void *b) { const dsl_key_mapping_t *kma = a; const dsl_key_mapping_t *kmb = b; if (kma->km_dsobj < kmb->km_dsobj) return (-1); if (kma->km_dsobj > kmb->km_dsobj) return (1); return (0); } static int spa_wkey_compare(const void *a, const void *b) { const dsl_wrapping_key_t *wka = a; const dsl_wrapping_key_t *wkb = b; if (wka->wk_ddobj < wkb->wk_ddobj) return (-1); if (wka->wk_ddobj > wkb->wk_ddobj) return (1); return (0); } void spa_keystore_init(spa_keystore_t *sk) { rw_init(&sk->sk_dk_lock, NULL, RW_DEFAULT, NULL); rw_init(&sk->sk_km_lock, NULL, RW_DEFAULT, NULL); rw_init(&sk->sk_wkeys_lock, NULL, RW_DEFAULT, NULL); avl_create(&sk->sk_dsl_keys, spa_crypto_key_compare, sizeof (dsl_crypto_key_t), offsetof(dsl_crypto_key_t, dck_avl_link)); avl_create(&sk->sk_key_mappings, spa_key_mapping_compare, sizeof (dsl_key_mapping_t), offsetof(dsl_key_mapping_t, km_avl_link)); avl_create(&sk->sk_wkeys, spa_wkey_compare, sizeof (dsl_wrapping_key_t), offsetof(dsl_wrapping_key_t, wk_avl_link)); } void spa_keystore_fini(spa_keystore_t *sk) { dsl_wrapping_key_t *wkey; void *cookie = NULL; ASSERT(avl_is_empty(&sk->sk_dsl_keys)); ASSERT(avl_is_empty(&sk->sk_key_mappings)); while ((wkey = avl_destroy_nodes(&sk->sk_wkeys, &cookie)) != NULL) dsl_wrapping_key_free(wkey); avl_destroy(&sk->sk_wkeys); avl_destroy(&sk->sk_key_mappings); avl_destroy(&sk->sk_dsl_keys); rw_destroy(&sk->sk_wkeys_lock); rw_destroy(&sk->sk_km_lock); rw_destroy(&sk->sk_dk_lock); } static int dsl_dir_get_encryption_root_ddobj(dsl_dir_t *dd, uint64_t *rddobj) { if (dd->dd_crypto_obj == 0) return (SET_ERROR(ENOENT)); return (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, rddobj)); } static int dsl_dir_get_encryption_version(dsl_dir_t *dd, uint64_t *version) { *version = 0; if (dd->dd_crypto_obj == 0) return (SET_ERROR(ENOENT)); /* version 0 is implied by ENOENT */ (void) zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, DSL_CRYPTO_KEY_VERSION, 8, 1, version); return (0); } boolean_t dsl_dir_incompatible_encryption_version(dsl_dir_t *dd) { int ret; uint64_t version = 0; ret = dsl_dir_get_encryption_version(dd, &version); if (ret != 0) return (B_FALSE); return (version != ZIO_CRYPT_KEY_CURRENT_VERSION); } static int spa_keystore_wkey_hold_ddobj_impl(spa_t *spa, uint64_t ddobj, void *tag, dsl_wrapping_key_t **wkey_out) { int ret; dsl_wrapping_key_t search_wkey; dsl_wrapping_key_t *found_wkey; ASSERT(RW_LOCK_HELD(&spa->spa_keystore.sk_wkeys_lock)); /* init the search wrapping key */ search_wkey.wk_ddobj = ddobj; /* lookup the wrapping key */ found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &search_wkey, NULL); if (!found_wkey) { ret = SET_ERROR(ENOENT); goto error; } /* increment the refcount */ dsl_wrapping_key_hold(found_wkey, tag); *wkey_out = found_wkey; return (0); error: *wkey_out = NULL; return (ret); } static int spa_keystore_wkey_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag, dsl_wrapping_key_t **wkey_out) { int ret; dsl_wrapping_key_t *wkey; uint64_t rddobj; boolean_t locked = B_FALSE; if (!RW_WRITE_HELD(&spa->spa_keystore.sk_wkeys_lock)) { rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_READER); locked = B_TRUE; } /* get the ddobj that the keylocation property was inherited from */ ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj); if (ret != 0) goto error; /* lookup the wkey in the avl tree */ ret = spa_keystore_wkey_hold_ddobj_impl(spa, rddobj, tag, &wkey); if (ret != 0) goto error; /* unlock the wkey tree if we locked it */ if (locked) rw_exit(&spa->spa_keystore.sk_wkeys_lock); *wkey_out = wkey; return (0); error: if (locked) rw_exit(&spa->spa_keystore.sk_wkeys_lock); *wkey_out = NULL; return (ret); } int dsl_crypto_can_set_keylocation(const char *dsname, const char *keylocation) { int ret = 0; dsl_dir_t *dd = NULL; dsl_pool_t *dp = NULL; uint64_t rddobj; /* hold the dsl dir */ ret = dsl_pool_hold(dsname, FTAG, &dp); if (ret != 0) goto out; ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL); if (ret != 0) { dd = NULL; goto out; } /* if dd is not encrypted, the value may only be "none" */ if (dd->dd_crypto_obj == 0) { if (strcmp(keylocation, "none") != 0) { ret = SET_ERROR(EACCES); goto out; } ret = 0; goto out; } /* check for a valid keylocation for encrypted datasets */ if (!zfs_prop_valid_keylocation(keylocation, B_TRUE)) { ret = SET_ERROR(EINVAL); goto out; } /* check that this is an encryption root */ ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj); if (ret != 0) goto out; if (rddobj != dd->dd_object) { ret = SET_ERROR(EACCES); goto out; } dsl_dir_rele(dd, FTAG); dsl_pool_rele(dp, FTAG); return (0); out: if (dd != NULL) dsl_dir_rele(dd, FTAG); if (dp != NULL) dsl_pool_rele(dp, FTAG); return (ret); } static void dsl_crypto_key_free(dsl_crypto_key_t *dck) { ASSERT(zfs_refcount_count(&dck->dck_holds) == 0); /* destroy the zio_crypt_key_t */ zio_crypt_key_destroy(&dck->dck_key); /* free the refcount, wrapping key, and lock */ zfs_refcount_destroy(&dck->dck_holds); if (dck->dck_wkey) dsl_wrapping_key_rele(dck->dck_wkey, dck); /* free the key */ kmem_free(dck, sizeof (dsl_crypto_key_t)); } static void dsl_crypto_key_rele(dsl_crypto_key_t *dck, void *tag) { if (zfs_refcount_remove(&dck->dck_holds, tag) == 0) dsl_crypto_key_free(dck); } static int dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey, uint64_t dckobj, void *tag, dsl_crypto_key_t **dck_out) { int ret; uint64_t crypt = 0, guid = 0, version = 0; uint8_t raw_keydata[MASTER_KEY_MAX_LEN]; uint8_t raw_hmac_keydata[SHA512_HMAC_KEYLEN]; uint8_t iv[WRAPPING_IV_LEN]; uint8_t mac[WRAPPING_MAC_LEN]; dsl_crypto_key_t *dck; /* allocate and initialize the key */ dck = kmem_zalloc(sizeof (dsl_crypto_key_t), KM_SLEEP); /* fetch all of the values we need from the ZAP */ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, &crypt); if (ret != 0) goto error; ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &guid); if (ret != 0) goto error; ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1, MASTER_KEY_MAX_LEN, raw_keydata); if (ret != 0) goto error; ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1, SHA512_HMAC_KEYLEN, raw_hmac_keydata); if (ret != 0) goto error; ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN, iv); if (ret != 0) goto error; ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN, mac); if (ret != 0) goto error; /* the initial on-disk format for encryption did not have a version */ (void) zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_VERSION, 8, 1, &version); /* * Unwrap the keys. If there is an error return EACCES to indicate * an authentication failure. */ ret = zio_crypt_key_unwrap(&wkey->wk_key, crypt, version, guid, raw_keydata, raw_hmac_keydata, iv, mac, &dck->dck_key); if (ret != 0) { ret = SET_ERROR(EACCES); goto error; } /* finish initializing the dsl_crypto_key_t */ zfs_refcount_create(&dck->dck_holds); dsl_wrapping_key_hold(wkey, dck); dck->dck_wkey = wkey; dck->dck_obj = dckobj; zfs_refcount_add(&dck->dck_holds, tag); *dck_out = dck; return (0); error: if (dck != NULL) { bzero(dck, sizeof (dsl_crypto_key_t)); kmem_free(dck, sizeof (dsl_crypto_key_t)); } *dck_out = NULL; return (ret); } static int spa_keystore_dsl_key_hold_impl(spa_t *spa, uint64_t dckobj, void *tag, dsl_crypto_key_t **dck_out) { int ret; dsl_crypto_key_t search_dck; dsl_crypto_key_t *found_dck; ASSERT(RW_LOCK_HELD(&spa->spa_keystore.sk_dk_lock)); /* init the search key */ search_dck.dck_obj = dckobj; /* find the matching key in the keystore */ found_dck = avl_find(&spa->spa_keystore.sk_dsl_keys, &search_dck, NULL); if (!found_dck) { ret = SET_ERROR(ENOENT); goto error; } /* increment the refcount */ zfs_refcount_add(&found_dck->dck_holds, tag); *dck_out = found_dck; return (0); error: *dck_out = NULL; return (ret); } static int spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag, dsl_crypto_key_t **dck_out) { int ret; avl_index_t where; dsl_crypto_key_t *dck_io = NULL, *dck_ks = NULL; dsl_wrapping_key_t *wkey = NULL; uint64_t dckobj = dd->dd_crypto_obj; /* Lookup the key in the tree of currently loaded keys */ rw_enter(&spa->spa_keystore.sk_dk_lock, RW_READER); ret = spa_keystore_dsl_key_hold_impl(spa, dckobj, tag, &dck_ks); rw_exit(&spa->spa_keystore.sk_dk_lock); if (ret == 0) { *dck_out = dck_ks; return (0); } /* Lookup the wrapping key from the keystore */ ret = spa_keystore_wkey_hold_dd(spa, dd, FTAG, &wkey); if (ret != 0) { *dck_out = NULL; return (SET_ERROR(EACCES)); } /* Read the key from disk */ ret = dsl_crypto_key_open(spa->spa_meta_objset, wkey, dckobj, tag, &dck_io); if (ret != 0) { dsl_wrapping_key_rele(wkey, FTAG); *dck_out = NULL; return (ret); } /* * Add the key to the keystore. It may already exist if it was * added while performing the read from disk. In this case discard * it and return the key from the keystore. */ rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER); ret = spa_keystore_dsl_key_hold_impl(spa, dckobj, tag, &dck_ks); if (ret != 0) { avl_find(&spa->spa_keystore.sk_dsl_keys, dck_io, &where); avl_insert(&spa->spa_keystore.sk_dsl_keys, dck_io, where); *dck_out = dck_io; } else { dsl_crypto_key_free(dck_io); *dck_out = dck_ks; } /* Release the wrapping key (the dsl key now has a reference to it) */ dsl_wrapping_key_rele(wkey, FTAG); rw_exit(&spa->spa_keystore.sk_dk_lock); return (0); } void spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, void *tag) { rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER); if (zfs_refcount_remove(&dck->dck_holds, tag) == 0) { avl_remove(&spa->spa_keystore.sk_dsl_keys, dck); dsl_crypto_key_free(dck); } rw_exit(&spa->spa_keystore.sk_dk_lock); } int spa_keystore_load_wkey_impl(spa_t *spa, dsl_wrapping_key_t *wkey) { int ret; avl_index_t where; dsl_wrapping_key_t *found_wkey; rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER); /* insert the wrapping key into the keystore */ found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, wkey, &where); if (found_wkey != NULL) { ret = SET_ERROR(EEXIST); goto error_unlock; } avl_insert(&spa->spa_keystore.sk_wkeys, wkey, where); rw_exit(&spa->spa_keystore.sk_wkeys_lock); return (0); error_unlock: rw_exit(&spa->spa_keystore.sk_wkeys_lock); return (ret); } int spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp, boolean_t noop) { int ret; dsl_dir_t *dd = NULL; dsl_crypto_key_t *dck = NULL; dsl_wrapping_key_t *wkey = dcp->cp_wkey; dsl_pool_t *dp = NULL; uint64_t rddobj, keyformat, salt, iters; /* * We don't validate the wrapping key's keyformat, salt, or iters * since they will never be needed after the DCK has been wrapped. */ if (dcp->cp_wkey == NULL || dcp->cp_cmd != DCP_CMD_NONE || dcp->cp_crypt != ZIO_CRYPT_INHERIT || dcp->cp_keylocation != NULL) return (SET_ERROR(EINVAL)); ret = dsl_pool_hold(dsname, FTAG, &dp); if (ret != 0) goto error; if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) { ret = SET_ERROR(ENOTSUP); goto error; } /* hold the dsl dir */ ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL); if (ret != 0) { dd = NULL; goto error; } /* confirm that dd is the encryption root */ ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj); if (ret != 0 || rddobj != dd->dd_object) { ret = SET_ERROR(EINVAL); goto error; } /* initialize the wkey's ddobj */ wkey->wk_ddobj = dd->dd_object; /* verify that the wkey is correct by opening its dsl key */ ret = dsl_crypto_key_open(dp->dp_meta_objset, wkey, dd->dd_crypto_obj, FTAG, &dck); if (ret != 0) goto error; /* initialize the wkey encryption parameters from the DSL Crypto Key */ ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &keyformat); if (ret != 0) goto error; ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &salt); if (ret != 0) goto error; ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &iters); if (ret != 0) goto error; ASSERT3U(keyformat, <, ZFS_KEYFORMAT_FORMATS); ASSERT3U(keyformat, !=, ZFS_KEYFORMAT_NONE); IMPLY(keyformat == ZFS_KEYFORMAT_PASSPHRASE, iters != 0); IMPLY(keyformat == ZFS_KEYFORMAT_PASSPHRASE, salt != 0); IMPLY(keyformat != ZFS_KEYFORMAT_PASSPHRASE, iters == 0); IMPLY(keyformat != ZFS_KEYFORMAT_PASSPHRASE, salt == 0); wkey->wk_keyformat = keyformat; wkey->wk_salt = salt; wkey->wk_iters = iters; /* * At this point we have verified the wkey and confirmed that it can * be used to decrypt a DSL Crypto Key. We can simply cleanup and * return if this is all the user wanted to do. */ if (noop) goto error; /* insert the wrapping key into the keystore */ ret = spa_keystore_load_wkey_impl(dp->dp_spa, wkey); if (ret != 0) goto error; dsl_crypto_key_rele(dck, FTAG); dsl_dir_rele(dd, FTAG); dsl_pool_rele(dp, FTAG); /* create any zvols under this ds */ zvol_create_minors_recursive(dsname); return (0); error: if (dck != NULL) dsl_crypto_key_rele(dck, FTAG); if (dd != NULL) dsl_dir_rele(dd, FTAG); if (dp != NULL) dsl_pool_rele(dp, FTAG); return (ret); } int spa_keystore_unload_wkey_impl(spa_t *spa, uint64_t ddobj) { int ret; dsl_wrapping_key_t search_wkey; dsl_wrapping_key_t *found_wkey; /* init the search wrapping key */ search_wkey.wk_ddobj = ddobj; rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER); /* remove the wrapping key from the keystore */ found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &search_wkey, NULL); if (!found_wkey) { ret = SET_ERROR(EACCES); goto error_unlock; } else if (zfs_refcount_count(&found_wkey->wk_refcnt) != 0) { ret = SET_ERROR(EBUSY); goto error_unlock; } avl_remove(&spa->spa_keystore.sk_wkeys, found_wkey); rw_exit(&spa->spa_keystore.sk_wkeys_lock); /* free the wrapping key */ dsl_wrapping_key_free(found_wkey); return (0); error_unlock: rw_exit(&spa->spa_keystore.sk_wkeys_lock); return (ret); } int spa_keystore_unload_wkey(const char *dsname) { int ret = 0; dsl_dir_t *dd = NULL; dsl_pool_t *dp = NULL; spa_t *spa = NULL; ret = spa_open(dsname, &spa, FTAG); if (ret != 0) return (ret); /* * Wait for any outstanding txg IO to complete, releasing any * remaining references on the wkey. */ if (spa_mode(spa) != SPA_MODE_READ) txg_wait_synced(spa->spa_dsl_pool, 0); spa_close(spa, FTAG); /* hold the dsl dir */ ret = dsl_pool_hold(dsname, FTAG, &dp); if (ret != 0) goto error; if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) { ret = (SET_ERROR(ENOTSUP)); goto error; } ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL); if (ret != 0) { dd = NULL; goto error; } /* unload the wkey */ ret = spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object); if (ret != 0) goto error; dsl_dir_rele(dd, FTAG); dsl_pool_rele(dp, FTAG); /* remove any zvols under this ds */ zvol_remove_minors(dp->dp_spa, dsname, B_TRUE); return (0); error: if (dd != NULL) dsl_dir_rele(dd, FTAG); if (dp != NULL) dsl_pool_rele(dp, FTAG); return (ret); } void key_mapping_add_ref(dsl_key_mapping_t *km, void *tag) { ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1); zfs_refcount_add(&km->km_refcnt, tag); } /* * The locking here is a little tricky to ensure we don't cause unnecessary * performance problems. We want to release a key mapping whenever someone * decrements the refcount to 0, but freeing the mapping requires removing * it from the spa_keystore, which requires holding sk_km_lock as a writer. * Most of the time we don't want to hold this lock as a writer, since the * same lock is held as a reader for each IO that needs to encrypt / decrypt * data for any dataset and in practice we will only actually free the * mapping after unmounting a dataset. */ void key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag) { ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1); if (zfs_refcount_remove(&km->km_refcnt, tag) != 0) return; /* * We think we are going to need to free the mapping. Add a * reference to prevent most other releasers from thinking * this might be their responsibility. This is inherently * racy, so we will confirm that we are legitimately the * last holder once we have the sk_km_lock as a writer. */ zfs_refcount_add(&km->km_refcnt, FTAG); rw_enter(&spa->spa_keystore.sk_km_lock, RW_WRITER); if (zfs_refcount_remove(&km->km_refcnt, FTAG) != 0) { rw_exit(&spa->spa_keystore.sk_km_lock); return; } avl_remove(&spa->spa_keystore.sk_key_mappings, km); rw_exit(&spa->spa_keystore.sk_km_lock); spa_keystore_dsl_key_rele(spa, km->km_key, km); zfs_refcount_destroy(&km->km_refcnt); kmem_free(km, sizeof (dsl_key_mapping_t)); } int spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, void *tag, dsl_key_mapping_t **km_out) { int ret; avl_index_t where; dsl_key_mapping_t *km, *found_km; boolean_t should_free = B_FALSE; /* Allocate and initialize the mapping */ km = kmem_zalloc(sizeof (dsl_key_mapping_t), KM_SLEEP); zfs_refcount_create(&km->km_refcnt); ret = spa_keystore_dsl_key_hold_dd(spa, ds->ds_dir, km, &km->km_key); if (ret != 0) { zfs_refcount_destroy(&km->km_refcnt); kmem_free(km, sizeof (dsl_key_mapping_t)); if (km_out != NULL) *km_out = NULL; return (ret); } km->km_dsobj = ds->ds_object; rw_enter(&spa->spa_keystore.sk_km_lock, RW_WRITER); /* * If a mapping already exists, simply increment its refcount and * cleanup the one we made. We want to allocate / free outside of * the lock because this lock is also used by the zio layer to lookup * key mappings. Otherwise, use the one we created. Normally, there will * only be one active reference at a time (the objset owner), but there * are times when there could be multiple async users. */ found_km = avl_find(&spa->spa_keystore.sk_key_mappings, km, &where); if (found_km != NULL) { should_free = B_TRUE; zfs_refcount_add(&found_km->km_refcnt, tag); if (km_out != NULL) *km_out = found_km; } else { zfs_refcount_add(&km->km_refcnt, tag); avl_insert(&spa->spa_keystore.sk_key_mappings, km, where); if (km_out != NULL) *km_out = km; } rw_exit(&spa->spa_keystore.sk_km_lock); if (should_free) { spa_keystore_dsl_key_rele(spa, km->km_key, km); zfs_refcount_destroy(&km->km_refcnt); kmem_free(km, sizeof (dsl_key_mapping_t)); } return (0); } int spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, void *tag) { int ret; dsl_key_mapping_t search_km; dsl_key_mapping_t *found_km; /* init the search key mapping */ search_km.km_dsobj = dsobj; rw_enter(&spa->spa_keystore.sk_km_lock, RW_READER); /* find the matching mapping */ found_km = avl_find(&spa->spa_keystore.sk_key_mappings, &search_km, NULL); if (found_km == NULL) { ret = SET_ERROR(ENOENT); goto error_unlock; } rw_exit(&spa->spa_keystore.sk_km_lock); key_mapping_rele(spa, found_km, tag); return (0); error_unlock: rw_exit(&spa->spa_keystore.sk_km_lock); return (ret); } /* * This function is primarily used by the zio and arc layer to lookup * DSL Crypto Keys for encryption. Callers must release the key with * spa_keystore_dsl_key_rele(). The function may also be called with * dck_out == NULL and tag == NULL to simply check that a key exists * without getting a reference to it. */ int spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag, dsl_crypto_key_t **dck_out) { int ret; dsl_key_mapping_t search_km; dsl_key_mapping_t *found_km; ASSERT((tag != NULL && dck_out != NULL) || (tag == NULL && dck_out == NULL)); /* init the search key mapping */ search_km.km_dsobj = dsobj; rw_enter(&spa->spa_keystore.sk_km_lock, RW_READER); /* remove the mapping from the tree */ found_km = avl_find(&spa->spa_keystore.sk_key_mappings, &search_km, NULL); if (found_km == NULL) { ret = SET_ERROR(ENOENT); goto error_unlock; } if (found_km && tag) zfs_refcount_add(&found_km->km_key->dck_holds, tag); rw_exit(&spa->spa_keystore.sk_km_lock); if (dck_out != NULL) *dck_out = found_km->km_key; return (0); error_unlock: rw_exit(&spa->spa_keystore.sk_km_lock); if (dck_out != NULL) *dck_out = NULL; return (ret); } static int dmu_objset_check_wkey_loaded(dsl_dir_t *dd) { int ret; dsl_wrapping_key_t *wkey = NULL; ret = spa_keystore_wkey_hold_dd(dd->dd_pool->dp_spa, dd, FTAG, &wkey); if (ret != 0) return (SET_ERROR(EACCES)); dsl_wrapping_key_rele(wkey, FTAG); return (0); } static zfs_keystatus_t dsl_dataset_get_keystatus(dsl_dir_t *dd) { /* check if this dd has a has a dsl key */ if (dd->dd_crypto_obj == 0) return (ZFS_KEYSTATUS_NONE); return (dmu_objset_check_wkey_loaded(dd) == 0 ? ZFS_KEYSTATUS_AVAILABLE : ZFS_KEYSTATUS_UNAVAILABLE); } static int dsl_dir_get_crypt(dsl_dir_t *dd, uint64_t *crypt) { if (dd->dd_crypto_obj == 0) { *crypt = ZIO_CRYPT_OFF; return (0); } return (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, crypt)); } static void dsl_crypto_key_sync_impl(objset_t *mos, uint64_t dckobj, uint64_t crypt, uint64_t root_ddobj, uint64_t guid, uint8_t *iv, uint8_t *mac, uint8_t *keydata, uint8_t *hmac_keydata, uint64_t keyformat, uint64_t salt, uint64_t iters, dmu_tx_t *tx) { VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, &crypt, tx)); VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, &root_ddobj, tx)); VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &guid, tx)); VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN, iv, tx)); VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN, mac, tx)); VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1, MASTER_KEY_MAX_LEN, keydata, tx)); VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1, SHA512_HMAC_KEYLEN, hmac_keydata, tx)); VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &keyformat, tx)); VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &salt, tx)); VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &iters, tx)); } static void dsl_crypto_key_sync(dsl_crypto_key_t *dck, dmu_tx_t *tx) { zio_crypt_key_t *key = &dck->dck_key; dsl_wrapping_key_t *wkey = dck->dck_wkey; uint8_t keydata[MASTER_KEY_MAX_LEN]; uint8_t hmac_keydata[SHA512_HMAC_KEYLEN]; uint8_t iv[WRAPPING_IV_LEN]; uint8_t mac[WRAPPING_MAC_LEN]; ASSERT(dmu_tx_is_syncing(tx)); ASSERT3U(key->zk_crypt, <, ZIO_CRYPT_FUNCTIONS); /* encrypt and store the keys along with the IV and MAC */ VERIFY0(zio_crypt_key_wrap(&dck->dck_wkey->wk_key, key, iv, mac, keydata, hmac_keydata)); /* update the ZAP with the obtained values */ dsl_crypto_key_sync_impl(tx->tx_pool->dp_meta_objset, dck->dck_obj, key->zk_crypt, wkey->wk_ddobj, key->zk_guid, iv, mac, keydata, hmac_keydata, wkey->wk_keyformat, wkey->wk_salt, wkey->wk_iters, tx); } typedef struct spa_keystore_change_key_args { const char *skcka_dsname; dsl_crypto_params_t *skcka_cp; } spa_keystore_change_key_args_t; static int spa_keystore_change_key_check(void *arg, dmu_tx_t *tx) { int ret; dsl_dir_t *dd = NULL; dsl_pool_t *dp = dmu_tx_pool(tx); spa_keystore_change_key_args_t *skcka = arg; dsl_crypto_params_t *dcp = skcka->skcka_cp; uint64_t rddobj; /* check for the encryption feature */ if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) { ret = SET_ERROR(ENOTSUP); goto error; } /* check for valid key change command */ if (dcp->cp_cmd != DCP_CMD_NEW_KEY && dcp->cp_cmd != DCP_CMD_INHERIT && dcp->cp_cmd != DCP_CMD_FORCE_NEW_KEY && dcp->cp_cmd != DCP_CMD_FORCE_INHERIT) { ret = SET_ERROR(EINVAL); goto error; } /* hold the dd */ ret = dsl_dir_hold(dp, skcka->skcka_dsname, FTAG, &dd, NULL); if (ret != 0) { dd = NULL; goto error; } /* verify that the dataset is encrypted */ if (dd->dd_crypto_obj == 0) { ret = SET_ERROR(EINVAL); goto error; } /* clones must always use their origin's key */ if (dsl_dir_is_clone(dd)) { ret = SET_ERROR(EINVAL); goto error; } /* lookup the ddobj we are inheriting the keylocation from */ ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj); if (ret != 0) goto error; /* Handle inheritance */ if (dcp->cp_cmd == DCP_CMD_INHERIT || dcp->cp_cmd == DCP_CMD_FORCE_INHERIT) { /* no other encryption params should be given */ if (dcp->cp_crypt != ZIO_CRYPT_INHERIT || dcp->cp_keylocation != NULL || dcp->cp_wkey != NULL) { ret = SET_ERROR(EINVAL); goto error; } /* check that this is an encryption root */ if (dd->dd_object != rddobj) { ret = SET_ERROR(EINVAL); goto error; } /* check that the parent is encrypted */ if (dd->dd_parent->dd_crypto_obj == 0) { ret = SET_ERROR(EINVAL); goto error; } /* if we are rewrapping check that both keys are loaded */ if (dcp->cp_cmd == DCP_CMD_INHERIT) { ret = dmu_objset_check_wkey_loaded(dd); if (ret != 0) goto error; ret = dmu_objset_check_wkey_loaded(dd->dd_parent); if (ret != 0) goto error; } dsl_dir_rele(dd, FTAG); return (0); } /* handle forcing an encryption root without rewrapping */ if (dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY) { /* no other encryption params should be given */ if (dcp->cp_crypt != ZIO_CRYPT_INHERIT || dcp->cp_keylocation != NULL || dcp->cp_wkey != NULL) { ret = SET_ERROR(EINVAL); goto error; } /* check that this is not an encryption root */ if (dd->dd_object == rddobj) { ret = SET_ERROR(EINVAL); goto error; } dsl_dir_rele(dd, FTAG); return (0); } /* crypt cannot be changed after creation */ if (dcp->cp_crypt != ZIO_CRYPT_INHERIT) { ret = SET_ERROR(EINVAL); goto error; } /* we are not inheritting our parent's wkey so we need one ourselves */ if (dcp->cp_wkey == NULL) { ret = SET_ERROR(EINVAL); goto error; } /* check for a valid keyformat for the new wrapping key */ if (dcp->cp_wkey->wk_keyformat >= ZFS_KEYFORMAT_FORMATS || dcp->cp_wkey->wk_keyformat == ZFS_KEYFORMAT_NONE) { ret = SET_ERROR(EINVAL); goto error; } /* * If this dataset is not currently an encryption root we need a new * keylocation for this dataset's new wrapping key. Otherwise we can * just keep the one we already had. */ if (dd->dd_object != rddobj && dcp->cp_keylocation == NULL) { ret = SET_ERROR(EINVAL); goto error; } /* check that the keylocation is valid if it is not NULL */ if (dcp->cp_keylocation != NULL && !zfs_prop_valid_keylocation(dcp->cp_keylocation, B_TRUE)) { ret = SET_ERROR(EINVAL); goto error; } /* passphrases require pbkdf2 salt and iters */ if (dcp->cp_wkey->wk_keyformat == ZFS_KEYFORMAT_PASSPHRASE) { if (dcp->cp_wkey->wk_salt == 0 || dcp->cp_wkey->wk_iters < MIN_PBKDF2_ITERATIONS) { ret = SET_ERROR(EINVAL); goto error; } } else { if (dcp->cp_wkey->wk_salt != 0 || dcp->cp_wkey->wk_iters != 0) { ret = SET_ERROR(EINVAL); goto error; } } /* make sure the dd's wkey is loaded */ ret = dmu_objset_check_wkey_loaded(dd); if (ret != 0) goto error; dsl_dir_rele(dd, FTAG); return (0); error: if (dd != NULL) dsl_dir_rele(dd, FTAG); return (ret); } /* * This function deals with the intricacies of updating wrapping * key references and encryption roots recursively in the event * of a call to 'zfs change-key' or 'zfs promote'. The 'skip' * parameter should always be set to B_FALSE when called * externally. */ static void spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj, uint64_t new_rddobj, dsl_wrapping_key_t *wkey, boolean_t skip, dmu_tx_t *tx) { int ret; zap_cursor_t *zc; zap_attribute_t *za; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd = NULL; dsl_crypto_key_t *dck = NULL; uint64_t curr_rddobj; ASSERT(RW_WRITE_HELD(&dp->dp_spa->spa_keystore.sk_wkeys_lock)); /* hold the dd */ VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); /* ignore special dsl dirs */ if (dd->dd_myname[0] == '$' || dd->dd_myname[0] == '%') { dsl_dir_rele(dd, FTAG); return; } ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj); VERIFY(ret == 0 || ret == ENOENT); /* * Stop recursing if this dsl dir didn't inherit from the root * or if this dd is a clone. */ if (ret == ENOENT || (!skip && (curr_rddobj != rddobj || dsl_dir_is_clone(dd)))) { dsl_dir_rele(dd, FTAG); return; } /* * If we don't have a wrapping key just update the dck to reflect the * new encryption root. Otherwise rewrap the entire dck and re-sync it * to disk. If skip is set, we don't do any of this work. */ if (!skip) { if (wkey == NULL) { VERIFY0(zap_update(dp->dp_meta_objset, dd->dd_crypto_obj, DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, &new_rddobj, tx)); } else { VERIFY0(spa_keystore_dsl_key_hold_dd(dp->dp_spa, dd, FTAG, &dck)); dsl_wrapping_key_hold(wkey, dck); dsl_wrapping_key_rele(dck->dck_wkey, dck); dck->dck_wkey = wkey; dsl_crypto_key_sync(dck, tx); spa_keystore_dsl_key_rele(dp->dp_spa, dck, FTAG); } } zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* Recurse into all child dsl dirs. */ for (zap_cursor_init(zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { spa_keystore_change_key_sync_impl(rddobj, za->za_first_integer, new_rddobj, wkey, B_FALSE, tx); } zap_cursor_fini(zc); /* * Recurse into all dsl dirs of clones. We utilize the skip parameter * here so that we don't attempt to process the clones directly. This * is because the clone and its origin share the same dck, which has * already been updated. */ for (zap_cursor_init(zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_clones); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dataset_t *clone; VERIFY0(dsl_dataset_hold_obj(dp, za->za_first_integer, FTAG, &clone)); spa_keystore_change_key_sync_impl(rddobj, clone->ds_dir->dd_object, new_rddobj, wkey, B_TRUE, tx); dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(zc); kmem_free(za, sizeof (zap_attribute_t)); kmem_free(zc, sizeof (zap_cursor_t)); dsl_dir_rele(dd, FTAG); } static void spa_keystore_change_key_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_t *ds; avl_index_t where; dsl_pool_t *dp = dmu_tx_pool(tx); spa_t *spa = dp->dp_spa; spa_keystore_change_key_args_t *skcka = arg; dsl_crypto_params_t *dcp = skcka->skcka_cp; dsl_wrapping_key_t *wkey = NULL, *found_wkey; dsl_wrapping_key_t wkey_search; char *keylocation = dcp->cp_keylocation; uint64_t rddobj, new_rddobj; /* create and initialize the wrapping key */ VERIFY0(dsl_dataset_hold(dp, skcka->skcka_dsname, FTAG, &ds)); ASSERT(!ds->ds_is_snapshot); if (dcp->cp_cmd == DCP_CMD_NEW_KEY || dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY) { /* * We are changing to a new wkey. Set additional properties * which can be sent along with this ioctl. Note that this * command can set keylocation even if it can't normally be * set via 'zfs set' due to a non-local keylocation. */ if (dcp->cp_cmd == DCP_CMD_NEW_KEY) { wkey = dcp->cp_wkey; wkey->wk_ddobj = ds->ds_dir->dd_object; } else { keylocation = "prompt"; } if (keylocation != NULL) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1, keylocation, tx); } VERIFY0(dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj)); new_rddobj = ds->ds_dir->dd_object; } else { /* * We are inheritting the parent's wkey. Unset any local * keylocation and grab a reference to the wkey. */ if (dcp->cp_cmd == DCP_CMD_INHERIT) { VERIFY0(spa_keystore_wkey_hold_dd(spa, ds->ds_dir->dd_parent, FTAG, &wkey)); } dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), ZPROP_SRC_NONE, 0, 0, NULL, tx); rddobj = ds->ds_dir->dd_object; VERIFY0(dsl_dir_get_encryption_root_ddobj(ds->ds_dir->dd_parent, &new_rddobj)); } if (wkey == NULL) { ASSERT(dcp->cp_cmd == DCP_CMD_FORCE_INHERIT || dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY); } rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER); /* recurse through all children and rewrap their keys */ spa_keystore_change_key_sync_impl(rddobj, ds->ds_dir->dd_object, new_rddobj, wkey, B_FALSE, tx); /* * All references to the old wkey should be released now (if it * existed). Replace the wrapping key. */ wkey_search.wk_ddobj = ds->ds_dir->dd_object; found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &wkey_search, NULL); if (found_wkey != NULL) { ASSERT0(zfs_refcount_count(&found_wkey->wk_refcnt)); avl_remove(&spa->spa_keystore.sk_wkeys, found_wkey); dsl_wrapping_key_free(found_wkey); } if (dcp->cp_cmd == DCP_CMD_NEW_KEY) { avl_find(&spa->spa_keystore.sk_wkeys, wkey, &where); avl_insert(&spa->spa_keystore.sk_wkeys, wkey, where); } else if (wkey != NULL) { dsl_wrapping_key_rele(wkey, FTAG); } rw_exit(&spa->spa_keystore.sk_wkeys_lock); dsl_dataset_rele(ds, FTAG); } int spa_keystore_change_key(const char *dsname, dsl_crypto_params_t *dcp) { spa_keystore_change_key_args_t skcka; /* initialize the args struct */ skcka.skcka_dsname = dsname; skcka.skcka_cp = dcp; /* * Perform the actual work in syncing context. The blocks modified * here could be calculated but it would require holding the pool * lock and traversing all of the datasets that will have their keys * changed. */ return (dsl_sync_task(dsname, spa_keystore_change_key_check, spa_keystore_change_key_sync, &skcka, 15, ZFS_SPACE_CHECK_RESERVED)); } int dsl_dir_rename_crypt_check(dsl_dir_t *dd, dsl_dir_t *newparent) { int ret; uint64_t curr_rddobj, parent_rddobj; if (dd->dd_crypto_obj == 0) return (0); ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj); if (ret != 0) goto error; /* * if this is not an encryption root, we must make sure we are not * moving dd to a new encryption root */ if (dd->dd_object != curr_rddobj) { ret = dsl_dir_get_encryption_root_ddobj(newparent, &parent_rddobj); if (ret != 0) goto error; if (parent_rddobj != curr_rddobj) { ret = SET_ERROR(EACCES); goto error; } } return (0); error: return (ret); } /* * Check to make sure that a promote from targetdd to origindd will not require * any key rewraps. */ int dsl_dataset_promote_crypt_check(dsl_dir_t *target, dsl_dir_t *origin) { int ret; uint64_t rddobj, op_rddobj, tp_rddobj; /* If the dataset is not encrypted we don't need to check anything */ if (origin->dd_crypto_obj == 0) return (0); /* * If we are not changing the first origin snapshot in a chain * the encryption root won't change either. */ if (dsl_dir_is_clone(origin)) return (0); /* * If the origin is the encryption root we will update * the DSL Crypto Key to point to the target instead. */ ret = dsl_dir_get_encryption_root_ddobj(origin, &rddobj); if (ret != 0) return (ret); if (rddobj == origin->dd_object) return (0); /* * The origin is inheriting its encryption root from its parent. * Check that the parent of the target has the same encryption root. */ ret = dsl_dir_get_encryption_root_ddobj(origin->dd_parent, &op_rddobj); if (ret == ENOENT) return (SET_ERROR(EACCES)); else if (ret != 0) return (ret); ret = dsl_dir_get_encryption_root_ddobj(target->dd_parent, &tp_rddobj); if (ret == ENOENT) return (SET_ERROR(EACCES)); else if (ret != 0) return (ret); if (op_rddobj != tp_rddobj) return (SET_ERROR(EACCES)); return (0); } void dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, dmu_tx_t *tx) { uint64_t rddobj; dsl_pool_t *dp = target->dd_pool; dsl_dataset_t *targetds; dsl_dataset_t *originds; char *keylocation; if (origin->dd_crypto_obj == 0) return; if (dsl_dir_is_clone(origin)) return; VERIFY0(dsl_dir_get_encryption_root_ddobj(origin, &rddobj)); if (rddobj != origin->dd_object) return; /* * If the target is being promoted to the encryption root update the * DSL Crypto Key and keylocation to reflect that. We also need to * update the DSL Crypto Keys of all children inheritting their * encryption root to point to the new target. Otherwise, the check * function ensured that the encryption root will not change. */ keylocation = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(target)->dd_head_dataset_obj, FTAG, &targetds)); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(origin)->dd_head_dataset_obj, FTAG, &originds)); VERIFY0(dsl_prop_get_dd(origin, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), 1, ZAP_MAXVALUELEN, keylocation, NULL, B_FALSE)); dsl_prop_set_sync_impl(targetds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1, keylocation, tx); dsl_prop_set_sync_impl(originds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), ZPROP_SRC_NONE, 0, 0, NULL, tx); rw_enter(&dp->dp_spa->spa_keystore.sk_wkeys_lock, RW_WRITER); spa_keystore_change_key_sync_impl(rddobj, origin->dd_object, target->dd_object, NULL, B_FALSE, tx); rw_exit(&dp->dp_spa->spa_keystore.sk_wkeys_lock); dsl_dataset_rele(targetds, FTAG); dsl_dataset_rele(originds, FTAG); kmem_free(keylocation, ZAP_MAXVALUELEN); } int dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp, boolean_t *will_encrypt) { int ret; uint64_t pcrypt, crypt; dsl_crypto_params_t dummy_dcp = { 0 }; if (will_encrypt != NULL) *will_encrypt = B_FALSE; if (dcp == NULL) dcp = &dummy_dcp; if (dcp->cp_cmd != DCP_CMD_NONE) return (SET_ERROR(EINVAL)); if (parentdd != NULL) { ret = dsl_dir_get_crypt(parentdd, &pcrypt); if (ret != 0) return (ret); } else { pcrypt = ZIO_CRYPT_OFF; } crypt = (dcp->cp_crypt == ZIO_CRYPT_INHERIT) ? pcrypt : dcp->cp_crypt; ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT); ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT); /* check for valid dcp with no encryption (inherited or local) */ if (crypt == ZIO_CRYPT_OFF) { /* Must not specify encryption params */ if (dcp->cp_wkey != NULL || (dcp->cp_keylocation != NULL && strcmp(dcp->cp_keylocation, "none") != 0)) return (SET_ERROR(EINVAL)); return (0); } if (will_encrypt != NULL) *will_encrypt = B_TRUE; /* * We will now definitely be encrypting. Check the feature flag. When * creating the pool the caller will check this for us since we won't * technically have the feature activated yet. */ if (parentdd != NULL && !spa_feature_is_enabled(parentdd->dd_pool->dp_spa, SPA_FEATURE_ENCRYPTION)) { return (SET_ERROR(EOPNOTSUPP)); } /* Check for errata #4 (encryption enabled, bookmark_v2 disabled) */ if (parentdd != NULL && !spa_feature_is_enabled(parentdd->dd_pool->dp_spa, SPA_FEATURE_BOOKMARK_V2)) { return (SET_ERROR(EOPNOTSUPP)); } /* handle inheritance */ if (dcp->cp_wkey == NULL) { ASSERT3P(parentdd, !=, NULL); /* key must be fully unspecified */ if (dcp->cp_keylocation != NULL) return (SET_ERROR(EINVAL)); /* parent must have a key to inherit */ if (pcrypt == ZIO_CRYPT_OFF) return (SET_ERROR(EINVAL)); /* check for parent key */ ret = dmu_objset_check_wkey_loaded(parentdd); if (ret != 0) return (ret); return (0); } /* At this point we should have a fully specified key. Check location */ if (dcp->cp_keylocation == NULL || !zfs_prop_valid_keylocation(dcp->cp_keylocation, B_TRUE)) return (SET_ERROR(EINVAL)); /* Must have fully specified keyformat */ switch (dcp->cp_wkey->wk_keyformat) { case ZFS_KEYFORMAT_HEX: case ZFS_KEYFORMAT_RAW: /* requires no pbkdf2 iters and salt */ if (dcp->cp_wkey->wk_salt != 0 || dcp->cp_wkey->wk_iters != 0) return (SET_ERROR(EINVAL)); break; case ZFS_KEYFORMAT_PASSPHRASE: /* requires pbkdf2 iters and salt */ if (dcp->cp_wkey->wk_salt == 0 || dcp->cp_wkey->wk_iters < MIN_PBKDF2_ITERATIONS) return (SET_ERROR(EINVAL)); break; case ZFS_KEYFORMAT_NONE: default: /* keyformat must be specified and valid */ return (SET_ERROR(EINVAL)); } return (0); } void dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, dsl_dataset_t *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; uint64_t crypt; dsl_wrapping_key_t *wkey; /* clones always use their origin's wrapping key */ if (dsl_dir_is_clone(dd)) { ASSERT3P(dcp, ==, NULL); /* * If this is an encrypted clone we just need to clone the * dck into dd. Zapify the dd so we can do that. */ if (origin->ds_dir->dd_crypto_obj != 0) { dmu_buf_will_dirty(dd->dd_dbuf, tx); dsl_dir_zapify(dd, tx); dd->dd_crypto_obj = dsl_crypto_key_clone_sync(origin->ds_dir, tx); VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object, DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, &dd->dd_crypto_obj, tx)); } return; } /* * A NULL dcp at this point indicates this is the origin dataset * which does not have an objset to encrypt. Raw receives will handle * encryption separately later. In both cases we can simply return. */ if (dcp == NULL || dcp->cp_cmd == DCP_CMD_RAW_RECV) return; crypt = dcp->cp_crypt; wkey = dcp->cp_wkey; /* figure out the effective crypt */ if (crypt == ZIO_CRYPT_INHERIT && dd->dd_parent != NULL) VERIFY0(dsl_dir_get_crypt(dd->dd_parent, &crypt)); /* if we aren't doing encryption just return */ if (crypt == ZIO_CRYPT_OFF || crypt == ZIO_CRYPT_INHERIT) return; /* zapify the dd so that we can add the crypto key obj to it */ dmu_buf_will_dirty(dd->dd_dbuf, tx); dsl_dir_zapify(dd, tx); /* use the new key if given or inherit from the parent */ if (wkey == NULL) { VERIFY0(spa_keystore_wkey_hold_dd(dp->dp_spa, dd->dd_parent, FTAG, &wkey)); } else { wkey->wk_ddobj = dd->dd_object; } ASSERT3P(wkey, !=, NULL); /* Create or clone the DSL crypto key and activate the feature */ dd->dd_crypto_obj = dsl_crypto_key_create_sync(crypt, wkey, tx); VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object, DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, &dd->dd_crypto_obj, tx)); dsl_dataset_activate_feature(dsobj, SPA_FEATURE_ENCRYPTION, (void *)B_TRUE, tx); /* * If we inherited the wrapping key we release our reference now. * Otherwise, this is a new key and we need to load it into the * keystore. */ if (dcp->cp_wkey == NULL) { dsl_wrapping_key_rele(wkey, FTAG); } else { VERIFY0(spa_keystore_load_wkey_impl(dp->dp_spa, wkey)); } } typedef struct dsl_crypto_recv_key_arg { uint64_t dcrka_dsobj; uint64_t dcrka_fromobj; dmu_objset_type_t dcrka_ostype; nvlist_t *dcrka_nvl; boolean_t dcrka_do_key; } dsl_crypto_recv_key_arg_t; static int dsl_crypto_recv_raw_objset_check(dsl_dataset_t *ds, dsl_dataset_t *fromds, dmu_objset_type_t ostype, nvlist_t *nvl, dmu_tx_t *tx) { int ret; objset_t *os; dnode_t *mdn; uint8_t *buf = NULL; uint_t len; uint64_t intval, nlevels, blksz, ibs; uint64_t nblkptr, maxblkid; if (ostype != DMU_OST_ZFS && ostype != DMU_OST_ZVOL) return (SET_ERROR(EINVAL)); /* raw receives also need info about the structure of the metadnode */ ret = nvlist_lookup_uint64(nvl, "mdn_compress", &intval); if (ret != 0 || intval >= ZIO_COMPRESS_LEGACY_FUNCTIONS) return (SET_ERROR(EINVAL)); ret = nvlist_lookup_uint64(nvl, "mdn_checksum", &intval); if (ret != 0 || intval >= ZIO_CHECKSUM_LEGACY_FUNCTIONS) return (SET_ERROR(EINVAL)); ret = nvlist_lookup_uint64(nvl, "mdn_nlevels", &nlevels); if (ret != 0 || nlevels > DN_MAX_LEVELS) return (SET_ERROR(EINVAL)); ret = nvlist_lookup_uint64(nvl, "mdn_blksz", &blksz); if (ret != 0 || blksz < SPA_MINBLOCKSIZE) return (SET_ERROR(EINVAL)); else if (blksz > spa_maxblocksize(tx->tx_pool->dp_spa)) return (SET_ERROR(ENOTSUP)); ret = nvlist_lookup_uint64(nvl, "mdn_indblkshift", &ibs); if (ret != 0 || ibs < DN_MIN_INDBLKSHIFT || ibs > DN_MAX_INDBLKSHIFT) return (SET_ERROR(ENOTSUP)); ret = nvlist_lookup_uint64(nvl, "mdn_nblkptr", &nblkptr); if (ret != 0 || nblkptr != DN_MAX_NBLKPTR) return (SET_ERROR(ENOTSUP)); ret = nvlist_lookup_uint64(nvl, "mdn_maxblkid", &maxblkid); if (ret != 0) return (SET_ERROR(EINVAL)); ret = nvlist_lookup_uint8_array(nvl, "portable_mac", &buf, &len); if (ret != 0 || len != ZIO_OBJSET_MAC_LEN) return (SET_ERROR(EINVAL)); ret = dmu_objset_from_ds(ds, &os); if (ret != 0) return (ret); mdn = DMU_META_DNODE(os); /* * If we already created the objset, make sure its unchangeable * properties match the ones received in the nvlist. */ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); if (!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) && (mdn->dn_nlevels != nlevels || mdn->dn_datablksz != blksz || mdn->dn_indblkshift != ibs || mdn->dn_nblkptr != nblkptr)) { rrw_exit(&ds->ds_bp_rwlock, FTAG); return (SET_ERROR(EINVAL)); } rrw_exit(&ds->ds_bp_rwlock, FTAG); /* * Check that the ivset guid of the fromds matches the one from the * send stream. Older versions of the encryption code did not have * an ivset guid on the from dataset and did not send one in the * stream. For these streams we provide the * zfs_disable_ivset_guid_check tunable to allow these datasets to * be received with a generated ivset guid. */ if (fromds != NULL && !zfs_disable_ivset_guid_check) { uint64_t from_ivset_guid = 0; intval = 0; (void) nvlist_lookup_uint64(nvl, "from_ivset_guid", &intval); (void) zap_lookup(tx->tx_pool->dp_meta_objset, fromds->ds_object, DS_FIELD_IVSET_GUID, sizeof (from_ivset_guid), 1, &from_ivset_guid); if (intval == 0 || from_ivset_guid == 0) return (SET_ERROR(ZFS_ERR_FROM_IVSET_GUID_MISSING)); if (intval != from_ivset_guid) return (SET_ERROR(ZFS_ERR_FROM_IVSET_GUID_MISMATCH)); } return (0); } static void dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype, nvlist_t *nvl, dmu_tx_t *tx) { dsl_pool_t *dp = tx->tx_pool; objset_t *os; dnode_t *mdn; zio_t *zio; uint8_t *portable_mac; uint_t len; uint64_t compress, checksum, nlevels, blksz, ibs, maxblkid; boolean_t newds = B_FALSE; VERIFY0(dmu_objset_from_ds(ds, &os)); mdn = DMU_META_DNODE(os); /* * Fetch the values we need from the nvlist. "to_ivset_guid" must * be set on the snapshot, which doesn't exist yet. The receive * code will take care of this for us later. */ compress = fnvlist_lookup_uint64(nvl, "mdn_compress"); checksum = fnvlist_lookup_uint64(nvl, "mdn_checksum"); nlevels = fnvlist_lookup_uint64(nvl, "mdn_nlevels"); blksz = fnvlist_lookup_uint64(nvl, "mdn_blksz"); ibs = fnvlist_lookup_uint64(nvl, "mdn_indblkshift"); maxblkid = fnvlist_lookup_uint64(nvl, "mdn_maxblkid"); VERIFY0(nvlist_lookup_uint8_array(nvl, "portable_mac", &portable_mac, &len)); /* if we haven't created an objset for the ds yet, do that now */ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); if (BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { (void) dmu_objset_create_impl_dnstats(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), ostype, nlevels, blksz, ibs, tx); newds = B_TRUE; } rrw_exit(&ds->ds_bp_rwlock, FTAG); /* * Set the portable MAC. The local MAC will always be zero since the * incoming data will all be portable and user accounting will be * deferred until the next mount. Afterwards, flag the os to be * written out raw next time. */ arc_release(os->os_phys_buf, &os->os_phys_buf); bcopy(portable_mac, os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN); bzero(os->os_phys->os_local_mac, ZIO_OBJSET_MAC_LEN); os->os_flags &= ~OBJSET_FLAG_USERACCOUNTING_COMPLETE; os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; /* set metadnode compression and checksum */ mdn->dn_compress = compress; mdn->dn_checksum = checksum; rw_enter(&mdn->dn_struct_rwlock, RW_WRITER); dnode_new_blkid(mdn, maxblkid, tx, B_FALSE, B_TRUE); rw_exit(&mdn->dn_struct_rwlock); /* * We can't normally dirty the dataset in syncing context unless * we are creating a new dataset. In this case, we perform a * pseudo txg sync here instead. */ if (newds) { dsl_dataset_dirty(ds, tx); } else { zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dsl_dataset_sync(ds, zio, tx); VERIFY0(zio_wait(zio)); - - /* dsl_dataset_sync_done will drop this reference. */ - dmu_buf_add_ref(ds->ds_dbuf, ds); dsl_dataset_sync_done(ds, tx); } } int dsl_crypto_recv_raw_key_check(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx) { int ret; objset_t *mos = tx->tx_pool->dp_meta_objset; uint8_t *buf = NULL; uint_t len; uint64_t intval, key_guid, version; boolean_t is_passphrase = B_FALSE; ASSERT(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT); /* * Read and check all the encryption values from the nvlist. We need * all of the fields of a DSL Crypto Key, as well as a fully specified * wrapping key. */ ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, &intval); if (ret != 0 || intval >= ZIO_CRYPT_FUNCTIONS || intval <= ZIO_CRYPT_OFF) return (SET_ERROR(EINVAL)); ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID, &intval); if (ret != 0) return (SET_ERROR(EINVAL)); /* * If this is an incremental receive make sure the given key guid * matches the one we already have. */ if (ds->ds_dir->dd_crypto_obj != 0) { ret = zap_lookup(mos, ds->ds_dir->dd_crypto_obj, DSL_CRYPTO_KEY_GUID, 8, 1, &key_guid); if (ret != 0) return (ret); if (intval != key_guid) return (SET_ERROR(EACCES)); } ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY, &buf, &len); if (ret != 0 || len != MASTER_KEY_MAX_LEN) return (SET_ERROR(EINVAL)); ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY, &buf, &len); if (ret != 0 || len != SHA512_HMAC_KEYLEN) return (SET_ERROR(EINVAL)); ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_IV, &buf, &len); if (ret != 0 || len != WRAPPING_IV_LEN) return (SET_ERROR(EINVAL)); ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, &buf, &len); if (ret != 0 || len != WRAPPING_MAC_LEN) return (SET_ERROR(EINVAL)); /* * We don't support receiving old on-disk formats. The version 0 * implementation protected several fields in an objset that were * not always portable during a raw receive. As a result, we call * the old version an on-disk errata #3. */ ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_VERSION, &version); if (ret != 0 || version != ZIO_CRYPT_KEY_CURRENT_VERSION) return (SET_ERROR(ENOTSUP)); ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &intval); if (ret != 0 || intval >= ZFS_KEYFORMAT_FORMATS || intval == ZFS_KEYFORMAT_NONE) return (SET_ERROR(EINVAL)); is_passphrase = (intval == ZFS_KEYFORMAT_PASSPHRASE); /* * for raw receives we allow any number of pbkdf2iters since there * won't be a chance for the user to change it. */ ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &intval); if (ret != 0 || (is_passphrase == (intval == 0))) return (SET_ERROR(EINVAL)); ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), &intval); if (ret != 0 || (is_passphrase == (intval == 0))) return (SET_ERROR(EINVAL)); return (0); } void dsl_crypto_recv_raw_key_sync(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx) { dsl_pool_t *dp = tx->tx_pool; objset_t *mos = dp->dp_meta_objset; dsl_dir_t *dd = ds->ds_dir; uint_t len; uint64_t rddobj, one = 1; uint8_t *keydata, *hmac_keydata, *iv, *mac; uint64_t crypt, key_guid, keyformat, iters, salt; uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION; char *keylocation = "prompt"; /* lookup the values we need to create the DSL Crypto Key */ crypt = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE); key_guid = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID); keyformat = fnvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT)); iters = fnvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS)); salt = fnvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT)); VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY, &keydata, &len)); VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY, &hmac_keydata, &len)); VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_IV, &iv, &len)); VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, &mac, &len)); /* if this is a new dataset setup the DSL Crypto Key. */ if (dd->dd_crypto_obj == 0) { /* zapify the dsl dir so we can add the key object to it */ dmu_buf_will_dirty(dd->dd_dbuf, tx); dsl_dir_zapify(dd, tx); /* create the DSL Crypto Key on disk and activate the feature */ dd->dd_crypto_obj = zap_create(mos, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dd->dd_crypto_obj, DSL_CRYPTO_KEY_REFCOUNT, sizeof (uint64_t), 1, &one, tx)); VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dd->dd_crypto_obj, DSL_CRYPTO_KEY_VERSION, sizeof (uint64_t), 1, &version, tx)); dsl_dataset_activate_feature(ds->ds_object, SPA_FEATURE_ENCRYPTION, (void *)B_TRUE, tx); ds->ds_feature[SPA_FEATURE_ENCRYPTION] = (void *)B_TRUE; /* save the dd_crypto_obj on disk */ VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, &dd->dd_crypto_obj, tx)); /* * Set the keylocation to prompt by default. If keylocation * has been provided via the properties, this will be overridden * later. */ dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1, keylocation, tx); rddobj = dd->dd_object; } else { VERIFY0(dsl_dir_get_encryption_root_ddobj(dd, &rddobj)); } /* sync the key data to the ZAP object on disk */ dsl_crypto_key_sync_impl(mos, dd->dd_crypto_obj, crypt, rddobj, key_guid, iv, mac, keydata, hmac_keydata, keyformat, salt, iters, tx); } static int dsl_crypto_recv_key_check(void *arg, dmu_tx_t *tx) { int ret; dsl_crypto_recv_key_arg_t *dcrka = arg; dsl_dataset_t *ds = NULL, *fromds = NULL; ret = dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_dsobj, FTAG, &ds); if (ret != 0) goto out; if (dcrka->dcrka_fromobj != 0) { ret = dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_fromobj, FTAG, &fromds); if (ret != 0) goto out; } ret = dsl_crypto_recv_raw_objset_check(ds, fromds, dcrka->dcrka_ostype, dcrka->dcrka_nvl, tx); if (ret != 0) goto out; /* * We run this check even if we won't be doing this part of * the receive now so that we don't make the user wait until * the receive finishes to fail. */ ret = dsl_crypto_recv_raw_key_check(ds, dcrka->dcrka_nvl, tx); if (ret != 0) goto out; out: if (ds != NULL) dsl_dataset_rele(ds, FTAG); if (fromds != NULL) dsl_dataset_rele(fromds, FTAG); return (ret); } static void dsl_crypto_recv_key_sync(void *arg, dmu_tx_t *tx) { dsl_crypto_recv_key_arg_t *dcrka = arg; dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_dsobj, FTAG, &ds)); dsl_crypto_recv_raw_objset_sync(ds, dcrka->dcrka_ostype, dcrka->dcrka_nvl, tx); if (dcrka->dcrka_do_key) dsl_crypto_recv_raw_key_sync(ds, dcrka->dcrka_nvl, tx); dsl_dataset_rele(ds, FTAG); } /* * This function is used to sync an nvlist representing a DSL Crypto Key and * the associated encryption parameters. The key will be written exactly as is * without wrapping it. */ int dsl_crypto_recv_raw(const char *poolname, uint64_t dsobj, uint64_t fromobj, dmu_objset_type_t ostype, nvlist_t *nvl, boolean_t do_key) { dsl_crypto_recv_key_arg_t dcrka; dcrka.dcrka_dsobj = dsobj; dcrka.dcrka_fromobj = fromobj; dcrka.dcrka_ostype = ostype; dcrka.dcrka_nvl = nvl; dcrka.dcrka_do_key = do_key; return (dsl_sync_task(poolname, dsl_crypto_recv_key_check, dsl_crypto_recv_key_sync, &dcrka, 1, ZFS_SPACE_CHECK_NORMAL)); } int dsl_crypto_populate_key_nvlist(objset_t *os, uint64_t from_ivset_guid, nvlist_t **nvl_out) { int ret; dsl_dataset_t *ds = os->os_dsl_dataset; dnode_t *mdn; uint64_t rddobj; nvlist_t *nvl = NULL; uint64_t dckobj = ds->ds_dir->dd_crypto_obj; dsl_dir_t *rdd = NULL; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; uint64_t crypt = 0, key_guid = 0, format = 0; uint64_t iters = 0, salt = 0, version = 0; uint64_t to_ivset_guid = 0; uint8_t raw_keydata[MASTER_KEY_MAX_LEN]; uint8_t raw_hmac_keydata[SHA512_HMAC_KEYLEN]; uint8_t iv[WRAPPING_IV_LEN]; uint8_t mac[WRAPPING_MAC_LEN]; ASSERT(dckobj != 0); mdn = DMU_META_DNODE(os); nvl = fnvlist_alloc(); /* lookup values from the DSL Crypto Key */ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, &crypt); if (ret != 0) goto error; ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &key_guid); if (ret != 0) goto error; ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1, MASTER_KEY_MAX_LEN, raw_keydata); if (ret != 0) goto error; ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1, SHA512_HMAC_KEYLEN, raw_hmac_keydata); if (ret != 0) goto error; ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN, iv); if (ret != 0) goto error; ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN, mac); if (ret != 0) goto error; /* see zfs_disable_ivset_guid_check tunable for errata info */ ret = zap_lookup(mos, ds->ds_object, DS_FIELD_IVSET_GUID, 8, 1, &to_ivset_guid); if (ret != 0) ASSERT3U(dp->dp_spa->spa_errata, !=, 0); /* * We don't support raw sends of legacy on-disk formats. See the * comment in dsl_crypto_recv_key_check() for details. */ ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_VERSION, 8, 1, &version); if (ret != 0 || version != ZIO_CRYPT_KEY_CURRENT_VERSION) { dp->dp_spa->spa_errata = ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; ret = SET_ERROR(ENOTSUP); goto error; } /* * Lookup wrapping key properties. An early version of the code did * not correctly add these values to the wrapping key or the DSL * Crypto Key on disk for non encryption roots, so to be safe we * always take the slightly circuitous route of looking it up from * the encryption root's key. */ ret = dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj); if (ret != 0) goto error; dsl_pool_config_enter(dp, FTAG); ret = dsl_dir_hold_obj(dp, rddobj, NULL, FTAG, &rdd); if (ret != 0) goto error_unlock; ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &format); if (ret != 0) goto error_unlock; if (format == ZFS_KEYFORMAT_PASSPHRASE) { ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &iters); if (ret != 0) goto error_unlock; ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &salt); if (ret != 0) goto error_unlock; } dsl_dir_rele(rdd, FTAG); dsl_pool_config_exit(dp, FTAG); fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, crypt); fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_GUID, key_guid); fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_VERSION, version); VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY, raw_keydata, MASTER_KEY_MAX_LEN)); VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY, raw_hmac_keydata, SHA512_HMAC_KEYLEN)); VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_IV, iv, WRAPPING_IV_LEN)); VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, mac, WRAPPING_MAC_LEN)); VERIFY0(nvlist_add_uint8_array(nvl, "portable_mac", os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN)); fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), format); fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), iters); fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), salt); fnvlist_add_uint64(nvl, "mdn_checksum", mdn->dn_checksum); fnvlist_add_uint64(nvl, "mdn_compress", mdn->dn_compress); fnvlist_add_uint64(nvl, "mdn_nlevels", mdn->dn_nlevels); fnvlist_add_uint64(nvl, "mdn_blksz", mdn->dn_datablksz); fnvlist_add_uint64(nvl, "mdn_indblkshift", mdn->dn_indblkshift); fnvlist_add_uint64(nvl, "mdn_nblkptr", mdn->dn_nblkptr); fnvlist_add_uint64(nvl, "mdn_maxblkid", mdn->dn_maxblkid); fnvlist_add_uint64(nvl, "to_ivset_guid", to_ivset_guid); fnvlist_add_uint64(nvl, "from_ivset_guid", from_ivset_guid); *nvl_out = nvl; return (0); error_unlock: dsl_pool_config_exit(dp, FTAG); error: if (rdd != NULL) dsl_dir_rele(rdd, FTAG); nvlist_free(nvl); *nvl_out = NULL; return (ret); } uint64_t dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey, dmu_tx_t *tx) { dsl_crypto_key_t dck; uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION; uint64_t one = 1ULL; ASSERT(dmu_tx_is_syncing(tx)); ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); ASSERT3U(crypt, >, ZIO_CRYPT_OFF); /* create the DSL Crypto Key ZAP object */ dck.dck_obj = zap_create(tx->tx_pool->dp_meta_objset, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); /* fill in the key (on the stack) and sync it to disk */ dck.dck_wkey = wkey; VERIFY0(zio_crypt_key_init(crypt, &dck.dck_key)); dsl_crypto_key_sync(&dck, tx); VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dck.dck_obj, DSL_CRYPTO_KEY_REFCOUNT, sizeof (uint64_t), 1, &one, tx)); VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dck.dck_obj, DSL_CRYPTO_KEY_VERSION, sizeof (uint64_t), 1, &version, tx)); zio_crypt_key_destroy(&dck.dck_key); bzero(&dck.dck_key, sizeof (zio_crypt_key_t)); return (dck.dck_obj); } uint64_t dsl_crypto_key_clone_sync(dsl_dir_t *origindd, dmu_tx_t *tx) { objset_t *mos = tx->tx_pool->dp_meta_objset; ASSERT(dmu_tx_is_syncing(tx)); VERIFY0(zap_increment(mos, origindd->dd_crypto_obj, DSL_CRYPTO_KEY_REFCOUNT, 1, tx)); return (origindd->dd_crypto_obj); } void dsl_crypto_key_destroy_sync(uint64_t dckobj, dmu_tx_t *tx) { objset_t *mos = tx->tx_pool->dp_meta_objset; uint64_t refcnt; /* Decrement the refcount, destroy if this is the last reference */ VERIFY0(zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_REFCOUNT, sizeof (uint64_t), 1, &refcnt)); if (refcnt != 1) { VERIFY0(zap_increment(mos, dckobj, DSL_CRYPTO_KEY_REFCOUNT, -1, tx)); } else { VERIFY0(zap_destroy(mos, dckobj, tx)); } } void dsl_dataset_crypt_stats(dsl_dataset_t *ds, nvlist_t *nv) { uint64_t intval; dsl_dir_t *dd = ds->ds_dir; dsl_dir_t *enc_root; char buf[ZFS_MAX_DATASET_NAME_LEN]; if (dd->dd_crypto_obj == 0) return; intval = dsl_dataset_get_keystatus(dd); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEYSTATUS, intval); if (dsl_dir_get_crypt(dd, &intval) == 0) dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_ENCRYPTION, intval); if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, DSL_CRYPTO_KEY_GUID, 8, 1, &intval) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEY_GUID, intval); } if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &intval) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEYFORMAT, intval); } if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &intval) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_PBKDF2_SALT, intval); } if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &intval) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_PBKDF2_ITERS, intval); } if (zap_lookup(dd->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_IVSET_GUID, 8, 1, &intval) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_IVSET_GUID, intval); } if (dsl_dir_get_encryption_root_ddobj(dd, &intval) == 0) { if (dsl_dir_hold_obj(dd->dd_pool, intval, NULL, FTAG, &enc_root) == 0) { dsl_dir_name(enc_root, buf); dsl_dir_rele(enc_root, FTAG); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ENCRYPTION_ROOT, buf); } } } int spa_crypt_get_salt(spa_t *spa, uint64_t dsobj, uint8_t *salt) { int ret; dsl_crypto_key_t *dck = NULL; /* look up the key from the spa's keystore */ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck); if (ret != 0) goto error; ret = zio_crypt_key_get_salt(&dck->dck_key, salt); if (ret != 0) goto error; spa_keystore_dsl_key_rele(spa, dck, FTAG); return (0); error: if (dck != NULL) spa_keystore_dsl_key_rele(spa, dck, FTAG); return (ret); } /* * Objset blocks are a special case for MAC generation. These blocks have 2 * 256-bit MACs which are embedded within the block itself, rather than a * single 128 bit MAC. As a result, this function handles encoding and decoding * the MACs on its own, unlike other functions in this file. */ int spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, abd_t *abd, uint_t datalen, boolean_t byteswap) { int ret; dsl_crypto_key_t *dck = NULL; void *buf = abd_borrow_buf_copy(abd, datalen); objset_phys_t *osp = buf; uint8_t portable_mac[ZIO_OBJSET_MAC_LEN]; uint8_t local_mac[ZIO_OBJSET_MAC_LEN]; const uint8_t zeroed_mac[ZIO_OBJSET_MAC_LEN] = {0}; /* look up the key from the spa's keystore */ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck); if (ret != 0) goto error; /* calculate both HMACs */ ret = zio_crypt_do_objset_hmacs(&dck->dck_key, buf, datalen, byteswap, portable_mac, local_mac); if (ret != 0) goto error; spa_keystore_dsl_key_rele(spa, dck, FTAG); /* if we are generating encode the HMACs in the objset_phys_t */ if (generate) { bcopy(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN); bcopy(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN); abd_return_buf_copy(abd, buf, datalen); return (0); } if (memcmp(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN) != 0 || memcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) { /* * If the MAC is zeroed out, we failed to decrypt it. * This should only arise, at least on Linux, * if we hit edge case handling for useraccounting, since we * shouldn't get here without bailing out on error earlier * otherwise. * * So if we're in that case, we can just fall through and * special-casing noticing that it's zero will handle it * elsewhere, since we can just regenerate it. */ if (memcmp(local_mac, zeroed_mac, ZIO_OBJSET_MAC_LEN) != 0) { abd_return_buf(abd, buf, datalen); return (SET_ERROR(ECKSUM)); } } abd_return_buf(abd, buf, datalen); return (0); error: if (dck != NULL) spa_keystore_dsl_key_rele(spa, dck, FTAG); abd_return_buf(abd, buf, datalen); return (ret); } int spa_do_crypt_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, abd_t *abd, uint_t datalen, uint8_t *mac) { int ret; dsl_crypto_key_t *dck = NULL; uint8_t *buf = abd_borrow_buf_copy(abd, datalen); uint8_t digestbuf[ZIO_DATA_MAC_LEN]; /* look up the key from the spa's keystore */ ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck); if (ret != 0) goto error; /* perform the hmac */ ret = zio_crypt_do_hmac(&dck->dck_key, buf, datalen, digestbuf, ZIO_DATA_MAC_LEN); if (ret != 0) goto error; abd_return_buf(abd, buf, datalen); spa_keystore_dsl_key_rele(spa, dck, FTAG); /* * Truncate and fill in mac buffer if we were asked to generate a MAC. * Otherwise verify that the MAC matched what we expected. */ if (generate) { bcopy(digestbuf, mac, ZIO_DATA_MAC_LEN); return (0); } if (bcmp(digestbuf, mac, ZIO_DATA_MAC_LEN) != 0) return (SET_ERROR(ECKSUM)); return (0); error: if (dck != NULL) spa_keystore_dsl_key_rele(spa, dck, FTAG); abd_return_buf(abd, buf, datalen); return (ret); } /* * This function serves as a multiplexer for encryption and decryption of * all blocks (except the L2ARC). For encryption, it will populate the IV, * salt, MAC, and cabd (the ciphertext). On decryption it will simply use * these fields to populate pabd (the plaintext). */ int spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb, dmu_object_type_t ot, boolean_t dedup, boolean_t bswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) { int ret; dsl_crypto_key_t *dck = NULL; uint8_t *plainbuf = NULL, *cipherbuf = NULL; ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); /* look up the key from the spa's keystore */ ret = spa_keystore_lookup_key(spa, zb->zb_objset, FTAG, &dck); if (ret != 0) { ret = SET_ERROR(EACCES); return (ret); } if (encrypt) { plainbuf = abd_borrow_buf_copy(pabd, datalen); cipherbuf = abd_borrow_buf(cabd, datalen); } else { plainbuf = abd_borrow_buf(pabd, datalen); cipherbuf = abd_borrow_buf_copy(cabd, datalen); } /* * Both encryption and decryption functions need a salt for key * generation and an IV. When encrypting a non-dedup block, we * generate the salt and IV randomly to be stored by the caller. Dedup * blocks perform a (more expensive) HMAC of the plaintext to obtain * the salt and the IV. ZIL blocks have their salt and IV generated * at allocation time in zio_alloc_zil(). On decryption, we simply use * the provided values. */ if (encrypt && ot != DMU_OT_INTENT_LOG && !dedup) { ret = zio_crypt_key_get_salt(&dck->dck_key, salt); if (ret != 0) goto error; ret = zio_crypt_generate_iv(iv); if (ret != 0) goto error; } else if (encrypt && dedup) { ret = zio_crypt_generate_iv_salt_dedup(&dck->dck_key, plainbuf, datalen, iv, salt); if (ret != 0) goto error; } /* call lower level function to perform encryption / decryption */ ret = zio_do_crypt_data(encrypt, &dck->dck_key, ot, bswap, salt, iv, mac, datalen, plainbuf, cipherbuf, no_crypt); /* * Handle injected decryption faults. Unfortunately, we cannot inject * faults for dnode blocks because we might trigger the panic in * dbuf_prepare_encrypted_dnode_leaf(), which exists because syncing * context is not prepared to handle malicious decryption failures. */ if (zio_injection_enabled && !encrypt && ot != DMU_OT_DNODE && ret == 0) ret = zio_handle_decrypt_injection(spa, zb, ot, ECKSUM); if (ret != 0) goto error; if (encrypt) { abd_return_buf(pabd, plainbuf, datalen); abd_return_buf_copy(cabd, cipherbuf, datalen); } else { abd_return_buf_copy(pabd, plainbuf, datalen); abd_return_buf(cabd, cipherbuf, datalen); } spa_keystore_dsl_key_rele(spa, dck, FTAG); return (0); error: if (encrypt) { /* zero out any state we might have changed while encrypting */ bzero(salt, ZIO_DATA_SALT_LEN); bzero(iv, ZIO_DATA_IV_LEN); bzero(mac, ZIO_DATA_MAC_LEN); abd_return_buf(pabd, plainbuf, datalen); abd_return_buf_copy(cabd, cipherbuf, datalen); } else { abd_return_buf_copy(pabd, plainbuf, datalen); abd_return_buf(cabd, cipherbuf, datalen); } spa_keystore_dsl_key_rele(spa, dck, FTAG); return (ret); } ZFS_MODULE_PARAM(zfs, zfs_, disable_ivset_guid_check, INT, ZMOD_RW, "Set to allow raw receives without IVset guids"); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c index a844059de495..3bc697303914 100644 --- a/module/zfs/dsl_dataset.c +++ b/module/zfs/dsl_dataset.c @@ -1,5006 +1,5001 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 RackTop Systems. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude * Copyright (c) 2020 The FreeBSD Foundation [1] * * [1] Portions of this software were developed by Allan Jude * under sponsorship from the FreeBSD Foundation. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The SPA supports block sizes up to 16MB. However, very large blocks * can have an impact on i/o latency (e.g. tying up a spinning disk for * ~300ms), and also potentially on the memory allocator. Therefore, * we do not allow the recordsize to be set larger than zfs_max_recordsize * (default 1MB). Larger blocks can be created by changing this tunable, * and pools with larger blocks can always be imported and used, regardless * of this setting. */ int zfs_max_recordsize = 1 * 1024 * 1024; int zfs_allow_redacted_dataset_mount = 0; #define SWITCH64(x, y) \ { \ uint64_t __tmp = (x); \ (x) = (y); \ (y) = __tmp; \ } #define DS_REF_MAX (1ULL << 62) static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx); static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx); static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f); extern int spa_asize_inflation; static zil_header_t zero_zil; /* * Figure out how much of this delta should be propagated to the dsl_dir * layer. If there's a refreservation, that space has already been * partially accounted for in our ancestors. */ static int64_t parent_delta(dsl_dataset_t *ds, int64_t delta) { dsl_dataset_phys_t *ds_phys; uint64_t old_bytes, new_bytes; if (ds->ds_reserved == 0) return (delta); ds_phys = dsl_dataset_phys(ds); old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved); new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved); ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); return (new_bytes - old_bytes); } void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; int used = bp_get_dsize_sync(spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; spa_feature_t f; dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); /* It could have been compressed away to nothing */ if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return; ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); if (ds == NULL) { dsl_pool_mos_diduse_space(tx->tx_pool, used, compressed, uncompressed); return; } ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); dsl_dataset_phys(ds)->ds_referenced_bytes += used; dsl_dataset_phys(ds)->ds_compressed_bytes += compressed; dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; dsl_dataset_phys(ds)->ds_unique_bytes += used; if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) { ds->ds_feature_activation[SPA_FEATURE_LARGE_BLOCKS] = (void *)B_TRUE; } f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); if (f != SPA_FEATURE_NONE) { ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); ds->ds_feature_activation[f] = (void *)B_TRUE; } f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); if (f != SPA_FEATURE_NONE) { ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); ds->ds_feature_activation[f] = (void *)B_TRUE; } /* * Track block for livelist, but ignore embedded blocks because * they do not need to be freed. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && bp->blk_birth > ds->ds_dir->dd_origin_txg && !(BP_IS_EMBEDDED(bp))) { ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LIVELIST)); bplist_append(&ds->ds_dir->dd_pending_allocs, bp); } mutex_exit(&ds->ds_lock); dsl_dir_diduse_transfer_space(ds->ds_dir, delta, compressed, uncompressed, used, DD_USED_REFRSRV, DD_USED_HEAD, tx); } /* * Called when the specified segment has been remapped, and is thus no * longer referenced in the head dataset. The vdev must be indirect. * * If the segment is referenced by a snapshot, put it on the remap deadlist. * Otherwise, add this segment to the obsolete spacemap. */ void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx) { spa_t *spa = ds->ds_dir->dd_pool->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(birth <= tx->tx_txg); ASSERT(!ds->ds_is_snapshot); if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); } else { blkptr_t fakebp; dva_t *dva = &fakebp.blk_dva[0]; ASSERT(ds != NULL); mutex_enter(&ds->ds_remap_deadlist_lock); if (!dsl_dataset_remap_deadlist_exists(ds)) { dsl_dataset_create_remap_deadlist(ds, tx); } mutex_exit(&ds->ds_remap_deadlist_lock); BP_ZERO(&fakebp); fakebp.blk_birth = birth; DVA_SET_VDEV(dva, vdev); DVA_SET_OFFSET(dva, offset); DVA_SET_ASIZE(dva, size); dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE, tx); } } int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; int used = bp_get_dsize_sync(spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) return (0); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(bp->blk_birth <= tx->tx_txg); if (ds == NULL) { dsl_free(tx->tx_pool, tx->tx_txg, bp); dsl_pool_mos_diduse_space(tx->tx_pool, -used, -compressed, -uncompressed); return (used); } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); ASSERT(!ds->ds_is_snapshot); dmu_buf_will_dirty(ds->ds_dbuf, tx); /* * Track block for livelist, but ignore embedded blocks because * they do not need to be freed. */ if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && bp->blk_birth > ds->ds_dir->dd_origin_txg && !(BP_IS_EMBEDDED(bp))) { ASSERT(dsl_dir_is_clone(ds->ds_dir)); ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LIVELIST)); bplist_append(&ds->ds_dir->dd_pending_frees, bp); } if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; dprintf_bp(bp, "freeing ds=%llu", (u_longlong_t)ds->ds_object); dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_lock); ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used || !DS_UNIQUE_IS_ACCURATE(ds)); delta = parent_delta(ds, -used); dsl_dataset_phys(ds)->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); dsl_dir_diduse_transfer_space(ds->ds_dir, delta, -compressed, -uncompressed, -used, DD_USED_REFRSRV, DD_USED_HEAD, tx); } else { dprintf_bp(bp, "putting on dead list: %s", ""); if (async) { /* * We are here as part of zio's write done callback, * which means we're a zio interrupt thread. We can't * call dsl_deadlist_insert() now because it may block * waiting for I/O. Instead, put bp on the deferred * queue and let dsl_pool_sync() finish the job. */ bplist_append(&ds->ds_pending_deadlist, bp); } else { dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx); } ASSERT3U(ds->ds_prev->ds_object, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj); ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object && bp->blk_birth > dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); mutex_enter(&ds->ds_prev->ds_lock); dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } } dsl_bookmark_block_killed(ds, bp, tx); mutex_enter(&ds->ds_lock); ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used); dsl_dataset_phys(ds)->ds_referenced_bytes -= used; ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed); dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed; ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed); dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed; mutex_exit(&ds->ds_lock); return (used); } struct feature_type_uint64_array_arg { uint64_t length; uint64_t *array; }; static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f) { switch (spa_feature_table[f].fi_type) { case ZFEATURE_TYPE_BOOLEAN: break; case ZFEATURE_TYPE_UINT64_ARRAY: { struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f]; kmem_free(ftuaa->array, ftuaa->length * sizeof (uint64_t)); kmem_free(ftuaa, sizeof (*ftuaa)); break; } default: panic("Invalid zfeature type %d", spa_feature_table[f].fi_type); } } static int load_zfeature(objset_t *mos, dsl_dataset_t *ds, spa_feature_t f) { int err = 0; switch (spa_feature_table[f].fi_type) { case ZFEATURE_TYPE_BOOLEAN: err = zap_contains(mos, ds->ds_object, spa_feature_table[f].fi_guid); if (err == 0) { ds->ds_feature[f] = (void *)B_TRUE; } else { ASSERT3U(err, ==, ENOENT); err = 0; } break; case ZFEATURE_TYPE_UINT64_ARRAY: { uint64_t int_size, num_int; uint64_t *data; err = zap_length(mos, ds->ds_object, spa_feature_table[f].fi_guid, &int_size, &num_int); if (err != 0) { ASSERT3U(err, ==, ENOENT); err = 0; break; } ASSERT3U(int_size, ==, sizeof (uint64_t)); data = kmem_alloc(int_size * num_int, KM_SLEEP); VERIFY0(zap_lookup(mos, ds->ds_object, spa_feature_table[f].fi_guid, int_size, num_int, data)); struct feature_type_uint64_array_arg *ftuaa = kmem_alloc(sizeof (*ftuaa), KM_SLEEP); ftuaa->length = num_int; ftuaa->array = data; ds->ds_feature[f] = ftuaa; break; } default: panic("Invalid zfeature type %d", spa_feature_table[f].fi_type); } return (err); } /* * We have to release the fsid synchronously or we risk that a subsequent * mount of the same dataset will fail to unique_insert the fsid. This * failure would manifest itself as the fsid of this dataset changing * between mounts which makes NFS clients quite unhappy. */ static void dsl_dataset_evict_sync(void *dbu) { dsl_dataset_t *ds = dbu; ASSERT(ds->ds_owner == NULL); unique_remove(ds->ds_fsid_guid); } static void dsl_dataset_evict_async(void *dbu) { dsl_dataset_t *ds = dbu; ASSERT(ds->ds_owner == NULL); ds->ds_dbuf = NULL; if (ds->ds_objset != NULL) dmu_objset_evict(ds->ds_objset); if (ds->ds_prev) { dsl_dataset_rele(ds->ds_prev, ds); ds->ds_prev = NULL; } dsl_bookmark_fini_ds(ds); bplist_destroy(&ds->ds_pending_deadlist); if (dsl_deadlist_is_open(&ds->ds_deadlist)) dsl_deadlist_close(&ds->ds_deadlist); if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) dsl_deadlist_close(&ds->ds_remap_deadlist); if (ds->ds_dir) dsl_dir_async_rele(ds->ds_dir, ds); ASSERT(!list_link_active(&ds->ds_synced_link)); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (dsl_dataset_feature_is_active(ds, f)) unload_zfeature(ds, f); } list_destroy(&ds->ds_prop_cbs); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_sendstream_lock); mutex_destroy(&ds->ds_remap_deadlist_lock); zfs_refcount_destroy(&ds->ds_longholds); rrw_destroy(&ds->ds_bp_rwlock); kmem_free(ds, sizeof (dsl_dataset_t)); } int dsl_dataset_get_snapname(dsl_dataset_t *ds) { dsl_dataset_phys_t *headphys; int err; dmu_buf_t *headdbuf; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; if (ds->ds_snapname[0]) return (0); if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) return (0); err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &headdbuf); if (err != 0) return (err); headphys = headdbuf->db_data; err = zap_value_search(dp->dp_meta_objset, headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); if (err != 0 && zfs_recover == B_TRUE) { err = 0; (void) snprintf(ds->ds_snapname, sizeof (ds->ds_snapname), "SNAPOBJ=%llu-ERR=%d", (unsigned long long)ds->ds_object, err); } dmu_buf_rele(headdbuf, FTAG); return (err); } int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; matchtype_t mt = 0; int err; if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; err = zap_lookup_norm(mos, snapobj, name, 8, 1, value, mt, NULL, 0, NULL); if (err == ENOTSUP && (mt & MT_NORMALIZE)) err = zap_lookup(mos, snapobj, name, 8, 1, value); return (err); } int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, boolean_t adj_cnt) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; matchtype_t mt = 0; int err; dsl_dir_snap_cmtime_update(ds->ds_dir); if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_NORMALIZE; err = zap_remove_norm(mos, snapobj, name, mt, tx); if (err == ENOTSUP && (mt & MT_NORMALIZE)) err = zap_remove(mos, snapobj, name, tx); if (err == 0 && adj_cnt) dsl_fs_ss_count_adjust(ds->ds_dir, -1, DD_FIELD_SNAPSHOT_COUNT, tx); return (err); } boolean_t dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag) { dmu_buf_t *dbuf = ds->ds_dbuf; boolean_t result = B_FALSE; if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset, ds->ds_object, DMU_BONUS_BLKID, tag)) { if (ds == dmu_buf_get_user(dbuf)) result = B_TRUE; else dmu_buf_rele(dbuf, tag); } return (result); } int dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_t **dsp) { objset_t *mos = dp->dp_meta_objset; dmu_buf_t *dbuf; dsl_dataset_t *ds; int err; dmu_object_info_t doi; ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); if (err != 0) return (err); /* Make sure dsobj has the correct object type. */ dmu_object_info_from_db(dbuf, &doi); if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) { dmu_buf_rele(dbuf, tag); return (SET_ERROR(EINVAL)); } ds = dmu_buf_get_user(dbuf); if (ds == NULL) { dsl_dataset_t *winner = NULL; ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; list_link_init(&ds->ds_synced_link); err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir); if (err != 0) { kmem_free(ds, sizeof (dsl_dataset_t)); dmu_buf_rele(dbuf, tag); return (err); } mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_remap_deadlist_lock, NULL, MUTEX_DEFAULT, NULL); rrw_init(&ds->ds_bp_rwlock, B_FALSE); zfs_refcount_create(&ds->ds_longholds); bplist_create(&ds->ds_pending_deadlist); list_create(&ds->ds_sendstreams, sizeof (dmu_sendstatus_t), offsetof(dmu_sendstatus_t, dss_link)); list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t), offsetof(dsl_prop_cb_record_t, cbr_ds_node)); if (doi.doi_type == DMU_OTN_ZAP_METADATA) { spa_feature_t f; for (f = 0; f < SPA_FEATURES; f++) { if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) continue; err = load_zfeature(mos, ds, f); } } if (!ds->ds_is_snapshot) { ds->ds_snapname[0] = '\0'; if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev); } err = dsl_bookmark_init_ds(ds); } else { if (zfs_flags & ZFS_DEBUG_SNAPNAMES) err = dsl_dataset_get_snapname(ds); if (err == 0 && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { err = zap_count( ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_userrefs_obj, &ds->ds_userrefs); } } if (err == 0 && !ds->ds_is_snapshot) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &ds->ds_reserved); if (err == 0) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFQUOTA), &ds->ds_quota); } } else { ds->ds_reserved = ds->ds_quota = 0; } if (err == 0 && ds->ds_dir->dd_crypto_obj != 0 && ds->ds_is_snapshot && zap_contains(mos, dsobj, DS_FIELD_IVSET_GUID) != 0) { dp->dp_spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; } dsl_deadlist_open(&ds->ds_deadlist, mos, dsl_dataset_phys(ds)->ds_deadlist_obj); uint64_t remap_deadlist_obj = dsl_dataset_get_remap_deadlist_object(ds); if (remap_deadlist_obj != 0) { dsl_deadlist_open(&ds->ds_remap_deadlist, mos, remap_deadlist_obj); } dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync, dsl_dataset_evict_async, &ds->ds_dbuf); if (err == 0) winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); if (err != 0 || winner != NULL) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) dsl_deadlist_close(&ds->ds_remap_deadlist); dsl_bookmark_fini_ds(ds); if (ds->ds_prev) dsl_dataset_rele(ds->ds_prev, ds); dsl_dir_rele(ds->ds_dir, ds); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (dsl_dataset_feature_is_active(ds, f)) unload_zfeature(ds, f); } list_destroy(&ds->ds_prop_cbs); list_destroy(&ds->ds_sendstreams); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_sendstream_lock); mutex_destroy(&ds->ds_remap_deadlist_lock); zfs_refcount_destroy(&ds->ds_longholds); rrw_destroy(&ds->ds_bp_rwlock); kmem_free(ds, sizeof (dsl_dataset_t)); if (err != 0) { dmu_buf_rele(dbuf, tag); return (err); } ds = winner; } else { ds->ds_fsid_guid = unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid); if (ds->ds_fsid_guid != dsl_dataset_phys(ds)->ds_fsid_guid) { zfs_dbgmsg("ds_fsid_guid changed from " "%llx to %llx for pool %s dataset id %llu", (long long) dsl_dataset_phys(ds)->ds_fsid_guid, (long long)ds->ds_fsid_guid, spa_name(dp->dp_spa), (u_longlong_t)dsobj); } } } ASSERT3P(ds->ds_dbuf, ==, dbuf); ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data); ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); *dsp = ds; return (0); } int dsl_dataset_create_key_mapping(dsl_dataset_t *ds) { dsl_dir_t *dd = ds->ds_dir; if (dd->dd_crypto_obj == 0) return (0); return (spa_keystore_create_mapping(dd->dd_pool->dp_spa, ds, ds, &ds->ds_key_mapping)); } int dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) { int err; err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); if (err != 0) return (err); ASSERT3P(*dsp, !=, NULL); if (flags & DS_HOLD_FLAG_DECRYPT) { err = dsl_dataset_create_key_mapping(*dsp); if (err != 0) dsl_dataset_rele(*dsp, tag); } return (err); } int dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) { dsl_dir_t *dd; const char *snapname; uint64_t obj; int err = 0; dsl_dataset_t *ds; err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); if (err != 0) return (err); ASSERT(dsl_pool_config_held(dp)); obj = dsl_dir_phys(dd)->dd_head_dataset_obj; if (obj != 0) err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &ds); else err = SET_ERROR(ENOENT); /* we may be looking for a snapshot */ if (err == 0 && snapname != NULL) { dsl_dataset_t *snap_ds; if (*snapname++ != '@') { dsl_dataset_rele_flags(ds, flags, tag); dsl_dir_rele(dd, FTAG); return (SET_ERROR(ENOENT)); } dprintf("looking for snapshot '%s'\n", snapname); err = dsl_dataset_snap_lookup(ds, snapname, &obj); if (err == 0) { err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &snap_ds); } dsl_dataset_rele_flags(ds, flags, tag); if (err == 0) { mutex_enter(&snap_ds->ds_lock); if (snap_ds->ds_snapname[0] == 0) (void) strlcpy(snap_ds->ds_snapname, snapname, sizeof (snap_ds->ds_snapname)); mutex_exit(&snap_ds->ds_lock); ds = snap_ds; } } if (err == 0) *dsp = ds; dsl_dir_rele(dd, FTAG); return (err); } int dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp)); } static int dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, void *tag, boolean_t override, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp); if (err != 0) return (err); if (!dsl_dataset_tryown(*dsp, tag, override)) { dsl_dataset_rele_flags(*dsp, flags, tag); *dsp = NULL; return (SET_ERROR(EBUSY)); } return (0); } int dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp)); } int dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp)); } static int dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, void *tag, boolean_t override, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp); if (err != 0) return (err); if (!dsl_dataset_tryown(*dsp, tag, override)) { dsl_dataset_rele_flags(*dsp, flags, tag); return (SET_ERROR(EBUSY)); } return (0); } int dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp)); } int dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp)); } /* * See the comment above dsl_pool_hold() for details. In summary, a long * hold is used to prevent destruction of a dataset while the pool hold * is dropped, allowing other concurrent operations (e.g. spa_sync()). * * The dataset and pool must be held when this function is called. After it * is called, the pool hold may be released while the dataset is still held * and accessed. */ void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag) { ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); (void) zfs_refcount_add(&ds->ds_longholds, tag); } void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag) { (void) zfs_refcount_remove(&ds->ds_longholds, tag); } /* Return B_TRUE if there are any long holds on this dataset. */ boolean_t dsl_dataset_long_held(dsl_dataset_t *ds) { return (!zfs_refcount_is_zero(&ds->ds_longholds)); } void dsl_dataset_name(dsl_dataset_t *ds, char *name) { if (ds == NULL) { (void) strlcpy(name, "mos", ZFS_MAX_DATASET_NAME_LEN); } else { dsl_dir_name(ds->ds_dir, name); VERIFY0(dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); /* * We use a "recursive" mutex so that we * can call dprintf_ds() with ds_lock held. */ if (!MUTEX_HELD(&ds->ds_lock)) { mutex_enter(&ds->ds_lock); VERIFY3U(strlcat(name, ds->ds_snapname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); mutex_exit(&ds->ds_lock); } else { VERIFY3U(strlcat(name, ds->ds_snapname, ZFS_MAX_DATASET_NAME_LEN), <, ZFS_MAX_DATASET_NAME_LEN); } } } } int dsl_dataset_namelen(dsl_dataset_t *ds) { VERIFY0(dsl_dataset_get_snapname(ds)); mutex_enter(&ds->ds_lock); int len = strlen(ds->ds_snapname); mutex_exit(&ds->ds_lock); /* add '@' if ds is a snap */ if (len > 0) len++; len += dsl_dir_namelen(ds->ds_dir); return (len); } void dsl_dataset_rele(dsl_dataset_t *ds, void *tag) { dmu_buf_rele(ds->ds_dbuf, tag); } void dsl_dataset_remove_key_mapping(dsl_dataset_t *ds) { dsl_dir_t *dd = ds->ds_dir; if (dd == NULL || dd->dd_crypto_obj == 0) return; (void) spa_keystore_remove_mapping(dd->dd_pool->dp_spa, ds->ds_object, ds); } void dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag) { if (flags & DS_HOLD_FLAG_DECRYPT) dsl_dataset_remove_key_mapping(ds); dsl_dataset_rele(ds, tag); } void dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag) { ASSERT3P(ds->ds_owner, ==, tag); ASSERT(ds->ds_dbuf != NULL); mutex_enter(&ds->ds_lock); ds->ds_owner = NULL; mutex_exit(&ds->ds_lock); dsl_dataset_long_rele(ds, tag); dsl_dataset_rele_flags(ds, flags, tag); } boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override) { boolean_t gotit = FALSE; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); mutex_enter(&ds->ds_lock); if (ds->ds_owner == NULL && (override || !(DS_IS_INCONSISTENT(ds) || (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS) && !zfs_allow_redacted_dataset_mount)))) { ds->ds_owner = tag; dsl_dataset_long_hold(ds, tag); gotit = TRUE; } mutex_exit(&ds->ds_lock); return (gotit); } boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds) { boolean_t rv; mutex_enter(&ds->ds_lock); rv = (ds->ds_owner != NULL); mutex_exit(&ds->ds_lock); return (rv); } boolean_t zfeature_active(spa_feature_t f, void *arg) { switch (spa_feature_table[f].fi_type) { case ZFEATURE_TYPE_BOOLEAN: { boolean_t val = (boolean_t)(uintptr_t)arg; ASSERT(val == B_FALSE || val == B_TRUE); return (val); } case ZFEATURE_TYPE_UINT64_ARRAY: /* * In this case, arg is a uint64_t array. The feature is active * if the array is non-null. */ return (arg != NULL); default: panic("Invalid zfeature type %d", spa_feature_table[f].fi_type); return (B_FALSE); } } boolean_t dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f) { return (zfeature_active(f, ds->ds_feature[f])); } /* * The buffers passed out by this function are references to internal buffers; * they should not be freed by callers of this function, and they should not be * used after the dataset has been released. */ boolean_t dsl_dataset_get_uint64_array_feature(dsl_dataset_t *ds, spa_feature_t f, uint64_t *outlength, uint64_t **outp) { VERIFY(spa_feature_table[f].fi_type & ZFEATURE_TYPE_UINT64_ARRAY); if (!dsl_dataset_feature_is_active(ds, f)) { return (B_FALSE); } struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f]; *outp = ftuaa->array; *outlength = ftuaa->length; return (B_TRUE); } void dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; uint64_t zero = 0; VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); spa_feature_incr(spa, f, tx); dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); switch (spa_feature_table[f].fi_type) { case ZFEATURE_TYPE_BOOLEAN: ASSERT3S((boolean_t)(uintptr_t)arg, ==, B_TRUE); VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, sizeof (zero), 1, &zero, tx)); break; case ZFEATURE_TYPE_UINT64_ARRAY: { struct feature_type_uint64_array_arg *ftuaa = arg; VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, sizeof (uint64_t), ftuaa->length, ftuaa->array, tx)); break; } default: panic("Invalid zfeature type %d", spa_feature_table[f].fi_type); } } static void dsl_dataset_deactivate_feature_impl(dsl_dataset_t *ds, spa_feature_t f, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; uint64_t dsobj = ds->ds_object; VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx)); spa_feature_decr(spa, f, tx); ds->ds_feature[f] = NULL; } void dsl_dataset_deactivate_feature(dsl_dataset_t *ds, spa_feature_t f, dmu_tx_t *tx) { unload_zfeature(ds, f); dsl_dataset_deactivate_feature_impl(ds, f, tx); } uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, dsl_crypto_params_t *dcp, uint64_t flags, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; uint64_t dsobj; objset_t *mos = dp->dp_meta_objset; if (origin == NULL) origin = dp->dp_origin_snap; ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0); dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = dd->dd_object; dsphys->ds_flags = flags; dsphys->ds_fsid_guid = unique_create(); (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); dsphys->ds_snapnames_zapobj = zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; if (origin == NULL) { dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); } else { dsl_dataset_t *ohds; /* head of the origin snapshot */ dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = dsl_dataset_phys(origin)->ds_creation_txg; dsphys->ds_referenced_bytes = dsl_dataset_phys(origin)->ds_referenced_bytes; dsphys->ds_compressed_bytes = dsl_dataset_phys(origin)->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = dsl_dataset_phys(origin)->ds_uncompressed_bytes; rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG); dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp; rrw_exit(&origin->ds_bp_rwlock, FTAG); /* * Inherit flags that describe the dataset's contents * (INCONSISTENT) or properties (Case Insensitive). */ dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags & (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (zfeature_active(f, origin->ds_feature[f])) { dsl_dataset_activate_feature(dsobj, f, origin->ds_feature[f], tx); } } dmu_buf_will_dirty(origin->ds_dbuf, tx); dsl_dataset_phys(origin)->ds_num_children++; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj, FTAG, &ohds)); dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); dsl_dataset_rele(ohds, FTAG); if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) { dsl_dataset_phys(origin)->ds_next_clones_obj = zap_create(mos, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(mos, dsl_dataset_phys(origin)->ds_next_clones_obj, dsobj, tx)); } dmu_buf_will_dirty(dd->dd_dbuf, tx); dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object; if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); dsl_dir_phys(origin->ds_dir)->dd_clones = zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(mos, dsl_dir_phys(origin->ds_dir)->dd_clones, dsobj, tx)); } } /* handle encryption */ dsl_dataset_create_crypt_sync(dsobj, dd, origin, dcp, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; dmu_buf_rele(dbuf, FTAG); dmu_buf_will_dirty(dd->dd_dbuf, tx); dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj; return (dsobj); } static void dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) { objset_t *os; VERIFY0(dmu_objset_from_ds(ds, &os)); if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) { dsl_pool_t *dp = ds->ds_dir->dd_pool; zio_t *zio; bzero(&os->os_zil_header, sizeof (os->os_zil_header)); if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dsl_dataset_sync(ds, zio, tx); VERIFY0(zio_wait(zio)); - - /* dsl_dataset_sync_done will drop this reference. */ - dmu_buf_add_ref(ds->ds_dbuf, ds); dsl_dataset_sync_done(ds, tx); } } uint64_t dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dsl_crypto_params_t *dcp, dmu_tx_t *tx) { dsl_pool_t *dp = pdd->dd_pool; uint64_t dsobj, ddobj; dsl_dir_t *dd; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(lastname[0] != '@'); /* * Filesystems will eventually have their origin set to dp_origin_snap, * but that's taken care of in dsl_dataset_create_sync_dd. When * creating a filesystem, this function is called with origin equal to * NULL. */ if (origin != NULL) ASSERT3P(origin, !=, dp->dp_origin_snap); ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); dsobj = dsl_dataset_create_sync_dd(dd, origin, dcp, flags & ~DS_CREATE_FLAG_NODIRTY, tx); dsl_deleg_set_create_perms(dd, tx, cr); /* * If we are creating a clone and the livelist feature is enabled, * add the entry DD_FIELD_LIVELIST to ZAP. */ if (origin != NULL && spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) { objset_t *mos = dd->dd_pool->dp_meta_objset; dsl_dir_zapify(dd, tx); uint64_t obj = dsl_deadlist_alloc(mos, tx); VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj, tx)); spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx); } /* * Since we're creating a new node we know it's a leaf, so we can * initialize the counts if the limit feature is active. */ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { uint64_t cnt = 0; objset_t *os = dd->dd_pool->dp_meta_objset; dsl_dir_zapify(dd, tx); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (cnt), 1, &cnt, tx)); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (cnt), 1, &cnt, tx)); } dsl_dir_rele(dd, FTAG); /* * If we are creating a clone, make sure we zero out any stale * data from the origin snapshots zil header. */ if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) { dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); dsl_dataset_zero_zil(ds, tx); dsl_dataset_rele(ds, FTAG); } return (dsobj); } /* * The unique space in the head dataset can be calculated by subtracting * the space used in the most recent snapshot, that is still being used * in this file system, from the space currently in use. To figure out * the space in the most recent snapshot still in use, we need to take * the total space used in the snapshot and subtract out the space that * has been freed up since the snapshot was taken. */ void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) { uint64_t mrs_used; uint64_t dlused, dlcomp, dluncomp; ASSERT(!ds->ds_is_snapshot); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes; else mrs_used = 0; dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); ASSERT3U(dlused, <=, mrs_used); dsl_dataset_phys(ds)->ds_unique_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused); if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; } void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t count __maybe_unused; int err; ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2); err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, obj, tx); /* * The err should not be ENOENT, but a bug in a previous version * of the code could cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a missing entry. * If we knew that the pool was created after * SPA_VERSION_NEXT_CLONES, we could assert that it isn't * ENOENT. However, at least we can check that we don't have * too many entries in the next_clones_obj even after failing to * remove this one. */ if (err != ENOENT) VERIFY0(err); ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, &count)); ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2); } blkptr_t * dsl_dataset_get_blkptr(dsl_dataset_t *ds) { return (&dsl_dataset_phys(ds)->ds_bp); } spa_t * dsl_dataset_get_spa(dsl_dataset_t *ds) { return (ds->ds_dir->dd_pool->dp_spa); } void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp; if (ds == NULL) /* this is the meta-objset */ return; ASSERT(ds->ds_objset != NULL); if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) panic("dirtying snapshot!"); /* Must not dirty a dataset in the same txg where it got snapshotted. */ ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); dp = ds->ds_dir->dd_pool; if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { objset_t *os = ds->ds_objset; /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, ds); /* if this dataset is encrypted, grab a reference to the DCK */ if (ds->ds_dir->dd_crypto_obj != 0 && !os->os_raw_receive && !os->os_next_write_raw[tx->tx_txg & TXG_MASK]) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_add_ref(ds->ds_key_mapping, ds); } } } static int dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) { uint64_t asize; if (!dmu_tx_is_syncing(tx)) return (0); /* * If there's an fs-only reservation, any blocks that might become * owned by the snapshot dataset must be accommodated by space * outside of the reservation. */ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved); if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) return (SET_ERROR(ENOSPC)); /* * Propagate any reserved space for this snapshot to other * snapshot checks in this sync group. */ if (asize > 0) dsl_dir_willuse_space(ds->ds_dir, asize, tx); return (0); } int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr, proc_t *proc) { int error; uint64_t value; ds->ds_trysnap_txg = tx->tx_txg; if (!dmu_tx_is_syncing(tx)) return (0); /* * We don't allow multiple snapshots of the same txg. If there * is already one, try again. */ if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) return (SET_ERROR(EAGAIN)); /* * Check for conflicting snapshot name. */ error = dsl_dataset_snap_lookup(ds, snapname, &value); if (error == 0) return (SET_ERROR(EEXIST)); if (error != ENOENT) return (error); /* * We don't allow taking snapshots of inconsistent datasets, such as * those into which we are currently receiving. However, if we are * creating this snapshot as part of a receive, this check will be * executed atomically with respect to the completion of the receive * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this * case we ignore this, knowing it will be fixed up for us shortly in * dmu_recv_end_sync(). */ if (!recv && DS_IS_INCONSISTENT(ds)) return (SET_ERROR(EBUSY)); /* * Skip the check for temporary snapshots or if we have already checked * the counts in dsl_dataset_snapshot_check. This means we really only * check the count here when we're receiving a stream. */ if (cnt != 0 && cr != NULL) { error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr, proc); if (error != 0) return (error); } error = dsl_dataset_snapshot_reserve_space(ds, tx); if (error != 0) return (error); return (0); } int dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) { dsl_dataset_snapshot_arg_t *ddsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; int rv = 0; /* * Pre-compute how many total new snapshots will be created for each * level in the tree and below. This is needed for validating the * snapshot limit when either taking a recursive snapshot or when * taking multiple snapshots. * * The problem is that the counts are not actually adjusted when * we are checking, only when we finally sync. For a single snapshot, * this is easy, the count will increase by 1 at each node up the tree, * but its more complicated for the recursive/multiple snapshot case. * * The dsl_fs_ss_limit_check function does recursively check the count * at each level up the tree but since it is validating each snapshot * independently we need to be sure that we are validating the complete * count for the entire set of snapshots. We do this by rolling up the * counts for each component of the name into an nvlist and then * checking each of those cases with the aggregated count. * * This approach properly handles not only the recursive snapshot * case (where we get all of those on the ddsa_snaps list) but also * the sibling case (e.g. snapshot a/b and a/c so that we will also * validate the limit on 'a' using a count of 2). * * We validate the snapshot names in the third loop and only report * name errors once. */ if (dmu_tx_is_syncing(tx)) { char *nm; nvlist_t *cnt_track = NULL; cnt_track = fnvlist_alloc(); nm = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* Rollup aggregated counts into the cnt_track list */ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { char *pdelim; uint64_t val; (void) strlcpy(nm, nvpair_name(pair), MAXPATHLEN); pdelim = strchr(nm, '@'); if (pdelim == NULL) continue; *pdelim = '\0'; do { if (nvlist_lookup_uint64(cnt_track, nm, &val) == 0) { /* update existing entry */ fnvlist_add_uint64(cnt_track, nm, val + 1); } else { /* add to list */ fnvlist_add_uint64(cnt_track, nm, 1); } pdelim = strrchr(nm, '/'); if (pdelim != NULL) *pdelim = '\0'; } while (pdelim != NULL); } kmem_free(nm, MAXPATHLEN); /* Check aggregated counts at each level */ for (pair = nvlist_next_nvpair(cnt_track, NULL); pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) { int error = 0; char *name; uint64_t cnt = 0; dsl_dataset_t *ds; name = nvpair_name(pair); cnt = fnvpair_value_uint64(pair); ASSERT(cnt > 0); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error == 0) { error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, ZFS_PROP_SNAPSHOT_LIMIT, NULL, ddsa->ddsa_cr, ddsa->ddsa_proc); dsl_dataset_rele(ds, FTAG); } if (error != 0) { if (ddsa->ddsa_errors != NULL) fnvlist_add_int32(ddsa->ddsa_errors, name, error); rv = error; /* only report one error for this check */ break; } } nvlist_free(cnt_track); } for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { int error = 0; dsl_dataset_t *ds; char *name, *atp = NULL; char dsname[ZFS_MAX_DATASET_NAME_LEN]; name = nvpair_name(pair); if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN) error = SET_ERROR(ENAMETOOLONG); if (error == 0) { atp = strchr(name, '@'); if (atp == NULL) error = SET_ERROR(EINVAL); if (error == 0) (void) strlcpy(dsname, name, atp - name + 1); } if (error == 0) error = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (error == 0) { /* passing 0/NULL skips dsl_fs_ss_limit_check */ error = dsl_dataset_snapshot_check_impl(ds, atp + 1, tx, B_FALSE, 0, NULL, NULL); dsl_dataset_rele(ds, FTAG); } if (error != 0) { if (ddsa->ddsa_errors != NULL) { fnvlist_add_int32(ddsa->ddsa_errors, name, error); } rv = error; } } return (rv); } void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; uint64_t dsobj, crtxg; objset_t *mos = dp->dp_meta_objset; static zil_header_t zero_zil __maybe_unused; objset_t *os __maybe_unused; ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); /* * If we are on an old pool, the zil must not be active, in which * case it will be zeroed. Usually zil_suspend() accomplishes this. */ ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || dmu_objset_from_ds(ds, &os) != 0 || bcmp(&os->os_phys->os_zil_header, &zero_zil, sizeof (zero_zil)) == 0); /* Should not snapshot a dirty dataset. */ ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, ds, tx->tx_txg)); dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); /* * The origin's ds_creation_txg has to be < TXG_INITIAL */ if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) crtxg = 1; else crtxg = tx->tx_txg; dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = ds->ds_dir->dd_object; dsphys->ds_fsid_guid = unique_create(); (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsphys->ds_next_snap_obj = ds->ds_object; dsphys->ds_num_children = 1; dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = crtxg; dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes; dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = dsl_dataset_phys(ds)->ds_uncompressed_bytes; dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags; rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp; rrw_exit(&ds->ds_bp_rwlock, FTAG); dmu_buf_rele(dbuf, FTAG); for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (zfeature_active(f, ds->ds_feature[f])) { dsl_dataset_activate_feature(dsobj, f, ds->ds_feature[f], tx); } } ASSERT3U(ds->ds_prev != 0, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); if (ds->ds_prev) { uint64_t next_clones_obj = dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj; ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object || dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1); if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, dsl_dataset_phys(ds->ds_prev)->ds_creation_txg); dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj; } else if (next_clones_obj != 0) { dsl_dataset_remove_from_next_clones(ds->ds_prev, dsphys->ds_next_snap_obj, tx); VERIFY0(zap_add_int(mos, next_clones_obj, dsobj, tx)); } } /* * If we have a reference-reservation on this dataset, we will * need to increase the amount of refreservation being charged * since our unique space is going to zero. */ if (ds->ds_reserved) { int64_t delta; ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); } dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_open(&ds->ds_deadlist, mos, dsl_dataset_phys(ds)->ds_deadlist_obj); dsl_deadlist_add_key(&ds->ds_deadlist, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); dsl_bookmark_snapshotted(ds, tx); if (dsl_dataset_remap_deadlist_exists(ds)) { uint64_t remap_deadlist_obj = dsl_dataset_get_remap_deadlist_object(ds); /* * Move the remap_deadlist to the snapshot. The head * will create a new remap deadlist on demand, from * dsl_dataset_block_remapped(). */ dsl_dataset_unset_remap_deadlist_object(ds, tx); dsl_deadlist_close(&ds->ds_remap_deadlist); dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx)); } /* * Create a ivset guid for this snapshot if the dataset is * encrypted. This may be overridden by a raw receive. A * previous implementation of this code did not have this * field as part of the on-disk format for ZFS encryption * (see errata #4). As part of the remediation for this * issue, we ask the user to enable the bookmark_v2 feature * which is now a dependency of the encryption feature. We * use this as a heuristic to determine when the user has * elected to correct any datasets created with the old code. * As a result, we only do this step if the bookmark_v2 * feature is enabled, which limits the number of states a * given pool / dataset can be in with regards to terms of * correcting the issue. */ if (ds->ds_dir->dd_crypto_obj != 0 && spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2)) { uint64_t ivset_guid = unique_create(); dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); VERIFY0(zap_add(mos, dsobj, DS_FIELD_IVSET_GUID, sizeof (ivset_guid), 1, &ivset_guid, tx)); } ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg); dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj; dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg; dsl_dataset_phys(ds)->ds_unique_bytes = 0; if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, snapname, 8, 1, &dsobj, tx)); if (ds->ds_prev) dsl_dataset_rele(ds->ds_prev, ds); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); dsl_scan_ds_snapshotted(ds, tx); dsl_dir_snap_cmtime_update(ds->ds_dir); spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " "); } void dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_snapshot_arg_t *ddsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { dsl_dataset_t *ds; char *name, *atp; char dsname[ZFS_MAX_DATASET_NAME_LEN]; name = nvpair_name(pair); atp = strchr(name, '@'); (void) strlcpy(dsname, name, atp - name + 1); VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx); if (ddsa->ddsa_props != NULL) { dsl_props_set_sync_impl(ds->ds_prev, ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); } dsl_dataset_rele(ds, FTAG); } } /* * The snapshots must all be in the same pool. * All-or-nothing: if there are any failures, nothing will be modified. */ int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) { dsl_dataset_snapshot_arg_t ddsa; nvpair_t *pair; boolean_t needsuspend; int error; spa_t *spa; char *firstname; nvlist_t *suspended = NULL; pair = nvlist_next_nvpair(snaps, NULL); if (pair == NULL) return (0); firstname = nvpair_name(pair); error = spa_open(firstname, &spa, FTAG); if (error != 0) return (error); needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); spa_close(spa, FTAG); if (needsuspend) { suspended = fnvlist_alloc(); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *snapname = nvpair_name(pair); char *atp; void *cookie; atp = strchr(snapname, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } (void) strlcpy(fsname, snapname, atp - snapname + 1); error = zil_suspend(fsname, &cookie); if (error != 0) break; fnvlist_add_uint64(suspended, fsname, (uintptr_t)cookie); } } ddsa.ddsa_snaps = snaps; ddsa.ddsa_props = props; ddsa.ddsa_errors = errors; ddsa.ddsa_cr = CRED(); ddsa.ddsa_proc = curproc; if (error == 0) { error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, dsl_dataset_snapshot_sync, &ddsa, fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL); } if (suspended != NULL) { for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL; pair = nvlist_next_nvpair(suspended, pair)) { zil_resume((void *)(uintptr_t) fnvpair_value_uint64(pair)); } fnvlist_free(suspended); } if (error == 0) { for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { zvol_create_minor(nvpair_name(pair)); } } return (error); } typedef struct dsl_dataset_snapshot_tmp_arg { const char *ddsta_fsname; const char *ddsta_snapname; minor_t ddsta_cleanup_minor; const char *ddsta_htag; } dsl_dataset_snapshot_tmp_arg_t; static int dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) { dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds); if (error != 0) return (error); /* NULL cred means no limit check for tmp snapshot */ error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, tx, B_FALSE, 0, NULL, NULL); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag, B_TRUE, tx); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds = NULL; VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx); dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag, ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx); dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx); dsl_dataset_rele(ds, FTAG); } int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, minor_t cleanup_minor, const char *htag) { dsl_dataset_snapshot_tmp_arg_t ddsta; int error; spa_t *spa; boolean_t needsuspend; void *cookie; ddsta.ddsta_fsname = fsname; ddsta.ddsta_snapname = snapname; ddsta.ddsta_cleanup_minor = cleanup_minor; ddsta.ddsta_htag = htag; error = spa_open(fsname, &spa, FTAG); if (error != 0) return (error); needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); spa_close(spa, FTAG); if (needsuspend) { error = zil_suspend(fsname, &cookie); if (error != 0) return (error); } error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check, dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED); if (needsuspend) zil_resume(cookie); return (error); } void dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_objset != NULL); ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0); /* * in case we had to change ds_fsid_guid when we opened it, * sync it out now. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) { VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1, &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx)); VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1, &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx)); VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1, &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx)); ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0; ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0; ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; } dmu_objset_sync(ds->ds_objset, zio, tx); } /* * Check if the percentage of blocks shared between the clone and the * snapshot (as opposed to those that are clone only) is below a certain * threshold */ static boolean_t dsl_livelist_should_disable(dsl_dataset_t *ds) { uint64_t used, referenced; int percent_shared; used = dsl_dir_get_usedds(ds->ds_dir); referenced = dsl_get_referenced(ds); ASSERT3U(referenced, >=, 0); ASSERT3U(used, >=, 0); if (referenced == 0) return (B_FALSE); percent_shared = (100 * (referenced - used)) / referenced; if (percent_shared <= zfs_livelist_min_percent_shared) return (B_TRUE); return (B_FALSE); } /* * Check if it is possible to combine two livelist entries into one. * This is the case if the combined number of 'live' blkptrs (ALLOCs that * don't have a matching FREE) is under the maximum sublist size. * We check this by subtracting twice the total number of frees from the total * number of blkptrs. FREEs are counted twice because each FREE blkptr * will cancel out an ALLOC blkptr when the livelist is processed. */ static boolean_t dsl_livelist_should_condense(dsl_deadlist_entry_t *first, dsl_deadlist_entry_t *next) { uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed + next->dle_bpobj.bpo_phys->bpo_num_freed; uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs + next->dle_bpobj.bpo_phys->bpo_num_blkptrs; if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries) return (B_TRUE); return (B_FALSE); } typedef struct try_condense_arg { spa_t *spa; dsl_dataset_t *ds; } try_condense_arg_t; /* * Iterate over the livelist entries, searching for a pair to condense. * A nonzero return value means stop, 0 means keep looking. */ static int dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first) { try_condense_arg_t *tca = arg; spa_t *spa = tca->spa; dsl_dataset_t *ds = tca->ds; dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; dsl_deadlist_entry_t *next; /* The condense thread has not yet been created at import */ if (spa->spa_livelist_condense_zthr == NULL) return (1); /* A condense is already in progress */ if (spa->spa_to_condense.ds != NULL) return (1); next = AVL_NEXT(&ll->dl_tree, &first->dle_node); /* The livelist has only one entry - don't condense it */ if (next == NULL) return (1); /* Next is the newest entry - don't condense it */ if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL) return (1); /* This pair is not ready to condense but keep looking */ if (!dsl_livelist_should_condense(first, next)) return (0); /* * Add a ref to prevent the dataset from being evicted while * the condense zthr or synctask are running. Ref will be * released at the end of the condense synctask */ dmu_buf_add_ref(ds->ds_dbuf, spa); spa->spa_to_condense.ds = ds; spa->spa_to_condense.first = first; spa->spa_to_condense.next = next; spa->spa_to_condense.syncing = B_FALSE; spa->spa_to_condense.cancelled = B_FALSE; zthr_wakeup(spa->spa_livelist_condense_zthr); return (1); } static void dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_dir_t *dd = ds->ds_dir; spa_t *spa = ds->ds_dir->dd_pool->dp_spa; dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist); /* Check if we need to add a new sub-livelist */ if (last == NULL) { /* The livelist is empty */ dsl_deadlist_add_key(&dd->dd_livelist, tx->tx_txg - 1, tx); } else if (spa_sync_pass(spa) == 1) { /* * Check if the newest entry is full. If it is, make a new one. * We only do this once per sync because we could overfill a * sublist in one sync pass and don't want to add another entry * for a txg that is already represented. This ensures that * blkptrs born in the same txg are stored in the same sublist. */ bpobj_t bpobj = last->dle_bpobj; uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs; uint64_t free = bpobj.bpo_phys->bpo_num_freed; uint64_t alloc = all - free; if (alloc > zfs_livelist_max_entries) { dsl_deadlist_add_key(&dd->dd_livelist, tx->tx_txg - 1, tx); } } /* Insert each entry into the on-disk livelist */ bplist_iterate(&dd->dd_pending_allocs, dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx); bplist_iterate(&dd->dd_pending_frees, dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx); /* Attempt to condense every pair of adjacent entries */ try_condense_arg_t arg = { .spa = spa, .ds = ds }; dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense, &arg); } void dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) { objset_t *os = ds->ds_objset; bplist_iterate(&ds->ds_pending_deadlist, dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx); if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { dsl_flush_pending_livelist(ds, tx); if (dsl_livelist_should_disable(ds)) { dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE); } } dsl_bookmark_sync_done(ds, tx); multilist_destroy(&os->os_synced_dnodes); if (os->os_encrypted) os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE; else ASSERT0(os->os_next_write_raw[tx->tx_txg & TXG_MASK]); - ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx))); - - dmu_buf_rele(ds->ds_dbuf, ds); - for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (zfeature_active(f, ds->ds_feature_activation[f])) { if (zfeature_active(f, ds->ds_feature[f])) continue; dsl_dataset_activate_feature(ds->ds_object, f, ds->ds_feature_activation[f], tx); ds->ds_feature[f] = ds->ds_feature_activation[f]; } } + + ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx))); } int get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val) { uint64_t count = 0; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; zap_cursor_t zc; zap_attribute_t za; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); /* * There may be missing entries in ds_next_clones_obj * due to a bug in a previous version of the code. * Only trust it if it has the right number of entries. */ if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, &count)); } if (count != dsl_dataset_phys(ds)->ds_num_children - 1) { return (SET_ERROR(ENOENT)); } for (zap_cursor_init(&zc, mos, dsl_dataset_phys(ds)->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *clone; char buf[ZFS_MAX_DATASET_NAME_LEN]; VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, za.za_first_integer, FTAG, &clone)); dsl_dir_name(clone->ds_dir, buf); fnvlist_add_boolean(val, buf); dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(&zc); return (0); } void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) { nvlist_t *propval = fnvlist_alloc(); nvlist_t *val = fnvlist_alloc(); if (get_clones_stat_impl(ds, val) == 0) { fnvlist_add_nvlist(propval, ZPROP_VALUE, val); fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval); } nvlist_free(val); nvlist_free(propval); } /* * Returns a string that represents the receive resume stats token. It should * be freed with strfree(). */ char * get_receive_resume_stats_impl(dsl_dataset_t *ds) { dsl_pool_t *dp = ds->ds_dir->dd_pool; if (dsl_dataset_has_resume_receive_state(ds)) { char *str; void *packed; uint8_t *compressed; uint64_t val; nvlist_t *token_nv = fnvlist_alloc(); size_t packed_size, compressed_size; if (zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) { fnvlist_add_uint64(token_nv, "fromguid", val); } if (zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) { fnvlist_add_uint64(token_nv, "object", val); } if (zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) { fnvlist_add_uint64(token_nv, "offset", val); } if (zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) { fnvlist_add_uint64(token_nv, "bytes", val); } if (zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) { fnvlist_add_uint64(token_nv, "toguid", val); } char buf[MAXNAMELEN]; if (zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { fnvlist_add_string(token_nv, "toname", buf); } if (zap_contains(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_LARGEBLOCK) == 0) { fnvlist_add_boolean(token_nv, "largeblockok"); } if (zap_contains(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_EMBEDOK) == 0) { fnvlist_add_boolean(token_nv, "embedok"); } if (zap_contains(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_COMPRESSOK) == 0) { fnvlist_add_boolean(token_nv, "compressok"); } if (zap_contains(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_RAWOK) == 0) { fnvlist_add_boolean(token_nv, "rawok"); } if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)) { uint64_t num_redact_snaps; uint64_t *redact_snaps; VERIFY(dsl_dataset_get_uint64_array_feature(ds, SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps, &redact_snaps)); fnvlist_add_uint64_array(token_nv, "redact_snaps", redact_snaps, num_redact_snaps); } if (zap_contains(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) { uint64_t num_redact_snaps, int_size; uint64_t *redact_snaps; VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size, &num_redact_snaps)); ASSERT3U(int_size, ==, sizeof (uint64_t)); redact_snaps = kmem_alloc(int_size * num_redact_snaps, KM_SLEEP); VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size, num_redact_snaps, redact_snaps)); fnvlist_add_uint64_array(token_nv, "book_redact_snaps", redact_snaps, num_redact_snaps); kmem_free(redact_snaps, int_size * num_redact_snaps); } packed = fnvlist_pack(token_nv, &packed_size); fnvlist_free(token_nv); compressed = kmem_alloc(packed_size, KM_SLEEP); compressed_size = gzip_compress(packed, compressed, packed_size, packed_size, 6); zio_cksum_t cksum; fletcher_4_native_varsize(compressed, compressed_size, &cksum); size_t alloc_size = compressed_size * 2 + 1; str = kmem_alloc(alloc_size, KM_SLEEP); for (int i = 0; i < compressed_size; i++) { size_t offset = i * 2; (void) snprintf(str + offset, alloc_size - offset, "%02x", compressed[i]); } str[compressed_size * 2] = '\0'; char *propval = kmem_asprintf("%u-%llx-%llx-%s", ZFS_SEND_RESUME_TOKEN_VERSION, (longlong_t)cksum.zc_word[0], (longlong_t)packed_size, str); kmem_free(packed, packed_size); kmem_free(str, alloc_size); kmem_free(compressed, packed_size); return (propval); } return (kmem_strdup("")); } /* * Returns a string that represents the receive resume stats token of the * dataset's child. It should be freed with strfree(). */ char * get_child_receive_stats(dsl_dataset_t *ds) { char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; dsl_dataset_t *recv_ds; dsl_dataset_name(ds, recvname); if (strlcat(recvname, "/", sizeof (recvname)) < sizeof (recvname) && strlcat(recvname, recv_clone_name, sizeof (recvname)) < sizeof (recvname) && dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG, &recv_ds) == 0) { char *propval = get_receive_resume_stats_impl(recv_ds); dsl_dataset_rele(recv_ds, FTAG); return (propval); } return (kmem_strdup("")); } static void get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) { char *propval = get_receive_resume_stats_impl(ds); if (strcmp(propval, "") != 0) { dsl_prop_nvlist_add_string(nv, ZFS_PROP_RECEIVE_RESUME_TOKEN, propval); } else { char *childval = get_child_receive_stats(ds); if (strcmp(childval, "") != 0) { dsl_prop_nvlist_add_string(nv, ZFS_PROP_RECEIVE_RESUME_TOKEN, childval); } kmem_strfree(childval); } kmem_strfree(propval); } uint64_t dsl_get_refratio(dsl_dataset_t *ds) { uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 : (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 / dsl_dataset_phys(ds)->ds_compressed_bytes); return (ratio); } uint64_t dsl_get_logicalreferenced(dsl_dataset_t *ds) { return (dsl_dataset_phys(ds)->ds_uncompressed_bytes); } uint64_t dsl_get_compressratio(dsl_dataset_t *ds) { if (ds->ds_is_snapshot) { return (dsl_get_refratio(ds)); } else { dsl_dir_t *dd = ds->ds_dir; mutex_enter(&dd->dd_lock); uint64_t val = dsl_dir_get_compressratio(dd); mutex_exit(&dd->dd_lock); return (val); } } uint64_t dsl_get_used(dsl_dataset_t *ds) { if (ds->ds_is_snapshot) { return (dsl_dataset_phys(ds)->ds_unique_bytes); } else { dsl_dir_t *dd = ds->ds_dir; mutex_enter(&dd->dd_lock); uint64_t val = dsl_dir_get_used(dd); mutex_exit(&dd->dd_lock); return (val); } } uint64_t dsl_get_creation(dsl_dataset_t *ds) { return (dsl_dataset_phys(ds)->ds_creation_time); } uint64_t dsl_get_creationtxg(dsl_dataset_t *ds) { return (dsl_dataset_phys(ds)->ds_creation_txg); } uint64_t dsl_get_refquota(dsl_dataset_t *ds) { return (ds->ds_quota); } uint64_t dsl_get_refreservation(dsl_dataset_t *ds) { return (ds->ds_reserved); } uint64_t dsl_get_guid(dsl_dataset_t *ds) { return (dsl_dataset_phys(ds)->ds_guid); } uint64_t dsl_get_unique(dsl_dataset_t *ds) { return (dsl_dataset_phys(ds)->ds_unique_bytes); } uint64_t dsl_get_objsetid(dsl_dataset_t *ds) { return (ds->ds_object); } uint64_t dsl_get_userrefs(dsl_dataset_t *ds) { return (ds->ds_userrefs); } uint64_t dsl_get_defer_destroy(dsl_dataset_t *ds) { return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0); } uint64_t dsl_get_referenced(dsl_dataset_t *ds) { return (dsl_dataset_phys(ds)->ds_referenced_bytes); } uint64_t dsl_get_numclones(dsl_dataset_t *ds) { ASSERT(ds->ds_is_snapshot); return (dsl_dataset_phys(ds)->ds_num_children - 1); } uint64_t dsl_get_inconsistent(dsl_dataset_t *ds) { return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ? 1 : 0); } uint64_t dsl_get_redacted(dsl_dataset_t *ds) { return (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)); } uint64_t dsl_get_available(dsl_dataset_t *ds) { uint64_t refdbytes = dsl_get_referenced(ds); uint64_t availbytes = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { availbytes += ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; } if (ds->ds_quota != 0) { /* * Adjust available bytes according to refquota */ if (refdbytes < ds->ds_quota) { availbytes = MIN(availbytes, ds->ds_quota - refdbytes); } else { availbytes = 0; } } return (availbytes); } int dsl_get_written(dsl_dataset_t *ds, uint64_t *written) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_dataset_t *prev; int err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err == 0) { uint64_t comp, uncomp; err = dsl_dataset_space_written(prev, ds, written, &comp, &uncomp); dsl_dataset_rele(prev, FTAG); } return (err); } /* * 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN. */ int dsl_get_prev_snap(dsl_dataset_t *ds, char *snap) { dsl_pool_t *dp = ds->ds_dir->dd_pool; if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) { dsl_dataset_name(ds->ds_prev, snap); return (0); } else { return (SET_ERROR(ENOENT)); } } void dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval) { uint64_t nsnaps; uint64_t *snaps; if (dsl_dataset_get_uint64_array_feature(ds, SPA_FEATURE_REDACTED_DATASETS, &nsnaps, &snaps)) { fnvlist_add_uint64_array(propval, ZPROP_VALUE, snaps, nsnaps); } } /* * Returns the mountpoint property and source for the given dataset in the value * and source buffers. The value buffer must be at least as large as MAXPATHLEN * and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN. * Returns 0 on success and an error on failure. */ int dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, char *source) { int error; dsl_pool_t *dp = ds->ds_dir->dd_pool; /* Retrieve the mountpoint value stored in the zap object */ error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1, ZAP_MAXVALUELEN, value, source); if (error != 0) { return (error); } /* * Process the dsname and source to find the full mountpoint string. * Can be skipped for 'legacy' or 'none'. */ if (value[0] == '/') { char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); char *root = buf; const char *relpath; /* * If we inherit the mountpoint, even from a dataset * with a received value, the source will be the path of * the dataset we inherit from. If source is * ZPROP_SOURCE_VAL_RECVD, the received value is not * inherited. */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { relpath = ""; } else { ASSERT0(strncmp(dsname, source, strlen(source))); relpath = dsname + strlen(source); if (relpath[0] == '/') relpath++; } spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN); /* * Special case an alternate root of '/'. This will * avoid having multiple leading slashes in the * mountpoint path. */ if (strcmp(root, "/") == 0) root++; /* * If the mountpoint is '/' then skip over this * if we are obtaining either an alternate root or * an inherited mountpoint. */ char *mnt = value; if (value[1] == '\0' && (root[0] != '\0' || relpath[0] != '\0')) mnt = value + 1; if (relpath[0] == '\0') { (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s", root, mnt); } else { (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s", root, mnt, relpath[0] == '@' ? "" : "/", relpath); } kmem_free(buf, ZAP_MAXVALUELEN); } return (0); } void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { dsl_pool_t *dp = ds->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, dsl_get_refratio(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, dsl_get_logicalreferenced(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, dsl_get_compressratio(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dsl_get_used(ds)); if (ds->ds_is_snapshot) { get_clones_stat(ds, nv); } else { char buf[ZFS_MAX_DATASET_NAME_LEN]; if (dsl_get_prev_snap(ds, buf) == 0) dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf); dsl_dir_stats(ds->ds_dir, nv); } nvlist_t *propval = fnvlist_alloc(); dsl_get_redact_snaps(ds, propval); fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), propval); nvlist_free(propval); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, dsl_get_available(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, dsl_get_referenced(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, dsl_get_creation(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, dsl_get_creationtxg(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, dsl_get_refquota(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, dsl_get_refreservation(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, dsl_get_guid(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, dsl_get_unique(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, dsl_get_objsetid(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, dsl_get_userrefs(ds)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, dsl_get_defer_destroy(ds)); dsl_dataset_crypt_stats(ds, nv); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { uint64_t written; if (dsl_get_written(ds, &written) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, written); } } if (!dsl_dataset_is_snapshot(ds)) { /* * A failed "newfs" (e.g. full) resumable receive leaves * the stats set on this dataset. Check here for the prop. */ get_receive_resume_stats(ds, nv); /* * A failed incremental resumable receive leaves the * stats set on our child named "%recv". Check the child * for the prop. */ /* 6 extra bytes for /%recv */ char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; dsl_dataset_t *recv_ds; dsl_dataset_name(ds, recvname); if (strlcat(recvname, "/", sizeof (recvname)) < sizeof (recvname) && strlcat(recvname, recv_clone_name, sizeof (recvname)) < sizeof (recvname) && dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) { get_receive_resume_stats(recv_ds, nv); dsl_dataset_rele(recv_ds, FTAG); } } } void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) { dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); stat->dds_creation_txg = dsl_get_creationtxg(ds); stat->dds_inconsistent = dsl_get_inconsistent(ds); stat->dds_guid = dsl_get_guid(ds); stat->dds_redacted = dsl_get_redacted(ds); stat->dds_origin[0] = '\0'; if (ds->ds_is_snapshot) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = dsl_get_numclones(ds); } else { stat->dds_is_snapshot = B_FALSE; stat->dds_num_clones = 0; if (dsl_dir_is_clone(ds->ds_dir)) { dsl_dir_get_origin(ds->ds_dir, stat->dds_origin); } } } uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds) { return (ds->ds_fsid_guid); } void dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) *availbytesp += ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; if (ds->ds_quota != 0) { /* * Adjust available bytes according to refquota */ if (*refdbytesp < ds->ds_quota) *availbytesp = MIN(*availbytesp, ds->ds_quota - *refdbytesp); else *availbytesp = 0; } rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp); rrw_exit(&ds->ds_bp_rwlock, FTAG); *availobjsp = DN_MAX_OBJECT - *usedobjsp; } boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) { dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool; uint64_t birth; ASSERT(dsl_pool_config_held(dp)); if (snap == NULL) return (B_FALSE); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); birth = dsl_dataset_get_blkptr(ds)->blk_birth; rrw_exit(&ds->ds_bp_rwlock, FTAG); if (birth > dsl_dataset_phys(snap)->ds_creation_txg) { objset_t *os, *os_snap; /* * It may be that only the ZIL differs, because it was * reset in the head. Don't count that as being * modified. */ if (dmu_objset_from_ds(ds, &os) != 0) return (B_TRUE); if (dmu_objset_from_ds(snap, &os_snap) != 0) return (B_TRUE); return (bcmp(&os->os_phys->os_meta_dnode, &os_snap->os_phys->os_meta_dnode, sizeof (os->os_phys->os_meta_dnode)) != 0); } return (B_FALSE); } typedef struct dsl_dataset_rename_snapshot_arg { const char *ddrsa_fsname; const char *ddrsa_oldsnapname; const char *ddrsa_newsnapname; boolean_t ddrsa_recursive; dmu_tx_t *ddrsa_tx; } dsl_dataset_rename_snapshot_arg_t; static int dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { (void) dp; dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; int error; uint64_t val; error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); if (error != 0) { /* ignore nonexistent snapshots */ return (error == ENOENT ? 0 : error); } /* new name should not exist */ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val); if (error == 0) error = SET_ERROR(EEXIST); else if (error == ENOENT) error = 0; /* dataset name + 1 for the "@" + the new snapshot name must fit */ if (dsl_dir_namelen(hds->ds_dir) + 1 + strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN) error = SET_ERROR(ENAMETOOLONG); return (error); } static int dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; int error; error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds); if (error != 0) return (error); if (ddrsa->ddrsa_recursive) { error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object, dsl_dataset_rename_snapshot_check_impl, ddrsa, DS_FIND_CHILDREN); } else { error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa); } dsl_dataset_rele(hds, FTAG); return (error); } static int dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; dsl_dataset_t *ds; uint64_t val; dmu_tx_t *tx = ddrsa->ddrsa_tx; int error; error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); ASSERT(error == 0 || error == ENOENT); if (error == ENOENT) { /* ignore nonexistent snapshots */ return (0); } VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds)); /* log before we change the name */ spa_history_log_internal_ds(ds, "rename", tx, "-> @%s", ddrsa->ddrsa_newsnapname); VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx, B_FALSE)); mutex_enter(&ds->ds_lock); (void) strlcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname, sizeof (ds->ds_snapname)); mutex_exit(&ds->ds_lock); VERIFY0(zap_add(dp->dp_meta_objset, dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); zvol_rename_minors(dp->dp_spa, ddrsa->ddrsa_oldsnapname, ddrsa->ddrsa_newsnapname, B_TRUE); dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds = NULL; VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds)); ddrsa->ddrsa_tx = tx; if (ddrsa->ddrsa_recursive) { VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object, dsl_dataset_rename_snapshot_sync_impl, ddrsa, DS_FIND_CHILDREN)); } else { VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa)); } dsl_dataset_rele(hds, FTAG); } int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive) { dsl_dataset_rename_snapshot_arg_t ddrsa; ddrsa.ddrsa_fsname = fsname; ddrsa.ddrsa_oldsnapname = oldsnapname; ddrsa.ddrsa_newsnapname = newsnapname; ddrsa.ddrsa_recursive = recursive; return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, dsl_dataset_rename_snapshot_sync, &ddrsa, 1, ZFS_SPACE_CHECK_RESERVED)); } /* * If we're doing an ownership handoff, we need to make sure that there is * only one long hold on the dataset. We're not allowed to change anything here * so we don't permanently release the long hold or regular hold here. We want * to do this only when syncing to avoid the dataset unexpectedly going away * when we release the long hold. */ static int dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) { boolean_t held = B_FALSE; if (!dmu_tx_is_syncing(tx)) return (0); dsl_dir_t *dd = ds->ds_dir; mutex_enter(&dd->dd_activity_lock); uint64_t holds = zfs_refcount_count(&ds->ds_longholds) - (owner != NULL ? 1 : 0); /* * The value of dd_activity_waiters can chance as soon as we drop the * lock, but we're fine with that; new waiters coming in or old * waiters leaving doesn't cause problems, since we're going to cancel * waiters later anyway. The goal of this check is to verify that no * non-waiters have long-holds, and all new long-holds will be * prevented because we're holding the pool config as writer. */ if (holds != dd->dd_activity_waiters) held = B_TRUE; mutex_exit(&dd->dd_activity_lock); if (held) return (SET_ERROR(EBUSY)); return (0); } int dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) { dsl_dataset_rollback_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int64_t unused_refres_delta; int error; error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds); if (error != 0) return (error); /* must not be a snapshot */ if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* must have a most recent snapshot */ if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ESRCH)); } /* * No rollback to a snapshot created in the current txg, because * the rollback may dirty the dataset and create blocks that are * not reachable from the rootbp while having a birth txg that * falls into the snapshot's range. */ if (dmu_tx_is_syncing(tx) && dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EAGAIN)); } /* * If the expected target snapshot is specified, then check that * the latest snapshot is it. */ if (ddra->ddra_tosnap != NULL) { dsl_dataset_t *snapds; /* Check if the target snapshot exists at all. */ error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds); if (error != 0) { /* * ESRCH is used to signal that the target snapshot does * not exist, while ENOENT is used to report that * the rolled back dataset does not exist. * ESRCH is also used to cover other cases where the * target snapshot is not related to the dataset being * rolled back such as being in a different pool. */ if (error == ENOENT || error == EXDEV) error = SET_ERROR(ESRCH); dsl_dataset_rele(ds, FTAG); return (error); } ASSERT(snapds->ds_is_snapshot); /* Check if the snapshot is the latest snapshot indeed. */ if (snapds != ds->ds_prev) { /* * Distinguish between the case where the only problem * is intervening snapshots (EEXIST) vs the snapshot * not being a valid target for rollback (ESRCH). */ if (snapds->ds_dir == ds->ds_dir || (dsl_dir_is_clone(ds->ds_dir) && dsl_dir_phys(ds->ds_dir)->dd_origin_obj == snapds->ds_object)) { error = SET_ERROR(EEXIST); } else { error = SET_ERROR(ESRCH); } dsl_dataset_rele(snapds, FTAG); dsl_dataset_rele(ds, FTAG); return (error); } dsl_dataset_rele(snapds, FTAG); } /* must not have any bookmarks after the most recent snapshot */ if (dsl_bookmark_latest_txg(ds) > dsl_dataset_phys(ds)->ds_prev_snap_txg) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EEXIST)); } error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } /* * Check if the snap we are rolling back to uses more than * the refquota. */ if (ds->ds_quota != 0 && dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EDQUOT)); } /* * When we do the clone swap, we will temporarily use more space * due to the refreservation (the head will no longer have any * unique space, so the entire amount of the refreservation will need * to be free). We will immediately destroy the clone, freeing * this space, but the freeing happens over many txg's. */ unused_refres_delta = (int64_t)MIN(ds->ds_reserved, dsl_dataset_phys(ds)->ds_unique_bytes); if (unused_refres_delta > 0 && unused_refres_delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOSPC)); } dsl_dataset_rele(ds, FTAG); return (0); } void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_rollback_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds, *clone; uint64_t cloneobj; char namebuf[ZFS_MAX_DATASET_NAME_LEN]; VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds)); dsl_dataset_name(ds->ds_prev, namebuf); fnvlist_add_string(ddra->ddra_result, "target", namebuf); cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, NULL, tx); VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); dsl_dataset_clone_swap_sync_impl(clone, ds, tx); dsl_dataset_zero_zil(ds, tx); dsl_destroy_head_sync_impl(clone, tx); dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(ds, FTAG); } /* * Rolls back the given filesystem or volume to the most recent snapshot. * The name of the most recent snapshot will be returned under key "target" * in the result nvlist. * * If owner != NULL: * - The existing dataset MUST be owned by the specified owner at entry * - Upon return, dataset will still be held by the same owner, whether we * succeed or not. * * This mode is required any time the existing filesystem is mounted. See * notes above zfs_suspend_fs() for further details. */ int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner, nvlist_t *result) { dsl_dataset_rollback_arg_t ddra; ddra.ddra_fsname = fsname; ddra.ddra_tosnap = tosnap; ddra.ddra_owner = owner; ddra.ddra_result = result; return (dsl_sync_task(fsname, dsl_dataset_rollback_check, dsl_dataset_rollback_sync, &ddra, 1, ZFS_SPACE_CHECK_RESERVED)); } struct promotenode { list_node_t link; dsl_dataset_t *ds; }; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag); static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag); int dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) { dsl_dataset_promote_arg_t *ddpa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; struct promotenode *snap; int err; uint64_t unused; uint64_t ss_mv_cnt; size_t max_snap_len; boolean_t conflicting_snaps; err = promote_hold(ddpa, dp, FTAG); if (err != 0) return (err); hds = ddpa->ddpa_clone; max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1; if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) { promote_rele(ddpa, FTAG); return (SET_ERROR(EXDEV)); } snap = list_head(&ddpa->shared_snaps); if (snap == NULL) { err = SET_ERROR(ENOENT); goto out; } dsl_dataset_t *const origin_ds = snap->ds; /* * Encrypted clones share a DSL Crypto Key with their origin's dsl dir. * When doing a promote we must make sure the encryption root for * both the target and the target's origin does not change to avoid * needing to rewrap encryption keys */ err = dsl_dataset_promote_crypt_check(hds->ds_dir, origin_ds->ds_dir); if (err != 0) goto out; /* * Compute and check the amount of space to transfer. Since this is * so expensive, don't do the preliminary check. */ if (!dmu_tx_is_syncing(tx)) { promote_rele(ddpa, FTAG); return (0); } /* compute origin's new unique space */ snap = list_tail(&ddpa->clone_snaps); ASSERT(snap != NULL); ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, origin_ds->ds_object); dsl_deadlist_space_range(&snap->ds->ds_deadlist, dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX, &ddpa->unique, &unused, &unused); /* * Walk the snapshots that we are moving * * Compute space to transfer. Consider the incremental changes * to used by each snapshot: * (my used) = (prev's used) + (blocks born) - (blocks killed) * So each snapshot gave birth to: * (blocks born) = (my used) - (prev's used) + (blocks killed) * So a sequence would look like: * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) * Which simplifies to: * uN + kN + kN-1 + ... + k1 + k0 * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ conflicting_snaps = B_FALSE; ss_mv_cnt = 0; ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes; ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes; ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes; for (snap = list_head(&ddpa->shared_snaps); snap; snap = list_next(&ddpa->shared_snaps, snap)) { uint64_t val, dlused, dlcomp, dluncomp; dsl_dataset_t *ds = snap->ds; ss_mv_cnt++; /* * If there are long holds, we won't be able to evict * the objset. */ if (dsl_dataset_long_held(ds)) { err = SET_ERROR(EBUSY); goto out; } /* Check that the snapshot name does not conflict */ VERIFY0(dsl_dataset_get_snapname(ds)); if (strlen(ds->ds_snapname) >= max_snap_len) { err = SET_ERROR(ENAMETOOLONG); goto out; } err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); if (err == 0) { fnvlist_add_boolean(ddpa->err_ds, snap->ds->ds_snapname); conflicting_snaps = B_TRUE; } else if (err != ENOENT) { goto out; } /* The very first snapshot does not have a deadlist */ if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0) continue; dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); ddpa->used += dlused; ddpa->comp += dlcomp; ddpa->uncomp += dluncomp; } /* * Check that bookmarks that are being transferred don't have * name conflicts. */ for (dsl_bookmark_node_t *dbn = avl_first(&origin_ds->ds_bookmarks); dbn != NULL && dbn->dbn_phys.zbm_creation_txg <= dsl_dataset_phys(origin_ds)->ds_creation_txg; dbn = AVL_NEXT(&origin_ds->ds_bookmarks, dbn)) { if (strlen(dbn->dbn_name) >= max_snap_len) { err = SET_ERROR(ENAMETOOLONG); goto out; } zfs_bookmark_phys_t bm; err = dsl_bookmark_lookup_impl(ddpa->ddpa_clone, dbn->dbn_name, &bm); if (err == 0) { fnvlist_add_boolean(ddpa->err_ds, dbn->dbn_name); conflicting_snaps = B_TRUE; } else if (err == ESRCH) { err = 0; } else if (err != 0) { goto out; } } /* * In order to return the full list of conflicting snapshots, we check * whether there was a conflict after traversing all of them. */ if (conflicting_snaps) { err = SET_ERROR(EEXIST); goto out; } /* * If we are a clone of a clone then we never reached ORIGIN, * so we need to subtract out the clone origin's used space. */ if (ddpa->origin_origin) { ddpa->used -= dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes; ddpa->comp -= dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes; ddpa->uncomp -= dsl_dataset_phys(ddpa->origin_origin)-> ds_uncompressed_bytes; } /* Check that there is enough space and limit headroom here */ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 0, ss_mv_cnt, ddpa->used, ddpa->cr, ddpa->proc); if (err != 0) goto out; /* * Compute the amounts of space that will be used by snapshots * after the promotion (for both origin and clone). For each, * it is the amount of space that will be on all of their * deadlists (that was not born before their new origin). */ if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) { uint64_t space; /* * Note, typically this will not be a clone of a clone, * so dd_origin_txg will be < TXG_INITIAL, so * these snaplist_space() -> dsl_deadlist_space_range() * calls will be fast because they do not have to * iterate over all bps. */ snap = list_head(&ddpa->origin_snaps); if (snap == NULL) { err = SET_ERROR(ENOENT); goto out; } err = snaplist_space(&ddpa->shared_snaps, snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap); if (err != 0) goto out; err = snaplist_space(&ddpa->clone_snaps, snap->ds->ds_dir->dd_origin_txg, &space); if (err != 0) goto out; ddpa->cloneusedsnap += space; } if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) { err = snaplist_space(&ddpa->origin_snaps, dsl_dataset_phys(origin_ds)->ds_creation_txg, &ddpa->originusedsnap); if (err != 0) goto out; } out: promote_rele(ddpa, FTAG); return (err); } void dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_promote_arg_t *ddpa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; struct promotenode *snap; dsl_dataset_t *origin_ds; dsl_dataset_t *origin_head; dsl_dir_t *dd; dsl_dir_t *odd = NULL; uint64_t oldnext_obj; int64_t delta; ASSERT(nvlist_empty(ddpa->err_ds)); VERIFY0(promote_hold(ddpa, dp, FTAG)); hds = ddpa->ddpa_clone; ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE); snap = list_head(&ddpa->shared_snaps); origin_ds = snap->ds; dd = hds->ds_dir; snap = list_head(&ddpa->origin_snaps); origin_head = snap->ds; /* * We need to explicitly open odd, since origin_ds's dd will be * changing. */ VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, NULL, FTAG, &odd)); dsl_dataset_promote_crypt_sync(hds->ds_dir, odd, tx); /* change origin's next snap */ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj; snap = list_tail(&ddpa->clone_snaps); ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, origin_ds->ds_object); dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object; /* change the origin's next clone */ if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) { dsl_dataset_remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dataset_phys(origin_ds)->ds_next_clones_obj, oldnext_obj, tx)); } /* change origin */ dmu_buf_will_dirty(dd->dd_dbuf, tx); ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object); dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj; dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; dmu_buf_will_dirty(odd->dd_dbuf, tx); dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object; origin_head->ds_dir->dd_origin_txg = dsl_dataset_phys(origin_ds)->ds_creation_txg; /* change dd_clone entries */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { VERIFY0(zap_remove_int(dp->dp_meta_objset, dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx)); VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, hds->ds_object, tx)); VERIFY0(zap_remove_int(dp->dp_meta_objset, dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, origin_head->ds_object, tx)); if (dsl_dir_phys(dd)->dd_clones == 0) { dsl_dir_phys(dd)->dd_clones = zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx)); } /* * Move bookmarks to this dir. */ dsl_bookmark_node_t *dbn_next; for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks); dbn != NULL && dbn->dbn_phys.zbm_creation_txg <= dsl_dataset_phys(origin_ds)->ds_creation_txg; dbn = dbn_next) { dbn_next = AVL_NEXT(&origin_head->ds_bookmarks, dbn); avl_remove(&origin_head->ds_bookmarks, dbn); VERIFY0(zap_remove(dp->dp_meta_objset, origin_head->ds_bookmarks_obj, dbn->dbn_name, tx)); dsl_bookmark_node_add(hds, dbn, tx); } dsl_bookmark_next_changed(hds, origin_ds, tx); /* move snapshots to this dir */ for (snap = list_head(&ddpa->shared_snaps); snap; snap = list_next(&ddpa->shared_snaps, snap)) { dsl_dataset_t *ds = snap->ds; /* * Property callbacks are registered to a particular * dsl_dir. Since ours is changing, evict the objset * so that they will be unregistered from the old dsl_dir. */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } /* move snap name entry */ VERIFY0(dsl_dataset_get_snapname(ds)); VERIFY0(dsl_dataset_snap_remove(origin_head, ds->ds_snapname, tx, B_TRUE)); VERIFY0(zap_add(dp->dp_meta_objset, dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); dsl_fs_ss_count_adjust(hds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); /* change containing dsl_dir */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object); dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object; ASSERT3P(ds->ds_dir, ==, odd); dsl_dir_rele(ds->ds_dir, ds); VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); /* move any clone references */ if (dsl_dataset_phys(ds)->ds_next_clones_obj && spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *cnds; uint64_t o; if (za.za_first_integer == oldnext_obj) { /* * We've already moved the * origin's reference. */ continue; } VERIFY0(dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &cnds)); o = dsl_dir_phys(cnds->ds_dir)-> dd_head_dataset_obj; VERIFY0(zap_remove_int(dp->dp_meta_objset, dsl_dir_phys(odd)->dd_clones, o, tx)); VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_clones, o, tx)); dsl_dataset_rele(cnds, FTAG); } zap_cursor_fini(&zc); } ASSERT(!dsl_prop_hascb(ds)); } /* * Change space accounting. * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either * both be valid, or both be 0 (resulting in delta == 0). This * is true for each of {clone,origin} independently. */ delta = ddpa->cloneusedsnap - dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, >=, 0); ASSERT3U(ddpa->used, >=, delta); dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(dd, DD_USED_HEAD, ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); delta = ddpa->originusedsnap - dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, <=, 0); ASSERT3U(ddpa->used, >=, -delta); dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(odd, DD_USED_HEAD, -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; /* * Since livelists are specific to a clone's origin txg, they * are no longer accurate. Destroy the livelist from the clone being * promoted. If the origin dataset is a clone, destroy its livelist * as well. */ dsl_dir_remove_livelist(dd, tx, B_TRUE); dsl_dir_remove_livelist(odd, tx, B_TRUE); /* log history record */ spa_history_log_internal_ds(hds, "promote", tx, " "); dsl_dir_rele(odd, FTAG); promote_rele(ddpa, FTAG); } /* * Make a list of dsl_dataset_t's for the snapshots between first_obj * (exclusive) and last_obj (inclusive). The list will be in reverse * order (last_obj will be the list_head()). If first_obj == 0, do all * snapshots back to this dataset's origin. */ static int snaplist_make(dsl_pool_t *dp, uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag) { uint64_t obj = last_obj; list_create(l, sizeof (struct promotenode), offsetof(struct promotenode, link)); while (obj != first_obj) { dsl_dataset_t *ds; struct promotenode *snap; int err; err = dsl_dataset_hold_obj(dp, obj, tag, &ds); ASSERT(err != ENOENT); if (err != 0) return (err); if (first_obj == 0) first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj; snap = kmem_alloc(sizeof (*snap), KM_SLEEP); snap->ds = ds; list_insert_tail(l, snap); obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } return (0); } static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) { struct promotenode *snap; *spacep = 0; for (snap = list_head(l); snap; snap = list_next(l, snap)) { uint64_t used, comp, uncomp; dsl_deadlist_space_range(&snap->ds->ds_deadlist, mintxg, UINT64_MAX, &used, &comp, &uncomp); *spacep += used; } return (0); } static void snaplist_destroy(list_t *l, void *tag) { struct promotenode *snap; if (l == NULL || !list_link_active(&l->list_head)) return; while ((snap = list_tail(l)) != NULL) { list_remove(l, snap); dsl_dataset_rele(snap->ds, tag); kmem_free(snap, sizeof (*snap)); } list_destroy(l); } static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) { int error; dsl_dir_t *dd; struct promotenode *snap; error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag, &ddpa->ddpa_clone); if (error != 0) return (error); dd = ddpa->ddpa_clone->ds_dir; if (ddpa->ddpa_clone->ds_is_snapshot || !dsl_dir_is_clone(dd)) { dsl_dataset_rele(ddpa->ddpa_clone, tag); return (SET_ERROR(EINVAL)); } error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj, &ddpa->shared_snaps, tag); if (error != 0) goto out; error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object, &ddpa->clone_snaps, tag); if (error != 0) goto out; snap = list_head(&ddpa->shared_snaps); ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj); error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj, dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj, &ddpa->origin_snaps, tag); if (error != 0) goto out; if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) { error = dsl_dataset_hold_obj(dp, dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj, tag, &ddpa->origin_origin); if (error != 0) goto out; } out: if (error != 0) promote_rele(ddpa, tag); return (error); } static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) { snaplist_destroy(&ddpa->shared_snaps, tag); snaplist_destroy(&ddpa->clone_snaps, tag); snaplist_destroy(&ddpa->origin_snaps, tag); if (ddpa->origin_origin != NULL) dsl_dataset_rele(ddpa->origin_origin, tag); dsl_dataset_rele(ddpa->ddpa_clone, tag); } /* * Promote a clone. * * If it fails due to a conflicting snapshot name, "conflsnap" will be filled * in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.) */ int dsl_dataset_promote(const char *name, char *conflsnap) { dsl_dataset_promote_arg_t ddpa = { 0 }; uint64_t numsnaps; int error; nvpair_t *snap_pair; objset_t *os; /* * We will modify space proportional to the number of * snapshots. Compute numsnaps. */ error = dmu_objset_hold(name, FTAG, &os); if (error != 0) return (error); error = zap_count(dmu_objset_pool(os)->dp_meta_objset, dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj, &numsnaps); dmu_objset_rele(os, FTAG); if (error != 0) return (error); ddpa.ddpa_clonename = name; ddpa.err_ds = fnvlist_alloc(); ddpa.cr = CRED(); ddpa.proc = curproc; error = dsl_sync_task(name, dsl_dataset_promote_check, dsl_dataset_promote_sync, &ddpa, 2 + numsnaps, ZFS_SPACE_CHECK_RESERVED); /* * Return the first conflicting snapshot found. */ snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL); if (snap_pair != NULL && conflsnap != NULL) (void) strlcpy(conflsnap, nvpair_name(snap_pair), ZFS_MAX_DATASET_NAME_LEN); fnvlist_free(ddpa.err_ds); return (error); } int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx) { /* * "slack" factor for received datasets with refquota set on them. * See the bottom of this function for details on its use. */ uint64_t refquota_slack = (uint64_t)DMU_MAX_ACCESS * spa_asize_inflation; int64_t unused_refres_delta; /* they should both be heads */ if (clone->ds_is_snapshot || origin_head->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* if we are not forcing, the branch point should be just before them */ if (!force && clone->ds_prev != origin_head->ds_prev) return (SET_ERROR(EINVAL)); /* clone should be the clone (unless they are unrelated) */ if (clone->ds_prev != NULL && clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap && origin_head->ds_dir != clone->ds_prev->ds_dir) return (SET_ERROR(EINVAL)); /* the clone should be a child of the origin */ if (clone->ds_dir->dd_parent != origin_head->ds_dir) return (SET_ERROR(EINVAL)); /* origin_head shouldn't be modified unless 'force' */ if (!force && dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev)) return (SET_ERROR(ETXTBSY)); /* origin_head should have no long holds (e.g. is not mounted) */ if (dsl_dataset_handoff_check(origin_head, owner, tx)) return (SET_ERROR(EBUSY)); /* check amount of any unconsumed refreservation */ unused_refres_delta = (int64_t)MIN(origin_head->ds_reserved, dsl_dataset_phys(origin_head)->ds_unique_bytes) - (int64_t)MIN(origin_head->ds_reserved, dsl_dataset_phys(clone)->ds_unique_bytes); if (unused_refres_delta > 0 && unused_refres_delta > dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) return (SET_ERROR(ENOSPC)); /* * The clone can't be too much over the head's refquota. * * To ensure that the entire refquota can be used, we allow one * transaction to exceed the refquota. Therefore, this check * needs to also allow for the space referenced to be more than the * refquota. The maximum amount of space that one transaction can use * on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this * overage ensures that we are able to receive a filesystem that * exceeds the refquota on the source system. * * So that overage is the refquota_slack we use below. */ if (origin_head->ds_quota != 0 && dsl_dataset_phys(clone)->ds_referenced_bytes > origin_head->ds_quota + refquota_slack) return (SET_ERROR(EDQUOT)); return (0); } static void dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone, dsl_dataset_t *origin, dmu_tx_t *tx) { uint64_t clone_remap_dl_obj, origin_remap_dl_obj; dsl_pool_t *dp = dmu_tx_pool(tx); ASSERT(dsl_pool_sync_context(dp)); clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone); origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin); if (clone_remap_dl_obj != 0) { dsl_deadlist_close(&clone->ds_remap_deadlist); dsl_dataset_unset_remap_deadlist_object(clone, tx); } if (origin_remap_dl_obj != 0) { dsl_deadlist_close(&origin->ds_remap_deadlist); dsl_dataset_unset_remap_deadlist_object(origin, tx); } if (clone_remap_dl_obj != 0) { dsl_dataset_set_remap_deadlist_object(origin, clone_remap_dl_obj, tx); dsl_deadlist_open(&origin->ds_remap_deadlist, dp->dp_meta_objset, clone_remap_dl_obj); } if (origin_remap_dl_obj != 0) { dsl_dataset_set_remap_deadlist_object(clone, origin_remap_dl_obj, tx); dsl_deadlist_open(&clone->ds_remap_deadlist, dp->dp_meta_objset, origin_remap_dl_obj); } } void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); int64_t unused_refres_delta; ASSERT(clone->ds_reserved == 0); /* * NOTE: On DEBUG kernels there could be a race between this and * the check function if spa_asize_inflation is adjusted... */ ASSERT(origin_head->ds_quota == 0 || dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota + DMU_MAX_ACCESS * spa_asize_inflation); ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); dsl_dir_cancel_waiters(origin_head->ds_dir); /* * Swap per-dataset feature flags. */ for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { if (!(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET)) { ASSERT(!dsl_dataset_feature_is_active(clone, f)); ASSERT(!dsl_dataset_feature_is_active(origin_head, f)); continue; } boolean_t clone_inuse = dsl_dataset_feature_is_active(clone, f); void *clone_feature = clone->ds_feature[f]; boolean_t origin_head_inuse = dsl_dataset_feature_is_active(origin_head, f); void *origin_head_feature = origin_head->ds_feature[f]; if (clone_inuse) dsl_dataset_deactivate_feature_impl(clone, f, tx); if (origin_head_inuse) dsl_dataset_deactivate_feature_impl(origin_head, f, tx); if (clone_inuse) { dsl_dataset_activate_feature(origin_head->ds_object, f, clone_feature, tx); origin_head->ds_feature[f] = clone_feature; } if (origin_head_inuse) { dsl_dataset_activate_feature(clone->ds_object, f, origin_head_feature, tx); clone->ds_feature[f] = origin_head_feature; } } dmu_buf_will_dirty(clone->ds_dbuf, tx); dmu_buf_will_dirty(origin_head->ds_dbuf, tx); if (clone->ds_objset != NULL) { dmu_objset_evict(clone->ds_objset); clone->ds_objset = NULL; } if (origin_head->ds_objset != NULL) { dmu_objset_evict(origin_head->ds_objset); origin_head->ds_objset = NULL; } unused_refres_delta = (int64_t)MIN(origin_head->ds_reserved, dsl_dataset_phys(origin_head)->ds_unique_bytes) - (int64_t)MIN(origin_head->ds_reserved, dsl_dataset_phys(clone)->ds_unique_bytes); /* * Reset origin's unique bytes. */ { dsl_dataset_t *origin = clone->ds_prev; uint64_t comp, uncomp; dmu_buf_will_dirty(origin->ds_dbuf, tx); dsl_deadlist_space_range(&clone->ds_deadlist, dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX, &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp); } /* swap blkptrs */ { rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG); rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG); blkptr_t tmp; tmp = dsl_dataset_phys(origin_head)->ds_bp; dsl_dataset_phys(origin_head)->ds_bp = dsl_dataset_phys(clone)->ds_bp; dsl_dataset_phys(clone)->ds_bp = tmp; rrw_exit(&origin_head->ds_bp_rwlock, FTAG); rrw_exit(&clone->ds_bp_rwlock, FTAG); } /* set dd_*_bytes */ { int64_t dused, dcomp, duncomp; uint64_t cdl_used, cdl_comp, cdl_uncomp; uint64_t odl_used, odl_comp, odl_uncomp; ASSERT3U(dsl_dir_phys(clone->ds_dir)-> dd_used_breakdown[DD_USED_SNAP], ==, 0); dsl_deadlist_space(&clone->ds_deadlist, &cdl_used, &cdl_comp, &cdl_uncomp); dsl_deadlist_space(&origin_head->ds_deadlist, &odl_used, &odl_comp, &odl_uncomp); dused = dsl_dataset_phys(clone)->ds_referenced_bytes + cdl_used - (dsl_dataset_phys(origin_head)->ds_referenced_bytes + odl_used); dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes + cdl_comp - (dsl_dataset_phys(origin_head)->ds_compressed_bytes + odl_comp); duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes + cdl_uncomp - (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes + odl_uncomp); dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, dused, dcomp, duncomp, tx); dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD, -dused, -dcomp, -duncomp, tx); /* * The difference in the space used by snapshots is the * difference in snapshot space due to the head's * deadlist (since that's the only thing that's * changing that affects the snapused). */ dsl_deadlist_space_range(&clone->ds_deadlist, origin_head->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used, &cdl_comp, &cdl_uncomp); dsl_deadlist_space_range(&origin_head->ds_deadlist, origin_head->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used, &odl_comp, &odl_uncomp); dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used, DD_USED_HEAD, DD_USED_SNAP, tx); } /* swap ds_*_bytes */ SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes, dsl_dataset_phys(clone)->ds_referenced_bytes); SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes, dsl_dataset_phys(clone)->ds_compressed_bytes); SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes, dsl_dataset_phys(clone)->ds_uncompressed_bytes); SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes, dsl_dataset_phys(clone)->ds_unique_bytes); /* apply any parent delta for change in unconsumed refreservation */ dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, unused_refres_delta, 0, 0, tx); /* * Swap deadlists. */ dsl_deadlist_close(&clone->ds_deadlist); dsl_deadlist_close(&origin_head->ds_deadlist); SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj, dsl_dataset_phys(clone)->ds_deadlist_obj); dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, dsl_dataset_phys(clone)->ds_deadlist_obj); dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, dsl_dataset_phys(origin_head)->ds_deadlist_obj); dsl_dataset_swap_remap_deadlists(clone, origin_head, tx); /* * If there is a bookmark at the origin, its "next dataset" is * changing, so we need to reset its FBN. */ dsl_bookmark_next_changed(origin_head, origin_head->ds_prev, tx); dsl_scan_ds_clone_swapped(origin_head, clone, tx); /* * Destroy any livelists associated with the clone or the origin, * since after the swap the corresponding livelists are no longer * valid. */ dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE); dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE); spa_history_log_internal_ds(clone, "clone swap", tx, "parent=%s", origin_head->ds_dir->dd_myname); } /* * Given a pool name and a dataset object number in that pool, * return the name of that dataset. */ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) { dsl_pool_t *dp; dsl_dataset_t *ds; int error; error = dsl_pool_hold(pname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); if (error == 0) { dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) { int error = 0; ASSERT3S(asize, >, 0); /* * *ref_rsrv is the portion of asize that will come from any * unconsumed refreservation space. */ *ref_rsrv = 0; mutex_enter(&ds->ds_lock); /* * Make a space adjustment for reserved bytes. */ if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { ASSERT3U(*used, >=, ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); *used -= (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); *ref_rsrv = asize - MIN(asize, parent_delta(ds, asize + inflight)); } if (!check_quota || ds->ds_quota == 0) { mutex_exit(&ds->ds_lock); return (0); } /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >= ds->ds_quota) { if (inflight > 0 || dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota) error = SET_ERROR(ERESTART); else error = SET_ERROR(EDQUOT); } mutex_exit(&ds->ds_lock); return (error); } typedef struct dsl_dataset_set_qr_arg { const char *ddsqra_name; zprop_source_t ddsqra_source; uint64_t ddsqra_value; } dsl_dataset_set_qr_arg_t; static int dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) { dsl_dataset_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; uint64_t newval; if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA) return (SET_ERROR(ENOTSUP)); error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_prop_predict(ds->ds_dir, zfs_prop_to_name(ZFS_PROP_REFQUOTA), ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (newval == 0) { dsl_dataset_rele(ds, FTAG); return (0); } if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes || newval < ds->ds_reserved) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOSPC)); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds = NULL; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFQUOTA), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval)); if (ds->ds_quota != newval) { dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_quota = newval; } dsl_dataset_rele(ds, FTAG); } int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, uint64_t refquota) { dsl_dataset_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = dsname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = refquota; return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } static int dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) { dsl_dataset_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; uint64_t newval, unique; if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION) return (SET_ERROR(ENOTSUP)); error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_prop_predict(ds->ds_dir, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ if (!dmu_tx_is_syncing(tx)) { dsl_dataset_rele(ds, FTAG); return (0); } mutex_enter(&ds->ds_lock); if (!DS_UNIQUE_IS_ACCURATE(ds)) dsl_dataset_recalc_head_uniq(ds); unique = dsl_dataset_phys(ds)->ds_unique_bytes; mutex_exit(&ds->ds_lock); if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { uint64_t delta = MAX(unique, newval) - MAX(unique, ds->ds_reserved); if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) || (ds->ds_quota > 0 && newval > ds->ds_quota)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOSPC)); } } dsl_dataset_rele(ds, FTAG); return (0); } void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, zprop_source_t source, uint64_t value, dmu_tx_t *tx) { uint64_t newval; uint64_t unique; int64_t delta; dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), source, sizeof (value), 1, &value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval)); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); unique = dsl_dataset_phys(ds)->ds_unique_bytes; delta = MAX(0, (int64_t)(newval - unique)) - MAX(0, (int64_t)(ds->ds_reserved - unique)); ds->ds_reserved = newval; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); mutex_exit(&ds->ds_dir->dd_lock); } static void dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds = NULL; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); dsl_dataset_set_refreservation_sync_impl(ds, ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx); dsl_dataset_rele(ds, FTAG); } int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, uint64_t refreservation) { dsl_dataset_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = dsname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = refreservation; return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, dsl_dataset_set_refreservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } typedef struct dsl_dataset_set_compression_arg { const char *ddsca_name; zprop_source_t ddsca_source; uint64_t ddsca_value; } dsl_dataset_set_compression_arg_t; static int dsl_dataset_set_compression_check(void *arg, dmu_tx_t *tx) { dsl_dataset_set_compression_arg_t *ddsca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value); spa_feature_t f = zio_compress_to_feature(compval); if (f == SPA_FEATURE_NONE) return (SET_ERROR(EINVAL)); if (!spa_feature_is_enabled(dp->dp_spa, f)) return (SET_ERROR(ENOTSUP)); return (0); } static void dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_set_compression_arg_t *ddsca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds = NULL; uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value); spa_feature_t f = zio_compress_to_feature(compval); ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds)); if (zfeature_active(f, ds->ds_feature[f]) != B_TRUE) { ds->ds_feature_activation[f] = (void *)B_TRUE; dsl_dataset_activate_feature(ds->ds_object, f, ds->ds_feature_activation[f], tx); ds->ds_feature[f] = ds->ds_feature_activation[f]; } dsl_dataset_rele(ds, FTAG); } int dsl_dataset_set_compression(const char *dsname, zprop_source_t source, uint64_t compression) { dsl_dataset_set_compression_arg_t ddsca; /* * The sync task is only required for zstd in order to activate * the feature flag when the property is first set. */ if (ZIO_COMPRESS_ALGO(compression) != ZIO_COMPRESS_ZSTD) return (0); ddsca.ddsca_name = dsname; ddsca.ddsca_source = source; ddsca.ddsca_value = compression; return (dsl_sync_task(dsname, dsl_dataset_set_compression_check, dsl_dataset_set_compression_sync, &ddsca, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); } /* * Return (in *usedp) the amount of space referenced by "new" that was not * referenced at the time the bookmark corresponds to. "New" may be a * snapshot or a head. The bookmark must be before new, in * new's filesystem (or its origin) -- caller verifies this. * * The written space is calculated by considering two components: First, we * ignore any freed space, and calculate the written as new's used space * minus old's used space. Next, we add in the amount of space that was freed * between the two time points, thus reducing new's used space relative to * old's. Specifically, this is the space that was born before * zbm_creation_txg, and freed before new (ie. on new's deadlist or a * previous deadlist). * * space freed [---------------------] * snapshots ---O-------O--------O-------O------ * bookmark new * * Note, the bookmark's zbm_*_bytes_refd must be valid, but if the HAS_FBN * flag is not set, we will calculate the freed_before_next based on the * next snapshot's deadlist, rather than using zbm_*_freed_before_next_snap. */ static int dsl_dataset_space_written_impl(zfs_bookmark_phys_t *bmp, dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { int err = 0; dsl_pool_t *dp = new->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); if (dsl_dataset_is_snapshot(new)) { ASSERT3U(bmp->zbm_creation_txg, <, dsl_dataset_phys(new)->ds_creation_txg); } *usedp = 0; *usedp += dsl_dataset_phys(new)->ds_referenced_bytes; *usedp -= bmp->zbm_referenced_bytes_refd; *compp = 0; *compp += dsl_dataset_phys(new)->ds_compressed_bytes; *compp -= bmp->zbm_compressed_bytes_refd; *uncompp = 0; *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes; *uncompp -= bmp->zbm_uncompressed_bytes_refd; dsl_dataset_t *snap = new; while (dsl_dataset_phys(snap)->ds_prev_snap_txg > bmp->zbm_creation_txg) { uint64_t used, comp, uncomp; dsl_deadlist_space_range(&snap->ds_deadlist, 0, bmp->zbm_creation_txg, &used, &comp, &uncomp); *usedp += used; *compp += comp; *uncompp += uncomp; uint64_t snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; if (snap != new) dsl_dataset_rele(snap, FTAG); err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); if (err != 0) break; } /* * We might not have the FBN if we are calculating written from * a snapshot (because we didn't know the correct "next" snapshot * until now). */ if (bmp->zbm_flags & ZBM_FLAG_HAS_FBN) { *usedp += bmp->zbm_referenced_freed_before_next_snap; *compp += bmp->zbm_compressed_freed_before_next_snap; *uncompp += bmp->zbm_uncompressed_freed_before_next_snap; } else { ASSERT3U(dsl_dataset_phys(snap)->ds_prev_snap_txg, ==, bmp->zbm_creation_txg); uint64_t used, comp, uncomp; dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp); *usedp += used; *compp += comp; *uncompp += uncomp; } if (snap != new) dsl_dataset_rele(snap, FTAG); return (err); } /* * Return (in *usedp) the amount of space written in new that was not * present at the time the bookmark corresponds to. New may be a * snapshot or the head. Old must be a bookmark before new, in * new's filesystem (or its origin) -- caller verifies this. */ int dsl_dataset_space_written_bookmark(zfs_bookmark_phys_t *bmp, dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { if (!(bmp->zbm_flags & ZBM_FLAG_HAS_FBN)) return (SET_ERROR(ENOTSUP)); return (dsl_dataset_space_written_impl(bmp, new, usedp, compp, uncompp)); } /* * Return (in *usedp) the amount of space written in new that is not * present in oldsnap. New may be a snapshot or the head. Old must be * a snapshot before new, in new's filesystem (or its origin). If not then * fail and return EINVAL. */ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { if (!dsl_dataset_is_before(new, oldsnap, 0)) return (SET_ERROR(EINVAL)); zfs_bookmark_phys_t zbm = { 0 }; dsl_dataset_phys_t *dsp = dsl_dataset_phys(oldsnap); zbm.zbm_guid = dsp->ds_guid; zbm.zbm_creation_txg = dsp->ds_creation_txg; zbm.zbm_creation_time = dsp->ds_creation_time; zbm.zbm_referenced_bytes_refd = dsp->ds_referenced_bytes; zbm.zbm_compressed_bytes_refd = dsp->ds_compressed_bytes; zbm.zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes; /* * If oldsnap is the origin (or origin's origin, ...) of new, * we can't easily calculate the effective FBN. Therefore, * we do not set ZBM_FLAG_HAS_FBN, so that the _impl will calculate * it relative to the correct "next": the next snapshot towards "new", * rather than the next snapshot in oldsnap's dsl_dir. */ return (dsl_dataset_space_written_impl(&zbm, new, usedp, compp, uncompp)); } /* * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, * lastsnap, and all snapshots in between are deleted. * * blocks that would be freed [---------------------------] * snapshots ---O-------O--------O-------O--------O * firstsnap lastsnap * * This is the set of blocks that were born after the snap before firstsnap, * (birth > firstsnap->prev_snap_txg) and died before the snap after the * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). * We calculate this by iterating over the relevant deadlists (from the snap * after lastsnap, backward to the snap after firstsnap), summing up the * space on the deadlist that was born after the snap before firstsnap. */ int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *lastsnap, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { int err = 0; uint64_t snapobj; dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; ASSERT(firstsnap->ds_is_snapshot); ASSERT(lastsnap->ds_is_snapshot); /* * Check that the snapshots are in the same dsl_dir, and firstsnap * is before lastsnap. */ if (firstsnap->ds_dir != lastsnap->ds_dir || dsl_dataset_phys(firstsnap)->ds_creation_txg > dsl_dataset_phys(lastsnap)->ds_creation_txg) return (SET_ERROR(EINVAL)); *usedp = *compp = *uncompp = 0; snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj; while (snapobj != firstsnap->ds_object) { dsl_dataset_t *ds; uint64_t used, comp, uncomp; err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); if (err != 0) break; dsl_deadlist_space_range(&ds->ds_deadlist, dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX, &used, &comp, &uncomp); *usedp += used; *compp += comp; *uncompp += uncomp; snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; ASSERT3U(snapobj, !=, 0); dsl_dataset_rele(ds, FTAG); } return (err); } /* * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. * For example, they could both be snapshots of the same filesystem, and * 'earlier' is before 'later'. Or 'earlier' could be the origin of * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's * filesystem. Or 'earlier' could be the origin's origin. * * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg. */ boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, uint64_t earlier_txg) { dsl_pool_t *dp = later->ds_dir->dd_pool; int error; boolean_t ret; ASSERT(dsl_pool_config_held(dp)); ASSERT(earlier->ds_is_snapshot || earlier_txg != 0); if (earlier_txg == 0) earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg; if (later->ds_is_snapshot && earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg) return (B_FALSE); if (later->ds_dir == earlier->ds_dir) return (B_TRUE); /* * We check dd_origin_obj explicitly here rather than using * dsl_dir_is_clone() so that we will return TRUE if "earlier" * is $ORIGIN@$ORIGIN. dsl_dataset_space_written() depends on * this behavior. */ if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == 0) return (B_FALSE); dsl_dataset_t *origin; error = dsl_dataset_hold_obj(dp, dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin); if (error != 0) return (B_FALSE); if (dsl_dataset_phys(origin)->ds_creation_txg == earlier_txg && origin->ds_dir == earlier->ds_dir) { dsl_dataset_rele(origin, FTAG); return (B_TRUE); } ret = dsl_dataset_is_before(origin, earlier, earlier_txg); dsl_dataset_rele(origin, FTAG); return (ret); } void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx); } boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds) { dmu_object_info_t doi; dmu_object_info_from_db(ds->ds_dbuf, &doi); return (doi.doi_type == DMU_OTN_ZAP_METADATA); } boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds) { return (dsl_dataset_is_zapified(ds) && zap_contains(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0); } uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds) { uint64_t remap_deadlist_obj; int err; if (!dsl_dataset_is_zapified(ds)) return (0); err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj); if (err != 0) { VERIFY3S(err, ==, ENOENT); return (0); } ASSERT(remap_deadlist_obj != 0); return (remap_deadlist_obj); } boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds) { EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist), dsl_dataset_get_remap_deadlist_object(ds) != 0); return (dsl_deadlist_is_open(&ds->ds_remap_deadlist)); } static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) { ASSERT(obj != 0); dsl_dataset_zapify(ds, tx); VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx)); } static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx) { VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx)); } void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) { uint64_t remap_deadlist_object; spa_t *spa = ds->ds_dir->dd_pool->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dsl_dataset_remap_deadlist_exists(ds)); remap_deadlist_object = ds->ds_remap_deadlist.dl_object; dsl_deadlist_close(&ds->ds_remap_deadlist); dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx); dsl_dataset_unset_remap_deadlist_object(ds, tx); spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) { uint64_t remap_deadlist_obj; spa_t *spa = ds->ds_dir->dd_pool->dp_spa; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock)); /* * Currently we only create remap deadlists when there are indirect * vdevs with referenced mappings. */ ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); remap_deadlist_obj = dsl_deadlist_clone( &ds->ds_deadlist, UINT64_MAX, dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); dsl_dataset_set_remap_deadlist_object(ds, remap_deadlist_obj, tx); dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa), remap_deadlist_obj); spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } void dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps, uint64_t num_redact_snaps, dmu_tx_t *tx) { uint64_t dsobj = ds->ds_object; struct feature_type_uint64_array_arg *ftuaa = kmem_zalloc(sizeof (*ftuaa), KM_SLEEP); ftuaa->length = (int64_t)num_redact_snaps; if (num_redact_snaps > 0) { ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t), KM_SLEEP); bcopy(redact_snaps, ftuaa->array, num_redact_snaps * sizeof (uint64_t)); } dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS, ftuaa, tx); ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa; } /* BEGIN CSTYLED */ #if defined(_LP64) #define RECORDSIZE_PERM ZMOD_RW #else /* Limited to 1M on 32-bit platforms due to lack of virtual address space */ #define RECORDSIZE_PERM ZMOD_RD #endif ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM, "Max allowed record size"); ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW, "Allow mounting of redacted datasets"); /* END CSTYLED */ EXPORT_SYMBOL(dsl_dataset_hold); EXPORT_SYMBOL(dsl_dataset_hold_flags); EXPORT_SYMBOL(dsl_dataset_hold_obj); EXPORT_SYMBOL(dsl_dataset_hold_obj_flags); EXPORT_SYMBOL(dsl_dataset_own); EXPORT_SYMBOL(dsl_dataset_own_obj); EXPORT_SYMBOL(dsl_dataset_name); EXPORT_SYMBOL(dsl_dataset_rele); EXPORT_SYMBOL(dsl_dataset_rele_flags); EXPORT_SYMBOL(dsl_dataset_disown); EXPORT_SYMBOL(dsl_dataset_tryown); EXPORT_SYMBOL(dsl_dataset_create_sync); EXPORT_SYMBOL(dsl_dataset_create_sync_dd); EXPORT_SYMBOL(dsl_dataset_snapshot_check); EXPORT_SYMBOL(dsl_dataset_snapshot_sync); EXPORT_SYMBOL(dsl_dataset_promote); EXPORT_SYMBOL(dsl_dataset_user_hold); EXPORT_SYMBOL(dsl_dataset_user_release); EXPORT_SYMBOL(dsl_dataset_get_holds); EXPORT_SYMBOL(dsl_dataset_get_blkptr); EXPORT_SYMBOL(dsl_dataset_get_spa); EXPORT_SYMBOL(dsl_dataset_modified_since_snap); EXPORT_SYMBOL(dsl_dataset_space_written); EXPORT_SYMBOL(dsl_dataset_space_wouldfree); EXPORT_SYMBOL(dsl_dataset_sync); EXPORT_SYMBOL(dsl_dataset_block_born); EXPORT_SYMBOL(dsl_dataset_block_kill); EXPORT_SYMBOL(dsl_dataset_dirty); EXPORT_SYMBOL(dsl_dataset_stats); EXPORT_SYMBOL(dsl_dataset_fast_stat); EXPORT_SYMBOL(dsl_dataset_space); EXPORT_SYMBOL(dsl_dataset_fsid_guid); EXPORT_SYMBOL(dsl_dsobj_to_dsname); EXPORT_SYMBOL(dsl_dataset_check_quota); EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl); EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl); diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 4036c8671f2d..277560aabfd1 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -1,1495 +1,1496 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ZFS Write Throttle * ------------------ * * ZFS must limit the rate of incoming writes to the rate at which it is able * to sync data modifications to the backend storage. Throttling by too much * creates an artificial limit; throttling by too little can only be sustained * for short periods and would lead to highly lumpy performance. On a per-pool * basis, ZFS tracks the amount of modified (dirty) data. As operations change * data, the amount of dirty data increases; as ZFS syncs out data, the amount * of dirty data decreases. When the amount of dirty data exceeds a * predetermined threshold further modifications are blocked until the amount * of dirty data decreases (as data is synced out). * * The limit on dirty data is tunable, and should be adjusted according to * both the IO capacity and available memory of the system. The larger the * window, the more ZFS is able to aggregate and amortize metadata (and data) * changes. However, memory is a limited resource, and allowing for more dirty * data comes at the cost of keeping other useful data in memory (for example * ZFS data cached by the ARC). * * Implementation * * As buffers are modified dsl_pool_willuse_space() increments both the per- * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of * dirty space used; dsl_pool_dirty_space() decrements those values as data * is synced out from dsl_pool_sync(). While only the poolwide value is * relevant, the per-txg value is useful for debugging. The tunable * zfs_dirty_data_max determines the dirty space limit. Once that value is * exceeded, new writes are halted until space frees up. * * The zfs_dirty_data_sync_percent tunable dictates the threshold at which we * ensure that there is a txg syncing (see the comment in txg.c for a full * description of transaction group stages). * * The IO scheduler uses both the dirty space limit and current amount of * dirty data as inputs. Those values affect the number of concurrent IOs ZFS * issues. See the comment in vdev_queue.c for details of the IO scheduler. * * The delay is also calculated based on the amount of dirty data. See the * comment above dmu_tx_delay() for details. */ /* * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, * capped at zfs_dirty_data_max_max. It can also be overridden with a module * parameter. */ unsigned long zfs_dirty_data_max = 0; unsigned long zfs_dirty_data_max_max = 0; int zfs_dirty_data_max_percent = 10; int zfs_dirty_data_max_max_percent = 25; /* * The upper limit of TX_WRITE log data. Write operations are throttled * when approaching the limit until log data is cleared out after txg sync. * It only counts TX_WRITE log with WR_COPIED or WR_NEED_COPY. */ unsigned long zfs_wrlog_data_max = 0; /* * If there's at least this much dirty data (as a percentage of * zfs_dirty_data_max), push out a txg. This should be less than * zfs_vdev_async_write_active_min_dirty_percent. */ int zfs_dirty_data_sync_percent = 20; /* * Once there is this amount of dirty data, the dmu_tx_delay() will kick in * and delay each transaction. * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. */ int zfs_delay_min_dirty_percent = 60; /* * This controls how quickly the delay approaches infinity. * Larger values cause it to delay more for a given amount of dirty data. * Therefore larger values will cause there to be less dirty data for a * given throughput. * * For the smoothest delay, this value should be about 1 billion divided * by the maximum number of operations per second. This will smoothly * handle between 10x and 1/10th this number. * * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the * multiply in dmu_tx_delay(). */ unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000; /* * This determines the number of threads used by the dp_sync_taskq. */ int zfs_sync_taskq_batch_pct = 75; /* * These tunables determine the behavior of how zil_itxg_clean() is * called via zil_clean() in the context of spa_sync(). When an itxg * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching. * If the dispatch fails, the call to zil_itxg_clean() will occur * synchronously in the context of spa_sync(), which can negatively * impact the performance of spa_sync() (e.g. in the case of the itxg * list having a large number of itxs that needs to be cleaned). * * Thus, these tunables can be used to manipulate the behavior of the * taskq used by zil_clean(); they determine the number of taskq entries * that are pre-populated when the taskq is first created (via the * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of * taskq entries that are cached after an on-demand allocation (via the * "zfs_zil_clean_taskq_maxalloc"). * * The idea being, we want to try reasonably hard to ensure there will * already be a taskq entry pre-allocated by the time that it is needed * by zil_clean(). This way, we can avoid the possibility of an * on-demand allocation of a new taskq entry from failing, which would * result in zil_itxg_clean() being called synchronously from zil_clean() * (which can adversely affect performance of spa_sync()). * * Additionally, the number of threads used by the taskq can be * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable. */ int zfs_zil_clean_taskq_nthr_pct = 100; int zfs_zil_clean_taskq_minalloc = 1024; int zfs_zil_clean_taskq_maxalloc = 1024 * 1024; int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { uint64_t obj; int err; err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, name, sizeof (obj), 1, &obj); if (err) return (err); return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); } static dsl_pool_t * dsl_pool_open_impl(spa_t *spa, uint64_t txg) { dsl_pool_t *dp; blkptr_t *bp = spa_get_rootblkptr(spa); dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); dp->dp_spa = spa; dp->dp_meta_rootbp = *bp; rrw_init(&dp->dp_config_rwlock, B_TRUE); txg_init(dp, txg); mmp_init(spa); txg_list_create(&dp->dp_dirty_datasets, spa, offsetof(dsl_dataset_t, ds_dirty_link)); txg_list_create(&dp->dp_dirty_zilogs, spa, offsetof(zilog_t, zl_dirty_link)); txg_list_create(&dp->dp_dirty_dirs, spa, offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); txg_list_create(&dp->dp_early_sync_tasks, spa, offsetof(dsl_sync_task_t, dst_node)); dp->dp_sync_taskq = taskq_create("dp_sync_taskq", zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, TASKQ_THREADS_CPU_PCT); dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", zfs_zil_clean_taskq_nthr_pct, minclsyspri, zfs_zil_clean_taskq_minalloc, zfs_zil_clean_taskq_maxalloc, TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); aggsum_init(&dp->dp_wrlog_total, 0); for (int i = 0; i < TXG_SIZE; i++) { aggsum_init(&dp->dp_wrlog_pertxg[i], 0); } dp->dp_zrele_taskq = taskq_create("z_zrele", 100, defclsyspri, boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain", 100, defclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC | TASKQ_THREADS_CPU_PCT); return (dp); } int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); /* * Initialize the caller's dsl_pool_t structure before we actually open * the meta objset. This is done because a self-healing write zio may * be issued as part of dmu_objset_open_impl() and the spa needs its * dsl_pool_t initialized in order to handle the write. */ *dpp = dp; err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &dp->dp_meta_objset); if (err != 0) { dsl_pool_close(dp); *dpp = NULL; } return (err); } int dsl_pool_open(dsl_pool_t *dp) { int err; dsl_dir_t *dd; dsl_dataset_t *ds; uint64_t obj; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); if (err) goto out; err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir); if (err) goto out; err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); if (err) goto out; if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; err = dsl_dataset_hold_obj(dp, dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); if (err == 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap); dsl_dataset_rele(ds, FTAG); } dsl_dir_rele(dd, dp); if (err) goto out; } if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir); if (err) goto out; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err) goto out; VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err == 0) { VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); } else if (err == ENOENT) { /* * We might not have created the remap bpobj yet. */ err = 0; } else { goto out; } } /* * Note: errors ignored, because the these special dirs, used for * space accounting, are only created on demand. */ (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir); if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj); if (err != 0) goto out; } if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, &dp->dp_empty_bpobj); if (err != 0) goto out; } err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); if (err == ENOENT) err = 0; if (err) goto out; err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: rrw_exit(&dp->dp_config_rwlock, FTAG); return (err); } void dsl_pool_close(dsl_pool_t *dp) { /* * Drop our references from dsl_pool_open(). * * Since we held the origin_snap from "syncing" context (which * includes pool-opening context), it actually only got a "ref" * and not a hold, so just drop that here. */ if (dp->dp_origin_snap != NULL) dsl_dataset_rele(dp->dp_origin_snap, dp); if (dp->dp_mos_dir != NULL) dsl_dir_rele(dp->dp_mos_dir, dp); if (dp->dp_free_dir != NULL) dsl_dir_rele(dp->dp_free_dir, dp); if (dp->dp_leak_dir != NULL) dsl_dir_rele(dp->dp_leak_dir, dp); if (dp->dp_root_dir != NULL) dsl_dir_rele(dp->dp_root_dir, dp); bpobj_close(&dp->dp_free_bpobj); bpobj_close(&dp->dp_obsolete_bpobj); /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ if (dp->dp_meta_objset != NULL) dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_zilogs); txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_early_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); taskq_destroy(dp->dp_zil_clean_taskq); taskq_destroy(dp->dp_sync_taskq); /* * We can't set retry to TRUE since we're explicitly specifying * a spa to flush. This is good enough; any missed buffers for * this spa won't cause trouble, and they'll eventually fall * out of the ARC just like any other unused buffer. */ arc_flush(dp->dp_spa, FALSE); mmp_fini(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); dmu_buf_user_evict_wait(); rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); cv_destroy(&dp->dp_spaceavail_cv); ASSERT0(aggsum_value(&dp->dp_wrlog_total)); aggsum_fini(&dp->dp_wrlog_total); for (int i = 0; i < TXG_SIZE; i++) { ASSERT0(aggsum_value(&dp->dp_wrlog_pertxg[i])); aggsum_fini(&dp->dp_wrlog_pertxg[i]); } taskq_destroy(dp->dp_unlinked_drain_taskq); taskq_destroy(dp->dp_zrele_taskq); if (dp->dp_blkstats != NULL) vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp, sizeof (dsl_pool_t)); } void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t obj; /* * Currently, we only create the obsolete_bpobj where there are * indirect vdevs with referenced mappings. */ ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL)); /* create and open the obsolete_bpobj */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); } void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) { spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); VERIFY0(zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_OBSOLETE_BPOBJ, tx)); bpobj_free(dp->dp_meta_objset, dp->dp_obsolete_bpobj.bpo_object, tx); bpobj_close(&dp->dp_obsolete_bpobj); } dsl_pool_t * dsl_pool_create(spa_t *spa, nvlist_t *zplprops __attribute__((unused)), dsl_crypto_params_t *dcp, uint64_t txg) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); #ifdef _KERNEL objset_t *os; #else objset_t *os __attribute__((unused)); #endif dsl_dataset_t *ds; uint64_t obj; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = dmu_objset_create_impl(spa, NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); spa->spa_meta_objset = dp->dp_meta_objset; /* create the pool directory */ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); ASSERT0(err); /* Initialize scan structures */ VERIFY0(dsl_scan_init(dp, txg)); /* create and open the root dir */ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir)); /* create and open the meta-objset dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir)); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { /* create and open the free dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* create and open the free_bplist */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) dsl_pool_create_origin(dp, tx); /* * Some features may be needed when creating the root dataset, so we * create the feature objects here. */ if (spa_version(spa) >= SPA_VERSION_FEATURES) spa_feature_create_zap_objects(spa, tx); if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF && dcp->cp_crypt != ZIO_CRYPT_INHERIT) spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx); /* create the root dataset */ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx); /* create the root objset */ VERIFY0(dsl_dataset_hold_obj_flags(dp, obj, DS_HOLD_FLAG_DECRYPT, FTAG, &ds)); rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); os = dmu_objset_create_impl(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); rrw_exit(&ds->ds_bp_rwlock, FTAG); #ifdef _KERNEL zfs_create_fs(os, kcred, zplprops, tx); #endif dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); dmu_tx_commit(tx); rrw_exit(&dp->dp_config_rwlock, FTAG); return (dp); } /* * Account for the meta-objset space in its placeholder dsl_dir. */ void dsl_pool_mos_diduse_space(dsl_pool_t *dp, int64_t used, int64_t comp, int64_t uncomp) { ASSERT3U(comp, ==, uncomp); /* it's all metadata */ mutex_enter(&dp->dp_lock); dp->dp_mos_used_delta += used; dp->dp_mos_compressed_delta += comp; dp->dp_mos_uncompressed_delta += uncomp; mutex_exit(&dp->dp_lock); } static void dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) { zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dmu_objset_sync(dp->dp_meta_objset, zio, tx); VERIFY0(zio_wait(zio)); dmu_objset_sync_done(dp->dp_meta_objset, tx); taskq_wait(dp->dp_sync_taskq); multilist_destroy(&dp->dp_meta_objset->os_synced_dnodes); dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } static void dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) { ASSERT(MUTEX_HELD(&dp->dp_lock)); if (delta < 0) ASSERT3U(-delta, <=, dp->dp_dirty_total); dp->dp_dirty_total += delta; /* * Note: we signal even when increasing dp_dirty_total. * This ensures forward progress -- each thread wakes the next waiter. */ if (dp->dp_dirty_total < zfs_dirty_data_max) cv_signal(&dp->dp_spaceavail_cv); } void dsl_pool_wrlog_count(dsl_pool_t *dp, int64_t size, uint64_t txg) { ASSERT3S(size, >=, 0); aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], size); aggsum_add(&dp->dp_wrlog_total, size); /* Choose a value slightly bigger than min dirty sync bytes */ uint64_t sync_min = zfs_wrlog_data_max * (zfs_dirty_data_sync_percent + 10) / 200; if (aggsum_compare(&dp->dp_wrlog_pertxg[txg & TXG_MASK], sync_min) > 0) txg_kick(dp, txg); } boolean_t dsl_pool_need_wrlog_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_wrlog_data_max * zfs_delay_min_dirty_percent / 100; return (aggsum_compare(&dp->dp_wrlog_total, delay_min_bytes) > 0); } static void dsl_pool_wrlog_clear(dsl_pool_t *dp, uint64_t txg) { int64_t delta; delta = -(int64_t)aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); aggsum_add(&dp->dp_wrlog_pertxg[txg & TXG_MASK], delta); aggsum_add(&dp->dp_wrlog_total, delta); /* Compact per-CPU sums after the big change. */ (void) aggsum_value(&dp->dp_wrlog_pertxg[txg & TXG_MASK]); (void) aggsum_value(&dp->dp_wrlog_total); } #ifdef ZFS_DEBUG static boolean_t dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) { spa_t *spa = dp->dp_spa; vdev_t *rvd = spa->spa_root_vdev; for (uint64_t c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; txg_list_t *tl = &vd->vdev_ms_list; metaslab_t *ms; for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms; ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) { VERIFY(range_tree_is_empty(ms->ms_freeing)); VERIFY(range_tree_is_empty(ms->ms_checkpointing)); } } return (B_TRUE); } #endif void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { zio_t *zio; dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; objset_t *mos = dp->dp_meta_objset; list_t synced_datasets; list_create(&synced_datasets, sizeof (dsl_dataset_t), offsetof(dsl_dataset_t, ds_synced_link)); tx = dmu_tx_create_assigned(dp, txg); /* * Run all early sync tasks before writing out any dirty blocks. * For more info on early sync tasks see block comment in * dsl_early_sync_task(). */ if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) { dsl_sync_task_t *dst; ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); while ((dst = txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) { ASSERT(dsl_early_sync_task_verify(dp, txg)); dsl_sync_task_sync(dst, tx); } ASSERT(dsl_early_sync_task_verify(dp, txg)); } /* * Write out all dirty blocks of dirty datasets. */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { /* * We must not sync any non-MOS datasets twice, because * we may have taken a snapshot of them. However, we * may sync newly-created datasets on pass 2. */ ASSERT(!list_link_active(&ds->ds_synced_link)); list_insert_tail(&synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } VERIFY0(zio_wait(zio)); /* * Update the long range free counter after * we're done syncing user data */ mutex_enter(&dp->dp_lock); ASSERT(spa_sync_pass(dp->dp_spa) == 1 || dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0); dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0; mutex_exit(&dp->dp_lock); /* * After the data blocks have been written (ensured by the zio_wait() * above), update the user/group/project space accounting. This happens * in tasks dispatched to dp_sync_taskq, so wait for them before * continuing. */ for (ds = list_head(&synced_datasets); ds != NULL; ds = list_next(&synced_datasets, ds)) { dmu_objset_sync_done(ds->ds_objset, tx); } taskq_wait(dp->dp_sync_taskq); /* * Sync the datasets again to push out the changes due to * userspace updates. This must be done before we process the * sync tasks, so that any snapshots will have the correct * user accounting information (and we won't get confused * about which blocks are part of the snapshot). */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { objset_t *os = ds->ds_objset; ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, zio, tx); /* * Release any key mappings created by calls to * dsl_dataset_dirty() from the userquota accounting * code paths. */ if (os->os_encrypted && !os->os_raw_receive && !os->os_next_write_raw[txg & TXG_MASK]) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } } VERIFY0(zio_wait(zio)); /* * Now that the datasets have been completely synced, we can * clean up our in-memory structures accumulated while syncing: * * - move dead blocks from the pending deadlist and livelists * to the on-disk versions * - release hold from dsl_dataset_dirty() * - release key mapping hold from dsl_dataset_dirty() */ while ((ds = list_remove_head(&synced_datasets)) != NULL) { objset_t *os = ds->ds_objset; if (os->os_encrypted && !os->os_raw_receive && !os->os_next_write_raw[txg & TXG_MASK]) { ASSERT3P(ds->ds_key_mapping, !=, NULL); key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); } dsl_dataset_sync_done(ds, tx); + dmu_buf_rele(ds->ds_dbuf, ds); } while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { dsl_dir_sync(dd, tx); } /* * The MOS's space is accounted for in the pool/$MOS * (dp_mos_dir). We can't modify the mos while we're syncing * it, so we remember the deltas and apply them here. */ if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || dp->dp_mos_uncompressed_delta != 0) { dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, dp->dp_mos_used_delta, dp->dp_mos_compressed_delta, dp->dp_mos_uncompressed_delta, tx); dp->dp_mos_used_delta = 0; dp->dp_mos_compressed_delta = 0; dp->dp_mos_uncompressed_delta = 0; } if (dmu_objset_is_dirty(mos, txg)) { dsl_pool_sync_mos(dp, tx); } /* * We have written all of the accounted dirty data, so our * dp_space_towrite should now be zero. However, some seldom-used * code paths do not adhere to this (e.g. dbuf_undirty()). Shore up * the accounting of any dirtied space now. * * Note that, besides any dirty data from datasets, the amount of * dirty data in the MOS is also accounted by the pool. Therefore, * we want to do this cleanup after dsl_pool_sync_mos() so we don't * attempt to update the accounting for the same dirty data twice. * (i.e. at this point we only update the accounting for the space * that we know that we "leaked"). */ dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); /* * If we modify a dataset in the same txg that we want to destroy it, * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. * dsl_dir_destroy_check() will fail if there are unexpected holds. * Therefore, we want to sync the MOS (thus syncing the dd_dbuf * and clearing the hold on it) before we process the sync_tasks. * The MOS data dirtied by the sync_tasks will be synced on the next * pass. */ if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { dsl_sync_task_t *dst; /* * No more sync tasks should have been added while we * were syncing. */ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) dsl_sync_task_sync(dst, tx); } dmu_tx_commit(tx); DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); } void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) { zilog_t *zilog; while ((zilog = txg_list_head(&dp->dp_dirty_zilogs, txg))) { dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); /* * We don't remove the zilog from the dp_dirty_zilogs * list until after we've cleaned it. This ensures that * callers of zilog_is_dirty() receive an accurate * answer when they are racing with the spa sync thread. */ zil_clean(zilog, txg); (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg); ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); dmu_buf_rele(ds->ds_dbuf, zilog); } dsl_pool_wrlog_clear(dp, txg); ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } /* * TRUE if the current thread is the tx_sync_thread or if we * are being called from SPA context during pool initialization. */ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || spa_is_initializing(dp->dp_spa) || taskq_member(dp->dp_sync_taskq, curthread)); } /* * This function returns the amount of allocatable space in the pool * minus whatever space is currently reserved by ZFS for specific * purposes. Specifically: * * 1] Any reserved SLOP space * 2] Any space used by the checkpoint * 3] Any space used for deferred frees * * The latter 2 are especially important because they are needed to * rectify the SPA's and DMU's different understanding of how much space * is used. Now the DMU is aware of that extra space tracked by the SPA * without having to maintain a separate special dir (e.g similar to * $MOS, $FREEING, and $LEAKED). * * Note: By deferred frees here, we mean the frees that were deferred * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the * segments placed in ms_defer trees during metaslab_sync_done(). */ uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy) { spa_t *spa = dp->dp_spa; uint64_t space, resv, adjustedsize; uint64_t spa_deferred_frees = spa->spa_deferred_bpobj.bpo_phys->bpo_bytes; space = spa_get_dspace(spa) - spa_get_checkpoint_space(spa) - spa_deferred_frees; resv = spa_get_slop_space(spa); switch (slop_policy) { case ZFS_SPACE_CHECK_NORMAL: break; case ZFS_SPACE_CHECK_RESERVED: resv >>= 1; break; case ZFS_SPACE_CHECK_EXTRA_RESERVED: resv >>= 2; break; case ZFS_SPACE_CHECK_NONE: resv = 0; break; default: panic("invalid slop policy value: %d", slop_policy); break; } adjustedsize = (space >= resv) ? (space - resv) : 0; return (adjustedsize); } uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy) { uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy); uint64_t deferred = metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0; return (quota); } uint64_t dsl_pool_deferred_space(dsl_pool_t *dp) { return (metaslab_class_get_deferred(spa_normal_class(dp->dp_spa))); } boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; mutex_enter(&dp->dp_lock); uint64_t dirty = dp->dp_dirty_total; mutex_exit(&dp->dp_lock); return (dirty > delay_min_bytes); } static boolean_t dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg) { ASSERT(MUTEX_HELD(&dp->dp_lock)); uint64_t dirty_min_bytes = zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; return (dirty > dirty_min_bytes); } void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) { if (space > 0) { mutex_enter(&dp->dp_lock); dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; dsl_pool_dirty_delta(dp, space); boolean_t needsync = !dmu_tx_is_syncing(tx) && dsl_pool_need_dirty_sync(dp, tx->tx_txg); mutex_exit(&dp->dp_lock); if (needsync) txg_kick(dp, tx->tx_txg); } } void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) { ASSERT3S(space, >=, 0); if (space == 0) return; mutex_enter(&dp->dp_lock); if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { /* XXX writing something we didn't dirty? */ space = dp->dp_dirty_pertxg[txg & TXG_MASK]; } ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; ASSERT3U(dp->dp_dirty_total, >=, space); dsl_pool_dirty_delta(dp, -space); mutex_exit(&dp->dp_lock); } /* ARGSUSED */ static int upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds, *prev = NULL; int err; err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) break; dsl_dataset_rele(ds, FTAG); ds = prev; prev = NULL; } if (prev == NULL) { prev = dp->dp_origin_snap; /* * The $ORIGIN can't have any data, or the accounting * will be wrong. */ rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); rrw_exit(&ds->ds_bp_rwlock, FTAG); /* The origin doesn't get attached to itself */ if (ds->ds_object == prev->ds_object) { dsl_dataset_rele(ds, FTAG); return (0); } dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; dsl_dataset_phys(ds)->ds_prev_snap_txg = dsl_dataset_phys(prev)->ds_creation_txg; dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; dmu_buf_will_dirty(prev->ds_dbuf, tx); dsl_dataset_phys(prev)->ds_num_children++; if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { ASSERT(ds->ds_prev == NULL); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); } } ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { dmu_buf_will_dirty(prev->ds_dbuf, tx); dsl_dataset_phys(prev)->ds_next_clones_obj = zap_create(dp->dp_meta_objset, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); dsl_dataset_rele(ds, FTAG); if (prev != dp->dp_origin_snap) dsl_dataset_rele(prev, FTAG); return (0); } void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap != NULL); VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); } /* ARGSUSED */ static int upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { dmu_tx_t *tx = arg; objset_t *mos = dp->dp_meta_objset; if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { dsl_dataset_t *origin; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); dsl_dir_phys(origin->ds_dir)->dd_clones = zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(origin->ds_dir)->dd_clones, ds->ds_object, tx)); dsl_dataset_rele(origin, FTAG); } return (0); } void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t obj; ASSERT(dmu_tx_is_syncing(tx)); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* * We can't use bpobj_alloc(), because spa_version() still * returns the old version, and we need a new-version bpobj with * subobj support. So call dmu_object_alloc() directly. */ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); } void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t dsobj; dsl_dataset_t *ds; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap == NULL); ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); /* create the origin dir, ds, & snap-ds */ dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, NULL, 0, kcred, NULL, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); } taskq_t * dsl_pool_zrele_taskq(dsl_pool_t *dp) { return (dp->dp_zrele_taskq); } taskq_t * dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp) { return (dp->dp_unlinked_drain_taskq); } /* * Walk through the pool-wide zap object of temporary snapshot user holds * and release them. */ void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) { zap_attribute_t za; zap_cursor_t zc; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; nvlist_t *holds; if (zapobj == 0) return; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); holds = fnvlist_alloc(); for (zap_cursor_init(&zc, mos, zapobj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { char *htag; nvlist_t *tags; htag = strchr(za.za_name, '-'); *htag = '\0'; ++htag; if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { tags = fnvlist_alloc(); fnvlist_add_boolean(tags, htag); fnvlist_add_nvlist(holds, za.za_name, tags); fnvlist_free(tags); } else { fnvlist_add_boolean(tags, htag); } } dsl_dataset_user_release_tmp(dp, holds); fnvlist_free(holds); zap_cursor_fini(&zc); } /* * Create the pool-wide zap object for storing temporary snapshot holds. */ static void dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; ASSERT(dp->dp_tmp_userrefs_obj == 0); ASSERT(dmu_tx_is_syncing(tx)); dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); } static int dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) { objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; char *name; int error; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); ASSERT(dmu_tx_is_syncing(tx)); /* * If the pool was created prior to SPA_VERSION_USERREFS, the * zap object for temporary holds might not exist yet. */ if (zapobj == 0) { if (holding) { dsl_pool_user_hold_create_obj(dp, tx); zapobj = dp->dp_tmp_userrefs_obj; } else { return (SET_ERROR(ENOENT)); } } name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); if (holding) error = zap_add(mos, zapobj, name, 8, 1, &now, tx); else error = zap_remove(mos, zapobj, name, tx); kmem_strfree(name); return (error); } /* * Add a temporary hold for the given dataset object and tag. */ int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, uint64_t now, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); } /* * Release a temporary hold for the given dataset object and tag. */ int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, tx, B_FALSE)); } /* * DSL Pool Configuration Lock * * The dp_config_rwlock protects against changes to DSL state (e.g. dataset * creation / destruction / rename / property setting). It must be held for * read to hold a dataset or dsl_dir. I.e. you must call * dsl_pool_config_enter() or dsl_pool_hold() before calling * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock * must be held continuously until all datasets and dsl_dirs are released. * * The only exception to this rule is that if a "long hold" is placed on * a dataset, then the dp_config_rwlock may be dropped while the dataset * is still held. The long hold will prevent the dataset from being * destroyed -- the destroy will fail with EBUSY. A long hold can be * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset * (by calling dsl_{dataset,objset}_{try}own{_obj}). * * Legitimate long-holders (including owners) should be long-running, cancelable * tasks that should cause "zfs destroy" to fail. This includes DMU * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), * "zfs send", and "zfs diff". There are several other long-holders whose * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). * * The usual formula for long-holding would be: * dsl_pool_hold() * dsl_dataset_hold() * ... perform checks ... * dsl_dataset_long_hold() * dsl_pool_rele() * ... perform long-running task ... * dsl_dataset_long_rele() * dsl_dataset_rele() * * Note that when the long hold is released, the dataset is still held but * the pool is not held. The dataset may change arbitrarily during this time * (e.g. it could be destroyed). Therefore you shouldn't do anything to the * dataset except release it. * * Operations generally fall somewhere into the following taxonomy: * * Read-Only Modifying * * Dataset Layer / MOS zfs get zfs destroy * * Individual Dataset read() write() * * * Dataset Layer Operations * * Modifying operations should generally use dsl_sync_task(). The synctask * infrastructure enforces proper locking strategy with respect to the * dp_config_rwlock. See the comment above dsl_sync_task() for details. * * Read-only operations will manually hold the pool, then the dataset, obtain * information from the dataset, then release the pool and dataset. * dmu_objset_{hold,rele}() are convenience routines that also do the pool * hold/rele. * * * Operations On Individual Datasets * * Objects _within_ an objset should only be modified by the current 'owner' * of the objset to prevent incorrect concurrent modification. Thus, use * {dmu_objset,dsl_dataset}_own to mark some entity as the current owner, * and fail with EBUSY if there is already an owner. The owner can then * implement its own locking strategy, independent of the dataset layer's * locking infrastructure. * (E.g., the ZPL has its own set of locks to control concurrency. A regular * vnop will not reach into the dataset layer). * * Ideally, objects would also only be read by the objset’s owner, so that we * don’t observe state mid-modification. * (E.g. the ZPL is creating a new object and linking it into a directory; if * you don’t coordinate with the ZPL to hold ZPL-level locks, you could see an * intermediate state. The ioctl level violates this but in pretty benign * ways, e.g. reading the zpl props object.) */ int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) { spa_t *spa; int error; error = spa_open(name, &spa, tag); if (error == 0) { *dp = spa_get_dsl(spa); dsl_pool_config_enter(*dp, tag); } return (error); } void dsl_pool_rele(dsl_pool_t *dp, void *tag) { dsl_pool_config_exit(dp, tag); spa_close(dp->dp_spa, tag); } void dsl_pool_config_enter(dsl_pool_t *dp, void *tag) { /* * We use a "reentrant" reader-writer lock, but not reentrantly. * * The rrwlock can (with the track_all flag) track all reading threads, * which is very useful for debugging which code path failed to release * the lock, and for verifying that the *current* thread does hold * the lock. * * (Unlike a rwlock, which knows that N threads hold it for * read, but not *which* threads, so rw_held(RW_READER) returns TRUE * if any thread holds it for read, even if this thread doesn't). */ ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); } void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag) { ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); rrw_enter_read_prio(&dp->dp_config_rwlock, tag); } void dsl_pool_config_exit(dsl_pool_t *dp, void *tag) { rrw_exit(&dp->dp_config_rwlock, tag); } boolean_t dsl_pool_config_held(dsl_pool_t *dp) { return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); } boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp) { return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); } EXPORT_SYMBOL(dsl_pool_config_enter); EXPORT_SYMBOL(dsl_pool_config_exit); /* BEGIN CSTYLED */ /* zfs_dirty_data_max_percent only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, INT, ZMOD_RD, "Max percent of RAM allowed to be dirty"); /* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, INT, ZMOD_RD, "zfs_dirty_data_max upper bound as % of RAM"); ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW, "Transaction delay threshold"); ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, "Determines the dirty space limit"); ZFS_MODULE_PARAM(zfs, zfs_, wrlog_data_max, ULONG, ZMOD_RW, "The size limit of write-transaction zil log data"); /* zfs_dirty_data_max_max only applied at module load in arc_init(). */ ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, "zfs_dirty_data_max upper bound in bytes"); ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW, "Dirty data txg sync threshold as a percentage of zfs_dirty_data_max"); ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW, "How quickly delay approaches infinity"); ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, "Max percent of CPUs that are used to sync dirty data"); ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW, "Max percent of CPUs that are used per dp_sync_taskq"); ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW, "Number of taskq entries that are pre-populated"); ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW, "Max number of taskq entries that are cached"); /* END CSTYLED */