Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 282755) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 282756) @@ -1,1829 +1,1820 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Needed to close a window in dnode_move() that allows the objset to be freed * before it can be safely accessed. */ krwlock_t os_lock; void dmu_objset_init(void) { rw_init(&os_lock, NULL, RW_DEFAULT, NULL); } void dmu_objset_fini(void) { rw_destroy(&os_lock); } spa_t * dmu_objset_spa(objset_t *os) { return (os->os_spa); } zilog_t * dmu_objset_zil(objset_t *os) { return (os->os_zil); } dsl_pool_t * dmu_objset_pool(objset_t *os) { dsl_dataset_t *ds; if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) return (ds->ds_dir->dd_pool); else return (spa_get_dsl(os->os_spa)); } dsl_dataset_t * dmu_objset_ds(objset_t *os) { return (os->os_dsl_dataset); } dmu_objset_type_t dmu_objset_type(objset_t *os) { return (os->os_phys->os_type); } void dmu_objset_name(objset_t *os, char *buf) { dsl_dataset_name(os->os_dsl_dataset, buf); } uint64_t dmu_objset_id(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; return (ds ? ds->ds_object : 0); } zfs_sync_type_t dmu_objset_syncprop(objset_t *os) { return (os->os_sync); } zfs_logbias_op_t dmu_objset_logbias(objset_t *os) { return (os->os_logbias); } static void checksum_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); } static void compression_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval != ZIO_COMPRESS_INHERIT); os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); } static void copies_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval > 0); ASSERT(newval <= spa_max_replication(os->os_spa)); os->os_copies = newval; } static void dedup_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; spa_t *spa = os->os_spa; enum zio_checksum checksum; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); } static void primary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_primary_cache = newval; } static void secondary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_secondary_cache = newval; } static void sync_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || newval == ZFS_SYNC_DISABLED); os->os_sync = newval; if (os->os_zil) zil_set_sync(os->os_zil, newval); } static void redundant_metadata_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || newval == ZFS_REDUNDANT_METADATA_MOST); os->os_redundant_metadata = newval; } static void logbias_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; ASSERT(newval == ZFS_LOGBIAS_LATENCY || newval == ZFS_LOGBIAS_THROUGHPUT); os->os_logbias = newval; if (os->os_zil) zil_set_logbias(os->os_zil, newval); } static void recordsize_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; os->os_recordsize = newval; } void dmu_objset_byteswap(void *buf, size_t size) { objset_phys_t *osp = buf; ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); dnode_byteswap(&osp->os_meta_dnode); byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); osp->os_type = BSWAP_64(osp->os_type); osp->os_flags = BSWAP_64(osp->os_flags); if (size == sizeof (objset_phys_t)) { dnode_byteswap(&osp->os_userused_dnode); dnode_byteswap(&osp->os_groupused_dnode); } } int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, objset_t **osp) { objset_t *os; int i, err; ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); os->os_dsl_dataset = ds; os->os_spa = spa; os->os_rootbp = bp; if (!BP_IS_HOLE(os->os_rootbp)) { arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_FLAG_L2CACHE; if (DMU_OS_IS_L2COMPRESSIBLE(os)) aflags |= ARC_FLAG_L2COMPRESS; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err != 0) { kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { arc_buf_t *buf = arc_buf_alloc(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); (void) arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } os->os_phys = os->os_phys_buf->b_data; os->os_flags = os->os_phys->os_flags; } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; os->os_phys_buf = arc_buf_alloc(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); } /* * Note: the changed_cb will be called once before the register * func returns, thus changing the checksum/compression from the * default (fletcher2/off). Snapshots don't need to know about * checksum/compression/copies. */ if (ds != NULL) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } if (!dsl_dataset_is_snapshot(ds)) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name( ZFS_PROP_REDUNDANT_METADATA), redundant_metadata_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), recordsize_changed_cb, os); } } if (err != 0) { VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); kmem_free(os, sizeof (objset_t)); return (err); } } else { /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_compress = ZIO_COMPRESS_LZJB; os->os_copies = spa_max_replication(spa); os->os_dedup_checksum = ZIO_CHECKSUM_OFF; os->os_dedup_verify = B_FALSE; os->os_logbias = ZFS_LOGBIAS_LATENCY; os->os_sync = ZFS_SYNC_STANDARD; os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; } if (ds == NULL || !dsl_dataset_is_snapshot(ds)) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); list_create(&os->os_free_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); } list_create(&os->os_dnodes, sizeof (dnode_t), offsetof(dnode_t, dn_link)); list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); DMU_META_DNODE(os) = dnode_special_open(os, &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { DMU_USERUSED_DNODE(os) = dnode_special_open(os, &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, &os->os_userused_dnode); DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); } *osp = os; return (0); } int dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { int err = 0; mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { objset_t *os; err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, dsl_dataset_get_blkptr(ds), &os); if (err == 0) { mutex_enter(&ds->ds_lock); ASSERT(ds->ds_objset == NULL); ds->ds_objset = os; mutex_exit(&ds->ds_lock); } } *osp = ds->ds_objset; mutex_exit(&ds->ds_opening_lock); return (err); } /* * Holds the pool while the objset is held. Therefore only one objset * can be held at a time. */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; err = dsl_pool_hold(name, tag, &dp); if (err != 0) return (err); err = dsl_dataset_hold(dp, name, tag, &ds); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } err = dmu_objset_from_ds(ds, osp); if (err != 0) { dsl_dataset_rele(ds, tag); dsl_pool_rele(dp, tag); } return (err); } /* * dsl_pool must not be held when this is called. * Upon successful return, there will be a longhold on the dataset, * and the dsl_pool will not be held. */ int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_own(dp, name, tag, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } err = dmu_objset_from_ds(ds, osp); dsl_pool_rele(dp, FTAG); if (err != 0) { dsl_dataset_disown(ds, tag); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EINVAL)); } else if (!readonly && dsl_dataset_is_snapshot(ds)) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EROFS)); } return (err); } void dmu_objset_rele(objset_t *os, void *tag) { dsl_pool_t *dp = dmu_objset_pool(os); dsl_dataset_rele(os->os_dsl_dataset, tag); dsl_pool_rele(dp, tag); } /* * When we are called, os MUST refer to an objset associated with a dataset * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner * == tag. We will then release and reacquire ownership of the dataset while * holding the pool config_rwlock to avoid intervening namespace or ownership * changes may occur. * * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to * release the hold on its dataset and acquire a new one on the dataset of the * same name so that it can be partially torn down and reconstructed. */ void dmu_objset_refresh_ownership(objset_t *os, void *tag) { dsl_pool_t *dp; dsl_dataset_t *ds, *newds; char name[MAXNAMELEN]; ds = os->os_dsl_dataset; VERIFY3P(ds, !=, NULL); VERIFY3P(ds->ds_owner, ==, tag); VERIFY(dsl_dataset_long_held(ds)); dsl_dataset_name(ds, name); dp = dmu_objset_pool(os); dsl_pool_config_enter(dp, FTAG); dmu_objset_disown(os, tag); VERIFY0(dsl_dataset_own(dp, name, tag, &newds)); VERIFY3P(newds, ==, os->os_dsl_dataset); dsl_pool_config_exit(dp, FTAG); } void dmu_objset_disown(objset_t *os, void *tag) { dsl_dataset_disown(os->os_dsl_dataset, tag); } void dmu_objset_evict_dbufs(objset_t *os) { dnode_t *dn; mutex_enter(&os->os_lock); /* process the mdn last, since the other dnodes have holds on it */ list_remove(&os->os_dnodes, DMU_META_DNODE(os)); list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); /* * Find the first dnode with holds. We have to do this dance * because dnode_add_ref() only works if you already have a * hold. If there are no holds then it has no dbufs so OK to * skip. */ for (dn = list_head(&os->os_dnodes); dn && !dnode_add_ref(dn, FTAG); dn = list_next(&os->os_dnodes, dn)) continue; while (dn) { dnode_t *next_dn = dn; do { next_dn = list_next(&os->os_dnodes, next_dn); } while (next_dn && !dnode_add_ref(next_dn, FTAG)); mutex_exit(&os->os_lock); dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); mutex_enter(&os->os_lock); dn = next_dn; } mutex_exit(&os->os_lock); } void dmu_objset_evict(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; for (int t = 0; t < TXG_SIZE; t++) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { if (!dsl_dataset_is_snapshot(ds)) { VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA), redundant_metadata_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), recordsize_changed_cb, os)); } VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os)); } if (os->os_sa) sa_tear_down(os); dmu_objset_evict_dbufs(os); dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); ASSERT3P(list_head(&os->os_dnodes), ==, NULL); VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* * This is a barrier to prevent the objset from going away in * dnode_move() until we can safely ensure that the objset is still in * use. We consider the objset valid before the barrier and invalid * after the barrier. */ rw_enter(&os_lock, RW_READER); rw_exit(&os_lock); mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); kmem_free(os, sizeof (objset_t)); } timestruc_t dmu_objset_snap_cmtime(objset_t *os) { return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); } /* called from dsl for meta-objset */ objset_t * dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx) { objset_t *os; dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); if (ds != NULL) VERIFY0(dmu_objset_from_ds(ds, &os)); else VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); mdn = DMU_META_DNODE(os); dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); /* * We don't want to have to increase the meta-dnode's nlevels * later, because then we could do it in quescing context while * we are also accessing it in open context. * * This precaution is not necessary for the MOS (ds == NULL), * because the MOS is only updated in syncing context. * This is most fortunate: the MOS is the only objset that * needs to be synced multiple times as spa_sync() iterates * to convergence, so minimizing its dn_nlevels matters. */ if (ds != NULL) { int levels = 1; /* * Determine the number of levels necessary for the meta-dnode * to contain DN_MAX_OBJECT dnodes. */ while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < DN_MAX_OBJECT * sizeof (dnode_phys_t)) levels++; mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = mdn->dn_nlevels = levels; } ASSERT(type != DMU_OST_NONE); ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); os->os_phys->os_type = type; if (dmu_objset_userused_enabled(os)) { os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; os->os_flags = os->os_phys->os_flags; } dsl_dataset_dirty(ds, tx); return (os); } typedef struct dmu_objset_create_arg { const char *doca_name; cred_t *doca_cred; void (*doca_userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *doca_userarg; dmu_objset_type_t doca_type; uint64_t doca_flags; } dmu_objset_create_arg_t; /*ARGSUSED*/ static int dmu_objset_create_check(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; int error; if (strchr(doca->doca_name, '@') != NULL) return (SET_ERROR(EINVAL)); error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); if (error != 0) return (error); if (tail == NULL) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred); dsl_dir_rele(pdd, FTAG); return (error); } static void dmu_objset_create_sync(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; dsl_dataset_t *ds; uint64_t obj; blkptr_t *bp; objset_t *os; VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, doca->doca_cred, tx); VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); bp = dsl_dataset_get_blkptr(ds); os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, ds, bp, doca->doca_type, tx); if (doca->doca_userfunc != NULL) { doca->doca_userfunc(os, doca->doca_userarg, doca->doca_cred, tx); } spa_history_log_internal_ds(ds, "create", tx, ""); dsl_dataset_rele(ds, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { dmu_objset_create_arg_t doca; doca.doca_name = name; doca.doca_cred = CRED(); doca.doca_flags = flags; doca.doca_userfunc = func; doca.doca_userarg = arg; doca.doca_type = type; return (dsl_sync_task(name, dmu_objset_create_check, dmu_objset_create_sync, &doca, 5, ZFS_SPACE_CHECK_NORMAL)); } typedef struct dmu_objset_clone_arg { const char *doca_clone; const char *doca_origin; cred_t *doca_cred; } dmu_objset_clone_arg_t; /*ARGSUSED*/ static int dmu_objset_clone_check(void *arg, dmu_tx_t *tx) { dmu_objset_clone_arg_t *doca = arg; dsl_dir_t *pdd; const char *tail; int error; dsl_dataset_t *origin; dsl_pool_t *dp = dmu_tx_pool(tx); if (strchr(doca->doca_clone, '@') != NULL) return (SET_ERROR(EINVAL)); error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); if (error != 0) return (error); if (tail == NULL) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } - /* You can't clone across pools. */ - if (pdd->dd_pool != dp) { - dsl_dir_rele(pdd, FTAG); - return (SET_ERROR(EXDEV)); - } + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EDQUOT)); } dsl_dir_rele(pdd, FTAG); error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); if (error != 0) return (error); - - /* You can't clone across pools. */ - if (origin->ds_dir->dd_pool != dp) { - dsl_dataset_rele(origin, FTAG); - return (SET_ERROR(EXDEV)); - } /* You can only clone snapshots, not the head datasets. */ if (!dsl_dataset_is_snapshot(origin)) { dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EINVAL)); } dsl_dataset_rele(origin, FTAG); return (0); } static void dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) { dmu_objset_clone_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; dsl_dataset_t *origin, *ds; uint64_t obj; char namebuf[MAXNAMELEN]; VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); obj = dsl_dataset_create_sync(pdd, tail, origin, 0, doca->doca_cred, tx); VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); dsl_dataset_name(origin, namebuf); spa_history_log_internal_ds(ds, "clone", tx, "origin=%s (%llu)", namebuf, origin->ds_object); dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_clone(const char *clone, const char *origin) { dmu_objset_clone_arg_t doca; doca.doca_clone = clone; doca.doca_origin = origin; doca.doca_cred = CRED(); return (dsl_sync_task(clone, dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5, ZFS_SPACE_CHECK_NORMAL)); } int dmu_objset_snapshot_one(const char *fsname, const char *snapname) { int err; char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); nvlist_t *snaps = fnvlist_alloc(); fnvlist_add_boolean(snaps, longsnap); strfree(longsnap); err = dsl_dataset_snapshot(snaps, NULL, NULL); fnvlist_free(snaps); return (err); } static void dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) { dnode_t *dn; while (dn = list_head(list)) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ASSERT(dn->dn_dbuf->db_data_pending); /* * Initialize dn_zio outside dnode_sync() because the * meta-dnode needs to set it ouside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); list_remove(list, dn); if (newlist) { (void) dnode_add_ref(dn, newlist); list_insert_tail(newlist, dn); } dnode_sync(dn, tx); } } /* ARGSUSED */ static void dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3P(bp, ==, os->os_rootbp); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); ASSERT0(BP_GET_LEVEL(bp)); /* * Update rootbp fill count: it should be the number of objects * allocated in the object set (not counting the "special" * objects that are stored in the objset_phys_t -- the meta * dnode and user/group accounting objects). */ bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]); } /* ARGSUSED */ static void dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; objset_t *os = arg; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } } /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; list_t *list; list_t *newlist = NULL; dbuf_dirty_record_t *dr; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); ASSERT(dmu_tx_is_syncing(tx)); /* XXX the write_done callback should really give us the tx... */ os->os_synctx = tx; if (os->os_dsl_dataset == NULL) { /* * This is the MOS. If we have upgraded, * spa_max_replication() could change, so reset * os_copies here. */ os->os_copies = spa_max_replication(os->os_spa); } /* * Create the root block IO */ SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); arc_release(os->os_phys_buf, &os->os_phys_buf); dmu_write_policy(os, NULL, 0, 0, &zp); zio = arc_write(pio, os->os_spa, tx->tx_txg, os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* * Sync special dnodes - the parent IO for the sync is the root block */ DMU_META_DNODE(os)->dn_zio = zio; dnode_sync(DMU_META_DNODE(os), tx); os->os_phys->os_flags = os->os_flags; if (DMU_USERUSED_DNODE(os) && DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { DMU_USERUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_USERUSED_DNODE(os), tx); DMU_GROUPUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_GROUPUSED_DNODE(os), tx); } txgoff = tx->tx_txg & TXG_MASK; if (dmu_objset_userused_enabled(os)) { newlist = &os->os_synced_dnodes; /* * We must create the list here because it uses the * dn_dirty_link[] of this txg. */ list_create(newlist, sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[txgoff])); } dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while (dr = list_head(list)) { ASSERT0(dr->dr_dbuf->db_level); list_remove(list, dr); if (dr->dr_zio) zio_nowait(dr->dr_zio); } /* * Free intent log blocks up to this tx. */ zil_sync(os->os_zil, tx); os->os_phys->os_zil_header = os->os_zil_header; zio_nowait(zio); } boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg) { return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); } static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) { used_cbs[ost] = cb; } boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && used_cbs[os->os_phys->os_type] != NULL && DMU_USERUSED_DNODE(os) != NULL); } static void do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) { if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { int64_t delta = DNODE_SIZE + used; if (subtract) delta = -delta; VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, user, delta, tx)); VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, group, delta, tx)); } } void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) { dnode_t *dn; list_t *list = &os->os_synced_dnodes; ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); while (dn = list_head(list)) { int flags; ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED); /* Allocate the user/groupused objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { VERIFY(0 == zap_create_claim(os, DMU_USERUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); VERIFY(0 == zap_create_claim(os, DMU_GROUPUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } /* * We intentionally modify the zap object even if the * net delta is zero. Otherwise * the block of the zap obj could be shared between * datasets but need to be different between them after * a bprewrite. */ flags = dn->dn_id_flags; ASSERT(flags); if (flags & DN_ID_OLD_EXIST) { do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); } if (flags & DN_ID_NEW_EXIST) { do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid, B_FALSE, tx); } mutex_enter(&dn->dn_mtx); dn->dn_oldused = 0; dn->dn_oldflags = 0; if (dn->dn_id_flags & DN_ID_NEW_EXIST) { dn->dn_olduid = dn->dn_newuid; dn->dn_oldgid = dn->dn_newgid; dn->dn_id_flags |= DN_ID_OLD_EXIST; if (dn->dn_bonuslen == 0) dn->dn_id_flags |= DN_ID_CHKED_SPILL; else dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); mutex_exit(&dn->dn_mtx); list_remove(list, dn); dnode_rele(dn, list); } } /* * Returns a pointer to data to find uid/gid from * * If a dirty record for transaction group that is syncing can't * be found then NULL is returned. In the NULL case it is assumed * the uid/gid aren't changing. */ static void * dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) { dbuf_dirty_record_t *dr, **drp; void *data; if (db->db_dirtycnt == 0) return (db->db.db_data); /* Nothing is changing */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg == tx->tx_txg) break; if (dr == NULL) { data = NULL; } else { dnode_t *dn; DB_DNODE_ENTER(dr->dr_dbuf); dn = DB_DNODE(dr->dr_dbuf); if (dn->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) data = dr->dt.dl.dr_data->b_data; else data = dr->dt.dl.dr_data; DB_DNODE_EXIT(dr->dr_dbuf); } return (data); } void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; void *data = NULL; dmu_buf_impl_t *db = NULL; uint64_t *user = NULL; uint64_t *group = NULL; int flags = dn->dn_id_flags; int error; boolean_t have_spill = B_FALSE; if (!dmu_objset_userused_enabled(dn->dn_objset)) return; if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| DN_ID_CHKED_SPILL))) return; if (before && dn->dn_bonuslen != 0) data = DN_BONUS(dn->dn_phys); else if (!before && dn->dn_bonuslen != 0) { if (dn->dn_bonus) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); data = dmu_objset_userquota_find_data(db, tx); } else { data = DN_BONUS(dn->dn_phys); } } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { int rf = 0; if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; error = dmu_spill_hold_by_dnode(dn, rf | DB_RF_MUST_SUCCEED, FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); data = (before) ? db->db.db_data : dmu_objset_userquota_find_data(db, tx); have_spill = B_TRUE; } else { mutex_enter(&dn->dn_mtx); dn->dn_id_flags |= DN_ID_CHKED_BONUS; mutex_exit(&dn->dn_mtx); return; } if (before) { ASSERT(data); user = &dn->dn_olduid; group = &dn->dn_oldgid; } else if (data) { user = &dn->dn_newuid; group = &dn->dn_newgid; } /* * Must always call the callback in case the object * type has changed and that type isn't an object type to track */ error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, user, group); /* * Preserve existing uid/gid when the callback can't determine * what the new uid/gid are and the callback returned EEXIST. * The EEXIST error tells us to just use the existing uid/gid. * If we don't know what the old values are then just assign * them to 0, since that is a new file being created. */ if (!before && data == NULL && error == EEXIST) { if (flags & DN_ID_OLD_EXIST) { dn->dn_newuid = dn->dn_olduid; dn->dn_newgid = dn->dn_oldgid; } else { dn->dn_newuid = 0; dn->dn_newgid = 0; } error = 0; } if (db) mutex_exit(&db->db_mtx); mutex_enter(&dn->dn_mtx); if (error == 0 && before) dn->dn_id_flags |= DN_ID_OLD_EXIST; if (error == 0 && !before) dn->dn_id_flags |= DN_ID_NEW_EXIST; if (have_spill) { dn->dn_id_flags |= DN_ID_CHKED_SPILL; } else { dn->dn_id_flags |= DN_ID_CHKED_BONUS; } mutex_exit(&dn->dn_mtx); if (have_spill) dmu_buf_rele((dmu_buf_t *)db, FTAG); } boolean_t dmu_objset_userspace_present(objset_t *os) { return (os->os_phys->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); } int dmu_objset_userspace_upgrade(objset_t *os) { uint64_t obj; int err = 0; if (dmu_objset_userspace_present(os)) return (0); if (!dmu_objset_userused_enabled(os)) return (SET_ERROR(ENOTSUP)); if (dmu_objset_is_snapshot(os)) return (SET_ERROR(EINVAL)); /* * We simply need to mark every object dirty, so that it will be * synced out and now accounted. If this is called * concurrently, or if we already did some work before crashing, * that's fine, since we track each object's accounted state * independently. */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { dmu_tx_t *tx; dmu_buf_t *db; int objerr; if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); objerr = dmu_bonus_hold(os, obj, FTAG, &db); if (objerr != 0) continue; tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); if (objerr != 0) { dmu_tx_abort(tx); continue; } dmu_buf_will_dirty(db, tx); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); } os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; txg_wait_synced(dmu_objset_pool(os), 0); return (0); } void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, usedobjsp, availobjsp); } uint64_t dmu_objset_fsid_guid(objset_t *os) { return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); } void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) { stat->dds_type = os->os_phys->os_type; if (os->os_dsl_dataset) dsl_dataset_fast_stat(os->os_dsl_dataset, stat); } void dmu_objset_stats(objset_t *os, nvlist_t *nv) { ASSERT(os->os_dsl_dataset || os->os_phys->os_type == DMU_OST_META); if (os->os_dsl_dataset != NULL) dsl_dataset_stats(os->os_dsl_dataset, nv); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, os->os_phys->os_type); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, dmu_objset_userspace_present(os)); } int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); else return (B_FALSE); } int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; uint64_t ignored; if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, real, maxlen, conflict)); } int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; zap_cursor_t cursor; zap_attribute_t attr; ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (SET_ERROR(ENAMETOOLONG)); } (void) strcpy(name, attr.za_name); if (idp) *idp = attr.za_first_integer; if (case_conflict) *case_conflict = attr.za_normalization_conflict; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp) { dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; zap_cursor_t cursor; zap_attribute_t attr; /* there is no next dir on a snapshot! */ if (os->os_dsl_dataset->ds_object != dsl_dir_phys(dd)->dd_head_dataset_obj) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (SET_ERROR(ENAMETOOLONG)); } (void) strcpy(name, attr.za_name); if (idp) *idp = attr.za_first_integer; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } /* * Find objsets under and including ddobj, call func(ds) on each. */ int dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) { dsl_dir_t *dd; dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; uint64_t thisobj; int err; ASSERT(dsl_pool_config_held(dp)); err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); if (err != 0) return (err); /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); return (0); } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); err = dmu_objset_find_dp(dp, attr->za_first_integer, func, arg, flags); if (err != 0) break; } zap_cursor_fini(&zc); if (err != 0) { dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } } /* * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { dsl_dataset_t *ds; err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); err = dsl_dataset_hold_obj(dp, attr->za_first_integer, FTAG, &ds); if (err != 0) break; err = func(dp, ds, arg); dsl_dataset_rele(ds, FTAG); if (err != 0) break; } zap_cursor_fini(&zc); } } dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); if (err != 0) return (err); /* * Apply to self. */ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err != 0) return (err); err = func(dp, ds, arg); dsl_dataset_rele(ds, FTAG); return (err); } /* * Find all objsets under name, and for each, call 'func(child_name, arg)'. * The dp_config_rwlock must not be held when this is called, and it * will not be held when the callback is called. * Therefore this function should only be used when the pool is not changing * (e.g. in syncing context), or the callback can deal with the possible races. */ static int dmu_objset_find_impl(spa_t *spa, const char *name, int func(const char *, void *), void *arg, int flags) { dsl_dir_t *dd; dsl_pool_t *dp = spa_get_dsl(spa); dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; char *child; uint64_t thisobj; int err; dsl_pool_config_enter(dp, FTAG); err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); if (err != 0) { dsl_pool_config_exit(dp, FTAG); return (err); } /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); dsl_pool_config_exit(dp, FTAG); return (0); } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s/%s", name, attr->za_name); dsl_pool_config_exit(dp, FTAG); err = dmu_objset_find_impl(spa, child, func, arg, flags); dsl_pool_config_enter(dp, FTAG); strfree(child); if (err != 0) break; } zap_cursor_fini(&zc); if (err != 0) { dsl_dir_rele(dd, FTAG); dsl_pool_config_exit(dp, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } } /* * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s@%s", name, attr->za_name); dsl_pool_config_exit(dp, FTAG); err = func(child, arg); dsl_pool_config_enter(dp, FTAG); strfree(child); if (err != 0) break; } zap_cursor_fini(&zc); } } dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); dsl_pool_config_exit(dp, FTAG); if (err != 0) return (err); /* Apply to self. */ return (func(name, arg)); } /* * See comment above dmu_objset_find_impl(). */ int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags) { spa_t *spa; int error; error = spa_open(name, &spa, FTAG); if (error != 0) return (error); error = dmu_objset_find_impl(spa, name, func, arg, flags); spa_close(spa, FTAG); return (error); } void dmu_objset_set_user(objset_t *os, void *user_ptr) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); os->os_user_ptr = user_ptr; } void * dmu_objset_get_user(objset_t *os) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); return (os->os_user_ptr); } /* * Determine name of filesystem, given name of snapshot. * buf must be at least MAXNAMELEN bytes */ int dmu_fsname(const char *snapname, char *buf) { char *atp = strchr(snapname, '@'); if (atp == NULL) return (SET_ERROR(EINVAL)); if (atp - snapname >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); (void) strlcpy(buf, snapname, atp - snapname + 1); return (0); } Index: stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c =================================================================== --- stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c (revision 282755) +++ stable/10/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c (revision 282756) @@ -1,1983 +1,1984 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Joyent, Inc. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" /* * Filesystem and Snapshot Limits * ------------------------------ * * These limits are used to restrict the number of filesystems and/or snapshots * that can be created at a given level in the tree or below. A typical * use-case is with a delegated dataset where the administrator wants to ensure * that a user within the zone is not creating too many additional filesystems * or snapshots, even though they're not exceeding their space quota. * * The filesystem and snapshot counts are stored as extensible properties. This * capability is controlled by a feature flag and must be enabled to be used. * Once enabled, the feature is not active until the first limit is set. At * that point, future operations to create/destroy filesystems or snapshots * will validate and update the counts. * * Because the count properties will not exist before the feature is active, * the counts are updated when a limit is first set on an uninitialized * dsl_dir node in the tree (The filesystem/snapshot count on a node includes * all of the nested filesystems/snapshots. Thus, a new leaf node has a * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and * snapshot count properties on a node indicate uninitialized counts on that * node.) When first setting a limit on an uninitialized node, the code starts * at the filesystem with the new limit and descends into all sub-filesystems * to add the count properties. * * In practice this is lightweight since a limit is typically set when the * filesystem is created and thus has no children. Once valid, changing the * limit value won't require a re-traversal since the counts are already valid. * When recursively fixing the counts, if a node with a limit is encountered * during the descent, the counts are known to be valid and there is no need to * descend into that filesystem's children. The counts on filesystems above the * one with the new limit will still be uninitialized, unless a limit is * eventually set on one of those filesystems. The counts are always recursively * updated when a limit is set on a dataset, unless there is already a limit. * When a new limit value is set on a filesystem with an existing limit, it is * possible for the new limit to be less than the current count at that level * since a user who can change the limit is also allowed to exceed the limit. * * Once the feature is active, then whenever a filesystem or snapshot is * created, the code recurses up the tree, validating the new count against the * limit at each initialized level. In practice, most levels will not have a * limit set. If there is a limit at any initialized level up the tree, the * check must pass or the creation will fail. Likewise, when a filesystem or * snapshot is destroyed, the counts are recursively adjusted all the way up * the initizized nodes in the tree. Renaming a filesystem into different point * in the tree will first validate, then update the counts on each branch up to * the common ancestor. A receive will also validate the counts and then update * them. * * An exception to the above behavior is that the limit is not enforced if the * user has permission to modify the limit. This is primarily so that * recursive snapshots in the global zone always work. We want to prevent a * denial-of-service in which a lower level delegated dataset could max out its * limit and thus block recursive snapshots from being taken in the global zone. * Because of this, it is possible for the snapshot count to be over the limit * and snapshots taken in the global zone could cause a lower level dataset to * hit or exceed its limit. The administrator taking the global zone recursive * snapshot should be aware of this side-effect and behave accordingly. * For consistency, the filesystem limit is also not enforced if the user can * modify the limit. * * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by * dsl_dir_init_fs_ss_count(). * * There is a special case when we receive a filesystem that already exists. In * this case a temporary clone name of %X is created (see dmu_recv_begin). We * never update the filesystem counts for temporary clones. * * Likewise, we do not update the snapshot counts for temporary snapshots, * such as those created by zfs diff. */ extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); /* ARGSUSED */ static void dsl_dir_evict(dmu_buf_t *db, void *arg) { dsl_dir_t *dd = arg; dsl_pool_t *dp = dd->dd_pool; int t; for (t = 0; t < TXG_SIZE; t++) { ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); ASSERT(dd->dd_tempreserved[t] == 0); ASSERT(dd->dd_space_towrite[t] == 0); } if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); spa_close(dd->dd_pool->dp_spa, dd); /* * The props callback list should have been cleaned up by * objset_evict(). */ list_destroy(&dd->dd_prop_cbs); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); } int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **ddp) { dmu_buf_t *dbuf; dsl_dir_t *dd; int err; ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); if (err != 0) return (err); dd = dmu_buf_get_user(dbuf); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbuf, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); } #endif if (dd == NULL) { dsl_dir_t *winner; dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), offsetof(dsl_prop_cb_record_t, cbr_node)); dsl_dir_snap_cmtime_update(dd); if (dsl_dir_phys(dd)->dd_parent_obj) { err = dsl_dir_hold_obj(dp, dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, &dd->dd_parent); if (err != 0) goto errout; if (tail) { #ifdef ZFS_DEBUG uint64_t foundobj; err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dd->dd_parent)-> dd_child_dir_zapobj, tail, sizeof (foundobj), 1, &foundobj); ASSERT(err || foundobj == ddobj); #endif (void) strcpy(dd->dd_myname, tail); } else { err = zap_value_search(dp->dp_meta_objset, dsl_dir_phys(dd->dd_parent)-> dd_child_dir_zapobj, ddobj, 0, dd->dd_myname); } if (err != 0) goto errout; } else { (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); } if (dsl_dir_is_clone(dd)) { dmu_buf_t *origin_bonus; dsl_dataset_phys_t *origin_phys; /* * We can't open the origin dataset, because * that would require opening this dsl_dir. * Just look at its phys directly instead. */ err = dmu_bonus_hold(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_bonus); if (err != 0) goto errout; origin_phys = origin_bonus->db_data; dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); } winner = dmu_buf_set_user_ie(dbuf, dd, dsl_dir_evict); if (winner) { if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dd = winner; } else { spa_open_ref(dp->dp_spa, dd); } } /* * The dsl_dir_t has both open-to-close and instantiate-to-evict * holds on the spa. We need the open-to-close holds because * otherwise the spa_refcnt wouldn't change when we open a * dir which the spa also has open, so we could incorrectly * think it was OK to unload/export/destroy the pool. We need * the instantiate-to-evict hold because the dsl_dir_t has a * pointer to the dd_pool, which has a pointer to the spa_t. */ spa_open_ref(dp->dp_spa, tag); ASSERT3P(dd->dd_pool, ==, dp); ASSERT3U(dd->dd_object, ==, ddobj); ASSERT3P(dd->dd_dbuf, ==, dbuf); *ddp = dd; return (0); errout: if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); return (err); } void dsl_dir_rele(dsl_dir_t *dd, void *tag) { dprintf_dd(dd, "%s\n", ""); spa_close(dd->dd_pool->dp_spa, tag); dmu_buf_rele(dd->dd_dbuf, tag); } /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ void dsl_dir_name(dsl_dir_t *dd, char *buf) { if (dd->dd_parent) { dsl_dir_name(dd->dd_parent, buf); (void) strcat(buf, "/"); } else { buf[0] = '\0'; } if (!MUTEX_HELD(&dd->dd_lock)) { /* * recursive mutex so that we can use * dprintf_dd() with dd_lock held */ mutex_enter(&dd->dd_lock); (void) strcat(buf, dd->dd_myname); mutex_exit(&dd->dd_lock); } else { (void) strcat(buf, dd->dd_myname); } } /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ int dsl_dir_namelen(dsl_dir_t *dd) { int result = 0; if (dd->dd_parent) { /* parent's name + 1 for the "/" */ result = dsl_dir_namelen(dd->dd_parent) + 1; } if (!MUTEX_HELD(&dd->dd_lock)) { /* see dsl_dir_name */ mutex_enter(&dd->dd_lock); result += strlen(dd->dd_myname); mutex_exit(&dd->dd_lock); } else { result += strlen(dd->dd_myname); } return (result); } static int getcomponent(const char *path, char *component, const char **nextp) { char *p; if ((path == NULL) || (path[0] == '\0')) return (SET_ERROR(ENOENT)); /* This would be a good place to reserve some namespace... */ p = strpbrk(path, "/@"); if (p && (p[1] == '/' || p[1] == '@')) { /* two separators in a row */ return (SET_ERROR(EINVAL)); } if (p == NULL || p == path) { /* * if the first thing is an @ or /, it had better be an * @ and it had better not have any more ats or slashes, * and it had better have something after the @. */ if (p != NULL && (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) return (SET_ERROR(EINVAL)); if (strlen(path) >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); (void) strcpy(component, path); p = NULL; } else if (p[0] == '/') { if (p - path >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); (void) strncpy(component, path, p - path); component[p - path] = '\0'; p++; } else if (p[0] == '@') { /* * if the next separator is an @, there better not be * any more slashes. */ if (strchr(path, '/')) return (SET_ERROR(EINVAL)); if (p - path >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); (void) strncpy(component, path, p - path); component[p - path] = '\0'; } else { panic("invalid p=%p", (void *)p); } *nextp = p; return (0); } /* * Return the dsl_dir_t, and possibly the last component which couldn't * be found in *tail. The name must be in the specified dsl_pool_t. This * thread must hold the dp_config_rwlock for the pool. Returns NULL if the * path is bogus, or if tail==NULL and we couldn't parse the whole name. * (*tail)[0] == '@' means that the last component is a snapshot. */ int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) { char buf[MAXNAMELEN]; const char *spaname, *next, *nextnext = NULL; int err; dsl_dir_t *dd; uint64_t ddobj; err = getcomponent(name, buf, &next); if (err != 0) return (err); /* Make sure the name is in the specified pool. */ spaname = spa_name(dp->dp_spa); if (strcmp(buf, spaname) != 0) - return (SET_ERROR(EINVAL)); + return (SET_ERROR(EXDEV)); ASSERT(dsl_pool_config_held(dp)); err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); if (err != 0) { return (err); } while (next != NULL) { dsl_dir_t *child_ds; err = getcomponent(next, buf, &nextnext); if (err != 0) break; ASSERT(next[0] != '\0'); if (next[0] == '@') break; dprintf("looking up %s in obj%lld\n", buf, dsl_dir_phys(dd)->dd_child_dir_zapobj); err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj, buf, sizeof (ddobj), 1, &ddobj); if (err != 0) { if (err == ENOENT) err = 0; break; } err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds); if (err != 0) break; dsl_dir_rele(dd, tag); dd = child_ds; next = nextnext; } if (err != 0) { dsl_dir_rele(dd, tag); return (err); } /* * It's an error if there's more than one component left, or * tailp==NULL and there's any component left. */ if (next != NULL && (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { /* bad path name */ dsl_dir_rele(dd, tag); dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); err = SET_ERROR(ENOENT); } if (tailp != NULL) *tailp = next; *ddp = dd; return (err); } /* * If the counts are already initialized for this filesystem and its * descendants then do nothing, otherwise initialize the counts. * * The counts on this filesystem, and those below, may be uninitialized due to * either the use of a pre-existing pool which did not support the * filesystem/snapshot limit feature, or one in which the feature had not yet * been enabled. * * Recursively descend the filesystem tree and update the filesystem/snapshot * counts on each filesystem below, then update the cumulative count on the * current filesystem. If the filesystem already has a count set on it, * then we know that its counts, and the counts on the filesystems below it, * are already correct, so we don't have to update this filesystem. */ static void dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) { uint64_t my_fs_cnt = 0; uint64_t my_ss_cnt = 0; dsl_pool_t *dp = dd->dd_pool; objset_t *os = dp->dp_meta_objset; zap_cursor_t *zc; zap_attribute_t *za; dsl_dataset_t *ds; ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); ASSERT(dsl_pool_config_held(dp)); ASSERT(dmu_tx_is_syncing(tx)); dsl_dir_zapify(dd, tx); /* * If the filesystem count has already been initialized then we * don't need to recurse down any further. */ if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) return; zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* Iterate my child dirs */ for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dir_t *chld_dd; uint64_t count; VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, &chld_dd)); /* * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and * temporary datasets. */ if (chld_dd->dd_myname[0] == '$' || chld_dd->dd_myname[0] == '%') { dsl_dir_rele(chld_dd, FTAG); continue; } my_fs_cnt++; /* count this child */ dsl_dir_init_fs_ss_count(chld_dd, tx); VERIFY0(zap_lookup(os, chld_dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); my_fs_cnt += count; VERIFY0(zap_lookup(os, chld_dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); my_ss_cnt += count; dsl_dir_rele(chld_dd, FTAG); } zap_cursor_fini(zc); /* Count my snapshots (we counted children's snapshots above) */ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { /* Don't count temporary snapshots */ if (za->za_name[0] != '%') my_ss_cnt++; } zap_cursor_fini(zc); dsl_dataset_rele(ds, FTAG); kmem_free(zc, sizeof (zap_cursor_t)); kmem_free(za, sizeof (zap_attribute_t)); /* we're in a sync task, update counts */ dmu_buf_will_dirty(dd->dd_dbuf, tx); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); } static int dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) { char *ddname = (char *)arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; dsl_dir_t *dd; int error; error = dsl_dataset_hold(dp, ddname, FTAG, &ds); if (error != 0) return (error); if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } dd = ds->ds_dir; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && dsl_dir_is_zapified(dd) && zap_contains(dp->dp_meta_objset, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EALREADY)); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) { char *ddname = (char *)arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; spa_t *spa; VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); spa = dsl_dataset_get_spa(ds); if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { /* * Since the feature was not active and we're now setting a * limit, increment the feature-active counter so that the * feature becomes active for the first time. * * We are already in a sync task so we can update the MOS. */ spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); } /* * Since we are now setting a non-UINT64_MAX limit on the filesystem, * we need to ensure the counts are correct. Descend down the tree from * this point and update all of the counts to be accurate. */ dsl_dir_init_fs_ss_count(ds->ds_dir, tx); dsl_dataset_rele(ds, FTAG); } /* * Make sure the feature is enabled and activate it if necessary. * Since we're setting a limit, ensure the on-disk counts are valid. * This is only called by the ioctl path when setting a limit value. * * We do not need to validate the new limit, since users who can change the * limit are also allowed to exceed the limit. */ int dsl_dir_activate_fs_ss_limit(const char *ddname) { int error; error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, ZFS_SPACE_CHECK_RESERVED); if (error == EALREADY) error = 0; return (error); } /* * Used to determine if the filesystem_limit or snapshot_limit should be * enforced. We allow the limit to be exceeded if the user has permission to * write the property value. We pass in the creds that we got in the open * context since we will always be the GZ root in syncing context. We also have * to handle the case where we are allowed to change the limit on the current * dataset, but there may be another limit in the tree above. * * We can never modify these two properties within a non-global zone. In * addition, the other checks are modeled on zfs_secpolicy_write_perms. We * can't use that function since we are already holding the dp_config_rwlock. * In addition, we already have the dd and dealing with snapshots is simplified * in this code. */ typedef enum { ENFORCE_ALWAYS, ENFORCE_NEVER, ENFORCE_ABOVE } enforce_res_t; static enforce_res_t dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) { enforce_res_t enforce = ENFORCE_ALWAYS; uint64_t obj; dsl_dataset_t *ds; uint64_t zoned; ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); #ifdef _KERNEL #ifdef __FreeBSD__ if (jailed(cr)) #else if (crgetzoneid(cr) != GLOBAL_ZONEID) #endif return (ENFORCE_ALWAYS); if (secpolicy_zfs(cr) == 0) return (ENFORCE_NEVER); #endif if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) return (ENFORCE_ALWAYS); ASSERT(dsl_pool_config_held(dd->dd_pool)); if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) return (ENFORCE_ALWAYS); if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) { /* Only root can access zoned fs's from the GZ */ enforce = ENFORCE_ALWAYS; } else { if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) enforce = ENFORCE_ABOVE; } dsl_dataset_rele(ds, FTAG); return (enforce); } /* * Check if adding additional child filesystem(s) would exceed any filesystem * limits or adding additional snapshot(s) would exceed any snapshot limits. * The prop argument indicates which limit to check. * * Note that all filesystem limits up to the root (or the highest * initialized) filesystem or the given ancestor must be satisfied. */ int dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, dsl_dir_t *ancestor, cred_t *cr) { objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t limit, count; char *count_prop; enforce_res_t enforce; int err = 0; ASSERT(dsl_pool_config_held(dd->dd_pool)); ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); /* * If we're allowed to change the limit, don't enforce the limit * e.g. this can happen if a snapshot is taken by an administrative * user in the global zone (i.e. a recursive snapshot by root). * However, we must handle the case of delegated permissions where we * are allowed to change the limit on the current dataset, but there * is another limit in the tree above. */ enforce = dsl_enforce_ds_ss_limits(dd, prop, cr); if (enforce == ENFORCE_NEVER) return (0); /* * e.g. if renaming a dataset with no snapshots, count adjustment * is 0. */ if (delta == 0) return (0); if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { /* * We don't enforce the limit for temporary snapshots. This is * indicated by a NULL cred_t argument. */ if (cr == NULL) return (0); count_prop = DD_FIELD_SNAPSHOT_COUNT; } else { count_prop = DD_FIELD_FILESYSTEM_COUNT; } /* * If an ancestor has been provided, stop checking the limit once we * hit that dir. We need this during rename so that we don't overcount * the check once we recurse up to the common ancestor. */ if (ancestor == dd) return (0); /* * If we hit an uninitialized node while recursing up the tree, we can * stop since we know there is no limit here (or above). The counts are * not valid on this node and we know we won't touch this node's counts. */ if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object, count_prop, sizeof (count), 1, &count) == ENOENT) return (0); err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, B_FALSE); if (err != 0) return (err); /* Is there a limit which we've hit? */ if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) return (SET_ERROR(EDQUOT)); if (dd->dd_parent != NULL) err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, ancestor, cr); return (err); } /* * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all * parents. When a new filesystem/snapshot is created, increment the count on * all parents, and when a filesystem/snapshot is destroyed, decrement the * count. */ void dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, dmu_tx_t *tx) { int err; objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t count; ASSERT(dsl_pool_config_held(dd->dd_pool)); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); /* * When we receive an incremental stream into a filesystem that already * exists, a temporary clone is created. We don't count this temporary * clone, whose name begins with a '%'. We also ignore hidden ($FREE, * $MOS & $ORIGIN) objsets. */ if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') && strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) return; /* * e.g. if renaming a dataset with no snapshots, count adjustment is 0 */ if (delta == 0) return; /* * If we hit an uninitialized node while recursing up the tree, we can * stop since we know the counts are not valid on this node and we * know we shouldn't touch this node's counts. An uninitialized count * on the node indicates that either the feature has not yet been * activated or there are no limits on this part of the tree. */ if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, prop, sizeof (count), 1, &count)) == ENOENT) return; VERIFY0(err); count += delta; /* Use a signed verify to make sure we're not neg. */ VERIFY3S(count, >=, 0); VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, tx)); /* Roll up this additional count into our ancestors */ if (dd->dd_parent != NULL) dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); } uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; uint64_t ddobj; dsl_dir_phys_t *ddphys; dmu_buf_t *dbuf; ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); if (pds) { VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, name, sizeof (uint64_t), 1, &ddobj, tx)); } else { /* it's the root dir */ VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); } VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); ddphys = dbuf->db_data; ddphys->dd_creation_time = gethrestime_sec(); if (pds) { ddphys->dd_parent_obj = pds->dd_object; /* update the filesystem counts */ dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); } ddphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); ddphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); } boolean_t dsl_dir_is_clone(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_origin_obj && (dd->dd_pool->dp_origin_snap == NULL || dsl_dir_phys(dd)->dd_origin_obj != dd->dd_pool->dp_origin_snap->ds_object)); } void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { mutex_enter(&dd->dd_lock); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dsl_dir_phys(dd)->dd_used_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dsl_dir_phys(dd)->dd_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, dsl_dir_phys(dd)->dd_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / dsl_dir_phys(dd)->dd_compressed_bytes)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, dsl_dir_phys(dd)->dd_uncompressed_bytes); if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); } mutex_exit(&dd->dd_lock); if (dsl_dir_is_zapified(dd)) { uint64_t count; objset_t *os = dd->dd_pool->dp_meta_objset; if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT, count); } if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, count); } } if (dsl_dir_is_clone(dd)) { dsl_dataset_t *ds; char buf[MAXNAMELEN]; VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); } } void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; ASSERT(dsl_dir_phys(dd)); if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(dd->dd_dbuf, dd); } } static int64_t parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) { uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); uint64_t new_accounted = MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); return (new_accounted - old_accounted); } void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); mutex_enter(&dd->dd_lock); ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]); dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; mutex_exit(&dd->dd_lock); /* release the hold from dsl_dir_dirty */ dmu_buf_rele(dd->dd_dbuf, dd); } static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd) { uint64_t space = 0; int i; ASSERT(MUTEX_HELD(&dd->dd_lock)); for (i = 0; i < TXG_SIZE; i++) { space += dd->dd_space_towrite[i&TXG_MASK]; ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); } return (space); } /* * How much space would dd have available if ancestor had delta applied * to it? If ondiskonly is set, we're only interested in what's * on-disk, not estimated pending changes. */ uint64_t dsl_dir_space_available(dsl_dir_t *dd, dsl_dir_t *ancestor, int64_t delta, int ondiskonly) { uint64_t parentspace, myspace, quota, used; /* * If there are no restrictions otherwise, assume we have * unlimited space available. */ quota = UINT64_MAX; parentspace = UINT64_MAX; if (dd->dd_parent != NULL) { parentspace = dsl_dir_space_available(dd->dd_parent, ancestor, delta, ondiskonly); } mutex_enter(&dd->dd_lock); if (dsl_dir_phys(dd)->dd_quota != 0) quota = dsl_dir_phys(dd)->dd_quota; used = dsl_dir_phys(dd)->dd_used_bytes; if (!ondiskonly) used += dsl_dir_space_towrite(dd); if (dd->dd_parent == NULL) { uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); quota = MIN(quota, poolsize); } if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { /* * We have some space reserved, in addition to what our * parent gave us. */ parentspace += dsl_dir_phys(dd)->dd_reserved - used; } if (dd == ancestor) { ASSERT(delta <= 0); ASSERT(used >= -delta); used += delta; if (parentspace != UINT64_MAX) parentspace -= delta; } if (used > quota) { /* over quota */ myspace = 0; } else { /* * the lesser of the space provided by our parent and * the space left in our quota */ myspace = MIN(parentspace, quota - used); } mutex_exit(&dd->dd_lock); return (myspace); } struct tempreserve { list_node_t tr_node; dsl_dir_t *tr_ds; uint64_t tr_size; }; static int dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, dmu_tx_t *tx, boolean_t first) { uint64_t txg = tx->tx_txg; uint64_t est_inflight, used_on_disk, quota, parent_rsrv; uint64_t deferred = 0; struct tempreserve *tr; int retval = EDQUOT; int txgidx = txg & TXG_MASK; int i; uint64_t ref_rsrv = 0; ASSERT3U(txg, !=, 0); ASSERT3S(asize, >, 0); mutex_enter(&dd->dd_lock); /* * Check against the dsl_dir's quota. We don't add in the delta * when checking for over-quota because they get one free hit. */ est_inflight = dsl_dir_space_towrite(dd); for (i = 0; i < TXG_SIZE; i++) est_inflight += dd->dd_tempreserved[i]; used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; /* * On the first iteration, fetch the dataset's used-on-disk and * refreservation values. Also, if checkrefquota is set, test if * allocating this space would exceed the dataset's refquota. */ if (first && tx->tx_objset) { int error; dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; error = dsl_dataset_check_quota(ds, checkrefquota, asize, est_inflight, &used_on_disk, &ref_rsrv); if (error) { mutex_exit(&dd->dd_lock); return (error); } } /* * If this transaction will result in a net free of space, * we want to let it through. */ if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) quota = UINT64_MAX; else quota = dsl_dir_phys(dd)->dd_quota; /* * Adjust the quota against the actual pool size at the root * minus any outstanding deferred frees. * To ensure that it's possible to remove files from a full * pool without inducing transient overcommits, we throttle * netfree transactions against a quota that is slightly larger, * but still within the pool's allocation slop. In cases where * we're very close to full, this will allow a steady trickle of * removes to get through. */ if (dd->dd_parent == NULL) { spa_t *spa = dd->dd_pool->dp_spa; uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); deferred = metaslab_class_get_deferred(spa_normal_class(spa)); if (poolsize - deferred < quota) { quota = poolsize - deferred; retval = ENOSPC; } } /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ if (used_on_disk + est_inflight >= quota) { if (est_inflight > 0 || used_on_disk < quota || (retval == ENOSPC && used_on_disk < quota + deferred)) retval = ERESTART; dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK err=%d\n", used_on_disk>>10, est_inflight>>10, quota>>10, asize>>10, retval); mutex_exit(&dd->dd_lock); return (SET_ERROR(retval)); } /* We need to up our estimated delta before dropping dd_lock */ dd->dd_tempreserved[txgidx] += asize; parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, asize - ref_rsrv); mutex_exit(&dd->dd_lock); tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_ds = dd; tr->tr_size = asize; list_insert_tail(tr_list, tr); /* see if it's OK with our parent */ if (dd->dd_parent && parent_rsrv) { boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); return (dsl_dir_tempreserve_impl(dd->dd_parent, parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); } else { return (0); } } /* * Reserve space in this dsl_dir, to be used in this tx's txg. * After the space has been dirtied (and dsl_dir_willuse_space() * has been called), the reservation should be canceled, using * dsl_dir_tempreserve_clear(). */ int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) { int err; list_t *tr_list; if (asize == 0) { *tr_cookiep = NULL; return (0); } tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); ASSERT3S(asize, >, 0); ASSERT3S(fsize, >=, 0); err = arc_tempreserve_space(lsize, tx->tx_txg); if (err == 0) { struct tempreserve *tr; tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_size = lsize; list_insert_tail(tr_list, tr); } else { if (err == EAGAIN) { /* * If arc_memory_throttle() detected that pageout * is running and we are low on memory, we delay new * non-pageout transactions to give pageout an * advantage. * * It is unfortunate to be delaying while the caller's * locks are held. */ txg_delay(dd->dd_pool, tx->tx_txg, MSEC2NSEC(10), MSEC2NSEC(10)); err = SET_ERROR(ERESTART); } } if (err == 0) { err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, FALSE, asize > usize, tr_list, tx, TRUE); } if (err != 0) dsl_dir_tempreserve_clear(tr_list, tx); else *tr_cookiep = tr_list; return (err); } /* * Clear a temporary reservation that we previously made with * dsl_dir_tempreserve_space(). */ void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) { int txgidx = tx->tx_txg & TXG_MASK; list_t *tr_list = tr_cookie; struct tempreserve *tr; ASSERT3U(tx->tx_txg, !=, 0); if (tr_cookie == NULL) return; while ((tr = list_head(tr_list)) != NULL) { if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, tr->tr_size); tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; mutex_exit(&tr->tr_ds->dd_lock); } else { arc_tempreserve_clear(tr->tr_size); } list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); } kmem_free(tr_list, sizeof (list_t)); } /* * This should be called from open context when we think we're going to write * or free space, for example when dirtying data. Be conservative; it's okay * to write less space or free more, but we don't want to write more or free * less than the amount specified. */ void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) { int64_t parent_space; uint64_t est_used; mutex_enter(&dd->dd_lock); if (space > 0) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); /* Make sure that we clean up dd_space_to* */ dsl_dir_dirty(dd, tx); /* XXX this is potentially expensive and unnecessary... */ if (parent_space && dd->dd_parent) dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); } /* call from syncing context when we actually write/free space for this dd */ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) { int64_t accounted_delta; /* * dsl_dataset_set_refreservation_sync_impl() calls this with * dd_lock held, so that it can atomically update * ds->ds_reserved and the dsl_dir accounting, so that * dsl_dataset_check_quota() can see dataset and dir accounting * consistently. */ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(type < DD_USED_NUM); dmu_buf_will_dirty(dd->dd_dbuf, tx); if (needlock) mutex_enter(&dd->dd_lock); accounted_delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); ASSERT(compressed >= 0 || dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); dsl_dir_phys(dd)->dd_used_bytes += used; dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; dsl_dir_phys(dd)->dd_compressed_bytes += compressed; if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { ASSERT(used > 0 || dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); dsl_dir_phys(dd)->dd_used_breakdown[type] += used; #ifdef DEBUG dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) u += dsl_dir_phys(dd)->dd_used_breakdown[t]; ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); #endif } if (needlock) mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, accounted_delta, compressed, uncompressed, tx); dsl_dir_transfer_space(dd->dd_parent, used - accounted_delta, DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL); } } void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) { ASSERT(tx == NULL || dmu_tx_is_syncing(tx)); ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); if (delta == 0 || !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; if (tx != NULL) dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; mutex_exit(&dd->dd_lock); } typedef struct dsl_dir_set_qr_arg { const char *ddsqra_name; zprop_source_t ddsqra_source; uint64_t ddsqra_value; } dsl_dir_set_qr_arg_t; static int dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; uint64_t towrite, newval; error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); error = dsl_prop_predict(ds->ds_dir, "quota", ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (newval == 0) { dsl_dataset_rele(ds, FTAG); return (0); } mutex_enter(&ds->ds_dir->dd_lock); /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the * pending changes could under-estimate the amount of space to be * freed up. */ towrite = dsl_dir_space_towrite(ds->ds_dir); if ((dmu_tx_is_syncing(tx) || towrite == 0) && (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { error = SET_ERROR(ENOSPC); } mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); return (error); } static void dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); } else { newval = ddsqra->ddsqra_value; spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); } dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); dsl_dir_phys(ds->ds_dir)->dd_quota = newval; mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) { dsl_dir_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = ddname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = quota; return (dsl_sync_task(ddname, dsl_dir_set_quota_check, dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); } int dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; dsl_dir_t *dd; uint64_t newval, used, avail; int error; error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); dd = ds->ds_dir; /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ if (!dmu_tx_is_syncing(tx)) { dsl_dataset_rele(ds, FTAG); return (0); } error = dsl_prop_predict(ds->ds_dir, zfs_prop_to_name(ZFS_PROP_RESERVATION), ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } mutex_enter(&dd->dd_lock); used = dsl_dir_phys(dd)->dd_used_bytes; mutex_exit(&dd->dd_lock); if (dd->dd_parent) { avail = dsl_dir_space_available(dd->dd_parent, NULL, 0, FALSE); } else { avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; } if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { uint64_t delta = MAX(used, newval) - MAX(used, dsl_dir_phys(dd)->dd_reserved); if (delta > avail || (dsl_dir_phys(dd)->dd_quota > 0 && newval > dsl_dir_phys(dd)->dd_quota)) error = SET_ERROR(ENOSPC); } dsl_dataset_rele(ds, FTAG); return (error); } void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) { uint64_t used; int64_t delta; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); used = dsl_dir_phys(dd)->dd_used_bytes; delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); dsl_dir_phys(dd)->dd_reserved = value; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, delta, 0, 0, tx); } mutex_exit(&dd->dd_lock); } static void dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); } else { newval = ddsqra->ddsqra_value; spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", zfs_prop_to_name(ZFS_PROP_RESERVATION), (longlong_t)newval); } dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation) { dsl_dir_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = ddname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = reservation; return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); } static dsl_dir_t * closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) { for (; ds1; ds1 = ds1->dd_parent) { dsl_dir_t *dd; for (dd = ds2; dd; dd = dd->dd_parent) { if (ds1 == dd) return (dd); } } return (NULL); } /* * If delta is applied to dd, how much of that delta would be applied to * ancestor? Syncing context only. */ static int64_t would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) { if (dd == ancestor) return (delta); mutex_enter(&dd->dd_lock); delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); mutex_exit(&dd->dd_lock); return (would_change(dd->dd_parent, delta, ancestor)); } typedef struct dsl_dir_rename_arg { const char *ddra_oldname; const char *ddra_newname; cred_t *ddra_cred; } dsl_dir_rename_arg_t; /* ARGSUSED */ static int dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { int *deltap = arg; char namebuf[MAXNAMELEN]; dsl_dataset_name(ds, namebuf); if (strlen(namebuf) + *deltap >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int dsl_dir_rename_check(void *arg, dmu_tx_t *tx) { dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; const char *mynewname; int error; int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); /* target dir should exist */ error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); if (error != 0) return (error); /* new parent should exist */ error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname); if (error != 0) { dsl_dir_rele(dd, FTAG); return (error); } /* can't rename to different pool */ if (dd->dd_pool != newparent->dd_pool) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(ENXIO)); } /* new name should not already exist */ if (mynewname == NULL) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EEXIST)); } /* if the name length is growing, validate child name lengths */ if (delta > 0) { error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } } if (dmu_tx_is_syncing(tx)) { if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { /* * Although this is the check function and we don't * normally make on-disk changes in check functions, * we need to do that here. * * Ensure this portion of the tree's counts have been * initialized in case the new parent has limits set. */ dsl_dir_init_fs_ss_count(dd, tx); } } if (newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = MAX(dsl_dir_phys(dd)->dd_used_bytes, dsl_dir_phys(dd)->dd_reserved); objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t fs_cnt = 0; uint64_t ss_cnt = 0; if (dsl_dir_is_zapified(dd)) { int err; err = zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, &fs_cnt); if (err != ENOENT && err != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (err); } /* * have to add 1 for the filesystem itself that we're * moving */ fs_cnt++; err = zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, &ss_cnt); if (err != ENOENT && err != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (err); } } /* no rename into our descendant */ if (closest_common_ancestor(dd, newparent) == dd) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dir_transfer_possible(dd->dd_parent, newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } } dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (0); } static void dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) { dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; const char *mynewname; int error; objset_t *mos = dp->dp_meta_objset; VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname)); /* Log this before we change the name. */ spa_history_log_internal_dd(dd, "rename", tx, "-> %s", ddra->ddra_newname); if (newparent != dd->dd_parent) { objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t fs_cnt = 0; uint64_t ss_cnt = 0; /* * We already made sure the dd counts were initialized in the * check function. */ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { VERIFY0(zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, &fs_cnt)); /* add 1 for the filesystem itself that we're moving */ fs_cnt++; VERIFY0(zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, &ss_cnt)); } dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, DD_FIELD_FILESYSTEM_COUNT, tx); dsl_fs_ss_count_adjust(newparent, fs_cnt, DD_FIELD_FILESYSTEM_COUNT, tx); dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, DD_FIELD_SNAPSHOT_COUNT, tx); dsl_fs_ss_count_adjust(newparent, ss_cnt, DD_FIELD_SNAPSHOT_COUNT, tx); dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, -dsl_dir_phys(dd)->dd_used_bytes, -dsl_dir_phys(dd)->dd_compressed_bytes, -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD, dsl_dir_phys(dd)->dd_used_bytes, dsl_dir_phys(dd)->dd_compressed_bytes, dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); if (dsl_dir_phys(dd)->dd_reserved > dsl_dir_phys(dd)->dd_used_bytes) { uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - dsl_dir_phys(dd)->dd_used_bytes; dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, -unused_rsrv, 0, 0, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, unused_rsrv, 0, 0, tx); } } dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ error = zap_remove(mos, dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, dd->dd_myname, tx); ASSERT0(error); (void) strcpy(dd->dd_myname, mynewname); dsl_dir_rele(dd->dd_parent, dd); dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; VERIFY0(dsl_dir_hold_obj(dp, newparent->dd_object, NULL, dd, &dd->dd_parent)); /* add to new parent zapobj */ VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, dd->dd_myname, 8, 1, &dd->dd_object, tx)); #ifdef __FreeBSD__ #ifdef _KERNEL zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname); #endif #endif dsl_prop_notify_all(dd); dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); } int dsl_dir_rename(const char *oldname, const char *newname) { dsl_dir_rename_arg_t ddra; ddra.ddra_oldname = oldname; ddra.ddra_newname = newname; ddra.ddra_cred = CRED(); return (dsl_sync_task(oldname, dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3, ZFS_SPACE_CHECK_RESERVED)); } int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr) { dsl_dir_t *ancestor; int64_t adelta; uint64_t avail; int err; ancestor = closest_common_ancestor(sdd, tdd); adelta = would_change(sdd, -space, ancestor); avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); if (avail < space) return (SET_ERROR(ENOSPC)); err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, ancestor, cr); if (err != 0) return (err); err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, ancestor, cr); if (err != 0) return (err); return (0); } timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd) { timestruc_t t; mutex_enter(&dd->dd_lock); t = dd->dd_snap_cmtime; mutex_exit(&dd->dd_lock); return (t); } void dsl_dir_snap_cmtime_update(dsl_dir_t *dd) { timestruc_t t; gethrestime(&t); mutex_enter(&dd->dd_lock); dd->dd_snap_cmtime = t; mutex_exit(&dd->dd_lock); } void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); } boolean_t dsl_dir_is_zapified(dsl_dir_t *dd) { dmu_object_info_t doi; dmu_object_info_from_db(dd->dd_dbuf, &doi); return (doi.doi_type == DMU_OTN_ZAP_METADATA); } Index: stable/10 =================================================================== --- stable/10 (revision 282755) +++ stable/10 (revision 282756) Property changes on: stable/10 ___________________________________________________________________ Modified: svn:mergeinfo ## -0,0 +0,1 ## Merged /head:r282126