Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c (revision 286575) @@ -1,2922 +1,3004 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Number of times that zfs_free_range() took the slow path while doing * a zfs receive. A nonzero value indicates a potential performance problem. */ uint64_t zfs_free_range_recv_miss; static void dbuf_destroy(dmu_buf_impl_t *db); static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); +#ifndef __lint +extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, + dmu_buf_evict_func_t *evict_func, dmu_buf_t **clear_on_evict_dbufp); +#endif /* ! __lint */ + /* * Global data structures and functions for the dbuf cache. */ static kmem_cache_t *dbuf_cache; +static taskq_t *dbu_evict_taskq; /* ARGSUSED */ static int dbuf_cons(void *vdb, void *unused, int kmflag) { dmu_buf_impl_t *db = vdb; bzero(db, sizeof (dmu_buf_impl_t)); mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); refcount_create(&db->db_holds); return (0); } /* ARGSUSED */ static void dbuf_dest(void *vdb, void *unused) { dmu_buf_impl_t *db = vdb; mutex_destroy(&db->db_mtx); cv_destroy(&db->db_changed); refcount_destroy(&db->db_holds); } /* * dbuf hash table routines */ static dbuf_hash_table_t dbuf_hash_table; static uint64_t dbuf_hash_count; static uint64_t dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) { uintptr_t osv = (uintptr_t)os; uint64_t crc = -1ULL; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (lvl)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 0)) & 0xFF]; crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (blkid >> 8)) & 0xFF]; crc ^= (osv>>14) ^ (obj>>16) ^ (blkid>>16); return (crc); } #define DBUF_HASH(os, obj, level, blkid) dbuf_hash(os, obj, level, blkid); #define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ ((dbuf)->db.db_object == (obj) && \ (dbuf)->db_objset == (os) && \ (dbuf)->db_level == (level) && \ (dbuf)->db_blkid == (blkid)) dmu_buf_impl_t * dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv = DBUF_HASH(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *db; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { if (DBUF_EQUAL(db, os, obj, level, blkid)) { mutex_enter(&db->db_mtx); if (db->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (db); } mutex_exit(&db->db_mtx); } } mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (NULL); } static dmu_buf_impl_t * dbuf_find_bonus(objset_t *os, uint64_t object) { dnode_t *dn; dmu_buf_impl_t *db = NULL; if (dnode_hold(os, object, FTAG, &dn) == 0) { rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_bonus != NULL) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); } rw_exit(&dn->dn_struct_rwlock); dnode_rele(dn, FTAG); } return (db); } /* * Insert an entry into the hash table. If there is already an element * equal to elem in the hash table, then the already existing element * will be returned and the new element will not be inserted. * Otherwise returns NULL. */ static dmu_buf_impl_t * dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid = db->db_blkid; uint64_t hv = DBUF_HASH(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf; mutex_enter(DBUF_HASH_MUTEX(h, idx)); for (dbf = h->hash_table[idx]; dbf != NULL; dbf = dbf->db_hash_next) { if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { mutex_enter(&dbf->db_mtx); if (dbf->db_state != DB_EVICTING) { mutex_exit(DBUF_HASH_MUTEX(h, idx)); return (dbf); } mutex_exit(&dbf->db_mtx); } } mutex_enter(&db->db_mtx); db->db_hash_next = h->hash_table[idx]; h->hash_table[idx] = db; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_inc_64(&dbuf_hash_count); return (NULL); } /* * Remove an entry from the hash table. It must be in the EVICTING state. */ static void dbuf_hash_remove(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; uint64_t hv = DBUF_HASH(db->db_objset, db->db.db_object, db->db_level, db->db_blkid); uint64_t idx = hv & h->hash_table_mask; dmu_buf_impl_t *dbf, **dbp; /* * We musn't hold db_mtx to maintain lock ordering: * DBUF_HASH_MUTEX > db_mtx. */ ASSERT(refcount_is_zero(&db->db_holds)); ASSERT(db->db_state == DB_EVICTING); ASSERT(!MUTEX_HELD(&db->db_mtx)); mutex_enter(DBUF_HASH_MUTEX(h, idx)); dbp = &h->hash_table[idx]; while ((dbf = *dbp) != db) { dbp = &dbf->db_hash_next; ASSERT(dbf != NULL); } *dbp = db->db_hash_next; db->db_hash_next = NULL; mutex_exit(DBUF_HASH_MUTEX(h, idx)); atomic_dec_64(&dbuf_hash_count); } static arc_evict_func_t dbuf_do_evict; +typedef enum { + DBVU_EVICTING, + DBVU_NOT_EVICTING +} dbvu_verify_type_t; + static void +dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) +{ +#ifdef ZFS_DEBUG + int64_t holds; + + if (db->db_user == NULL) + return; + + /* Only data blocks support the attachment of user data. */ + ASSERT(db->db_level == 0); + + /* Clients must resolve a dbuf before attaching user data. */ + ASSERT(db->db.db_data != NULL); + ASSERT3U(db->db_state, ==, DB_CACHED); + + holds = refcount_count(&db->db_holds); + if (verify_type == DBVU_EVICTING) { + /* + * Immediate eviction occurs when holds == dirtycnt. + * For normal eviction buffers, holds is zero on + * eviction, except when dbuf_fix_old_data() calls + * dbuf_clear_data(). However, the hold count can grow + * during eviction even though db_mtx is held (see + * dmu_bonus_hold() for an example), so we can only + * test the generic invariant that holds >= dirtycnt. + */ + ASSERT3U(holds, >=, db->db_dirtycnt); + } else { + if (db->db_immediate_evict == TRUE) + ASSERT3U(holds, >=, db->db_dirtycnt); + else + ASSERT3U(holds, >, 0); + } +#endif +} + +static void dbuf_evict_user(dmu_buf_impl_t *db) { + dmu_buf_user_t *dbu = db->db_user; + ASSERT(MUTEX_HELD(&db->db_mtx)); - if (db->db_level != 0 || db->db_evict_func == NULL) + if (dbu == NULL) return; - db->db_evict_func(&db->db, db->db_user_ptr); - db->db_user_ptr = NULL; - db->db_evict_func = NULL; + dbuf_verify_user(db, DBVU_EVICTING); + db->db_user = NULL; + +#ifdef ZFS_DEBUG + if (dbu->dbu_clear_on_evict_dbufp != NULL) + *dbu->dbu_clear_on_evict_dbufp = NULL; +#endif + + /* + * Invoke the callback from a taskq to avoid lock order reversals + * and limit stack depth. + */ + taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func, dbu, 0, + &dbu->dbu_tqent); } boolean_t dbuf_is_metadata(dmu_buf_impl_t *db) { if (db->db_level > 0) { return (B_TRUE); } else { boolean_t is_metadata; DB_DNODE_ENTER(db); is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); DB_DNODE_EXIT(db); return (is_metadata); } } void dbuf_evict(dmu_buf_impl_t *db) { ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_buf == NULL); ASSERT(db->db_data_pending == NULL); dbuf_clear(db); dbuf_destroy(db); } void dbuf_init(void) { uint64_t hsize = 1ULL << 16; dbuf_hash_table_t *h = &dbuf_hash_table; int i; /* * The hash table is big enough to fill all of physical memory * with an average 4K block size. The table will take up * totalmem*sizeof(void*)/4K (i.e. 2MB/GB with 8-byte pointers). */ while (hsize * 4096 < (uint64_t)physmem * PAGESIZE) hsize <<= 1; retry: h->hash_table_mask = hsize - 1; h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); if (h->hash_table == NULL) { /* XXX - we should really return an error instead of assert */ ASSERT(hsize > (1ULL << 10)); hsize >>= 1; goto retry; } dbuf_cache = kmem_cache_create("dmu_buf_impl_t", sizeof (dmu_buf_impl_t), 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); for (i = 0; i < DBUF_MUTEXES; i++) mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + + /* + * All entries are queued via taskq_dispatch_ent(), so min/maxalloc + * configuration is not required. + */ + dbu_evict_taskq = taskq_create("dbu_evict", 1, minclsyspri, 0, 0, 0); } void dbuf_fini(void) { dbuf_hash_table_t *h = &dbuf_hash_table; int i; for (i = 0; i < DBUF_MUTEXES; i++) mutex_destroy(&h->hash_mutexes[i]); kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); kmem_cache_destroy(dbuf_cache); + taskq_destroy(dbu_evict_taskq); } /* * Other stuff. */ #ifdef ZFS_DEBUG static void dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn; dbuf_dirty_record_t *dr; ASSERT(MUTEX_HELD(&db->db_mtx)); if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) return; ASSERT(db->db_objset != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn == NULL) { ASSERT(db->db_parent == NULL); ASSERT(db->db_blkptr == NULL); } else { ASSERT3U(db->db.db_object, ==, dn->dn_object); ASSERT3P(db->db_objset, ==, dn->dn_objset); ASSERT3U(db->db_level, <, dn->dn_nlevels); ASSERT(db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID || !avl_is_empty(&dn->dn_dbufs)); } if (db->db_blkid == DMU_BONUS_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); } else if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn != NULL); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT0(db->db.db_offset); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) ASSERT(dr->dr_dbuf == db); /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing * dnode_set_blksz(). */ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { dr = db->db_data_pending; /* * It should only be modified in syncing context, so * make sure we only have one copy of the data. */ ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); } /* verify db->db_blkptr */ if (db->db_blkptr) { if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); } else { /* db is pointed to by an indirect block */ int epb = db->db_parent->db.db_size >> SPA_BLKPTRSHIFT; ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); ASSERT3U(db->db_parent->db.db_object, ==, db->db.db_object); /* * dnode_grow_indblksz() can make this fail if we don't * have the struct_rwlock. XXX indblksz no longer * grows. safe to do this now? */ if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) { ASSERT3P(db->db_blkptr, ==, ((blkptr_t *)db->db_parent->db.db_data + db->db_blkid % epb)); } } } if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && (db->db_buf == NULL || db->db_buf->b_data) && db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && db->db_state != DB_FILL && !dn->dn_free_txg) { /* * If the blkptr isn't set but they have nonzero data, * it had better be dirty, otherwise we'll lose that * data when we evict this buffer. */ if (db->db_dirtycnt == 0) { uint64_t *buf = db->db.db_data; int i; for (i = 0; i < db->db.db_size >> 3; i++) { ASSERT(buf[i] == 0); } } } DB_DNODE_EXIT(db); } #endif static void +dbuf_clear_data(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + dbuf_evict_user(db); + db->db_buf = NULL; + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) + db->db_state = DB_UNCACHED; +} + +static void dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) { ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(buf != NULL); + db->db_buf = buf; - if (buf != NULL) { - ASSERT(buf->b_data != NULL); - db->db.db_data = buf->b_data; - if (!arc_released(buf)) - arc_set_callback(buf, dbuf_do_evict, db); - } else { - dbuf_evict_user(db); - db->db.db_data = NULL; - if (db->db_state != DB_NOFILL) - db->db_state = DB_UNCACHED; - } + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; + if (!arc_released(buf)) + arc_set_callback(buf, dbuf_do_evict, db); } /* * Loan out an arc_buf for read. Return the loaned arc_buf. */ arc_buf_t * dbuf_loan_arcbuf(dmu_buf_impl_t *db) { arc_buf_t *abuf; mutex_enter(&db->db_mtx); if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { int blksz = db->db.db_size; spa_t *spa = db->db_objset->os_spa; mutex_exit(&db->db_mtx); abuf = arc_loan_buf(spa, blksz); bcopy(db->db.db_data, abuf->b_data, blksz); } else { abuf = db->db_buf; arc_loan_inuse_buf(abuf, db); - dbuf_set_data(db, NULL); + dbuf_clear_data(db); mutex_exit(&db->db_mtx); } return (abuf); } uint64_t dbuf_whichblock(dnode_t *dn, uint64_t offset) { if (dn->dn_datablkshift) { return (offset >> dn->dn_datablkshift); } else { ASSERT3U(offset, <, dn->dn_datablksz); return (0); } } static void dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; mutex_enter(&db->db_mtx); ASSERT3U(db->db_state, ==, DB_READ); /* * All reads are synchronous, so we must have a hold on the dbuf */ ASSERT(refcount_count(&db->db_holds) > 0); ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); if (db->db_level == 0 && db->db_freed_in_flight) { /* we were freed in flight; disregard any error */ arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else if (zio == NULL || zio->io_error == 0) { dbuf_set_data(db, buf); db->db_state = DB_CACHED; } else { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT3P(db->db_buf, ==, NULL); VERIFY(arc_buf_remove_ref(buf, db)); db->db_state = DB_UNCACHED; } cv_broadcast(&db->db_changed); dbuf_rele_and_unlock(db, NULL); } static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { dnode_t *dn; zbookmark_phys_t zb; arc_flags_t aflags = ARC_FLAG_NOWAIT; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); if (db->db_blkid == DMU_BONUS_BLKID) { int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); if (bonuslen) bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); DB_DNODE_EXIT(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; } /* * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() * processes the delete record and clears the bp while we are waiting * for the dn_mtx (resulting in a "no" from block_freed). */ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); DB_DNODE_EXIT(db); dbuf_set_data(db, arc_buf_alloc(db->db_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; *flags |= DB_RF_CACHED; mutex_exit(&db->db_mtx); return; } DB_DNODE_EXIT(db); db->db_state = DB_READ; mutex_exit(&db->db_mtx); if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_FLAG_L2CACHE; if (DBUF_IS_L2COMPRESSIBLE(db)) aflags |= ARC_FLAG_L2COMPRESS; SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); dbuf_add_ref(db, NULL); (void) arc_read(zio, db->db_objset->os_spa, db->db_blkptr, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); if (aflags & ARC_FLAG_CACHED) *flags |= DB_RF_CACHED; } int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) { int err = 0; boolean_t havepzio = (zio != NULL); boolean_t prefetch; dnode_t *dn; /* * We don't have to hold the mutex to check db_state because it * can't be freed while we have a hold on the buffer. */ ASSERT(!refcount_is_zero(&db->db_holds)); if (db->db_state == DB_NOFILL) return (SET_ERROR(EIO)); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_enter(&dn->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); } else if (db->db_state == DB_UNCACHED) { spa_t *spa = dn->dn_objset->os_spa; if (zio == NULL) zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); dbuf_read_impl(db, zio, &flags); /* dbuf_read_impl has dropped db_mtx for us */ if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, flags & DB_RF_CACHED); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); if (!havepzio) err = zio_wait(zio); } else { /* * Another reader came in while the dbuf was in flight * between UNCACHED and CACHED. Either a writer will finish * writing the buffer (sending the dbuf to CACHED) or the * first reader's request will reach the read_done callback * and send the dbuf to CACHED. Otherwise, a failure * occurred and the dbuf went to UNCACHED. */ mutex_exit(&db->db_mtx); if (prefetch) dmu_zfetch(&dn->dn_zfetch, db->db.db_offset, db->db.db_size, TRUE); if ((flags & DB_RF_HAVESTRUCT) == 0) rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); /* Skip the wait per the caller's request. */ mutex_enter(&db->db_mtx); if ((flags & DB_RF_NEVERWAIT) == 0) { while (db->db_state == DB_READ || db->db_state == DB_FILL) { ASSERT(db->db_state == DB_READ || (flags & DB_RF_HAVESTRUCT) == 0); DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, db, zio_t *, zio); cv_wait(&db->db_changed, &db->db_mtx); } if (db->db_state == DB_UNCACHED) err = SET_ERROR(EIO); } mutex_exit(&db->db_mtx); } ASSERT(err || havepzio || db->db_state == DB_CACHED); return (err); } static void dbuf_noread(dmu_buf_impl_t *db) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); if (db->db_state == DB_UNCACHED) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; ASSERT(db->db_buf == NULL); ASSERT(db->db.db_data == NULL); dbuf_set_data(db, arc_buf_alloc(spa, db->db.db_size, db, type)); db->db_state = DB_FILL; } else if (db->db_state == DB_NOFILL) { - dbuf_set_data(db, NULL); + dbuf_clear_data(db); } else { ASSERT3U(db->db_state, ==, DB_CACHED); } mutex_exit(&db->db_mtx); } /* * This is our just-in-time copy function. It makes a copy of * buffers, that have been modified in a previous transaction * group, before we modify them in the current active group. * * This function is used in two places: when we are dirtying a * buffer for the first time in a txg, and when we are freeing * a range in a dnode that includes this buffer. * * Note that when we are called from dbuf_free_range() we do * not put a hold on the buffer, we just traverse the active * dbuf list for the dnode. */ static void dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db.db_data != NULL); ASSERT(db->db_level == 0); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); if (dr == NULL || (dr->dt.dl.dr_data != ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) return; /* * If the last dirty record for this dbuf has not yet synced * and its referencing the dbuf data, either: * reset the reference to point to a new copy, * or (if there a no active holders) * just null out the current db_data pointer. */ ASSERT(dr->dr_txg >= txg - 2); if (db->db_blkid == DMU_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); spa_t *spa = db->db_objset->os_spa; dr->dt.dl.dr_data = arc_buf_alloc(spa, size, db, type); bcopy(db->db.db_data, dr->dt.dl.dr_data->b_data, size); } else { - dbuf_set_data(db, NULL); + dbuf_clear_data(db); } } void dbuf_unoverride(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); ASSERT(db->db_level == 0); if (db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; ASSERT(db->db_data_pending != dr); /* free this block */ if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; /* * Release the already-written buffer, so we leave it in * a consistent dirty state. Note that all callers are * modifying the buffer, so they will immediately do * another (redundant) arc_release(). Therefore, leave * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ arc_release(dr->dt.dl.dr_data, db); } /* * Evict (if its unreferenced) or clear (if its referenced) any level-0 * data blocks in the free range, so that any future readers will find * empty blocks. * * This is a no-op if the dataset is in the middle of an incremental * receive; see comment below for details. */ void dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, dmu_tx_t *tx) { - dmu_buf_impl_t *db, *db_next, db_search; + dmu_buf_impl_t db_search; + dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; avl_index_t where; if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID)) end_blkid = dn->dn_maxblkid; dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); db_search.db_level = 0; db_search.db_blkid = start_blkid; db_search.db_state = DB_SEARCH; mutex_enter(&dn->dn_dbufs_mtx); if (start_blkid >= dn->dn_unlisted_l0_blkid) { /* There can't be any dbufs in this range; no need to search. */ #ifdef DEBUG db = avl_find(&dn->dn_dbufs, &db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); ASSERT(db == NULL || db->db_level > 0); #endif mutex_exit(&dn->dn_dbufs_mtx); return; } else if (dmu_objset_is_receiving(dn->dn_objset)) { /* * If we are receiving, we expect there to be no dbufs in * the range to be freed, because receive modifies each * block at most once, and in offset order. If this is * not the case, it can lead to performance problems, * so note that we unexpectedly took the slow path. */ atomic_inc_64(&zfs_free_range_recv_miss); } db = avl_find(&dn->dn_dbufs, &db_search, &where); ASSERT3P(db, ==, NULL); db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); for (; db != NULL; db = db_next) { db_next = AVL_NEXT(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DMU_BONUS_BLKID); if (db->db_level != 0 || db->db_blkid > end_blkid) { break; } ASSERT3U(db->db_blkid, >=, start_blkid); /* found a level 0 buffer in the range */ mutex_enter(&db->db_mtx); if (dbuf_undirty(db, tx)) { /* mutex has been dropped and dbuf destroyed */ continue; } if (db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL || db->db_state == DB_EVICTING) { ASSERT(db->db.db_data == NULL); mutex_exit(&db->db_mtx); continue; } if (db->db_state == DB_READ || db->db_state == DB_FILL) { /* will be handled in dbuf_read_done or dbuf_rele */ db->db_freed_in_flight = TRUE; mutex_exit(&db->db_mtx); continue; } if (refcount_count(&db->db_holds) == 0) { ASSERT(db->db_buf); dbuf_clear(db); continue; } /* The dbuf is referenced */ if (db->db_last_dirty != NULL) { dbuf_dirty_record_t *dr = db->db_last_dirty; if (dr->dr_txg == txg) { /* * This buffer is "in-use", re-adjust the file * size to reflect that this buffer may * contain new data when we sync. */ if (db->db_blkid != DMU_SPILL_BLKID && db->db_blkid > dn->dn_maxblkid) dn->dn_maxblkid = db->db_blkid; dbuf_unoverride(dr); } else { /* * This dbuf is not dirty in the open context. * Either uncache it (if its not referenced in * the open context) or reset its contents to * empty. */ dbuf_fix_old_data(db, txg); } } /* clear the contents if its cached */ if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); arc_release(db->db_buf, db); bzero(db->db.db_data, db->db.db_size); arc_buf_freeze(db->db_buf); } mutex_exit(&db->db_mtx); } mutex_exit(&dn->dn_dbufs_mtx); } static int dbuf_block_freeable(dmu_buf_impl_t *db) { dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; uint64_t birth_txg = 0; /* * We don't need any locking to protect db_blkptr: * If it's syncing, then db_last_dirty will be set * so we'll ignore db_blkptr. * * This logic ensures that only block births for * filled blocks are considered. */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_last_dirty && (db->db_blkptr == NULL || !BP_IS_HOLE(db->db_blkptr))) { birth_txg = db->db_last_dirty->dr_txg; } else if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { birth_txg = db->db_blkptr->blk_birth; } /* * If this block don't exist or is in a snapshot, it can't be freed. * Don't pass the bp to dsl_dataset_block_freeable() since we * are holding the db_mtx lock and might deadlock if we are * prefetching a dedup-ed block. */ if (birth_txg != 0) return (ds == NULL || dsl_dataset_block_freeable(ds, NULL, birth_txg)); else return (B_FALSE); } void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) { arc_buf_t *buf, *obuf; int osize = db->db.db_size; arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dnode_t *dn; ASSERT(db->db_blkid != DMU_BONUS_BLKID); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* XXX does *this* func really need the lock? */ ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * This call to dmu_buf_will_dirty() with the dn_struct_rwlock held * is OK, because there can be no other references to the db * when we are changing its size, so no concurrent DB_FILL can * be happening. */ /* * XXX we should be doing a dbuf_read, checking the return * value and returning that up to our callers */ dmu_buf_will_dirty(&db->db, tx); /* create the data buffer for the new block */ buf = arc_buf_alloc(dn->dn_objset->os_spa, size, db, type); /* copy old block data to the new block */ obuf = db->db_buf; bcopy(obuf->b_data, buf->b_data, MIN(osize, size)); /* zero the remainder */ if (size > osize) bzero((uint8_t *)buf->b_data + osize, size - osize); mutex_enter(&db->db_mtx); dbuf_set_data(db, buf); VERIFY(arc_buf_remove_ref(obuf, db)); db->db.db_size = size; if (db->db_level == 0) { ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); db->db_last_dirty->dt.dl.dr_data = buf; } mutex_exit(&db->db_mtx); dnode_willuse_space(dn, size-osize, tx); DB_DNODE_EXIT(db); } void dbuf_release_bp(dmu_buf_impl_t *db) { objset_t *os = db->db_objset; ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); ASSERT(arc_released(os->os_phys_buf) || list_link_active(&os->os_dsl_dataset->ds_synced_link)); ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); (void) arc_release(db->db_buf, db); } dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; objset_t *os; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; boolean_t do_free_accounting = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); DMU_TX_DIRTY_BUF(tx, db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. */ ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_objset->os_dsl_dataset == NULL); /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty * in syncing context. */ ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); mutex_enter(&db->db_mtx); /* * XXX make this true for indirects too? The problem is that * transactions created with dmu_tx_create_assigned() from * syncing context don't bother holding ahead. */ ASSERT(db->db_level != 0 || db->db_state == DB_CACHED || db->db_state == DB_FILL || db->db_state == DB_NOFILL); mutex_enter(&dn->dn_mtx); /* * Don't set dirtyctx to SYNC if we're just modifying this as we * initialize the objset. */ if (dn->dn_dirtyctx == DN_UNDIRTIED && !BP_IS_HOLE(dn->dn_objset->os_rootbp)) { dn->dn_dirtyctx = (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN); ASSERT(dn->dn_dirtyctx_firstset == NULL); dn->dn_dirtyctx_firstset = kmem_alloc(1, KM_SLEEP); } mutex_exit(&dn->dn_mtx); if (db->db_blkid == DMU_SPILL_BLKID) dn->dn_have_spill = B_TRUE; /* * If this buffer is already dirty, we're done. */ drp = &db->db_last_dirty; ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) drp = &dr->dr_next; if (dr && dr->dr_txg == tx->tx_txg) { DB_DNODE_EXIT(db); if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT && db->db_state != DB_NOFILL) arc_buf_thaw(db->db_buf); } mutex_exit(&db->db_mtx); return (dr); } /* * Only valid if not already dirty. */ ASSERT(dn->dn_object == 0 || dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || dn->dn_phys->dn_nlevels > db->db_level || dn->dn_next_nlevels[txgoff] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); /* * We should only be dirtying in syncing context if it's the * mos or we're initializing the os or it's a special object. * However, we are allowed to dirty in syncing context provided * we already dirtied it in open context. Hence we must make * this assertion only if we're not already dirty. */ os = dn->dn_objset; ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); if (db->db_blkid != DMU_BONUS_BLKID) { /* * Update the accounting. * Note: we delay "free accounting" until after we drop * the db_mtx. This keeps us from grabbing other locks * (and possibly deadlocking) in bp_get_dsize() while * also holding the db_mtx. */ dnode_willuse_space(dn, db->db.db_size, tx); do_free_accounting = dbuf_block_freeable(db); } /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this * transaction group won't leak out when we sync the older txg. */ dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); if (db->db_level == 0) { void *data_old = db->db_buf; if (db->db_state != DB_NOFILL) { if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db.db_data; } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { /* * Release the data buffer from the cache so * that we can modify it without impacting * possible other users of this cached data * block. Note that indirect blocks and * private objects are not released until the * syncing state (since they are only modified * then). */ arc_release(db->db_buf, db); dbuf_fix_old_data(db, tx->tx_txg); data_old = db->db_buf; } ASSERT(data_old != NULL); } dr->dt.dl.dr_data = data_old; } else { mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_DEFAULT, NULL); list_create(&dr->dt.di.dr_children, sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } if (db->db_blkid != DMU_BONUS_BLKID && os->os_dsl_dataset != NULL) dr->dr_accounted = db->db.db_size; dr->dr_dbuf = db; dr->dr_txg = tx->tx_txg; dr->dr_next = *drp; *drp = dr; /* * We could have been freed_in_flight between the dbuf_noread * and dbuf_dirty. We win, as though the dbuf_noread() had * happened after the free. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_free_ranges[txgoff] != NULL) { range_tree_clear(dn->dn_free_ranges[txgoff], db->db_blkid, 1); } mutex_exit(&dn->dn_mtx); db->db_freed_in_flight = FALSE; } /* * This buffer is now part of this txg */ dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); db->db_dirtycnt += 1; ASSERT3U(db->db_dirtycnt, <=, 3); mutex_exit(&db->db_mtx); if (db->db_blkid == DMU_BONUS_BLKID || db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } else if (do_free_accounting) { blkptr_t *bp = db->db_blkptr; int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? bp_get_dsize(os->os_spa, bp) : db->db.db_size; /* * This is only a guess -- if the dbuf is dirty * in a previous txg, we don't know how much * space it will use on disk yet. We should * really have the struct_rwlock to access * db_blkptr, but since this is just a guess, * it's OK if we get an odd answer. */ ddt_prefetch(os->os_spa, bp); dnode_willuse_space(dn, -willfree, tx); } if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } if (db->db_level == 0) { dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); ASSERT(dn->dn_maxblkid >= db->db_blkid); } if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; int parent_held = FALSE; if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; parent = dbuf_hold_level(dn, db->db_level+1, db->db_blkid >> epbs, FTAG); ASSERT(parent != NULL); parent_held = TRUE; } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); ASSERT3U(db->db_level+1, ==, parent->db_level); di = dbuf_dirty(parent, tx); if (parent_held) dbuf_rele(parent, FTAG); mutex_enter(&db->db_mtx); /* * Since we've dropped the mutex, it's possible that * dbuf_undirty() might have changed this out from under us. */ if (db->db_last_dirty == dr || dn->dn_object == DMU_META_DNODE_OBJECT) { mutex_enter(&di->dt.di.dr_mtx); ASSERT3U(di->dr_txg, ==, tx->tx_txg); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&di->dt.di.dr_children, dr); mutex_exit(&di->dt.di.dr_mtx); dr->dr_parent = di; } mutex_exit(&db->db_mtx); } else { ASSERT(db->db_level+1 == dn->dn_nlevels); ASSERT(db->db_blkid < dn->dn_nblkptr); ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); ASSERT(!list_link_active(&dr->dr_dirty_node)); list_insert_tail(&dn->dn_dirty_records[txgoff], dr); mutex_exit(&dn->dn_mtx); if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } dnode_setdirty(dn, tx); DB_DNODE_EXIT(db); return (dr); } /* * Undirty a buffer in the transaction group referenced by the given * transaction. Return whether this evicted the dbuf. */ static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn; uint64_t txg = tx->tx_txg; dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); /* * Due to our use of dn_nlevels below, this can only be called * in open context, unless we are operating on the MOS. * From syncing context, dn_nlevels may be different from the * dn_nlevels used when dbuf was dirtied. */ ASSERT(db->db_objset == dmu_objset_pool(db->db_objset)->dp_meta_objset || txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); /* * If this buffer is not dirty, we're done. */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; if (dr == NULL || dr->dr_txg < txg) return (B_FALSE); ASSERT(dr->dr_txg == txg); ASSERT(dr->dr_dbuf == db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); ASSERT(db->db.db_size != 0); dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), dr->dr_accounted, txg); *drp = dr->dr_next; /* * Note that there are three places in dbuf_dirty() * where this dirty record may be put on a list. * Make sure to do a list_remove corresponding to * every one of those list_insert calls. */ if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_blkid == DMU_SPILL_BLKID || db->db_level + 1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); } DB_DNODE_EXIT(db); if (db->db_state != DB_NOFILL) { dbuf_unoverride(dr); ASSERT(db->db_buf != NULL); ASSERT(dr->dt.dl.dr_data != NULL); if (dr->dt.dl.dr_data != db->db_buf) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { arc_buf_t *buf = db->db_buf; ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); - dbuf_set_data(db, NULL); + dbuf_clear_data(db); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); return (B_TRUE); } return (B_FALSE); } void dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); DB_DNODE_ENTER(db); if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; DB_DNODE_EXIT(db); (void) dbuf_read(db, NULL, rf); (void) dbuf_dirty(db, tx); } void dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; db->db_state = DB_NOFILL; dmu_buf_will_fill(db_fake, tx); } void dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(tx->tx_txg != 0); ASSERT(db->db_level == 0); ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); dbuf_noread(db); (void) dbuf_dirty(db, tx); } #pragma weak dmu_buf_fill_done = dbuf_fill_done /* ARGSUSED */ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) { mutex_enter(&db->db_mtx); DBUF_VERIFY(db); if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ bzero(db->db.db_data, db->db.db_size); db->db_freed_in_flight = FALSE; } db->db_state = DB_CACHED; cv_broadcast(&db->db_changed); } mutex_exit(&db->db_mtx); } void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; struct dirty_leaf *dl; dmu_object_type_t type; DB_DNODE_ENTER(db); type = DB_DNODE(db)->dn_type; DB_DNODE_EXIT(db); ASSERT0(db->db_level); ASSERT(db->db_blkid != DMU_BONUS_BLKID); dmu_buf_will_not_fill(dbuf, tx); ASSERT3U(db->db_last_dirty->dr_txg, ==, tx->tx_txg); dl = &db->db_last_dirty->dt.dl; encode_embedded_bp_compressed(&dl->dr_overridden_by, data, comp, uncompressed_size, compressed_size); BPE_SET_ETYPE(&dl->dr_overridden_by, etype); BP_SET_TYPE(&dl->dr_overridden_by, type); BP_SET_LEVEL(&dl->dr_overridden_by, 0); BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); dl->dr_override_state = DR_OVERRIDDEN; dl->dr_overridden_by.blk_birth = db->db_last_dirty->dr_txg; } /* * Directly assign a provided arc buf to a given dbuf if it's not referenced * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. */ void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) { ASSERT(!refcount_is_zero(&db->db_holds)); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(db->db_level == 0); ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); ASSERT(buf != NULL); ASSERT(arc_buf_size(buf) == db->db.db_size); ASSERT(tx->tx_txg != 0); arc_return_buf(buf, db); ASSERT(arc_released(buf)); mutex_enter(&db->db_mtx); while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); if (db->db_state == DB_CACHED && refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); bcopy(buf->b_data, db->db.db_data, db->db.db_size); VERIFY(arc_buf_remove_ref(buf, db)); xuio_stat_wbuf_copied(); return; } xuio_stat_wbuf_nocopy(); if (db->db_state == DB_CACHED) { dbuf_dirty_record_t *dr = db->db_last_dirty; ASSERT(db->db_buf != NULL); if (dr != NULL && dr->dr_txg == tx->tx_txg) { ASSERT(dr->dt.dl.dr_data == db->db_buf); if (!arc_released(db->db_buf)) { ASSERT(dr->dt.dl.dr_override_state == DR_OVERRIDDEN); arc_release(db->db_buf, db); } dr->dt.dl.dr_data = buf; VERIFY(arc_buf_remove_ref(db->db_buf, db)); } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { arc_release(db->db_buf, db); VERIFY(arc_buf_remove_ref(db->db_buf, db)); } db->db_buf = NULL; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); db->db_state = DB_FILL; mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); dmu_buf_fill_done(&db->db, tx); } /* * "Clear" the contents of this dbuf. This will mark the dbuf * EVICTING and clear *most* of its references. Unfortunately, * when we are not holding the dn_dbufs_mtx, we can't clear the * entry in the dn_dbufs list. We have to wait until dbuf_destroy() * in this case. For callers from the DMU we will usually see: * dbuf_clear()->arc_clear_callback()->dbuf_do_evict()->dbuf_destroy() * For the arc callback, we will usually see: * dbuf_do_evict()->dbuf_clear();dbuf_destroy() * Sometimes, though, we will get a mix of these two: * DMU: dbuf_clear()->arc_clear_callback() * ARC: dbuf_do_evict()->dbuf_destroy() * * This routine will dissociate the dbuf from the arc, by calling * arc_clear_callback(), but will not evict the data from the ARC. */ void dbuf_clear(dmu_buf_impl_t *db) { dnode_t *dn; dmu_buf_impl_t *parent = db->db_parent; dmu_buf_impl_t *dndb; boolean_t dbuf_gone = B_FALSE; ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(refcount_is_zero(&db->db_holds)); dbuf_evict_user(db); if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); if (db->db_blkid == DMU_BONUS_BLKID) { zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; db->db_state = DB_UNCACHED; } ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); ASSERT(db->db_data_pending == NULL); db->db_state = DB_EVICTING; db->db_blkptr = NULL; DB_DNODE_ENTER(db); dn = DB_DNODE(db); dndb = dn->dn_dbuf; if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); membar_producer(); DB_DNODE_EXIT(db); /* * Decrementing the dbuf count means that the hold corresponding * to the removed dbuf is no longer discounted in dnode_move(), * so the dnode cannot be moved until after we release the hold. * The membar_producer() ensures visibility of the decremented * value in dnode_move(), since DB_DNODE_EXIT doesn't actually * release any lock. */ dnode_rele(dn, db); db->db_dnode_handle = NULL; } else { DB_DNODE_EXIT(db); } if (db->db_buf) dbuf_gone = arc_clear_callback(db->db_buf); if (!dbuf_gone) mutex_exit(&db->db_mtx); /* * If this dbuf is referenced from an indirect dbuf, * decrement the ref count on the indirect dbuf. */ if (parent && parent != dndb) dbuf_rele(parent, db); } static int dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, dmu_buf_impl_t **parentp, blkptr_t **bpp) { int nlevels, epbs; *parentp = NULL; *bpp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); if (blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); if (dn->dn_have_spill && (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) *bpp = &dn->dn_phys->dn_spill; else *bpp = NULL; dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; mutex_exit(&dn->dn_mtx); return (0); } if (dn->dn_phys->dn_nlevels == 0) nlevels = 1; else nlevels = dn->dn_phys->dn_nlevels; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(level * epbs, <, 64); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (level >= nlevels || (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { /* the buffer has no parent yet */ return (SET_ERROR(ENOENT)); } else if (level < nlevels-1) { /* this block is referenced from an indirect block */ int err = dbuf_hold_impl(dn, level+1, blkid >> epbs, fail_sparse, NULL, parentp); if (err) return (err); err = dbuf_read(*parentp, NULL, (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); if (err) { dbuf_rele(*parentp, NULL); *parentp = NULL; return (err); } *bpp = ((blkptr_t *)(*parentp)->db.db_data) + (blkid & ((1ULL << epbs) - 1)); return (0); } else { /* the block is referenced from the dnode */ ASSERT3U(level, ==, nlevels-1); ASSERT(dn->dn_phys->dn_nblkptr == 0 || blkid < dn->dn_phys->dn_nblkptr); if (dn->dn_dbuf) { dbuf_add_ref(dn->dn_dbuf, NULL); *parentp = dn->dn_dbuf; } *bpp = &dn->dn_phys->dn_blkptr[blkid]; return (0); } } static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr) { objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_type != DMU_OT_NONE); db = kmem_cache_alloc(dbuf_cache, KM_SLEEP); db->db_objset = os; db->db.db_object = dn->dn_object; db->db_level = level; db->db_blkid = blkid; db->db_last_dirty = NULL; db->db_dirtycnt = 0; db->db_dnode_handle = dn->dn_handle; db->db_parent = parent; db->db_blkptr = blkptr; - db->db_user_ptr = NULL; - db->db_evict_func = NULL; + db->db_user = NULL; db->db_immediate_evict = 0; db->db_freed_in_flight = 0; if (blkid == DMU_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); db->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DMU_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); } else if (blkid == DMU_SPILL_BLKID) { db->db.db_size = (blkptr != NULL) ? BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; db->db.db_offset = 0; } else { int blocksize = db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; db->db.db_size = blocksize; db->db.db_offset = db->db_blkid * blocksize; } /* * Hold the dn_dbufs_mtx while we get the new dbuf * in the hash table *and* added to the dbufs list. * This prevents a possible deadlock with someone * trying to look up this dbuf before its added to the * dn_dbufs list. */ mutex_enter(&dn->dn_dbufs_mtx); db->db_state = DB_EVICTING; if ((odb = dbuf_hash_insert(db)) != NULL) { /* someone else inserted it first */ kmem_cache_free(dbuf_cache, db); mutex_exit(&dn->dn_dbufs_mtx); return (odb); } avl_add(&dn->dn_dbufs, db); if (db->db_level == 0 && db->db_blkid >= dn->dn_unlisted_l0_blkid) dn->dn_unlisted_l0_blkid = db->db_blkid + 1; db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || refcount_count(&dn->dn_holds) > 0); (void) refcount_add(&dn->dn_holds, db); atomic_inc_32(&dn->dn_dbufs_count); dprintf_dbuf(db, "db=%p\n", db); return (db); } static int dbuf_do_evict(void *private) { dmu_buf_impl_t *db = private; if (!MUTEX_HELD(&db->db_mtx)) mutex_enter(&db->db_mtx); ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_state != DB_EVICTING) { ASSERT(db->db_state == DB_CACHED); DBUF_VERIFY(db); db->db_buf = NULL; dbuf_evict(db); } else { mutex_exit(&db->db_mtx); dbuf_destroy(db); } return (0); } static void dbuf_destroy(dmu_buf_impl_t *db) { ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_blkid != DMU_BONUS_BLKID) { /* * If this dbuf is still on the dn_dbufs list, * remove it from that list. */ if (db->db_dnode_handle != NULL) { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); mutex_enter(&dn->dn_dbufs_mtx); avl_remove(&dn->dn_dbufs, db); atomic_dec_32(&dn->dn_dbufs_count); mutex_exit(&dn->dn_dbufs_mtx); DB_DNODE_EXIT(db); /* * Decrementing the dbuf count means that the hold * corresponding to the removed dbuf is no longer * discounted in dnode_move(), so the dnode cannot be * moved until after we release the hold. */ dnode_rele(dn, db); db->db_dnode_handle = NULL; } dbuf_hash_remove(db); } db->db_parent = NULL; db->db_buf = NULL; ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); kmem_cache_free(dbuf_cache, db); arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); } void dbuf_prefetch(dnode_t *dn, uint64_t blkid, zio_priority_t prio) { dmu_buf_impl_t *db = NULL; blkptr_t *bp = NULL; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); if (dnode_block_freed(dn, blkid)) return; /* dbuf_find() returns with db_mtx held */ if (db = dbuf_find(dn->dn_objset, dn->dn_object, 0, blkid)) { /* * This dbuf is already in the cache. We assume that * it is already CACHED, or else about to be either * read or filled. */ mutex_exit(&db->db_mtx); return; } if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { if (bp && !BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, dn->dn_object, 0, blkid); (void) arc_read(NULL, dn->dn_objset->os_spa, bp, NULL, NULL, prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); } if (db) dbuf_rele(db, NULL); } } /* * Returns with db_holds incremented, and db_mtx not held. * Note: dn_struct_rwlock must be held. */ int dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, int fail_sparse, void *tag, dmu_buf_impl_t **dbp) { dmu_buf_impl_t *db, *parent = NULL; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT3U(dn->dn_nlevels, >, level); *dbp = NULL; top: /* dbuf_find() returns with db_mtx held */ db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); if (db == NULL) { blkptr_t *bp = NULL; int err; ASSERT3P(parent, ==, NULL); err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); if (fail_sparse) { if (err == 0 && bp && BP_IS_HOLE(bp)) err = SET_ERROR(ENOENT); if (err) { if (parent) dbuf_rele(parent, NULL); return (err); } } if (err && err != ENOENT) return (err); db = dbuf_create(dn, level, blkid, parent, bp); } if (db->db_buf && refcount_is_zero(&db->db_holds)) { arc_buf_add_ref(db->db_buf, db); if (db->db_buf->b_data == NULL) { dbuf_clear(db); if (parent) { dbuf_rele(parent, NULL); parent = NULL; } goto top; } ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); } ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); /* * If this buffer is currently syncing out, and we are are * still referencing it from db_data, we need to make a copy * of it in case we decide we want to dirty it again in this txg. */ if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && dn->dn_object != DMU_META_DNODE_OBJECT && db->db_state == DB_CACHED && db->db_data_pending) { dbuf_dirty_record_t *dr = db->db_data_pending; if (dr->dt.dl.dr_data == db->db_buf) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); bcopy(dr->dt.dl.dr_data->b_data, db->db.db_data, db->db.db_size); } } (void) refcount_add(&db->db_holds, tag); DBUF_VERIFY(db); mutex_exit(&db->db_mtx); /* NOTE: we can't rele the parent until after we drop the db_mtx */ if (parent) dbuf_rele(parent, NULL); ASSERT3P(DB_DNODE(db), ==, dn); ASSERT3U(db->db_blkid, ==, blkid); ASSERT3U(db->db_level, ==, level); *dbp = db; return (0); } dmu_buf_impl_t * dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, 0, blkid, FALSE, tag, &db); return (err ? NULL : db); } dmu_buf_impl_t * dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) { dmu_buf_impl_t *db; int err = dbuf_hold_impl(dn, level, blkid, FALSE, tag, &db); return (err ? NULL : db); } void dbuf_create_bonus(dnode_t *dn) { ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); } int dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dnode_t *dn; if (db->db_blkid != DMU_SPILL_BLKID) return (SET_ERROR(ENOTSUP)); if (blksz == 0) blksz = SPA_MINBLOCKSIZE; ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); DB_DNODE_ENTER(db); dn = DB_DNODE(db); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dbuf_new_size(db, blksz, tx); rw_exit(&dn->dn_struct_rwlock); DB_DNODE_EXIT(db); return (0); } void dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) { dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); } #pragma weak dmu_buf_add_ref = dbuf_add_ref void dbuf_add_ref(dmu_buf_impl_t *db, void *tag) { int64_t holds = refcount_add(&db->db_holds, tag); ASSERT(holds > 1); } #pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref boolean_t dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, void *tag) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; dmu_buf_impl_t *found_db; boolean_t result = B_FALSE; if (db->db_blkid == DMU_BONUS_BLKID) found_db = dbuf_find_bonus(os, obj); else found_db = dbuf_find(os, obj, 0, blkid); if (found_db != NULL) { if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { (void) refcount_add(&db->db_holds, tag); result = B_TRUE; } mutex_exit(&db->db_mtx); } return (result); } /* * If you call dbuf_rele() you had better not be referencing the dnode handle * unless you have some other direct or indirect hold on the dnode. (An indirect * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the * dnode's parent dbuf evicting its dnode handles. */ void dbuf_rele(dmu_buf_impl_t *db, void *tag) { mutex_enter(&db->db_mtx); dbuf_rele_and_unlock(db, tag); } void dmu_buf_rele(dmu_buf_t *db, void *tag) { dbuf_rele((dmu_buf_impl_t *)db, tag); } /* * dbuf_rele() for an already-locked dbuf. This is necessary to allow * db_dirtycnt and db_holds to be updated atomically. */ void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) { int64_t holds; ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); /* * Remove the reference to the dbuf before removing its hold on the * dnode so we can guarantee in dnode_move() that a referenced bonus * buffer has a corresponding dnode hold. */ holds = refcount_remove(&db->db_holds, tag); ASSERT(holds >= 0); /* * We can't freeze indirects if there is a possibility that they * may be modified in the current syncing context. */ if (db->db_buf && holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) arc_buf_freeze(db->db_buf); if (holds == db->db_dirtycnt && db->db_level == 0 && db->db_immediate_evict) dbuf_evict_user(db); if (holds == 0) { if (db->db_blkid == DMU_BONUS_BLKID) { dnode_t *dn; /* * If the dnode moves here, we cannot cross this * barrier until the move completes. */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); atomic_dec_32(&dn->dn_dbufs_count); /* * Decrementing the dbuf count means that the bonus * buffer's dnode hold is no longer discounted in * dnode_move(). The dnode cannot move until after * the dnode_rele_and_unlock() below. */ DB_DNODE_EXIT(db); /* * Do not reference db after its lock is dropped. * Another thread may evict it. */ mutex_exit(&db->db_mtx); /* * If the dnode has been freed, evict the bonus * buffer immediately. The data in the bonus * buffer is no longer relevant and this prevents * a stale bonus buffer from being associated * with this dnode_t should the dnode_t be reused * prior to being destroyed. */ mutex_enter(&dn->dn_mtx); if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) { /* * Drop dn_mtx. It is a leaf lock and * cannot be held when dnode_evict_bonus() * acquires other locks in order to * perform the eviction. * * Freed dnodes cannot be reused until the * last hold is released. Since this bonus * buffer has a hold, the dnode will remain * in the free state, even without dn_mtx * held, until the dnode_rele_and_unlock() * below. */ mutex_exit(&dn->dn_mtx); dnode_evict_bonus(dn); mutex_enter(&dn->dn_mtx); } dnode_rele_and_unlock(dn, db); } else if (db->db_buf == NULL) { /* * This is a special case: we never associated this * dbuf with any data allocated from the ARC. */ ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); dbuf_evict(db); } else if (arc_released(db->db_buf)) { arc_buf_t *buf = db->db_buf; /* * This dbuf has anonymous data associated with it. */ - dbuf_set_data(db, NULL); + dbuf_clear_data(db); VERIFY(arc_buf_remove_ref(buf, db)); dbuf_evict(db); } else { VERIFY(!arc_buf_remove_ref(db->db_buf, db)); /* * A dbuf will be eligible for eviction if either the * 'primarycache' property is set or a duplicate * copy of this buffer is already cached in the arc. * * In the case of the 'primarycache' a buffer * is considered for eviction if it matches the * criteria set in the property. * * To decide if our buffer is considered a * duplicate, we must call into the arc to determine * if multiple buffers are referencing the same * block on-disk. If so, then we simply evict * ourselves. */ if (!DBUF_IS_CACHEABLE(db)) { if (db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr) && !BP_IS_EMBEDDED(db->db_blkptr)) { spa_t *spa = dmu_objset_spa(db->db_objset); blkptr_t bp = *db->db_blkptr; dbuf_clear(db); arc_freed(spa, &bp); } else { dbuf_clear(db); } - } else if (arc_buf_eviction_needed(db->db_buf)) { + } else if (db->db_objset->os_evicting || + arc_buf_eviction_needed(db->db_buf)) { dbuf_clear(db); } else { mutex_exit(&db->db_mtx); } } } else { mutex_exit(&db->db_mtx); } } #pragma weak dmu_buf_refcount = dbuf_refcount uint64_t dbuf_refcount(dmu_buf_impl_t *db) { return (refcount_count(&db->db_holds)); } void * -dmu_buf_set_user(dmu_buf_t *db_fake, void *user_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, + dmu_buf_user_t *new_user) { - return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + mutex_enter(&db->db_mtx); + dbuf_verify_user(db, DBVU_NOT_EVICTING); + if (db->db_user == old_user) + db->db_user = new_user; + else + old_user = db->db_user; + dbuf_verify_user(db, DBVU_NOT_EVICTING); + mutex_exit(&db->db_mtx); + + return (old_user); } void * -dmu_buf_set_user_ie(dmu_buf_t *db_fake, void *user_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - - db->db_immediate_evict = TRUE; - return (dmu_buf_update_user(db_fake, NULL, user_ptr, evict_func)); + return (dmu_buf_replace_user(db_fake, NULL, user)); } void * -dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, void *user_ptr, - dmu_buf_evict_func_t *evict_func) +dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(db->db_level == 0); - ASSERT((user_ptr == NULL) == (evict_func == NULL)); + db->db_immediate_evict = TRUE; + return (dmu_buf_set_user(db_fake, user)); +} - mutex_enter(&db->db_mtx); - - if (db->db_user_ptr == old_user_ptr) { - db->db_user_ptr = user_ptr; - db->db_evict_func = evict_func; - } else { - old_user_ptr = db->db_user_ptr; - } - - mutex_exit(&db->db_mtx); - return (old_user_ptr); +void * +dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + return (dmu_buf_replace_user(db_fake, user, NULL)); } void * dmu_buf_get_user(dmu_buf_t *db_fake) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - ASSERT(!refcount_is_zero(&db->db_holds)); - return (db->db_user_ptr); + dbuf_verify_user(db, DBVU_NOT_EVICTING); + return (db->db_user); +} + +void +dmu_buf_user_evict_wait() +{ + taskq_wait(dbu_evict_taskq); } boolean_t dmu_buf_freeable(dmu_buf_t *dbuf) { boolean_t res = B_FALSE; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; if (db->db_blkptr) res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, db->db_blkptr, db->db_blkptr->blk_birth); return (res); } blkptr_t * dmu_buf_get_blkptr(dmu_buf_t *db) { dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; return (dbi->db_blkptr); } static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { /* ASSERT(dmu_tx_is_syncing(tx) */ ASSERT(MUTEX_HELD(&db->db_mtx)); if (db->db_blkptr != NULL) return; if (db->db_blkid == DMU_SPILL_BLKID) { db->db_blkptr = &dn->dn_phys->dn_spill; BP_ZERO(db->db_blkptr); return; } if (db->db_level == dn->dn_phys->dn_nlevels-1) { /* * This buffer was allocated at a time when there was * no available blkptrs from the dnode, or it was * inappropriate to hook it in (i.e., nlevels mis-match). */ ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); ASSERT(db->db_parent == NULL); db->db_parent = dn->dn_dbuf; db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; DBUF_VERIFY(db); } else { dmu_buf_impl_t *parent = db->db_parent; int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT(dn->dn_phys->dn_nlevels > 1); if (parent == NULL) { mutex_exit(&db->db_mtx); rw_enter(&dn->dn_struct_rwlock, RW_READER); (void) dbuf_hold_impl(dn, db->db_level+1, db->db_blkid >> epbs, FALSE, db, &parent); rw_exit(&dn->dn_struct_rwlock); mutex_enter(&db->db_mtx); db->db_parent = parent; } db->db_blkptr = (blkptr_t *)parent->db.db_data + (db->db_blkid & ((1ULL << epbs) - 1)); DBUF_VERIFY(db); } } static void dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; zio_t *zio; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); ASSERT(db->db_level > 0); DBUF_VERIFY(db); /* Read the block if it hasn't been read yet. */ if (db->db_buf == NULL) { mutex_exit(&db->db_mtx); (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); mutex_enter(&db->db_mtx); } ASSERT3U(db->db_state, ==, DB_CACHED); ASSERT(db->db_buf != NULL); DB_DNODE_ENTER(db); dn = DB_DNODE(db); /* Indirect block size must match what the dnode thinks it is. */ ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); dbuf_check_blkptr(dn, db); DB_DNODE_EXIT(db); /* Provide the pending dirty record to child dbufs */ db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); } static void dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) { arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; uint64_t txg = tx->tx_txg; ASSERT(dmu_tx_is_syncing(tx)); dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); mutex_enter(&db->db_mtx); /* * To be synced, we must be dirtied. But we * might have been freed after the dirty. */ if (db->db_state == DB_UNCACHED) { /* This buffer has been freed since it was dirtied */ ASSERT(db->db.db_data == NULL); } else if (db->db_state == DB_FILL) { /* This buffer was freed and is now being re-filled */ ASSERT(db->db.db_data != dr->dt.dl.dr_data); } else { ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); } DBUF_VERIFY(db); DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (db->db_blkid == DMU_SPILL_BLKID) { mutex_enter(&dn->dn_mtx); dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); } /* * If this is a bonus buffer, simply copy the bonus data into the * dnode. It will be written out when the dnode is synced (and it * will be synced, since it must have been dirty for dbuf_sync to * be called). */ if (db->db_blkid == DMU_BONUS_BLKID) { dbuf_dirty_record_t **drp; ASSERT(*datap != NULL); ASSERT0(db->db_level); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); DB_DNODE_EXIT(db); if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db_data_pending = NULL; drp = &db->db_last_dirty; while (*drp != dr) drp = &(*drp)->dr_next; ASSERT(dr->dr_next == NULL); ASSERT(dr->dr_dbuf == db); *drp = dr->dr_next; if (dr->dr_dbuf->db_level != 0) { list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); return; } os = dn->dn_objset; /* * This function may have dropped the db_mtx lock allowing a dmu_sync * operation to sneak in. As a result, we need to ensure that we * don't check the dr_override_state until we have returned from * dbuf_check_blkptr. */ dbuf_check_blkptr(dn, db); /* * If this buffer is in the middle of an immediate write, * wait for the synchronous IO to complete. */ while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); cv_wait(&db->db_changed, &db->db_mtx); ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } if (db->db_state != DB_NOFILL && dn->dn_object != DMU_META_DNODE_OBJECT && refcount_count(&db->db_holds) > 1 && dr->dt.dl.dr_override_state != DR_OVERRIDDEN && *datap == db->db_buf) { /* * If this buffer is currently "in use" (i.e., there * are active holds and db_data still references it), * then make a copy before we start the write so that * any modifications from the open txg will not leak * into this write. * * NOTE: this copy does not need to be made for * objects only modified in the syncing context (e.g. * DNONE_DNODE blocks). */ int blksz = arc_buf_size(*datap); arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); *datap = arc_buf_alloc(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; mutex_exit(&db->db_mtx); dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) { list_insert_tail(&dn->dn_dirty_records[txg&TXG_MASK], dr); DB_DNODE_EXIT(db); } else { /* * Although zio_nowait() does not "wait for an IO", it does * initiate the IO. If this is an empty write it seems plausible * that the IO could actually be completed before the nowait * returns. We need to DB_DNODE_EXIT() first in case * zio_nowait() invalidates the dbuf. */ DB_DNODE_EXIT(db); zio_nowait(dr->dr_zio); } } void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; while (dr = list_head(list)) { if (dr->dr_zio != NULL) { /* * If we find an already initialized zio then we * are processing the meta-dnode, and we have finished. * The dbufs for all dnodes are put back on the list * during processing, so that we can zio_wait() * these IOs after initiating all child IOs. */ ASSERT3U(dr->dr_dbuf->db.db_object, ==, DMU_META_DNODE_OBJECT); break; } if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { VERIFY3U(dr->dr_dbuf->db_level, ==, level); } list_remove(list, dr); if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); else dbuf_sync_leaf(dr, tx); } } /* ARGSUSED */ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; dnode_t *dn; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; spa_t *spa = zio->io_spa; int64_t delta; uint64_t fill = 0; int i; ASSERT3P(db->db_blkptr, ==, bp); DB_DNODE_ENTER(db); dn = DB_DNODE(db); delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); dnode_diduse_space(dn, delta - zio->io_prev_space_delta); zio->io_prev_space_delta = delta; if (bp->blk_birth != 0) { ASSERT((db->db_blkid != DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_type) || (db->db_blkid == DMU_SPILL_BLKID && BP_GET_TYPE(bp) == dn->dn_bonustype) || BP_IS_EMBEDDED(bp)); ASSERT(BP_GET_LEVEL(bp) == db->db_level); } mutex_enter(&db->db_mtx); #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); } #endif if (db->db_level == 0) { mutex_enter(&dn->dn_mtx); if (db->db_blkid > dn->dn_phys->dn_maxblkid && db->db_blkid != DMU_SPILL_BLKID) dn->dn_phys->dn_maxblkid = db->db_blkid; mutex_exit(&dn->dn_mtx); if (dn->dn_type == DMU_OT_DNODE) { dnode_phys_t *dnp = db->db.db_data; for (i = db->db.db_size >> DNODE_SHIFT; i > 0; i--, dnp++) { if (dnp->dn_type != DMU_OT_NONE) fill++; } } else { if (BP_IS_HOLE(bp)) { fill = 0; } else { fill = 1; } } } else { blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; fill += BP_GET_FILL(ibp); } } DB_DNODE_EXIT(db); if (!BP_IS_EMBEDDED(bp)) bp->blk_fill = fill; mutex_exit(&db->db_mtx); } /* * The SPA will call this callback several times for each zio - once * for every physical child i/o (zio->io_phys_children times). This * allows the DMU to monitor the progress of each logical i/o. For example, * there may be 2 copies of an indirect block, or many fragments of a RAID-Z * block. There may be a long delay before all copies/fragments are completed, * so this callback allows us to retire dirty space gradually, as the physical * i/os complete. */ /* ARGSUSED */ static void dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) { dmu_buf_impl_t *db = arg; objset_t *os = db->db_objset; dsl_pool_t *dp = dmu_objset_pool(os); dbuf_dirty_record_t *dr; int delta = 0; dr = db->db_data_pending; ASSERT3U(dr->dr_txg, ==, zio->io_txg); /* * The callback will be called io_phys_children times. Retire one * portion of our dirty space each time we are called. Any rounding * error will be cleaned up by dsl_pool_sync()'s call to * dsl_pool_undirty_space(). */ delta = dr->dr_accounted / zio->io_phys_children; dsl_pool_undirty_space(dp, delta, zio->io_txg); } /* ARGSUSED */ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; blkptr_t *bp_orig = &zio->io_bp_orig; blkptr_t *bp = db->db_blkptr; objset_t *os = db->db_objset; dmu_tx_t *tx = os->os_synctx; dbuf_dirty_record_t **drp, *dr; ASSERT0(zio->io_error); ASSERT(db->db_blkptr == bp); /* * For nopwrites and rewrites we ensure that the bp matches our * original and bypass all the accounting. */ if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } mutex_enter(&db->db_mtx); DBUF_VERIFY(db); drp = &db->db_last_dirty; while ((dr = *drp) != db->db_data_pending) drp = &dr->dr_next; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; #ifdef ZFS_DEBUG if (db->db_blkid == DMU_SPILL_BLKID) { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && db->db_blkptr == &dn->dn_phys->dn_spill); DB_DNODE_EXIT(db); } #endif if (db->db_level == 0) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); else if (!arc_released(db->db_buf)) arc_set_callback(db->db_buf, dbuf_do_evict, db); } } else { dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; ASSERT3U(db->db_blkid, <=, dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, db->db.db_size); if (!arc_released(db->db_buf)) arc_set_callback(db->db_buf, dbuf_do_evict, db); } DB_DNODE_EXIT(db); mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); cv_broadcast(&db->db_changed); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg); } static void dbuf_write_nofill_ready(zio_t *zio) { dbuf_write_ready(zio, NULL, zio->io_private); } static void dbuf_write_nofill_done(zio_t *zio) { dbuf_write_done(zio, NULL, zio->io_private); } static void dbuf_write_override_ready(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; dbuf_write_ready(zio, NULL, db); } static void dbuf_write_override_done(zio_t *zio) { dbuf_dirty_record_t *dr = zio->io_private; dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *obp = &dr->dt.dl.dr_overridden_by; mutex_enter(&db->db_mtx); if (!BP_EQUAL(zio->io_bp, obp)) { if (!BP_IS_HOLE(obp)) dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); arc_release(dr->dt.dl.dr_data, db); } mutex_exit(&db->db_mtx); dbuf_write_done(zio, NULL, db); } /* Issue I/O to commit a dirty buffer to disk. */ static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn; objset_t *os; dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; int wp_flag = 0; DB_DNODE_ENTER(db); dn = DB_DNODE(db); os = dn->dn_objset; if (db->db_state != DB_NOFILL) { if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { /* * Private object buffers are released here rather * than in dbuf_dirty() since they are only modified * in the syncing context and we don't want the * overhead of making multiple copies of the data. */ if (BP_IS_HOLE(db->db_blkptr)) { arc_buf_thaw(data); } else { dbuf_release_bp(db); } } } if (parent != dn->dn_dbuf) { /* Our parent is an indirect block. */ /* We have a dirty parent that has been scheduled for write. */ ASSERT(parent && parent->db_data_pending); /* Our parent's buffer is one level closer to the dnode. */ ASSERT(db->db_level == parent->db_level-1); /* * We're about to modify our parent's db_data by modifying * our block pointer, so the parent must be released. */ ASSERT(arc_released(parent->db_buf)); zio = parent->db_data_pending->dr_zio; } else { /* Our parent is the dnode itself. */ ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && db->db_blkid != DMU_SPILL_BLKID) || (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); if (db->db_blkid != DMU_SPILL_BLKID) ASSERT3P(db->db_blkptr, ==, &dn->dn_phys->dn_blkptr[db->db_blkid]); zio = dn->dn_zio; } ASSERT(db->db_level == 0 || data == db->db_buf); ASSERT3U(db->db_blkptr->blk_birth, <=, txg); ASSERT(zio); SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, db->db.db_object, db->db_level, db->db_blkid); if (db->db_blkid == DMU_SPILL_BLKID) wp_flag = WP_SPILL; wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); DB_DNODE_EXIT(db); if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* * The BP for this block has been provided by open context * (by dmu_sync() or dmu_buf_write_embedded()). */ void *contents = (data != NULL) ? data->b_data : NULL; dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, contents, db->db.db_size, &zp, dbuf_write_override_ready, NULL, dbuf_write_override_done, dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); mutex_enter(&db->db_mtx); dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); mutex_exit(&db->db_mtx); } else if (db->db_state == DB_NOFILL) { ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); dr->dr_zio = zio_write(zio, os->os_spa, txg, db->db_blkptr, NULL, db->db.db_size, &zp, dbuf_write_nofill_ready, NULL, dbuf_write_nofill_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); } else { ASSERT(arc_released(data)); dr->dr_zio = arc_write(zio, os->os_spa, txg, db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), DBUF_IS_L2COMPRESSIBLE(db), &zp, dbuf_write_ready, dbuf_write_physdone, dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_objset.c (revision 286575) @@ -1,1821 +1,1846 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Needed to close a window in dnode_move() that allows the objset to be freed * before it can be safely accessed. */ krwlock_t os_lock; void dmu_objset_init(void) { rw_init(&os_lock, NULL, RW_DEFAULT, NULL); } void dmu_objset_fini(void) { rw_destroy(&os_lock); } spa_t * dmu_objset_spa(objset_t *os) { return (os->os_spa); } zilog_t * dmu_objset_zil(objset_t *os) { return (os->os_zil); } dsl_pool_t * dmu_objset_pool(objset_t *os) { dsl_dataset_t *ds; if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) return (ds->ds_dir->dd_pool); else return (spa_get_dsl(os->os_spa)); } dsl_dataset_t * dmu_objset_ds(objset_t *os) { return (os->os_dsl_dataset); } dmu_objset_type_t dmu_objset_type(objset_t *os) { return (os->os_phys->os_type); } void dmu_objset_name(objset_t *os, char *buf) { dsl_dataset_name(os->os_dsl_dataset, buf); } uint64_t dmu_objset_id(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; return (ds ? ds->ds_object : 0); } zfs_sync_type_t dmu_objset_syncprop(objset_t *os) { return (os->os_sync); } zfs_logbias_op_t dmu_objset_logbias(objset_t *os) { return (os->os_logbias); } static void checksum_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); } static void compression_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval != ZIO_COMPRESS_INHERIT); os->os_compress = zio_compress_select(os->os_spa, newval, ZIO_COMPRESS_ON); } static void copies_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval > 0); ASSERT(newval <= spa_max_replication(os->os_spa)); os->os_copies = newval; } static void dedup_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; spa_t *spa = os->os_spa; enum zio_checksum checksum; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); } static void primary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_primary_cache = newval; } static void secondary_cache_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); os->os_secondary_cache = newval; } static void sync_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || newval == ZFS_SYNC_DISABLED); os->os_sync = newval; if (os->os_zil) zil_set_sync(os->os_zil, newval); } static void redundant_metadata_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || newval == ZFS_REDUNDANT_METADATA_MOST); os->os_redundant_metadata = newval; } static void logbias_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; ASSERT(newval == ZFS_LOGBIAS_LATENCY || newval == ZFS_LOGBIAS_THROUGHPUT); os->os_logbias = newval; if (os->os_zil) zil_set_logbias(os->os_zil, newval); } static void recordsize_changed_cb(void *arg, uint64_t newval) { objset_t *os = arg; os->os_recordsize = newval; } void dmu_objset_byteswap(void *buf, size_t size) { objset_phys_t *osp = buf; ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); dnode_byteswap(&osp->os_meta_dnode); byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); osp->os_type = BSWAP_64(osp->os_type); osp->os_flags = BSWAP_64(osp->os_flags); if (size == sizeof (objset_phys_t)) { dnode_byteswap(&osp->os_userused_dnode); dnode_byteswap(&osp->os_groupused_dnode); } } int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, objset_t **osp) { objset_t *os; int i, err; ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); os->os_dsl_dataset = ds; os->os_spa = spa; os->os_rootbp = bp; if (!BP_IS_HOLE(os->os_rootbp)) { arc_flags_t aflags = ARC_FLAG_WAIT; zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_FLAG_L2CACHE; if (DMU_OS_IS_L2COMPRESSIBLE(os)) aflags |= ARC_FLAG_L2COMPRESS; dprintf_bp(os->os_rootbp, "reading %s", ""); err = arc_read(NULL, spa, os->os_rootbp, arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err != 0) { kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } /* Increase the blocksize if we are permitted. */ if (spa_version(spa) >= SPA_VERSION_USERSPACE && arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { arc_buf_t *buf = arc_buf_alloc(spa, sizeof (objset_phys_t), &os->os_phys_buf, ARC_BUFC_METADATA); bzero(buf->b_data, sizeof (objset_phys_t)); bcopy(os->os_phys_buf->b_data, buf->b_data, arc_buf_size(os->os_phys_buf)); (void) arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf); os->os_phys_buf = buf; } os->os_phys = os->os_phys_buf->b_data; os->os_flags = os->os_phys->os_flags; } else { int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; os->os_phys_buf = arc_buf_alloc(spa, size, &os->os_phys_buf, ARC_BUFC_METADATA); os->os_phys = os->os_phys_buf->b_data; bzero(os->os_phys, size); } /* * Note: the changed_cb will be called once before the register * func returns, thus changing the checksum/compression from the * default (fletcher2/off). Snapshots don't need to know about * checksum/compression/copies. */ if (ds != NULL) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os); if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os); } - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name( ZFS_PROP_REDUNDANT_METADATA), redundant_metadata_changed_cb, os); } if (err == 0) { err = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), recordsize_changed_cb, os); } } if (err != 0) { VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); kmem_free(os, sizeof (objset_t)); return (err); } } else { /* It's the meta-objset. */ os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; os->os_compress = ZIO_COMPRESS_ON; os->os_copies = spa_max_replication(spa); os->os_dedup_checksum = ZIO_CHECKSUM_OFF; os->os_dedup_verify = B_FALSE; os->os_logbias = ZFS_LOGBIAS_LATENCY; os->os_sync = ZFS_SYNC_STANDARD; os->os_primary_cache = ZFS_CACHE_ALL; os->os_secondary_cache = ZFS_CACHE_ALL; } - if (ds == NULL || !dsl_dataset_is_snapshot(ds)) + if (ds == NULL || !ds->ds_is_snapshot) os->os_zil_header = os->os_phys->os_zil_header; os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); list_create(&os->os_free_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); } list_create(&os->os_dnodes, sizeof (dnode_t), offsetof(dnode_t, dn_link)); list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - DMU_META_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT, - &os->os_meta_dnode); + dnode_special_open(os, &os->os_phys->os_meta_dnode, + DMU_META_DNODE_OBJECT, &os->os_meta_dnode); if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { - DMU_USERUSED_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT, - &os->os_userused_dnode); - DMU_GROUPUSED_DNODE(os) = dnode_special_open(os, - &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT, - &os->os_groupused_dnode); + dnode_special_open(os, &os->os_phys->os_userused_dnode, + DMU_USERUSED_OBJECT, &os->os_userused_dnode); + dnode_special_open(os, &os->os_phys->os_groupused_dnode, + DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); } *osp = os; return (0); } int dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { int err = 0; mutex_enter(&ds->ds_opening_lock); if (ds->ds_objset == NULL) { objset_t *os; err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), ds, dsl_dataset_get_blkptr(ds), &os); if (err == 0) { mutex_enter(&ds->ds_lock); ASSERT(ds->ds_objset == NULL); ds->ds_objset = os; mutex_exit(&ds->ds_lock); } } *osp = ds->ds_objset; mutex_exit(&ds->ds_opening_lock); return (err); } /* * Holds the pool while the objset is held. Therefore only one objset * can be held at a time. */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; err = dsl_pool_hold(name, tag, &dp); if (err != 0) return (err); err = dsl_dataset_hold(dp, name, tag, &ds); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } err = dmu_objset_from_ds(ds, osp); if (err != 0) { dsl_dataset_rele(ds, tag); dsl_pool_rele(dp, tag); } return (err); } /* * dsl_pool must not be held when this is called. * Upon successful return, there will be a longhold on the dataset, * and the dsl_pool will not be held. */ int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_own(dp, name, tag, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } err = dmu_objset_from_ds(ds, osp); dsl_pool_rele(dp, FTAG); if (err != 0) { dsl_dataset_disown(ds, tag); } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EINVAL)); - } else if (!readonly && dsl_dataset_is_snapshot(ds)) { + } else if (!readonly && ds->ds_is_snapshot) { dsl_dataset_disown(ds, tag); return (SET_ERROR(EROFS)); } return (err); } void dmu_objset_rele(objset_t *os, void *tag) { dsl_pool_t *dp = dmu_objset_pool(os); dsl_dataset_rele(os->os_dsl_dataset, tag); dsl_pool_rele(dp, tag); } /* * When we are called, os MUST refer to an objset associated with a dataset * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner * == tag. We will then release and reacquire ownership of the dataset while * holding the pool config_rwlock to avoid intervening namespace or ownership * changes may occur. * * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to * release the hold on its dataset and acquire a new one on the dataset of the * same name so that it can be partially torn down and reconstructed. */ void dmu_objset_refresh_ownership(objset_t *os, void *tag) { dsl_pool_t *dp; dsl_dataset_t *ds, *newds; char name[MAXNAMELEN]; ds = os->os_dsl_dataset; VERIFY3P(ds, !=, NULL); VERIFY3P(ds->ds_owner, ==, tag); VERIFY(dsl_dataset_long_held(ds)); dsl_dataset_name(ds, name); dp = dmu_objset_pool(os); dsl_pool_config_enter(dp, FTAG); dmu_objset_disown(os, tag); VERIFY0(dsl_dataset_own(dp, name, tag, &newds)); VERIFY3P(newds, ==, os->os_dsl_dataset); dsl_pool_config_exit(dp, FTAG); } void dmu_objset_disown(objset_t *os, void *tag) { dsl_dataset_disown(os->os_dsl_dataset, tag); } void dmu_objset_evict_dbufs(objset_t *os) { + dnode_t dn_marker; dnode_t *dn; mutex_enter(&os->os_lock); + dn = list_head(&os->os_dnodes); + while (dn != NULL) { + /* + * Skip dnodes without holds. We have to do this dance + * because dnode_add_ref() only works if there is already a + * hold. If the dnode has no holds, then it has no dbufs. + */ + if (dnode_add_ref(dn, FTAG)) { + list_insert_after(&os->os_dnodes, dn, &dn_marker); + mutex_exit(&os->os_lock); - /* process the mdn last, since the other dnodes have holds on it */ - list_remove(&os->os_dnodes, DMU_META_DNODE(os)); - list_insert_tail(&os->os_dnodes, DMU_META_DNODE(os)); + dnode_evict_dbufs(dn); + dnode_rele(dn, FTAG); - /* - * Find the first dnode with holds. We have to do this dance - * because dnode_add_ref() only works if you already have a - * hold. If there are no holds then it has no dbufs so OK to - * skip. - */ - for (dn = list_head(&os->os_dnodes); - dn && !dnode_add_ref(dn, FTAG); - dn = list_next(&os->os_dnodes, dn)) - continue; - - while (dn) { - dnode_t *next_dn = dn; - - do { - next_dn = list_next(&os->os_dnodes, next_dn); - } while (next_dn && !dnode_add_ref(next_dn, FTAG)); - - mutex_exit(&os->os_lock); - dnode_evict_dbufs(dn); - dnode_rele(dn, FTAG); - mutex_enter(&os->os_lock); - dn = next_dn; + mutex_enter(&os->os_lock); + dn = list_next(&os->os_dnodes, &dn_marker); + list_remove(&os->os_dnodes, &dn_marker); + } else { + dn = list_next(&os->os_dnodes, dn); + } } mutex_exit(&os->os_lock); + + if (DMU_USERUSED_DNODE(os) != NULL) { + dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); + dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); + } + dnode_evict_dbufs(DMU_META_DNODE(os)); } +/* + * Objset eviction processing is split into into two pieces. + * The first marks the objset as evicting, evicts any dbufs that + * have a refcount of zero, and then queues up the objset for the + * second phase of eviction. Once os->os_dnodes has been cleared by + * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. + * The second phase closes the special dnodes, dequeues the objset from + * the list of those undergoing eviction, and finally frees the objset. + * + * NOTE: Due to asynchronous eviction processing (invocation of + * dnode_buf_pageout()), it is possible for the meta dnode for the + * objset to have no holds even though os->os_dnodes is not empty. + */ void dmu_objset_evict(objset_t *os) { dsl_dataset_t *ds = os->os_dsl_dataset; for (int t = 0; t < TXG_SIZE; t++) ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_CHECKSUM), checksum_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_COMPRESSION), compression_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_COPIES), copies_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEDUP), dedup_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_LOGBIAS), logbias_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SYNC), sync_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA), redundant_metadata_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), recordsize_changed_cb, os)); } VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), primary_cache_changed_cb, os)); VERIFY0(dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), secondary_cache_changed_cb, os)); } if (os->os_sa) sa_tear_down(os); + os->os_evicting = B_TRUE; dmu_objset_evict_dbufs(os); + mutex_enter(&os->os_lock); + spa_evicting_os_register(os->os_spa, os); + if (list_is_empty(&os->os_dnodes)) { + mutex_exit(&os->os_lock); + dmu_objset_evict_done(os); + } else { + mutex_exit(&os->os_lock); + } +} + +void +dmu_objset_evict_done(objset_t *os) +{ + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + dnode_special_close(&os->os_meta_dnode); if (DMU_USERUSED_DNODE(os)) { dnode_special_close(&os->os_userused_dnode); dnode_special_close(&os->os_groupused_dnode); } zil_free(os->os_zil); - ASSERT3P(list_head(&os->os_dnodes), ==, NULL); - VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf)); /* * This is a barrier to prevent the objset from going away in * dnode_move() until we can safely ensure that the objset is still in * use. We consider the objset valid before the barrier and invalid * after the barrier. */ rw_enter(&os_lock, RW_READER); rw_exit(&os_lock); mutex_destroy(&os->os_lock); mutex_destroy(&os->os_obj_lock); mutex_destroy(&os->os_user_ptr_lock); + spa_evicting_os_deregister(os->os_spa, os); kmem_free(os, sizeof (objset_t)); } timestruc_t dmu_objset_snap_cmtime(objset_t *os) { return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); } /* called from dsl for meta-objset */ objset_t * dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx) { objset_t *os; dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); if (ds != NULL) VERIFY0(dmu_objset_from_ds(ds, &os)); else VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); mdn = DMU_META_DNODE(os); dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); /* * We don't want to have to increase the meta-dnode's nlevels * later, because then we could do it in quescing context while * we are also accessing it in open context. * * This precaution is not necessary for the MOS (ds == NULL), * because the MOS is only updated in syncing context. * This is most fortunate: the MOS is the only objset that * needs to be synced multiple times as spa_sync() iterates * to convergence, so minimizing its dn_nlevels matters. */ if (ds != NULL) { int levels = 1; /* * Determine the number of levels necessary for the meta-dnode * to contain DN_MAX_OBJECT dnodes. */ while ((uint64_t)mdn->dn_nblkptr << (mdn->dn_datablkshift + (levels - 1) * (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < DN_MAX_OBJECT * sizeof (dnode_phys_t)) levels++; mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = mdn->dn_nlevels = levels; } ASSERT(type != DMU_OST_NONE); ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); os->os_phys->os_type = type; if (dmu_objset_userused_enabled(os)) { os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; os->os_flags = os->os_phys->os_flags; } dsl_dataset_dirty(ds, tx); return (os); } typedef struct dmu_objset_create_arg { const char *doca_name; cred_t *doca_cred; void (*doca_userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *doca_userarg; dmu_objset_type_t doca_type; uint64_t doca_flags; } dmu_objset_create_arg_t; /*ARGSUSED*/ static int dmu_objset_create_check(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; int error; if (strchr(doca->doca_name, '@') != NULL) return (SET_ERROR(EINVAL)); error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); if (error != 0) return (error); if (tail == NULL) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred); dsl_dir_rele(pdd, FTAG); return (error); } static void dmu_objset_create_sync(void *arg, dmu_tx_t *tx) { dmu_objset_create_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; dsl_dataset_t *ds; uint64_t obj; blkptr_t *bp; objset_t *os; VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, doca->doca_cred, tx); VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); bp = dsl_dataset_get_blkptr(ds); os = dmu_objset_create_impl(pdd->dd_pool->dp_spa, ds, bp, doca->doca_type, tx); if (doca->doca_userfunc != NULL) { doca->doca_userfunc(os, doca->doca_userarg, doca->doca_cred, tx); } spa_history_log_internal_ds(ds, "create", tx, ""); dsl_dataset_rele(ds, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { dmu_objset_create_arg_t doca; doca.doca_name = name; doca.doca_cred = CRED(); doca.doca_flags = flags; doca.doca_userfunc = func; doca.doca_userarg = arg; doca.doca_type = type; return (dsl_sync_task(name, dmu_objset_create_check, dmu_objset_create_sync, &doca, 5, ZFS_SPACE_CHECK_NORMAL)); } typedef struct dmu_objset_clone_arg { const char *doca_clone; const char *doca_origin; cred_t *doca_cred; } dmu_objset_clone_arg_t; /*ARGSUSED*/ static int dmu_objset_clone_check(void *arg, dmu_tx_t *tx) { dmu_objset_clone_arg_t *doca = arg; dsl_dir_t *pdd; const char *tail; int error; dsl_dataset_t *origin; dsl_pool_t *dp = dmu_tx_pool(tx); if (strchr(doca->doca_clone, '@') != NULL) return (SET_ERROR(EINVAL)); error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); if (error != 0) return (error); if (tail == NULL) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EEXIST)); } error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, doca->doca_cred); if (error != 0) { dsl_dir_rele(pdd, FTAG); return (SET_ERROR(EDQUOT)); } dsl_dir_rele(pdd, FTAG); error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); if (error != 0) return (error); /* You can only clone snapshots, not the head datasets. */ - if (!dsl_dataset_is_snapshot(origin)) { + if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); return (SET_ERROR(EINVAL)); } dsl_dataset_rele(origin, FTAG); return (0); } static void dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) { dmu_objset_clone_arg_t *doca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *pdd; const char *tail; dsl_dataset_t *origin, *ds; uint64_t obj; char namebuf[MAXNAMELEN]; VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); obj = dsl_dataset_create_sync(pdd, tail, origin, 0, doca->doca_cred, tx); VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); dsl_dataset_name(origin, namebuf); spa_history_log_internal_ds(ds, "clone", tx, "origin=%s (%llu)", namebuf, origin->ds_object); dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(origin, FTAG); dsl_dir_rele(pdd, FTAG); } int dmu_objset_clone(const char *clone, const char *origin) { dmu_objset_clone_arg_t doca; doca.doca_clone = clone; doca.doca_origin = origin; doca.doca_cred = CRED(); return (dsl_sync_task(clone, dmu_objset_clone_check, dmu_objset_clone_sync, &doca, 5, ZFS_SPACE_CHECK_NORMAL)); } int dmu_objset_snapshot_one(const char *fsname, const char *snapname) { int err; char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); nvlist_t *snaps = fnvlist_alloc(); fnvlist_add_boolean(snaps, longsnap); strfree(longsnap); err = dsl_dataset_snapshot(snaps, NULL, NULL); fnvlist_free(snaps); return (err); } static void dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) { dnode_t *dn; while (dn = list_head(list)) { ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ASSERT(dn->dn_dbuf->db_data_pending); /* * Initialize dn_zio outside dnode_sync() because the * meta-dnode needs to set it ouside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); list_remove(list, dn); if (newlist) { (void) dnode_add_ref(dn, newlist); list_insert_tail(newlist, dn); } dnode_sync(dn, tx); } } /* ARGSUSED */ static void dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ASSERT(!BP_IS_EMBEDDED(bp)); ASSERT3P(bp, ==, os->os_rootbp); ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); ASSERT0(BP_GET_LEVEL(bp)); /* * Update rootbp fill count: it should be the number of objects * allocated in the object set (not counting the "special" * objects that are stored in the objset_phys_t -- the meta * dnode and user/group accounting objects). */ bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += BP_GET_FILL(&dnp->dn_blkptr[i]); } /* ARGSUSED */ static void dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; objset_t *os = arg; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { ASSERT(BP_EQUAL(bp, bp_orig)); } else { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); dsl_dataset_block_born(ds, bp, tx); } } /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; zbookmark_phys_t zb; zio_prop_t zp; zio_t *zio; list_t *list; list_t *newlist = NULL; dbuf_dirty_record_t *dr; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); ASSERT(dmu_tx_is_syncing(tx)); /* XXX the write_done callback should really give us the tx... */ os->os_synctx = tx; if (os->os_dsl_dataset == NULL) { /* * This is the MOS. If we have upgraded, * spa_max_replication() could change, so reset * os_copies here. */ os->os_copies = spa_max_replication(os->os_spa); } /* * Create the root block IO */ SET_BOOKMARK(&zb, os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); arc_release(os->os_phys_buf, &os->os_phys_buf); dmu_write_policy(os, NULL, 0, 0, &zp); zio = arc_write(pio, os->os_spa, tx->tx_txg, os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), DMU_OS_IS_L2COMPRESSIBLE(os), &zp, dmu_objset_write_ready, NULL, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* * Sync special dnodes - the parent IO for the sync is the root block */ DMU_META_DNODE(os)->dn_zio = zio; dnode_sync(DMU_META_DNODE(os), tx); os->os_phys->os_flags = os->os_flags; if (DMU_USERUSED_DNODE(os) && DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { DMU_USERUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_USERUSED_DNODE(os), tx); DMU_GROUPUSED_DNODE(os)->dn_zio = zio; dnode_sync(DMU_GROUPUSED_DNODE(os), tx); } txgoff = tx->tx_txg & TXG_MASK; if (dmu_objset_userused_enabled(os)) { newlist = &os->os_synced_dnodes; /* * We must create the list here because it uses the * dn_dirty_link[] of this txg. */ list_create(newlist, sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[txgoff])); } dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; while (dr = list_head(list)) { ASSERT0(dr->dr_dbuf->db_level); list_remove(list, dr); if (dr->dr_zio) zio_nowait(dr->dr_zio); } /* * Free intent log blocks up to this tx. */ zil_sync(os->os_zil, tx); os->os_phys->os_zil_header = os->os_zil_header; zio_nowait(zio); } boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg) { return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); } static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) { used_cbs[ost] = cb; } boolean_t dmu_objset_userused_enabled(objset_t *os) { return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && used_cbs[os->os_phys->os_type] != NULL && DMU_USERUSED_DNODE(os) != NULL); } static void do_userquota_update(objset_t *os, uint64_t used, uint64_t flags, uint64_t user, uint64_t group, boolean_t subtract, dmu_tx_t *tx) { if ((flags & DNODE_FLAG_USERUSED_ACCOUNTED)) { int64_t delta = DNODE_SIZE + used; if (subtract) delta = -delta; VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, user, delta, tx)); VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, group, delta, tx)); } } void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) { dnode_t *dn; list_t *list = &os->os_synced_dnodes; ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); while (dn = list_head(list)) { int flags; ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED); /* Allocate the user/groupused objects if necessary. */ if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { VERIFY(0 == zap_create_claim(os, DMU_USERUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); VERIFY(0 == zap_create_claim(os, DMU_GROUPUSED_OBJECT, DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); } /* * We intentionally modify the zap object even if the * net delta is zero. Otherwise * the block of the zap obj could be shared between * datasets but need to be different between them after * a bprewrite. */ flags = dn->dn_id_flags; ASSERT(flags); if (flags & DN_ID_OLD_EXIST) { do_userquota_update(os, dn->dn_oldused, dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, B_TRUE, tx); } if (flags & DN_ID_NEW_EXIST) { do_userquota_update(os, DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid, B_FALSE, tx); } mutex_enter(&dn->dn_mtx); dn->dn_oldused = 0; dn->dn_oldflags = 0; if (dn->dn_id_flags & DN_ID_NEW_EXIST) { dn->dn_olduid = dn->dn_newuid; dn->dn_oldgid = dn->dn_newgid; dn->dn_id_flags |= DN_ID_OLD_EXIST; if (dn->dn_bonuslen == 0) dn->dn_id_flags |= DN_ID_CHKED_SPILL; else dn->dn_id_flags |= DN_ID_CHKED_BONUS; } dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); mutex_exit(&dn->dn_mtx); list_remove(list, dn); dnode_rele(dn, list); } } /* * Returns a pointer to data to find uid/gid from * * If a dirty record for transaction group that is syncing can't * be found then NULL is returned. In the NULL case it is assumed * the uid/gid aren't changing. */ static void * dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) { dbuf_dirty_record_t *dr, **drp; void *data; if (db->db_dirtycnt == 0) return (db->db.db_data); /* Nothing is changing */ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg == tx->tx_txg) break; if (dr == NULL) { data = NULL; } else { dnode_t *dn; DB_DNODE_ENTER(dr->dr_dbuf); dn = DB_DNODE(dr->dr_dbuf); if (dn->dn_bonuslen == 0 && dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) data = dr->dt.dl.dr_data->b_data; else data = dr->dt.dl.dr_data; DB_DNODE_EXIT(dr->dr_dbuf); } return (data); } void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; void *data = NULL; dmu_buf_impl_t *db = NULL; uint64_t *user = NULL; uint64_t *group = NULL; int flags = dn->dn_id_flags; int error; boolean_t have_spill = B_FALSE; if (!dmu_objset_userused_enabled(dn->dn_objset)) return; if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| DN_ID_CHKED_SPILL))) return; if (before && dn->dn_bonuslen != 0) data = DN_BONUS(dn->dn_phys); else if (!before && dn->dn_bonuslen != 0) { if (dn->dn_bonus) { db = dn->dn_bonus; mutex_enter(&db->db_mtx); data = dmu_objset_userquota_find_data(db, tx); } else { data = DN_BONUS(dn->dn_phys); } } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { int rf = 0; if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) rf |= DB_RF_HAVESTRUCT; error = dmu_spill_hold_by_dnode(dn, rf | DB_RF_MUST_SUCCEED, FTAG, (dmu_buf_t **)&db); ASSERT(error == 0); mutex_enter(&db->db_mtx); data = (before) ? db->db.db_data : dmu_objset_userquota_find_data(db, tx); have_spill = B_TRUE; } else { mutex_enter(&dn->dn_mtx); dn->dn_id_flags |= DN_ID_CHKED_BONUS; mutex_exit(&dn->dn_mtx); return; } if (before) { ASSERT(data); user = &dn->dn_olduid; group = &dn->dn_oldgid; } else if (data) { user = &dn->dn_newuid; group = &dn->dn_newgid; } /* * Must always call the callback in case the object * type has changed and that type isn't an object type to track */ error = used_cbs[os->os_phys->os_type](dn->dn_bonustype, data, user, group); /* * Preserve existing uid/gid when the callback can't determine * what the new uid/gid are and the callback returned EEXIST. * The EEXIST error tells us to just use the existing uid/gid. * If we don't know what the old values are then just assign * them to 0, since that is a new file being created. */ if (!before && data == NULL && error == EEXIST) { if (flags & DN_ID_OLD_EXIST) { dn->dn_newuid = dn->dn_olduid; dn->dn_newgid = dn->dn_oldgid; } else { dn->dn_newuid = 0; dn->dn_newgid = 0; } error = 0; } if (db) mutex_exit(&db->db_mtx); mutex_enter(&dn->dn_mtx); if (error == 0 && before) dn->dn_id_flags |= DN_ID_OLD_EXIST; if (error == 0 && !before) dn->dn_id_flags |= DN_ID_NEW_EXIST; if (have_spill) { dn->dn_id_flags |= DN_ID_CHKED_SPILL; } else { dn->dn_id_flags |= DN_ID_CHKED_BONUS; } mutex_exit(&dn->dn_mtx); if (have_spill) dmu_buf_rele((dmu_buf_t *)db, FTAG); } boolean_t dmu_objset_userspace_present(objset_t *os) { return (os->os_phys->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); } int dmu_objset_userspace_upgrade(objset_t *os) { uint64_t obj; int err = 0; if (dmu_objset_userspace_present(os)) return (0); if (!dmu_objset_userused_enabled(os)) return (SET_ERROR(ENOTSUP)); if (dmu_objset_is_snapshot(os)) return (SET_ERROR(EINVAL)); /* * We simply need to mark every object dirty, so that it will be * synced out and now accounted. If this is called * concurrently, or if we already did some work before crashing, * that's fine, since we track each object's accounted state * independently. */ for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { dmu_tx_t *tx; dmu_buf_t *db; int objerr; if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); objerr = dmu_bonus_hold(os, obj, FTAG, &db); if (objerr != 0) continue; tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, obj); objerr = dmu_tx_assign(tx, TXG_WAIT); if (objerr != 0) { dmu_tx_abort(tx); continue; } dmu_buf_will_dirty(db, tx); dmu_buf_rele(db, FTAG); dmu_tx_commit(tx); } os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; txg_wait_synced(dmu_objset_pool(os), 0); return (0); } void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, usedobjsp, availobjsp); } uint64_t dmu_objset_fsid_guid(objset_t *os) { return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); } void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) { stat->dds_type = os->os_phys->os_type; if (os->os_dsl_dataset) dsl_dataset_fast_stat(os->os_dsl_dataset, stat); } void dmu_objset_stats(objset_t *os, nvlist_t *nv) { ASSERT(os->os_dsl_dataset || os->os_phys->os_type == DMU_OST_META); if (os->os_dsl_dataset != NULL) dsl_dataset_stats(os->os_dsl_dataset, nv); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, os->os_phys->os_type); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, dmu_objset_userspace_present(os)); } int dmu_objset_is_snapshot(objset_t *os) { if (os->os_dsl_dataset != NULL) - return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); + return (os->os_dsl_dataset->ds_is_snapshot); else return (B_FALSE); } int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; uint64_t ignored; if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored, MT_FIRST, real, maxlen, conflict)); } int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) { dsl_dataset_t *ds = os->os_dsl_dataset; zap_cursor_t cursor; zap_attribute_t attr; ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (SET_ERROR(ENAMETOOLONG)); } (void) strcpy(name, attr.za_name); if (idp) *idp = attr.za_first_integer; if (case_conflict) *case_conflict = attr.za_normalization_conflict; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp) { dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; zap_cursor_t cursor; zap_attribute_t attr; /* there is no next dir on a snapshot! */ if (os->os_dsl_dataset->ds_object != dsl_dir_phys(dd)->dd_head_dataset_obj) return (SET_ERROR(ENOENT)); zap_cursor_init_serialized(&cursor, dd->dd_pool->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp); if (zap_cursor_retrieve(&cursor, &attr) != 0) { zap_cursor_fini(&cursor); return (SET_ERROR(ENOENT)); } if (strlen(attr.za_name) + 1 > namelen) { zap_cursor_fini(&cursor); return (SET_ERROR(ENAMETOOLONG)); } (void) strcpy(name, attr.za_name); if (idp) *idp = attr.za_first_integer; zap_cursor_advance(&cursor); *offp = zap_cursor_serialize(&cursor); zap_cursor_fini(&cursor); return (0); } /* * Find objsets under and including ddobj, call func(ds) on each. */ int dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) { dsl_dir_t *dd; dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; uint64_t thisobj; int err; ASSERT(dsl_pool_config_held(dp)); err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); if (err != 0) return (err); /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); return (0); } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); err = dmu_objset_find_dp(dp, attr->za_first_integer, func, arg, flags); if (err != 0) break; } zap_cursor_fini(&zc); if (err != 0) { dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } } /* * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { dsl_dataset_t *ds; err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); err = dsl_dataset_hold_obj(dp, attr->za_first_integer, FTAG, &ds); if (err != 0) break; err = func(dp, ds, arg); dsl_dataset_rele(ds, FTAG); if (err != 0) break; } zap_cursor_fini(&zc); } } dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); if (err != 0) return (err); /* * Apply to self. */ err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err != 0) return (err); err = func(dp, ds, arg); dsl_dataset_rele(ds, FTAG); return (err); } /* * Find all objsets under name, and for each, call 'func(child_name, arg)'. * The dp_config_rwlock must not be held when this is called, and it * will not be held when the callback is called. * Therefore this function should only be used when the pool is not changing * (e.g. in syncing context), or the callback can deal with the possible races. */ static int dmu_objset_find_impl(spa_t *spa, const char *name, int func(const char *, void *), void *arg, int flags) { dsl_dir_t *dd; dsl_pool_t *dp = spa_get_dsl(spa); dsl_dataset_t *ds; zap_cursor_t zc; zap_attribute_t *attr; char *child; uint64_t thisobj; int err; dsl_pool_config_enter(dp, FTAG); err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); if (err != 0) { dsl_pool_config_exit(dp, FTAG); return (err); } /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ if (dd->dd_myname[0] == '$') { dsl_dir_rele(dd, FTAG); dsl_pool_config_exit(dp, FTAG); return (0); } thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* * Iterate over all children. */ if (flags & DS_FIND_CHILDREN) { for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s/%s", name, attr->za_name); dsl_pool_config_exit(dp, FTAG); err = dmu_objset_find_impl(spa, child, func, arg, flags); dsl_pool_config_enter(dp, FTAG); strfree(child); if (err != 0) break; } zap_cursor_fini(&zc); if (err != 0) { dsl_dir_rele(dd, FTAG); dsl_pool_config_exit(dp, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); return (err); } } /* * Iterate over all snapshots. */ if (flags & DS_FIND_SNAPSHOTS) { err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); if (err == 0) { uint64_t snapobj; snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; dsl_dataset_rele(ds, FTAG); for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); zap_cursor_retrieve(&zc, attr) == 0; (void) zap_cursor_advance(&zc)) { ASSERT3U(attr->za_integer_length, ==, sizeof (uint64_t)); ASSERT3U(attr->za_num_integers, ==, 1); child = kmem_asprintf("%s@%s", name, attr->za_name); dsl_pool_config_exit(dp, FTAG); err = func(child, arg); dsl_pool_config_enter(dp, FTAG); strfree(child); if (err != 0) break; } zap_cursor_fini(&zc); } } dsl_dir_rele(dd, FTAG); kmem_free(attr, sizeof (zap_attribute_t)); dsl_pool_config_exit(dp, FTAG); if (err != 0) return (err); /* Apply to self. */ return (func(name, arg)); } /* * See comment above dmu_objset_find_impl(). */ int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags) { spa_t *spa; int error; error = spa_open(name, &spa, FTAG); if (error != 0) return (error); error = dmu_objset_find_impl(spa, name, func, arg, flags); spa_close(spa, FTAG); return (error); } void dmu_objset_set_user(objset_t *os, void *user_ptr) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); os->os_user_ptr = user_ptr; } void * dmu_objset_get_user(objset_t *os) { ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); return (os->os_user_ptr); } /* * Determine name of filesystem, given name of snapshot. * buf must be at least MAXNAMELEN bytes */ int dmu_fsname(const char *snapname, char *buf) { char *atp = strchr(snapname, '@'); if (atp == NULL) return (SET_ERROR(EINVAL)); if (atp - snapname >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); (void) strlcpy(buf, snapname, atp - snapname + 1); return (0); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_send.c (revision 286575) @@ -1,2236 +1,2236 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2012, Martin Matuska . All rights reserved. * Copyright 2014 HybridCluster. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __FreeBSD__ #undef dump_write #define dump_write dmu_dump_write #endif /* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ int zfs_send_corrupt_data = B_FALSE; static char *dmu_recv_tag = "dmu_recv_tag"; static const char *recv_clone_name = "%recv"; static int dump_bytes(dmu_sendarg_t *dsp, void *buf, int len) { dsl_dataset_t *ds = dsp->dsa_os->os_dsl_dataset; struct uio auio; struct iovec aiov; ASSERT0(len % 8); fletcher_4_incremental_native(buf, len, &dsp->dsa_zc); aiov.iov_base = buf; aiov.iov_len = len; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_WRITE; auio.uio_offset = (off_t)-1; auio.uio_td = dsp->dsa_td; #ifdef _KERNEL if (dsp->dsa_fp->f_type == DTYPE_VNODE) bwillwrite(); dsp->dsa_err = fo_write(dsp->dsa_fp, &auio, dsp->dsa_td->td_ucred, 0, dsp->dsa_td); #else fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); dsp->dsa_err = EOPNOTSUPP; #endif mutex_enter(&ds->ds_sendstream_lock); *dsp->dsa_off += len; mutex_exit(&ds->ds_sendstream_lock); return (dsp->dsa_err); } static int dump_free(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, uint64_t length) { struct drr_free *drrf = &(dsp->dsa_drr->drr_u.drr_free); /* * When we receive a free record, dbuf_free_range() assumes * that the receiving system doesn't have any dbufs in the range * being freed. This is always true because there is a one-record * constraint: we only send one WRITE record for any given * object+offset. We know that the one-record constraint is * true because we always send data in increasing order by * object,offset. * * If the increasing-order constraint ever changes, we should find * another way to assert that the one-record constraint is still * satisfied. */ ASSERT(object > dsp->dsa_last_data_object || (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); /* * If we are doing a non-incremental send, then there can't * be any data in the dataset we're receiving into. Therefore * a free record would simply be a no-op. Save space by not * sending it to begin with. */ if (!dsp->dsa_incremental) return (0); if (length != -1ULL && offset + length < offset) length = -1ULL; /* * If there is a pending op, but it's not PENDING_FREE, push it out, * since free block aggregation can only be done for blocks of the * same type (i.e., DRR_FREE records can only be aggregated with * other DRR_FREE records. DRR_FREEOBJECTS records can only be * aggregated with other DRR_FREEOBJECTS records. */ if (dsp->dsa_pending_op != PENDING_NONE && dsp->dsa_pending_op != PENDING_FREE) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } if (dsp->dsa_pending_op == PENDING_FREE) { /* * There should never be a PENDING_FREE if length is -1 * (because dump_dnode is the only place where this * function is called with a -1, and only after flushing * any pending record). */ ASSERT(length != -1ULL); /* * Check to see whether this free block can be aggregated * with pending one. */ if (drrf->drr_object == object && drrf->drr_offset + drrf->drr_length == offset) { drrf->drr_length += length; return (0); } else { /* not a continuation. Push out pending record */ if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } } /* create a FREE record and make it pending */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_FREE; drrf->drr_object = object; drrf->drr_offset = offset; drrf->drr_length = length; drrf->drr_toguid = dsp->dsa_toguid; if (length == -1ULL) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); } else { dsp->dsa_pending_op = PENDING_FREE; } return (0); } static int dump_write(dmu_sendarg_t *dsp, dmu_object_type_t type, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) { struct drr_write *drrw = &(dsp->dsa_drr->drr_u.drr_write); /* * We send data in increasing object, offset order. * See comment in dump_free() for details. */ ASSERT(object > dsp->dsa_last_data_object || (object == dsp->dsa_last_data_object && offset > dsp->dsa_last_data_offset)); dsp->dsa_last_data_object = object; dsp->dsa_last_data_offset = offset + blksz - 1; /* * If there is any kind of pending aggregation (currently either * a grouping of free objects or free blocks), push it out to * the stream, since aggregation can't be done across operations * of different types. */ if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write a DATA record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_WRITE; drrw->drr_object = object; drrw->drr_type = type; drrw->drr_offset = offset; drrw->drr_length = blksz; drrw->drr_toguid = dsp->dsa_toguid; if (bp == NULL || BP_IS_EMBEDDED(bp)) { /* * There's no pre-computed checksum for partial-block * writes or embedded BP's, so (like * fletcher4-checkummed blocks) userland will have to * compute a dedup-capable checksum itself. */ drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; } else { drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); drrw->drr_key.ddk_cksum = bp->blk_cksum; } if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); if (dump_bytes(dsp, data, blksz) != 0) return (SET_ERROR(EINTR)); return (0); } static int dump_write_embedded(dmu_sendarg_t *dsp, uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp) { char buf[BPE_PAYLOAD_SIZE]; struct drr_write_embedded *drrw = &(dsp->dsa_drr->drr_u.drr_write_embedded); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); dsp->dsa_pending_op = PENDING_NONE; } ASSERT(BP_IS_EMBEDDED(bp)); bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_WRITE_EMBEDDED; drrw->drr_object = object; drrw->drr_offset = offset; drrw->drr_length = blksz; drrw->drr_toguid = dsp->dsa_toguid; drrw->drr_compression = BP_GET_COMPRESS(bp); drrw->drr_etype = BPE_GET_ETYPE(bp); drrw->drr_lsize = BPE_GET_LSIZE(bp); drrw->drr_psize = BPE_GET_PSIZE(bp); decode_embedded_bp_compressed(bp, buf); if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); if (dump_bytes(dsp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) return (EINTR); return (0); } static int dump_spill(dmu_sendarg_t *dsp, uint64_t object, int blksz, void *data) { struct drr_spill *drrs = &(dsp->dsa_drr->drr_u.drr_spill); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write a SPILL record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_SPILL; drrs->drr_object = object; drrs->drr_length = blksz; drrs->drr_toguid = dsp->dsa_toguid; if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t))) return (SET_ERROR(EINTR)); if (dump_bytes(dsp, data, blksz)) return (SET_ERROR(EINTR)); return (0); } static int dump_freeobjects(dmu_sendarg_t *dsp, uint64_t firstobj, uint64_t numobjs) { struct drr_freeobjects *drrfo = &(dsp->dsa_drr->drr_u.drr_freeobjects); /* See comment in dump_free(). */ if (!dsp->dsa_incremental) return (0); /* * If there is a pending op, but it's not PENDING_FREEOBJECTS, * push it out, since free block aggregation can only be done for * blocks of the same type (i.e., DRR_FREE records can only be * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records * can only be aggregated with other DRR_FREEOBJECTS records. */ if (dsp->dsa_pending_op != PENDING_NONE && dsp->dsa_pending_op != PENDING_FREEOBJECTS) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } if (dsp->dsa_pending_op == PENDING_FREEOBJECTS) { /* * See whether this free object array can be aggregated * with pending one */ if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { drrfo->drr_numobjs += numobjs; return (0); } else { /* can't be aggregated. Push out pending record */ if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } } /* write a FREEOBJECTS record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_FREEOBJECTS; drrfo->drr_firstobj = firstobj; drrfo->drr_numobjs = numobjs; drrfo->drr_toguid = dsp->dsa_toguid; dsp->dsa_pending_op = PENDING_FREEOBJECTS; return (0); } static int dump_dnode(dmu_sendarg_t *dsp, uint64_t object, dnode_phys_t *dnp) { struct drr_object *drro = &(dsp->dsa_drr->drr_u.drr_object); if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(dsp, object, 1)); if (dsp->dsa_pending_op != PENDING_NONE) { if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); dsp->dsa_pending_op = PENDING_NONE; } /* write an OBJECT record */ bzero(dsp->dsa_drr, sizeof (dmu_replay_record_t)); dsp->dsa_drr->drr_type = DRR_OBJECT; drro->drr_object = object; drro->drr_type = dnp->dn_type; drro->drr_bonustype = dnp->dn_bonustype; drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; drro->drr_bonuslen = dnp->dn_bonuslen; drro->drr_checksumtype = dnp->dn_checksum; drro->drr_compress = dnp->dn_compress; drro->drr_toguid = dsp->dsa_toguid; if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; if (dump_bytes(dsp, dsp->dsa_drr, sizeof (dmu_replay_record_t)) != 0) return (SET_ERROR(EINTR)); if (dump_bytes(dsp, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) return (SET_ERROR(EINTR)); /* Free anything past the end of the file. */ if (dump_free(dsp, object, (dnp->dn_maxblkid + 1) * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), -1ULL) != 0) return (SET_ERROR(EINTR)); if (dsp->dsa_err != 0) return (SET_ERROR(EINTR)); return (0); } static boolean_t backup_do_embed(dmu_sendarg_t *dsp, const blkptr_t *bp) { if (!BP_IS_EMBEDDED(bp)) return (B_FALSE); /* * Compression function must be legacy, or explicitly enabled. */ if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && !(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4))) return (B_FALSE); /* * Embed type must be explicitly enabled. */ switch (BPE_GET_ETYPE(bp)) { case BP_EMBEDDED_TYPE_DATA: if (dsp->dsa_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) return (B_TRUE); break; default: return (B_FALSE); } return (B_FALSE); } #define BP_SPAN(dnp, level) \ (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) /* ARGSUSED */ static int backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { dmu_sendarg_t *dsp = arg; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; int err = 0; if (issig(JUSTLOOKING) && issig(FORREAL)) return (SET_ERROR(EINTR)); if (zb->zb_object != DMU_META_DNODE_OBJECT && DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { return (0); } else if (zb->zb_level == ZB_ZIL_LEVEL) { /* * If we are sending a non-snapshot (which is allowed on * read-only pools), it may have a ZIL, which must be ignored. */ return (0); } else if (BP_IS_HOLE(bp) && zb->zb_object == DMU_META_DNODE_OBJECT) { uint64_t span = BP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(dsp, dnobj, span >> DNODE_SHIFT); } else if (BP_IS_HOLE(bp)) { uint64_t span = BP_SPAN(dnp, zb->zb_level); err = dump_free(dsp, zb->zb_object, zb->zb_blkid * span, span); } else if (zb->zb_level > 0 || type == DMU_OT_OBJSET) { return (0); } else if (type == DMU_OT_DNODE) { dnode_phys_t *blk; int i; int blksz = BP_GET_LSIZE(bp); arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (SET_ERROR(EIO)); blk = abuf->b_data; for (i = 0; i < blksz >> DNODE_SHIFT; i++) { uint64_t dnobj = (zb->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; err = dump_dnode(dsp, dnobj, blk+i); if (err != 0) break; } (void) arc_buf_remove_ref(abuf, &abuf); } else if (type == DMU_OT_SA) { arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) return (SET_ERROR(EIO)); err = dump_spill(dsp, zb->zb_object, blksz, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); } else if (backup_do_embed(dsp, bp)) { /* it's an embedded level-0 block of a regular object */ int blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; err = dump_write_embedded(dsp, zb->zb_object, zb->zb_blkid * blksz, blksz, bp); } else { /* it's a level-0 block of a regular object */ arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf; int blksz = BP_GET_LSIZE(bp); uint64_t offset; ASSERT3U(blksz, ==, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT0(zb->zb_level); if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &aflags, zb) != 0) { if (zfs_send_corrupt_data) { /* Send a block filled with 0x"zfs badd bloc" */ abuf = arc_buf_alloc(spa, blksz, &abuf, ARC_BUFC_DATA); uint64_t *ptr; for (ptr = abuf->b_data; (char *)ptr < (char *)abuf->b_data + blksz; ptr++) *ptr = 0x2f5baddb10cULL; } else { return (SET_ERROR(EIO)); } } offset = zb->zb_blkid * blksz; if (!(dsp->dsa_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && blksz > SPA_OLD_MAXBLOCKSIZE) { char *buf = abuf->b_data; while (blksz > 0 && err == 0) { int n = MIN(blksz, SPA_OLD_MAXBLOCKSIZE); err = dump_write(dsp, type, zb->zb_object, offset, n, NULL, buf); offset += n; buf += n; blksz -= n; } } else { err = dump_write(dsp, type, zb->zb_object, offset, blksz, bp, abuf->b_data); } (void) arc_buf_remove_ref(abuf, &abuf); } ASSERT(err == 0 || err == EINTR); return (err); } /* * Releases dp using the specified tag. */ static int dmu_send_impl(void *tag, dsl_pool_t *dp, dsl_dataset_t *ds, zfs_bookmark_phys_t *fromzb, boolean_t is_clone, boolean_t embedok, #ifdef illumos boolean_t large_block_ok, int outfd, vnode_t *vp, offset_t *off) #else boolean_t large_block_ok, int outfd, struct file *fp, offset_t *off) #endif { objset_t *os; dmu_replay_record_t *drr; dmu_sendarg_t *dsp; int err; uint64_t fromtxg = 0; uint64_t featureflags = 0; err = dmu_objset_from_ds(ds, &os); if (err != 0) { dsl_pool_rele(dp, tag); return (err); } drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, DMU_SUBSTREAM); #ifdef _KERNEL if (dmu_objset_type(os) == DMU_OST_ZFS) { uint64_t version; if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); dsl_pool_rele(dp, tag); return (SET_ERROR(EINVAL)); } if (version >= ZPL_VERSION_SA) { featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; } } #endif if (large_block_ok && ds->ds_large_blocks) featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; if (embedok && spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA_LZ4; } else { embedok = B_FALSE; } DMU_SET_FEATUREFLAGS(drr->drr_u.drr_begin.drr_versioninfo, featureflags); drr->drr_u.drr_begin.drr_creation_time = dsl_dataset_phys(ds)->ds_creation_time; drr->drr_u.drr_begin.drr_type = dmu_objset_type(os); if (is_clone) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; drr->drr_u.drr_begin.drr_toguid = dsl_dataset_phys(ds)->ds_guid; if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CI_DATA; if (fromzb != NULL) { drr->drr_u.drr_begin.drr_fromguid = fromzb->zbm_guid; fromtxg = fromzb->zbm_creation_txg; } dsl_dataset_name(ds, drr->drr_u.drr_begin.drr_toname); - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { (void) strlcat(drr->drr_u.drr_begin.drr_toname, "@--head--", sizeof (drr->drr_u.drr_begin.drr_toname)); } dsp = kmem_zalloc(sizeof (dmu_sendarg_t), KM_SLEEP); dsp->dsa_drr = drr; dsp->dsa_outfd = outfd; dsp->dsa_proc = curproc; dsp->dsa_td = curthread; dsp->dsa_fp = fp; dsp->dsa_os = os; dsp->dsa_off = off; dsp->dsa_toguid = dsl_dataset_phys(ds)->ds_guid; ZIO_SET_CHECKSUM(&dsp->dsa_zc, 0, 0, 0, 0); dsp->dsa_pending_op = PENDING_NONE; dsp->dsa_incremental = (fromzb != NULL); dsp->dsa_featureflags = featureflags; mutex_enter(&ds->ds_sendstream_lock); list_insert_head(&ds->ds_sendstreams, dsp); mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_long_hold(ds, FTAG); dsl_pool_rele(dp, tag); if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { err = dsp->dsa_err; goto out; } err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, backup_cb, dsp); if (dsp->dsa_pending_op != PENDING_NONE) if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) err = SET_ERROR(EINTR); if (err != 0) { if (err == EINTR && dsp->dsa_err != 0) err = dsp->dsa_err; goto out; } bzero(drr, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; drr->drr_u.drr_end.drr_checksum = dsp->dsa_zc; drr->drr_u.drr_end.drr_toguid = dsp->dsa_toguid; if (dump_bytes(dsp, drr, sizeof (dmu_replay_record_t)) != 0) { err = dsp->dsa_err; goto out; } out: mutex_enter(&ds->ds_sendstream_lock); list_remove(&ds->ds_sendstreams, dsp); mutex_exit(&ds->ds_sendstream_lock); kmem_free(drr, sizeof (dmu_replay_record_t)); kmem_free(dsp, sizeof (dmu_sendarg_t)); dsl_dataset_long_rele(ds, FTAG); return (err); } int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, boolean_t embedok, boolean_t large_block_ok, #ifdef illumos int outfd, vnode_t *vp, offset_t *off) #else int outfd, struct file *fp, offset_t *off) #endif { dsl_pool_t *dp; dsl_dataset_t *ds; dsl_dataset_t *fromds = NULL; int err; err = dsl_pool_hold(pool, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_hold_obj(dp, tosnap, FTAG, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (fromsnap != 0) { zfs_bookmark_phys_t zb; boolean_t is_clone; err = dsl_dataset_hold_obj(dp, fromsnap, FTAG, &fromds); if (err != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (err); } if (!dsl_dataset_is_before(ds, fromds, 0)) err = SET_ERROR(EXDEV); zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (fromds->ds_dir != ds->ds_dir); dsl_dataset_rele(fromds, FTAG); err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, large_block_ok, outfd, fp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, large_block_ok, outfd, fp, off); } dsl_dataset_rele(ds, FTAG); return (err); } int dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, boolean_t large_block_ok, #ifdef illumos int outfd, vnode_t *vp, offset_t *off) #else int outfd, struct file *fp, offset_t *off) #endif { dsl_pool_t *dp; dsl_dataset_t *ds; int err; boolean_t owned = B_FALSE; if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) return (SET_ERROR(EINVAL)); err = dsl_pool_hold(tosnap, FTAG, &dp); if (err != 0) return (err); if (strchr(tosnap, '@') == NULL && spa_writeable(dp->dp_spa)) { /* * We are sending a filesystem or volume. Ensure * that it doesn't change by owning the dataset. */ err = dsl_dataset_own(dp, tosnap, FTAG, &ds); owned = B_TRUE; } else { err = dsl_dataset_hold(dp, tosnap, FTAG, &ds); } if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (fromsnap != NULL) { zfs_bookmark_phys_t zb; boolean_t is_clone = B_FALSE; int fsnamelen = strchr(tosnap, '@') - tosnap; /* * If the fromsnap is in a different filesystem, then * mark the send stream as a clone. */ if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || (fromsnap[fsnamelen] != '@' && fromsnap[fsnamelen] != '#')) { is_clone = B_TRUE; } if (strchr(fromsnap, '@')) { dsl_dataset_t *fromds; err = dsl_dataset_hold(dp, fromsnap, FTAG, &fromds); if (err == 0) { if (!dsl_dataset_is_before(ds, fromds, 0)) err = SET_ERROR(EXDEV); zb.zbm_creation_time = dsl_dataset_phys(fromds)->ds_creation_time; zb.zbm_creation_txg = dsl_dataset_phys(fromds)->ds_creation_txg; zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; is_clone = (ds->ds_dir != fromds->ds_dir); dsl_dataset_rele(fromds, FTAG); } } else { err = dsl_bookmark_lookup(dp, fromsnap, ds, &zb); } if (err != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (err); } err = dmu_send_impl(FTAG, dp, ds, &zb, is_clone, embedok, large_block_ok, outfd, fp, off); } else { err = dmu_send_impl(FTAG, dp, ds, NULL, B_FALSE, embedok, large_block_ok, outfd, fp, off); } if (owned) dsl_dataset_disown(ds, FTAG); else dsl_dataset_rele(ds, FTAG); return (err); } int dmu_send_estimate(dsl_dataset_t *ds, dsl_dataset_t *fromds, uint64_t *sizep) { dsl_pool_t *dp = ds->ds_dir->dd_pool; int err; uint64_t size; ASSERT(dsl_pool_config_held(dp)); /* tosnap must be a snapshot */ - if (!dsl_dataset_is_snapshot(ds)) + if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* fromsnap, if provided, must be a snapshot */ - if (fromds != NULL && !dsl_dataset_is_snapshot(fromds)) + if (fromds != NULL && !fromds->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* * fromsnap must be an earlier snapshot from the same fs as tosnap, * or the origin's fs. */ if (fromds != NULL && !dsl_dataset_is_before(ds, fromds, 0)) return (SET_ERROR(EXDEV)); /* Get uncompressed size estimate of changed data. */ if (fromds == NULL) { size = dsl_dataset_phys(ds)->ds_uncompressed_bytes; } else { uint64_t used, comp; err = dsl_dataset_space_written(fromds, ds, &used, &comp, &size); if (err != 0) return (err); } /* * Assume that space (both on-disk and in-stream) is dominated by * data. We will adjust for indirect blocks and the copies property, * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). */ /* * Subtract out approximate space used by indirect blocks. * Assume most space is used by data blocks (non-indirect, non-dnode). * Assume all blocks are recordsize. Assume ditto blocks and * internal fragmentation counter out compression. * * Therefore, space used by indirect blocks is sizeof(blkptr_t) per * block, which we observe in practice. */ uint64_t recordsize; err = dsl_prop_get_int_ds(ds, "recordsize", &recordsize); if (err != 0) return (err); size -= size / recordsize * sizeof (blkptr_t); /* Add in the space for the record associated with each block. */ size += size / recordsize * sizeof (dmu_replay_record_t); *sizep = size; return (0); } typedef struct dmu_recv_begin_arg { const char *drba_origin; dmu_recv_cookie_t *drba_cookie; cred_t *drba_cred; uint64_t drba_snapobj; } dmu_recv_begin_arg_t; static int recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, uint64_t fromguid) { uint64_t val; int error; dsl_pool_t *dp = ds->ds_dir->dd_pool; /* temporary clone name must not exist */ error = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, 8, 1, &val); if (error != ENOENT) return (error == 0 ? EBUSY : error); /* new snapshot name must not exist */ error = zap_lookup(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_snapnames_zapobj, drba->drba_cookie->drc_tosnap, 8, 1, &val); if (error != ENOENT) return (error == 0 ? EEXIST : error); /* * Check snapshot limit before receiving. We'll recheck again at the * end, but might as well abort before receiving if we're already over * the limit. * * Note that we do not check the file system limit with * dsl_dir_fscount_check because the temporary %clones don't count * against that limit. */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); if (error != 0) return (error); if (fromguid != 0) { dsl_dataset_t *snap; uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; /* Find snapshot in this dir that matches fromguid. */ while (obj != 0) { error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) return (SET_ERROR(ENODEV)); if (snap->ds_dir != ds->ds_dir) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ENODEV)); } if (dsl_dataset_phys(snap)->ds_guid == fromguid) break; obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); } if (obj == 0) return (SET_ERROR(ENODEV)); if (drba->drba_cookie->drc_force) { drba->drba_snapobj = obj; } else { /* * If we are not forcing, there must be no * changes since fromsnap. */ if (dsl_dataset_modified_since_snap(ds, snap)) { dsl_dataset_rele(snap, FTAG); return (SET_ERROR(ETXTBSY)); } drba->drba_snapobj = ds->ds_prev->ds_object; } dsl_dataset_rele(snap, FTAG); } else { /* if full, then must be forced */ if (!drba->drba_cookie->drc_force) return (SET_ERROR(EEXIST)); /* start from $ORIGIN@$ORIGIN, if supported */ drba->drba_snapobj = dp->dp_origin_snap != NULL ? dp->dp_origin_snap->ds_object : 0; } return (0); } static int dmu_recv_begin_check(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; uint64_t fromguid = drrb->drr_fromguid; int flags = drrb->drr_flags; int error; uint64_t featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); dsl_dataset_t *ds; const char *tofs = drba->drba_cookie->drc_tofs; /* already checked */ ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_COMPOUNDSTREAM || drrb->drr_type >= DMU_OST_NUMTYPES || ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) return (SET_ERROR(EINVAL)); /* Verify pool version supports SA if SA_SPILL feature set */ if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && spa_version(dp->dp_spa) < SPA_VERSION_SA) return (SET_ERROR(ENOTSUP)); /* * The receiving code doesn't know how to translate a WRITE_EMBEDDED * record to a plan WRITE record, so the pool must have the * EMBEDDED_DATA feature enabled if the stream has WRITE_EMBEDDED * records. Same with WRITE_EMBEDDED records that use LZ4 compression. */ if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) return (SET_ERROR(ENOTSUP)); if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA_LZ4) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) return (SET_ERROR(ENOTSUP)); /* * The receiving code doesn't know how to translate large blocks * to smaller ones, so the pool must have the LARGE_BLOCKS * feature enabled if the stream has LARGE_BLOCKS. */ if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ENOTSUP)); error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { /* target fs already exists; recv into temp clone */ /* Can't recv a clone into an existing fs */ if (flags & DRR_FLAG_CLONE) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } error = recv_begin_check_existing_impl(drba, ds, fromguid); dsl_dataset_rele(ds, FTAG); } else if (error == ENOENT) { /* target fs does not exist; must be a full backup or clone */ char buf[MAXNAMELEN]; /* * If it's a non-clone incremental, we are missing the * target fs, so fail the recv. */ if (fromguid != 0 && !(flags & DRR_FLAG_CLONE)) return (SET_ERROR(ENOENT)); /* Open the parent of tofs */ ASSERT3U(strlen(tofs), <, MAXNAMELEN); (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); error = dsl_dataset_hold(dp, buf, FTAG, &ds); if (error != 0) return (error); /* * Check filesystem and snapshot limits before receiving. We'll * recheck snapshot limits again at the end (we create the * filesystems and increment those counts during begin_sync). */ error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, drba->drba_cred); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, NULL, drba->drba_cred); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (drba->drba_origin != NULL) { dsl_dataset_t *origin; error = dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } - if (!dsl_dataset_is_snapshot(origin)) { + if (!origin->ds_is_snapshot) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } if (dsl_dataset_phys(origin)->ds_guid != fromguid) { dsl_dataset_rele(origin, FTAG); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENODEV)); } dsl_dataset_rele(origin, FTAG); } dsl_dataset_rele(ds, FTAG); error = 0; } return (error); } static void dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) { dmu_recv_begin_arg_t *drba = arg; dsl_pool_t *dp = dmu_tx_pool(tx); struct drr_begin *drrb = drba->drba_cookie->drc_drrb; const char *tofs = drba->drba_cookie->drc_tofs; dsl_dataset_t *ds, *newds; uint64_t dsobj; int error; uint64_t crflags; crflags = (drrb->drr_flags & DRR_FLAG_CI_DATA) ? DS_FLAG_CI_DATASET : 0; error = dsl_dataset_hold(dp, tofs, FTAG, &ds); if (error == 0) { /* create temporary clone */ dsl_dataset_t *snap = NULL; if (drba->drba_snapobj != 0) { VERIFY0(dsl_dataset_hold_obj(dp, drba->drba_snapobj, FTAG, &snap)); } dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, snap, crflags, drba->drba_cred, tx); if (drba->drba_snapobj != 0) dsl_dataset_rele(snap, FTAG); dsl_dataset_rele(ds, FTAG); } else { dsl_dir_t *dd; const char *tail; dsl_dataset_t *origin = NULL; VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); if (drba->drba_origin != NULL) { VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, FTAG, &origin)); } /* Create new dataset. */ dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1, origin, crflags, drba->drba_cred, tx); if (origin != NULL) dsl_dataset_rele(origin, FTAG); dsl_dir_rele(dd, FTAG); drba->drba_cookie->drc_newfs = B_TRUE; } VERIFY0(dsl_dataset_own_obj(dp, dsobj, dmu_recv_tag, &newds)); if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && !newds->ds_large_blocks) { dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); newds->ds_large_blocks = B_TRUE; } dmu_buf_will_dirty(newds->ds_dbuf, tx); dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; /* * If we actually created a non-clone, we need to create the * objset in our new dataset. */ if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds))) { (void) dmu_objset_create_impl(dp->dp_spa, newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); } drba->drba_cookie->drc_ds = newds; spa_history_log_internal_ds(newds, "receive", tx, ""); } /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, boolean_t force, char *origin, dmu_recv_cookie_t *drc) { dmu_recv_begin_arg_t drba = { 0 }; dmu_replay_record_t *drr; bzero(drc, sizeof (dmu_recv_cookie_t)); drc->drc_drrb = drrb; drc->drc_tosnap = tosnap; drc->drc_tofs = tofs; drc->drc_force = force; drc->drc_cred = CRED(); if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) drc->drc_byteswap = B_TRUE; else if (drrb->drr_magic != DMU_BACKUP_MAGIC) return (SET_ERROR(EINVAL)); drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin = *drc->drc_drrb; if (drc->drc_byteswap) { fletcher_4_incremental_byteswap(drr, sizeof (dmu_replay_record_t), &drc->drc_cksum); } else { fletcher_4_incremental_native(drr, sizeof (dmu_replay_record_t), &drc->drc_cksum); } kmem_free(drr, sizeof (dmu_replay_record_t)); if (drc->drc_byteswap) { drrb->drr_magic = BSWAP_64(drrb->drr_magic); drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); } drba.drba_origin = origin; drba.drba_cookie = drc; drba.drba_cred = CRED(); return (dsl_sync_task(tofs, dmu_recv_begin_check, dmu_recv_begin_sync, &drba, 5, ZFS_SPACE_CHECK_NORMAL)); } struct restorearg { int err; boolean_t byteswap; kthread_t *td; struct file *fp; char *buf; uint64_t voff; int bufsize; /* amount of memory allocated for buf */ zio_cksum_t cksum; avl_tree_t *guid_to_ds_map; }; typedef struct guid_map_entry { uint64_t guid; dsl_dataset_t *gme_ds; avl_node_t avlnode; } guid_map_entry_t; static int guid_compare(const void *arg1, const void *arg2) { const guid_map_entry_t *gmep1 = arg1; const guid_map_entry_t *gmep2 = arg2; if (gmep1->guid < gmep2->guid) return (-1); else if (gmep1->guid > gmep2->guid) return (1); return (0); } static void free_guid_map_onexit(void *arg) { avl_tree_t *ca = arg; void *cookie = NULL; guid_map_entry_t *gmep; while ((gmep = avl_destroy_nodes(ca, &cookie)) != NULL) { dsl_dataset_long_rele(gmep->gme_ds, gmep); dsl_dataset_rele(gmep->gme_ds, gmep); kmem_free(gmep, sizeof (guid_map_entry_t)); } avl_destroy(ca); kmem_free(ca, sizeof (avl_tree_t)); } static int restore_bytes(struct restorearg *ra, void *buf, int len, off_t off, ssize_t *resid) { struct uio auio; struct iovec aiov; int error; aiov.iov_base = buf; aiov.iov_len = len; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_resid = len; auio.uio_segflg = UIO_SYSSPACE; auio.uio_rw = UIO_READ; auio.uio_offset = off; auio.uio_td = ra->td; #ifdef _KERNEL error = fo_read(ra->fp, &auio, ra->td->td_ucred, FOF_OFFSET, ra->td); #else fprintf(stderr, "%s: returning EOPNOTSUPP\n", __func__); error = EOPNOTSUPP; #endif *resid = auio.uio_resid; return (error); } static void * restore_read(struct restorearg *ra, int len, char *buf) { int done = 0; if (buf == NULL) buf = ra->buf; /* some things will require 8-byte alignment, so everything must */ ASSERT0(len % 8); ASSERT3U(len, <=, ra->bufsize); while (done < len) { ssize_t resid; ra->err = restore_bytes(ra, buf + done, len - done, ra->voff, &resid); if (resid == len - done) ra->err = SET_ERROR(EINVAL); ra->voff += len - done - resid; done = len - resid; if (ra->err != 0) return (NULL); } ASSERT3U(done, ==, len); if (ra->byteswap) fletcher_4_incremental_byteswap(buf, len, &ra->cksum); else fletcher_4_incremental_native(buf, len, &ra->cksum); return (buf); } static void backup_byteswap(dmu_replay_record_t *drr) { #define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) #define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) drr->drr_type = BSWAP_32(drr->drr_type); drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); DO64(drr_begin.drr_versioninfo); DO64(drr_begin.drr_creation_time); DO32(drr_begin.drr_type); DO32(drr_begin.drr_flags); DO64(drr_begin.drr_toguid); DO64(drr_begin.drr_fromguid); break; case DRR_OBJECT: DO64(drr_object.drr_object); DO32(drr_object.drr_type); DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); DO32(drr_object.drr_bonuslen); DO64(drr_object.drr_toguid); break; case DRR_FREEOBJECTS: DO64(drr_freeobjects.drr_firstobj); DO64(drr_freeobjects.drr_numobjs); DO64(drr_freeobjects.drr_toguid); break; case DRR_WRITE: DO64(drr_write.drr_object); DO32(drr_write.drr_type); DO64(drr_write.drr_offset); DO64(drr_write.drr_length); DO64(drr_write.drr_toguid); DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); DO64(drr_write.drr_key.ddk_prop); break; case DRR_WRITE_BYREF: DO64(drr_write_byref.drr_object); DO64(drr_write_byref.drr_offset); DO64(drr_write_byref.drr_length); DO64(drr_write_byref.drr_toguid); DO64(drr_write_byref.drr_refguid); DO64(drr_write_byref.drr_refobject); DO64(drr_write_byref.drr_refoffset); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); DO64(drr_write_byref.drr_key.ddk_prop); break; case DRR_WRITE_EMBEDDED: DO64(drr_write_embedded.drr_object); DO64(drr_write_embedded.drr_offset); DO64(drr_write_embedded.drr_length); DO64(drr_write_embedded.drr_toguid); DO32(drr_write_embedded.drr_lsize); DO32(drr_write_embedded.drr_psize); break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); DO64(drr_free.drr_length); DO64(drr_free.drr_toguid); break; case DRR_SPILL: DO64(drr_spill.drr_object); DO64(drr_spill.drr_length); DO64(drr_spill.drr_toguid); break; case DRR_END: DO64(drr_end.drr_checksum.zc_word[0]); DO64(drr_end.drr_checksum.zc_word[1]); DO64(drr_end.drr_checksum.zc_word[2]); DO64(drr_end.drr_checksum.zc_word[3]); DO64(drr_end.drr_toguid); break; } #undef DO64 #undef DO32 } static inline uint8_t deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) { if (bonus_type == DMU_OT_SA) { return (1); } else { return (1 + ((DN_MAX_BONUSLEN - bonus_size) >> SPA_BLKPTRSHIFT)); } } static int restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) { dmu_object_info_t doi; dmu_tx_t *tx; void *data = NULL; uint64_t object; int err; if (drro->drr_type == DMU_OT_NONE || !DMU_OT_IS_VALID(drro->drr_type) || !DMU_OT_IS_VALID(drro->drr_bonustype) || drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(os)) || drro->drr_bonuslen > DN_MAX_BONUSLEN) { return (SET_ERROR(EINVAL)); } err = dmu_object_info(os, drro->drr_object, &doi); if (err != 0 && err != ENOENT) return (SET_ERROR(EINVAL)); object = err == 0 ? drro->drr_object : DMU_NEW_OBJECT; if (drro->drr_bonuslen) { data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8), NULL); if (ra->err != 0) return (ra->err); } /* * If we are losing blkptrs or changing the block size this must * be a new file instance. We must clear out the previous file * contents before we can change this type of metadata in the dnode. */ if (err == 0) { int nblkptr; nblkptr = deduce_nblkptr(drro->drr_bonustype, drro->drr_bonuslen); if (drro->drr_blksz != doi.doi_data_block_size || nblkptr < doi.doi_nblkptr) { err = dmu_free_long_range(os, drro->drr_object, 0, DMU_OBJECT_END); if (err != 0) return (SET_ERROR(EINVAL)); } } tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } if (object == DMU_NEW_OBJECT) { /* currently free, want to be allocated */ err = dmu_object_claim(os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } else if (drro->drr_type != doi.doi_type || drro->drr_blksz != doi.doi_data_block_size || drro->drr_bonustype != doi.doi_bonus_type || drro->drr_bonuslen != doi.doi_bonus_size) { /* currently allocated, but with different properties */ err = dmu_object_reclaim(os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); } if (err != 0) { dmu_tx_commit(tx); return (SET_ERROR(EINVAL)); } dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, tx); dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); if (data != NULL) { dmu_buf_t *db; VERIFY(0 == dmu_bonus_hold(os, drro->drr_object, FTAG, &db)); dmu_buf_will_dirty(db, tx); ASSERT3U(db->db_size, >=, drro->drr_bonuslen); bcopy(data, db->db_data, drro->drr_bonuslen); if (ra->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drro->drr_bonustype); dmu_ot_byteswap[byteswap].ob_func(db->db_data, drro->drr_bonuslen); } dmu_buf_rele(db, FTAG); } dmu_tx_commit(tx); return (0); } /* ARGSUSED */ static int restore_freeobjects(struct restorearg *ra, objset_t *os, struct drr_freeobjects *drrfo) { uint64_t obj; if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) return (SET_ERROR(EINVAL)); for (obj = drrfo->drr_firstobj; obj < drrfo->drr_firstobj + drrfo->drr_numobjs; (void) dmu_object_next(os, &obj, FALSE, 0)) { int err; if (dmu_object_info(os, obj, NULL) != 0) continue; err = dmu_free_long_object(os, obj); if (err != 0) return (err); } return (0); } static int restore_write(struct restorearg *ra, objset_t *os, struct drr_write *drrw) { dmu_tx_t *tx; void *data; int err; if (drrw->drr_offset + drrw->drr_length < drrw->drr_offset || !DMU_OT_IS_VALID(drrw->drr_type)) return (SET_ERROR(EINVAL)); if (dmu_object_info(os, drrw->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); dmu_buf_t *bonus; if (dmu_bonus_hold(os, drrw->drr_object, FTAG, &bonus) != 0) return (SET_ERROR(EINVAL)); arc_buf_t *abuf = dmu_request_arcbuf(bonus, drrw->drr_length); data = restore_read(ra, drrw->drr_length, abuf->b_data); if (data == NULL) { dmu_return_arcbuf(abuf); dmu_buf_rele(bonus, FTAG); return (ra->err); } tx = dmu_tx_create(os); dmu_tx_hold_write(tx, drrw->drr_object, drrw->drr_offset, drrw->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_return_arcbuf(abuf); dmu_buf_rele(bonus, FTAG); dmu_tx_abort(tx); return (err); } if (ra->byteswap) { dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(drrw->drr_type); dmu_ot_byteswap[byteswap].ob_func(data, drrw->drr_length); } dmu_assign_arcbuf(bonus, drrw->drr_offset, abuf, tx); dmu_tx_commit(tx); dmu_buf_rele(bonus, FTAG); return (0); } /* * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed * streams to refer to a copy of the data that is already on the * system because it came in earlier in the stream. This function * finds the earlier copy of the data, and uses that copy instead of * data from the stream to fulfill this write. */ static int restore_write_byref(struct restorearg *ra, objset_t *os, struct drr_write_byref *drrwbr) { dmu_tx_t *tx; int err; guid_map_entry_t gmesrch; guid_map_entry_t *gmep; avl_index_t where; objset_t *ref_os = NULL; dmu_buf_t *dbp; if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) return (SET_ERROR(EINVAL)); /* * If the GUID of the referenced dataset is different from the * GUID of the target dataset, find the referenced dataset. */ if (drrwbr->drr_toguid != drrwbr->drr_refguid) { gmesrch.guid = drrwbr->drr_refguid; if ((gmep = avl_find(ra->guid_to_ds_map, &gmesrch, &where)) == NULL) { return (SET_ERROR(EINVAL)); } if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) return (SET_ERROR(EINVAL)); } else { ref_os = os; } err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, drrwbr->drr_refoffset, FTAG, &dbp, DMU_READ_PREFETCH); if (err != 0) return (err); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } dmu_write(os, drrwbr->drr_object, drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); dmu_buf_rele(dbp, FTAG); dmu_tx_commit(tx); return (0); } static int restore_write_embedded(struct restorearg *ra, objset_t *os, struct drr_write_embedded *drrwnp) { dmu_tx_t *tx; int err; void *data; if (drrwnp->drr_offset + drrwnp->drr_length < drrwnp->drr_offset) return (EINVAL); if (drrwnp->drr_psize > BPE_PAYLOAD_SIZE) return (EINVAL); if (drrwnp->drr_etype >= NUM_BP_EMBEDDED_TYPES) return (EINVAL); if (drrwnp->drr_compression >= ZIO_COMPRESS_FUNCTIONS) return (EINVAL); data = restore_read(ra, P2ROUNDUP(drrwnp->drr_psize, 8), NULL); if (data == NULL) return (ra->err); tx = dmu_tx_create(os); dmu_tx_hold_write(tx, drrwnp->drr_object, drrwnp->drr_offset, drrwnp->drr_length); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); return (err); } dmu_write_embedded(os, drrwnp->drr_object, drrwnp->drr_offset, data, drrwnp->drr_etype, drrwnp->drr_compression, drrwnp->drr_lsize, drrwnp->drr_psize, ra->byteswap ^ ZFS_HOST_BYTEORDER, tx); dmu_tx_commit(tx); return (0); } static int restore_spill(struct restorearg *ra, objset_t *os, struct drr_spill *drrs) { dmu_tx_t *tx; void *data; dmu_buf_t *db, *db_spill; int err; if (drrs->drr_length < SPA_MINBLOCKSIZE || drrs->drr_length > spa_maxblocksize(dmu_objset_spa(os))) return (SET_ERROR(EINVAL)); data = restore_read(ra, drrs->drr_length, NULL); if (data == NULL) return (ra->err); if (dmu_object_info(os, drrs->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); VERIFY(0 == dmu_bonus_hold(os, drrs->drr_object, FTAG, &db)); if ((err = dmu_spill_hold_by_bonus(db, FTAG, &db_spill)) != 0) { dmu_buf_rele(db, FTAG); return (err); } tx = dmu_tx_create(os); dmu_tx_hold_spill(tx, db->db_object); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_abort(tx); return (err); } dmu_buf_will_dirty(db_spill, tx); if (db_spill->db_size < drrs->drr_length) VERIFY(0 == dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); bcopy(data, db_spill->db_data, drrs->drr_length); dmu_buf_rele(db, FTAG); dmu_buf_rele(db_spill, FTAG); dmu_tx_commit(tx); return (0); } /* ARGSUSED */ static int restore_free(struct restorearg *ra, objset_t *os, struct drr_free *drrf) { int err; if (drrf->drr_length != -1ULL && drrf->drr_offset + drrf->drr_length < drrf->drr_offset) return (SET_ERROR(EINVAL)); if (dmu_object_info(os, drrf->drr_object, NULL) != 0) return (SET_ERROR(EINVAL)); err = dmu_free_long_range(os, drrf->drr_object, drrf->drr_offset, drrf->drr_length); return (err); } /* used to destroy the drc_ds on error */ static void dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) { char name[MAXNAMELEN]; dsl_dataset_name(drc->drc_ds, name); dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); (void) dsl_destroy_head(name); } /* * NB: callers *must* call dmu_recv_end() if this succeeds. */ int dmu_recv_stream(dmu_recv_cookie_t *drc, struct file *fp, offset_t *voffp, int cleanup_fd, uint64_t *action_handlep) { struct restorearg ra = { 0 }; dmu_replay_record_t *drr; objset_t *os; zio_cksum_t pcksum; int featureflags; ra.byteswap = drc->drc_byteswap; ra.cksum = drc->drc_cksum; ra.td = curthread; ra.fp = fp; ra.voff = *voffp; ra.bufsize = SPA_MAXBLOCKSIZE; ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); /* these were verified in dmu_recv_begin */ ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, DMU_SUBSTREAM); ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); /* * Open the objset we are modifying. */ VERIFY0(dmu_objset_from_ds(drc->drc_ds, &os)); ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); /* if this stream is dedup'ed, set up the avl tree for guid mapping */ if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { minor_t minor; if (cleanup_fd == -1) { ra.err = SET_ERROR(EBADF); goto out; } ra.err = zfs_onexit_fd_hold(cleanup_fd, &minor); if (ra.err != 0) { cleanup_fd = -1; goto out; } if (*action_handlep == 0) { ra.guid_to_ds_map = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); avl_create(ra.guid_to_ds_map, guid_compare, sizeof (guid_map_entry_t), offsetof(guid_map_entry_t, avlnode)); ra.err = zfs_onexit_add_cb(minor, free_guid_map_onexit, ra.guid_to_ds_map, action_handlep); if (ra.err != 0) goto out; } else { ra.err = zfs_onexit_cb_data(minor, *action_handlep, (void **)&ra.guid_to_ds_map); if (ra.err != 0) goto out; } drc->drc_guid_to_ds_map = ra.guid_to_ds_map; } /* * Read records and process them. */ pcksum = ra.cksum; while (ra.err == 0 && NULL != (drr = restore_read(&ra, sizeof (*drr), NULL))) { if (issig(JUSTLOOKING) && issig(FORREAL)) { ra.err = SET_ERROR(EINTR); goto out; } if (ra.byteswap) backup_byteswap(drr); switch (drr->drr_type) { case DRR_OBJECT: { /* * We need to make a copy of the record header, * because restore_{object,write} may need to * restore_read(), which will invalidate drr. */ struct drr_object drro = drr->drr_u.drr_object; ra.err = restore_object(&ra, os, &drro); break; } case DRR_FREEOBJECTS: { struct drr_freeobjects drrfo = drr->drr_u.drr_freeobjects; ra.err = restore_freeobjects(&ra, os, &drrfo); break; } case DRR_WRITE: { struct drr_write drrw = drr->drr_u.drr_write; ra.err = restore_write(&ra, os, &drrw); break; } case DRR_WRITE_BYREF: { struct drr_write_byref drrwbr = drr->drr_u.drr_write_byref; ra.err = restore_write_byref(&ra, os, &drrwbr); break; } case DRR_WRITE_EMBEDDED: { struct drr_write_embedded drrwe = drr->drr_u.drr_write_embedded; ra.err = restore_write_embedded(&ra, os, &drrwe); break; } case DRR_FREE: { struct drr_free drrf = drr->drr_u.drr_free; ra.err = restore_free(&ra, os, &drrf); break; } case DRR_END: { struct drr_end drre = drr->drr_u.drr_end; /* * We compare against the *previous* checksum * value, because the stored checksum is of * everything before the DRR_END record. */ if (!ZIO_CHECKSUM_EQUAL(drre.drr_checksum, pcksum)) ra.err = SET_ERROR(ECKSUM); goto out; } case DRR_SPILL: { struct drr_spill drrs = drr->drr_u.drr_spill; ra.err = restore_spill(&ra, os, &drrs); break; } default: ra.err = SET_ERROR(EINVAL); goto out; } pcksum = ra.cksum; } ASSERT(ra.err != 0); out: if ((featureflags & DMU_BACKUP_FEATURE_DEDUP) && (cleanup_fd != -1)) zfs_onexit_fd_rele(cleanup_fd); if (ra.err != 0) { /* * destroy what we created, so we don't leave it in the * inconsistent restoring state. */ dmu_recv_cleanup_ds(drc); } kmem_free(ra.buf, ra.bufsize); *voffp = ra.voff; return (ra.err); } static int dmu_recv_end_check(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); int error; ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); if (!drc->drc_newfs) { dsl_dataset_t *origin_head; error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); if (error != 0) return (error); if (drc->drc_force) { /* * We will destroy any snapshots in tofs (i.e. before * origin_head) that are after the origin (which is * the snap before drc_ds, because drc_ds can not * have any snaps of its own). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; error = dsl_dataset_hold_obj(dp, obj, FTAG, &snap); if (error != 0) break; if (snap->ds_dir != origin_head->ds_dir) error = SET_ERROR(EINVAL); if (error == 0) { error = dsl_destroy_snapshot_check_impl( snap, B_FALSE); } obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_dataset_rele(snap, FTAG); if (error != 0) break; } if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } } error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, origin_head, drc->drc_force, drc->drc_owner, tx); if (error != 0) { dsl_dataset_rele(origin_head, FTAG); return (error); } error = dsl_dataset_snapshot_check_impl(origin_head, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); dsl_dataset_rele(origin_head, FTAG); if (error != 0) return (error); error = dsl_destroy_head_check_impl(drc->drc_ds, 1); } else { error = dsl_dataset_snapshot_check_impl(drc->drc_ds, drc->drc_tosnap, tx, B_TRUE, 1, drc->drc_cred); } return (error); } static void dmu_recv_end_sync(void *arg, dmu_tx_t *tx) { dmu_recv_cookie_t *drc = arg; dsl_pool_t *dp = dmu_tx_pool(tx); spa_history_log_internal_ds(drc->drc_ds, "finish receiving", tx, "snap=%s", drc->drc_tosnap); if (!drc->drc_newfs) { dsl_dataset_t *origin_head; VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head)); if (drc->drc_force) { /* * Destroy any snapshots of drc_tofs (origin_head) * after the origin (the snap before drc_ds). */ uint64_t obj; obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; while (obj != dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { dsl_dataset_t *snap; VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &snap)); ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; dsl_destroy_snapshot_sync_impl(snap, B_FALSE, tx); dsl_dataset_rele(snap, FTAG); } } VERIFY3P(drc->drc_ds->ds_prev, ==, origin_head->ds_prev); dsl_dataset_clone_swap_sync_impl(drc->drc_ds, origin_head, tx); dsl_dataset_snapshot_sync_impl(origin_head, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(origin_head->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(origin_head->ds_dbuf, tx); dsl_dataset_phys(origin_head)->ds_flags &= ~DS_FLAG_INCONSISTENT; dsl_dataset_rele(origin_head, FTAG); dsl_destroy_head_sync_impl(drc->drc_ds, tx); if (drc->drc_owner != NULL) VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); } else { dsl_dataset_t *ds = drc->drc_ds; dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); /* set snapshot's creation time and guid */ dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); dsl_dataset_phys(ds->ds_prev)->ds_creation_time = drc->drc_drrb->drr_creation_time; dsl_dataset_phys(ds->ds_prev)->ds_guid = drc->drc_drrb->drr_toguid; dsl_dataset_phys(ds->ds_prev)->ds_flags &= ~DS_FLAG_INCONSISTENT; dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; } drc->drc_newsnapobj = dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; /* * Release the hold from dmu_recv_begin. This must be done before * we return to open context, so that when we free the dataset's dnode, * we can evict its bonus buffer. */ dsl_dataset_disown(drc->drc_ds, dmu_recv_tag); drc->drc_ds = NULL; } static int add_ds_to_guidmap(const char *name, avl_tree_t *guid_map, uint64_t snapobj) { dsl_pool_t *dp; dsl_dataset_t *snapds; guid_map_entry_t *gmep; int err; ASSERT(guid_map != NULL); err = dsl_pool_hold(name, FTAG, &dp); if (err != 0) return (err); gmep = kmem_alloc(sizeof (*gmep), KM_SLEEP); err = dsl_dataset_hold_obj(dp, snapobj, gmep, &snapds); if (err == 0) { gmep->guid = dsl_dataset_phys(snapds)->ds_guid; gmep->gme_ds = snapds; avl_add(guid_map, gmep); dsl_dataset_long_hold(snapds, gmep); } else kmem_free(gmep, sizeof (*gmep)); dsl_pool_rele(dp, FTAG); return (err); } static int dmu_recv_end_modified_blocks = 3; static int dmu_recv_existing_end(dmu_recv_cookie_t *drc) { int error; char name[MAXNAMELEN]; #ifdef _KERNEL /* * We will be destroying the ds; make sure its origin is unmounted if * necessary. */ dsl_dataset_name(drc->drc_ds, name); zfs_destroy_unmount_origin(name); #endif error = dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); if (error != 0) dmu_recv_cleanup_ds(drc); return (error); } static int dmu_recv_new_end(dmu_recv_cookie_t *drc) { int error; error = dsl_sync_task(drc->drc_tofs, dmu_recv_end_check, dmu_recv_end_sync, drc, dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL); if (error != 0) { dmu_recv_cleanup_ds(drc); } else if (drc->drc_guid_to_ds_map != NULL) { (void) add_ds_to_guidmap(drc->drc_tofs, drc->drc_guid_to_ds_map, drc->drc_newsnapobj); } return (error); } int dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) { drc->drc_owner = owner; if (drc->drc_newfs) return (dmu_recv_new_end(drc)); else return (dmu_recv_existing_end(drc)); } /* * Return TRUE if this objset is currently being received into. */ boolean_t dmu_objset_is_receiving(objset_t *os) { return (os->os_dsl_dataset != NULL && os->os_dsl_dataset->ds_owner == dmu_recv_tag); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_traverse.c (revision 286575) @@ -1,650 +1,650 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2015 Chunwei Chen. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ typedef struct prefetch_data { kmutex_t pd_mtx; kcondvar_t pd_cv; int32_t pd_bytes_fetched; int pd_flags; boolean_t pd_cancel; boolean_t pd_exited; } prefetch_data_t; typedef struct traverse_data { spa_t *td_spa; uint64_t td_objset; blkptr_t *td_rootbp; uint64_t td_min_txg; zbookmark_phys_t *td_resume; int td_flags; prefetch_data_t *td_pfd; boolean_t td_paused; uint64_t td_hole_birth_enabled_txg; blkptr_cb_t *td_func; void *td_arg; } traverse_data_t; static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, uint64_t objset, uint64_t object); static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, uint64_t objset, uint64_t object); static int traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { traverse_data_t *td = arg; zbookmark_phys_t zb; if (BP_IS_HOLE(bp)) return (0); if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) return (0); SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); return (0); } static int traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { traverse_data_t *td = arg; if (lrc->lrc_txtype == TX_WRITE) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; if (BP_IS_HOLE(bp)) return (0); if (claim_txg == 0 || bp->blk_birth < claim_txg) return (0); SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); } return (0); } static void traverse_zil(traverse_data_t *td, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zilog_t *zilog; /* * We only want to visit blocks that have been claimed but not yet * replayed; plus, in read-only mode, blocks that are already stable. */ if (claim_txg == 0 && spa_writeable(td->td_spa)) return; zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, claim_txg); zil_free(zilog); } typedef enum resume_skip { RESUME_SKIP_ALL, RESUME_SKIP_NONE, RESUME_SKIP_CHILDREN } resume_skip_t; /* * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and * the block indicated by zb does not need to be visited at all. Returns * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the * resume point. This indicates that this block should be visited but not its * children (since they must have been visited in a previous traversal). * Otherwise returns RESUME_SKIP_NONE. */ static resume_skip_t resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) { if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { /* * If we already visited this bp & everything below, * don't bother doing it again. */ if (zbookmark_is_before(dnp, zb, td->td_resume)) return (RESUME_SKIP_ALL); /* * If we found the block we're trying to resume from, zero * the bookmark out to indicate that we have resumed. */ if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { bzero(td->td_resume, sizeof (*zb)); if (td->td_flags & TRAVERSE_POST) return (RESUME_SKIP_CHILDREN); } } return (RESUME_SKIP_NONE); } static void traverse_prefetch_metadata(traverse_data_t *td, const blkptr_t *bp, const zbookmark_phys_t *zb) { arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) return; /* * If we are in the process of resuming, don't prefetch, because * some children will not be needed (and in fact may have already * been freed). */ if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) return; if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) return; if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) return; (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); } static boolean_t prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) { ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) return (B_FALSE); return (B_TRUE); } static int traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb) { zbookmark_phys_t czb; int err = 0; arc_buf_t *buf = NULL; prefetch_data_t *pd = td->td_pfd; boolean_t hard = td->td_flags & TRAVERSE_HARD; switch (resume_skip_check(td, dnp, zb)) { case RESUME_SKIP_ALL: return (0); case RESUME_SKIP_CHILDREN: goto post; case RESUME_SKIP_NONE: break; default: ASSERT(0); } if (bp->blk_birth == 0) { /* * Since this block has a birth time of 0 it must be a * hole created before the SPA_FEATURE_HOLE_BIRTH * feature was enabled. If SPA_FEATURE_HOLE_BIRTH * was enabled before the min_txg for this traveral we * know the hole must have been created before the * min_txg for this traveral, so we can skip it. If * SPA_FEATURE_HOLE_BIRTH was enabled after the min_txg * for this traveral we cannot tell if the hole was * created before or after the min_txg for this * traversal, so we cannot skip it. */ if (td->td_hole_birth_enabled_txg < td->td_min_txg) return (0); } else if (bp->blk_birth <= td->td_min_txg) { return (0); } if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { uint64_t size = BP_GET_LSIZE(bp); mutex_enter(&pd->pd_mtx); ASSERT(pd->pd_bytes_fetched >= 0); while (pd->pd_bytes_fetched < size && !pd->pd_exited) cv_wait(&pd->pd_cv, &pd->pd_mtx); pd->pd_bytes_fetched -= size; cv_broadcast(&pd->pd_cv); mutex_exit(&pd->pd_mtx); } if (BP_IS_HOLE(bp)) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err != 0) goto post; return (0); } if (td->td_flags & TRAVERSE_PRE) { err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err == TRAVERSE_VISIT_NO_CHILDREN) return (0); if (err != 0) goto post; } if (BP_GET_LEVEL(bp) > 0) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; cbp = buf->b_data; for (i = 0; i < epb; i++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); traverse_prefetch_metadata(td, &cbp[i], &czb); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i++) { SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, &cbp[i], &czb); if (err != 0) break; } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; dnode_phys_t *cdnp; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; cdnp = buf->b_data; for (i = 0; i < epb; i++) { prefetch_dnode_metadata(td, &cdnp[i], zb->zb_objset, zb->zb_blkid * epb + i); } /* recursively visitbp() blocks below this */ for (i = 0; i < epb; i++) { err = traverse_dnode(td, &cdnp[i], zb->zb_objset, zb->zb_blkid * epb + i); if (err != 0) break; } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; dnode_phys_t *mdnp, *gdnp, *udnp; err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); if (err != 0) goto post; osp = buf->b_data; mdnp = &osp->os_meta_dnode; gdnp = &osp->os_groupused_dnode; udnp = &osp->os_userused_dnode; prefetch_dnode_metadata(td, mdnp, zb->zb_objset, DMU_META_DNODE_OBJECT); if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { prefetch_dnode_metadata(td, gdnp, zb->zb_objset, DMU_GROUPUSED_OBJECT); prefetch_dnode_metadata(td, udnp, zb->zb_objset, DMU_USERUSED_OBJECT); } err = traverse_dnode(td, mdnp, zb->zb_objset, DMU_META_DNODE_OBJECT); if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { err = traverse_dnode(td, gdnp, zb->zb_objset, DMU_GROUPUSED_OBJECT); } if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { err = traverse_dnode(td, udnp, zb->zb_objset, DMU_USERUSED_OBJECT); } } if (buf) (void) arc_buf_remove_ref(buf, &buf); post: if (err == 0 && (td->td_flags & TRAVERSE_POST)) err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (hard && (err == EIO || err == ECKSUM)) { /* * Ignore this disk error as requested by the HARD flag, * and continue traversal. */ err = 0; } /* * If we are stopping here, set td_resume. */ if (td->td_resume != NULL && err != 0 && !td->td_paused) { td->td_resume->zb_objset = zb->zb_objset; td->td_resume->zb_object = zb->zb_object; td->td_resume->zb_level = 0; /* * If we have stopped on an indirect block (e.g. due to * i/o error), we have not visited anything below it. * Set the bookmark to the first level-0 block that we need * to visit. This way, the resuming code does not need to * deal with resuming from indirect blocks. */ td->td_resume->zb_blkid = zb->zb_blkid << (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)); td->td_paused = B_TRUE; } return (err); } static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int j; zbookmark_phys_t czb; for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); traverse_prefetch_metadata(td, &dnp->dn_spill, &czb); } } static int traverse_dnode(traverse_data_t *td, const dnode_phys_t *dnp, uint64_t objset, uint64_t object) { int j, err = 0; zbookmark_phys_t czb; for (j = 0; j < dnp->dn_nblkptr; j++) { SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); if (err != 0) break; } if (err == 0 && dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); err = traverse_visitbp(td, dnp, &dnp->dn_spill, &czb); } return (err); } /* ARGSUSED */ static int traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { prefetch_data_t *pfd = arg; arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; ASSERT(pfd->pd_bytes_fetched >= 0); if (pfd->pd_cancel) return (SET_ERROR(EINTR)); if (!prefetch_needed(pfd, bp)) return (0); mutex_enter(&pfd->pd_mtx); while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max) cv_wait(&pfd->pd_cv, &pfd->pd_mtx); pfd->pd_bytes_fetched += BP_GET_LSIZE(bp); cv_broadcast(&pfd->pd_cv); mutex_exit(&pfd->pd_mtx); (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, zb); return (0); } static void traverse_prefetch_thread(void *arg) { traverse_data_t *td_main = arg; traverse_data_t td = *td_main; zbookmark_phys_t czb; td.td_func = traverse_prefetcher; td.td_arg = td_main->td_pfd; td.td_pfd = NULL; SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb); mutex_enter(&td_main->td_pfd->pd_mtx); td_main->td_pfd->pd_exited = B_TRUE; cv_broadcast(&td_main->td_pfd->pd_cv); mutex_exit(&td_main->td_pfd->pd_mtx); } /* * NB: dataset must not be changing on-disk (eg, is a snapshot or we are * in syncing context). */ static int traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg) { traverse_data_t td; prefetch_data_t pd = { 0 }; zbookmark_phys_t czb; int err; ASSERT(ds == NULL || objset == ds->ds_object); ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); /* * The data prefetching mechanism (the prefetch thread) is incompatible * with resuming from a bookmark. */ ASSERT(resume == NULL || !(flags & TRAVERSE_PREFETCH_DATA)); td.td_spa = spa; td.td_objset = objset; td.td_rootbp = rootbp; td.td_min_txg = txg_start; td.td_resume = resume; td.td_func = func; td.td_arg = arg; td.td_pfd = &pd; td.td_flags = flags; td.td_paused = B_FALSE; if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { VERIFY(spa_feature_enabled_txg(spa, SPA_FEATURE_HOLE_BIRTH, &td.td_hole_birth_enabled_txg)); } else { td.td_hole_birth_enabled_txg = 0; } pd.pd_flags = flags; mutex_init(&pd.pd_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&pd.pd_cv, NULL, CV_DEFAULT, NULL); /* See comment on ZIL traversal in dsl_scan_visitds. */ - if (ds != NULL && !dsl_dataset_is_snapshot(ds) && !BP_IS_HOLE(rootbp)) { + if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; err = arc_read(NULL, td.td_spa, rootbp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, NULL); if (err != 0) return (err); osp = buf->b_data; traverse_zil(&td, &osp->os_zil_header); (void) arc_buf_remove_ref(buf, &buf); } if (!(flags & TRAVERSE_PREFETCH_DATA) || 0 == taskq_dispatch(system_taskq, traverse_prefetch_thread, &td, TQ_NOQUEUE)) pd.pd_exited = B_TRUE; SET_BOOKMARK(&czb, td.td_objset, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = traverse_visitbp(&td, NULL, rootbp, &czb); mutex_enter(&pd.pd_mtx); pd.pd_cancel = B_TRUE; cv_broadcast(&pd.pd_cv); while (!pd.pd_exited) cv_wait(&pd.pd_cv, &pd.pd_mtx); mutex_exit(&pd.pd_mtx); mutex_destroy(&pd.pd_mtx); cv_destroy(&pd.pd_cv); return (err); } /* * NB: dataset must not be changing on-disk (eg, is a snapshot or we are * in syncing context). */ int traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, &dsl_dataset_phys(ds)->ds_bp, txg_start, NULL, flags, func, arg)); } int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, uint64_t txg_start, zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg) { return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, blkptr, txg_start, resume, flags, func, arg)); } /* * NB: pool must not be changing on-disk (eg, from zdb or sync context). */ int traverse_pool(spa_t *spa, uint64_t txg_start, int flags, blkptr_cb_t func, void *arg) { int err; uint64_t obj; dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; boolean_t hard = (flags & TRAVERSE_HARD); /* visit the MOS */ err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), txg_start, NULL, flags, func, arg); if (err != 0) return (err); /* visit each dataset */ for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, txg_start)) { dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); if (err != 0) { if (hard) continue; break; } if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) { dsl_dataset_t *ds; uint64_t txg = txg_start; dsl_pool_config_enter(dp, FTAG); err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); dsl_pool_config_exit(dp, FTAG); if (err != 0) { if (hard) continue; break; } if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg) txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; err = traverse_dataset(ds, txg, flags, func, arg); dsl_dataset_rele(ds, FTAG); if (err != 0) break; } } if (err == ESRCH) err = 0; return (err); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c (revision 286575) @@ -1,2022 +1,2045 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include static kmem_cache_t *dnode_cache; /* * Define DNODE_STATS to turn on statistic gathering. By default, it is only * turned on when DEBUG is also defined. */ #ifdef DEBUG #define DNODE_STATS #endif /* DEBUG */ #ifdef DNODE_STATS #define DNODE_STAT_ADD(stat) ((stat)++) #else #define DNODE_STAT_ADD(stat) /* nothing */ #endif /* DNODE_STATS */ static dnode_phys_t dnode_phys_zero; int zfs_default_bs = SPA_MINBLOCKSHIFT; int zfs_default_ibs = DN_MAX_INDBLKSHIFT; #ifdef illumos static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); #endif static int dbuf_compare(const void *x1, const void *x2) { const dmu_buf_impl_t *d1 = x1; const dmu_buf_impl_t *d2 = x2; if (d1->db_level < d2->db_level) { return (-1); } if (d1->db_level > d2->db_level) { return (1); } if (d1->db_blkid < d2->db_blkid) { return (-1); } if (d1->db_blkid > d2->db_blkid) { return (1); } if (d1->db_state == DB_SEARCH) { ASSERT3S(d2->db_state, !=, DB_SEARCH); return (-1); } else if (d2->db_state == DB_SEARCH) { ASSERT3S(d1->db_state, !=, DB_SEARCH); return (1); } if ((uintptr_t)d1 < (uintptr_t)d2) { return (-1); } if ((uintptr_t)d1 > (uintptr_t)d2) { return (1); } return (0); } /* ARGSUSED */ static int dnode_cons(void *arg, void *unused, int kmflag) { dnode_t *dn = arg; int i; rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); /* * Every dbuf has a reference, and dropping a tracked reference is * O(number of references), so don't track dn_holds. */ refcount_create_untracked(&dn->dn_holds); refcount_create(&dn->dn_tx_holds); list_link_init(&dn->dn_link); bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { list_link_init(&dn->dn_dirty_link[i]); dn->dn_free_ranges[i] = NULL; list_create(&dn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); } dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; dn->dn_dirtyctx = 0; dn->dn_dirtyctx_firstset = NULL; dn->dn_bonus = NULL; dn->dn_have_spill = B_FALSE; dn->dn_zio = NULL; dn->dn_oldused = 0; dn->dn_oldflags = 0; dn->dn_olduid = 0; dn->dn_oldgid = 0; dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_id_flags = 0; dn->dn_dbufs_count = 0; dn->dn_unlisted_l0_blkid = 0; avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); dn->dn_moved = 0; POINTER_INVALIDATE(&dn->dn_objset); return (0); } /* ARGSUSED */ static void dnode_dest(void *arg, void *unused) { int i; dnode_t *dn = arg; rw_destroy(&dn->dn_struct_rwlock); mutex_destroy(&dn->dn_mtx); mutex_destroy(&dn->dn_dbufs_mtx); cv_destroy(&dn->dn_notxholds); refcount_destroy(&dn->dn_holds); refcount_destroy(&dn->dn_tx_holds); ASSERT(!list_link_active(&dn->dn_link)); for (i = 0; i < TXG_SIZE; i++) { ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); list_destroy(&dn->dn_dirty_records[i]); ASSERT0(dn->dn_next_nblkptr[i]); ASSERT0(dn->dn_next_nlevels[i]); ASSERT0(dn->dn_next_indblkshift[i]); ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_bonuslen[i]); ASSERT0(dn->dn_next_blksz[i]); } ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_free_txg); ASSERT0(dn->dn_assigned_txg); ASSERT0(dn->dn_dirtyctx); ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); ASSERT3P(dn->dn_bonus, ==, NULL); ASSERT(!dn->dn_have_spill); ASSERT3P(dn->dn_zio, ==, NULL); ASSERT0(dn->dn_oldused); ASSERT0(dn->dn_oldflags); ASSERT0(dn->dn_olduid); ASSERT0(dn->dn_oldgid); ASSERT0(dn->dn_newuid); ASSERT0(dn->dn_newgid); ASSERT0(dn->dn_id_flags); ASSERT0(dn->dn_dbufs_count); ASSERT0(dn->dn_unlisted_l0_blkid); avl_destroy(&dn->dn_dbufs); } void dnode_init(void) { ASSERT(dnode_cache == NULL); dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); kmem_cache_set_move(dnode_cache, dnode_move); } void dnode_fini(void) { kmem_cache_destroy(dnode_cache); dnode_cache = NULL; } #ifdef ZFS_DEBUG void dnode_verify(dnode_t *dn) { int drop_struct_lock = FALSE; ASSERT(dn->dn_phys); ASSERT(dn->dn_objset); ASSERT(dn->dn_handle->dnh_dnode == dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) return; if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { rw_enter(&dn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { int i; ASSERT3U(dn->dn_indblkshift, >=, 0); ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); if (dn->dn_datablkshift) { ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); ASSERT3U(1<dn_datablkshift, ==, dn->dn_datablksz); } ASSERT3U(dn->dn_nlevels, <=, 30); ASSERT(DMU_OT_IS_VALID(dn->dn_type)); ASSERT3U(dn->dn_nblkptr, >=, 1); ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); ASSERT3U(dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); ASSERT3U(dn->dn_datablksz, ==, dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + dn->dn_bonuslen, <=, DN_MAX_BONUSLEN); for (i = 0; i < TXG_SIZE; i++) { ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); } } if (dn->dn_phys->dn_type != DMU_OT_NONE) ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); if (dn->dn_dbuf != NULL) { ASSERT3P(dn->dn_phys, ==, (dnode_phys_t *)dn->dn_dbuf->db.db_data + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); } if (drop_struct_lock) rw_exit(&dn->dn_struct_rwlock); } #endif void dnode_byteswap(dnode_phys_t *dnp) { uint64_t *buf64 = (void*)&dnp->dn_blkptr; int i; if (dnp->dn_type == DMU_OT_NONE) { bzero(dnp, sizeof (dnode_phys_t)); return; } dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); dnp->dn_used = BSWAP_64(dnp->dn_used); /* * dn_nblkptr is only one byte, so it's OK to read it in either * byte order. We can't read dn_bouslen. */ ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) buf64[i] = BSWAP_64(buf64[i]); /* * OK to check dn_bonuslen for zero, because it won't matter if * we have the wrong byte order. This is necessary because the * dnode dnode is smaller than a regular dnode. */ if (dnp->dn_bonuslen != 0) { /* * Note that the bonus length calculated here may be * longer than the actual bonus buffer. This is because * we always put the bonus buffer after the last block * pointer (instead of packing it against the end of the * dnode buffer). */ int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); size_t len = DN_MAX_BONUSLEN - off; ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); dmu_object_byteswap_t byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype); dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); } /* Swap SPILL block if we have one */ if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) byteswap_uint64_array(&dnp->dn_spill, sizeof (blkptr_t)); } void dnode_buf_byteswap(void *vbuf, size_t size) { dnode_phys_t *buf = vbuf; int i; ASSERT3U(sizeof (dnode_phys_t), ==, (1<>= DNODE_SHIFT; for (i = 0; i < size; i++) { dnode_byteswap(buf); buf++; } } void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) { ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); dnode_setdirty(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); ASSERT3U(newsize, <=, DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t)); dn->dn_bonuslen = newsize; if (newsize == 0) dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; else dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; rw_exit(&dn->dn_struct_rwlock); } void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) { ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); dnode_setdirty(dn, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dn->dn_bonustype = newtype; dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; rw_exit(&dn->dn_struct_rwlock); } void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) { ASSERT3U(refcount_count(&dn->dn_holds), >=, 1); ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); dnode_setdirty(dn, tx); dn->dn_rm_spillblk[tx->tx_txg&TXG_MASK] = DN_KILL_SPILLBLK; dn->dn_have_spill = B_FALSE; } static void dnode_setdblksz(dnode_t *dn, int size) { ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE)); ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); ASSERT3U(size, >=, SPA_MINBLOCKSIZE); ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); dn->dn_datablksz = size; dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0; } static dnode_t * dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object, dnode_handle_t *dnh) { - dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); + dnode_t *dn; + dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); ASSERT(!POINTER_IS_VALID(dn->dn_objset)); dn->dn_moved = 0; /* * Defer setting dn_objset until the dnode is ready to be a candidate * for the dnode_move() callback. */ dn->dn_object = object; dn->dn_dbuf = db; dn->dn_handle = dnh; dn->dn_phys = dnp; if (dnp->dn_datablkszsec) { dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } else { dn->dn_datablksz = 0; dn->dn_datablkszsec = 0; dn->dn_datablkshift = 0; } dn->dn_indblkshift = dnp->dn_indblkshift; dn->dn_nlevels = dnp->dn_nlevels; dn->dn_type = dnp->dn_type; dn->dn_nblkptr = dnp->dn_nblkptr; dn->dn_checksum = dnp->dn_checksum; dn->dn_compress = dnp->dn_compress; dn->dn_bonustype = dnp->dn_bonustype; dn->dn_bonuslen = dnp->dn_bonuslen; dn->dn_maxblkid = dnp->dn_maxblkid; dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); dn->dn_id_flags = 0; dmu_zfetch_init(&dn->dn_zfetch, dn); ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); mutex_enter(&os->os_lock); - list_insert_head(&os->os_dnodes, dn); + if (dnh->dnh_dnode != NULL) { + /* Lost the allocation race. */ + mutex_exit(&os->os_lock); + kmem_cache_free(dnode_cache, dn); + return (dnh->dnh_dnode); + } + + /* + * Exclude special dnodes from os_dnodes so an empty os_dnodes + * signifies that the special dnodes have no references from + * their children (the entries in os_dnodes). This allows + * dnode_destroy() to easily determine if the last child has + * been removed and then complete eviction of the objset. + */ + if (!DMU_OBJECT_IS_SPECIAL(object)) + list_insert_head(&os->os_dnodes, dn); membar_producer(); + /* - * Everything else must be valid before assigning dn_objset makes the - * dnode eligible for dnode_move(). + * Everything else must be valid before assigning dn_objset + * makes the dnode eligible for dnode_move(). */ dn->dn_objset = os; + + dnh->dnh_dnode = dn; mutex_exit(&os->os_lock); arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); return (dn); } /* * Caller must be holding the dnode handle, which is released upon return. */ static void dnode_destroy(dnode_t *dn) { objset_t *os = dn->dn_objset; + boolean_t complete_os_eviction = B_FALSE; ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); mutex_enter(&os->os_lock); POINTER_INVALIDATE(&dn->dn_objset); - list_remove(&os->os_dnodes, dn); + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + list_remove(&os->os_dnodes, dn); + complete_os_eviction = + list_is_empty(&os->os_dnodes) && + list_link_active(&os->os_evicting_node); + } mutex_exit(&os->os_lock); /* the dnode can no longer move, so we can release the handle */ zrl_remove(&dn->dn_handle->dnh_zrlock); dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_assigned_txg = 0; dn->dn_dirtyctx = 0; if (dn->dn_dirtyctx_firstset != NULL) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; } if (dn->dn_bonus != NULL) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_evict(dn->dn_bonus); dn->dn_bonus = NULL; } dn->dn_zio = NULL; dn->dn_have_spill = B_FALSE; dn->dn_oldused = 0; dn->dn_oldflags = 0; dn->dn_olduid = 0; dn->dn_oldgid = 0; dn->dn_newuid = 0; dn->dn_newgid = 0; dn->dn_id_flags = 0; dn->dn_unlisted_l0_blkid = 0; dmu_zfetch_rele(&dn->dn_zfetch); kmem_cache_free(dnode_cache, dn); arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); + + if (complete_os_eviction) + dmu_objset_evict_done(os); } void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { int i; ASSERT3U(blocksize, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (blocksize == 0) blocksize = 1 << zfs_default_bs; else blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); if (ibs == 0) ibs = zfs_default_ibs; ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d\n", dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); ASSERT(ot != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(ot)); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype == DMU_OT_SA && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); ASSERT(dn->dn_type == DMU_OT_NONE); ASSERT0(dn->dn_maxblkid); ASSERT0(dn->dn_allocated_txg); ASSERT0(dn->dn_assigned_txg); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); ASSERT3U(refcount_count(&dn->dn_holds), <=, 1); ASSERT(avl_is_empty(&dn->dn_dbufs)); for (i = 0; i < TXG_SIZE; i++) { ASSERT0(dn->dn_next_nblkptr[i]); ASSERT0(dn->dn_next_nlevels[i]); ASSERT0(dn->dn_next_indblkshift[i]); ASSERT0(dn->dn_next_bonuslen[i]); ASSERT0(dn->dn_next_bonustype[i]); ASSERT0(dn->dn_rm_spillblk[i]); ASSERT0(dn->dn_next_blksz[i]); ASSERT(!list_link_active(&dn->dn_dirty_link[i])); ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); ASSERT3P(dn->dn_free_ranges[i], ==, NULL); } dn->dn_type = ot; dnode_setdblksz(dn, blocksize); dn->dn_indblkshift = ibs; dn->dn_nlevels = 1; if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ dn->dn_nblkptr = 1; else dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; dn->dn_dirtyctx = 0; dn->dn_free_txg = 0; if (dn->dn_dirtyctx_firstset) { kmem_free(dn->dn_dirtyctx_firstset, 1); dn->dn_dirtyctx_firstset = NULL; } dn->dn_allocated_txg = tx->tx_txg; dn->dn_id_flags = 0; dnode_setdirty(dn, tx); dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; } void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { int nblkptr; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); ASSERT3U(blocksize, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); ASSERT0(blocksize % SPA_MINBLOCKSIZE); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); ASSERT(tx->tx_txg != 0); ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || (bonustype != DMU_OT_NONE && bonuslen != 0) || (bonustype == DMU_OT_SA && bonuslen == 0)); ASSERT(DMU_OT_IS_VALID(bonustype)); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); dn->dn_id_flags = 0; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); dnode_setdirty(dn, tx); if (dn->dn_datablksz != blocksize) { /* change blocksize */ ASSERT(dn->dn_maxblkid == 0 && (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || dnode_block_freed(dn, 0))); dnode_setdblksz(dn, blocksize); dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; } if (dn->dn_bonuslen != bonuslen) dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ nblkptr = 1; else nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); if (dn->dn_bonustype != bonustype) dn->dn_next_bonustype[tx->tx_txg&TXG_MASK] = bonustype; if (dn->dn_nblkptr != nblkptr) dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { dbuf_rm_spill(dn, tx); dnode_rm_spill(dn, tx); } rw_exit(&dn->dn_struct_rwlock); /* change type */ dn->dn_type = ot; /* change bonus size and type */ mutex_enter(&dn->dn_mtx); dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; dn->dn_nblkptr = nblkptr; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); /* fix up the bonus db_size */ if (dn->dn_bonus) { dn->dn_bonus->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); } dn->dn_allocated_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); } #ifdef DNODE_STATS static struct { uint64_t dms_dnode_invalid; uint64_t dms_dnode_recheck1; uint64_t dms_dnode_recheck2; uint64_t dms_dnode_special; uint64_t dms_dnode_handle; uint64_t dms_dnode_rwlock; uint64_t dms_dnode_active; } dnode_move_stats; #endif /* DNODE_STATS */ static void dnode_move_impl(dnode_t *odn, dnode_t *ndn) { int i; ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); ASSERT(!RW_LOCK_HELD(&odn->dn_zfetch.zf_rwlock)); /* Copy fields. */ ndn->dn_objset = odn->dn_objset; ndn->dn_object = odn->dn_object; ndn->dn_dbuf = odn->dn_dbuf; ndn->dn_handle = odn->dn_handle; ndn->dn_phys = odn->dn_phys; ndn->dn_type = odn->dn_type; ndn->dn_bonuslen = odn->dn_bonuslen; ndn->dn_bonustype = odn->dn_bonustype; ndn->dn_nblkptr = odn->dn_nblkptr; ndn->dn_checksum = odn->dn_checksum; ndn->dn_compress = odn->dn_compress; ndn->dn_nlevels = odn->dn_nlevels; ndn->dn_indblkshift = odn->dn_indblkshift; ndn->dn_datablkshift = odn->dn_datablkshift; ndn->dn_datablkszsec = odn->dn_datablkszsec; ndn->dn_datablksz = odn->dn_datablksz; ndn->dn_maxblkid = odn->dn_maxblkid; bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], sizeof (odn->dn_next_nblkptr)); bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], sizeof (odn->dn_next_nlevels)); bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], sizeof (odn->dn_next_indblkshift)); bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], sizeof (odn->dn_next_bonustype)); bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], sizeof (odn->dn_rm_spillblk)); bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], sizeof (odn->dn_next_bonuslen)); bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], sizeof (odn->dn_next_blksz)); for (i = 0; i < TXG_SIZE; i++) { list_move_tail(&ndn->dn_dirty_records[i], &odn->dn_dirty_records[i]); } bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0], sizeof (odn->dn_free_ranges)); ndn->dn_allocated_txg = odn->dn_allocated_txg; ndn->dn_free_txg = odn->dn_free_txg; ndn->dn_assigned_txg = odn->dn_assigned_txg; ndn->dn_dirtyctx = odn->dn_dirtyctx; ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; ASSERT(refcount_count(&odn->dn_tx_holds) == 0); refcount_transfer(&ndn->dn_holds, &odn->dn_holds); ASSERT(avl_is_empty(&ndn->dn_dbufs)); avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); ndn->dn_dbufs_count = odn->dn_dbufs_count; ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid; ndn->dn_bonus = odn->dn_bonus; ndn->dn_have_spill = odn->dn_have_spill; ndn->dn_zio = odn->dn_zio; ndn->dn_oldused = odn->dn_oldused; ndn->dn_oldflags = odn->dn_oldflags; ndn->dn_olduid = odn->dn_olduid; ndn->dn_oldgid = odn->dn_oldgid; ndn->dn_newuid = odn->dn_newuid; ndn->dn_newgid = odn->dn_newgid; ndn->dn_id_flags = odn->dn_id_flags; dmu_zfetch_init(&ndn->dn_zfetch, NULL); list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; ndn->dn_zfetch.zf_stream_cnt = odn->dn_zfetch.zf_stream_cnt; ndn->dn_zfetch.zf_alloc_fail = odn->dn_zfetch.zf_alloc_fail; /* * Update back pointers. Updating the handle fixes the back pointer of * every descendant dbuf as well as the bonus dbuf. */ ASSERT(ndn->dn_handle->dnh_dnode == odn); ndn->dn_handle->dnh_dnode = ndn; if (ndn->dn_zfetch.zf_dnode == odn) { ndn->dn_zfetch.zf_dnode = ndn; } /* * Invalidate the original dnode by clearing all of its back pointers. */ odn->dn_dbuf = NULL; odn->dn_handle = NULL; avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); odn->dn_dbufs_count = 0; odn->dn_unlisted_l0_blkid = 0; odn->dn_bonus = NULL; odn->dn_zfetch.zf_dnode = NULL; /* * Set the low bit of the objset pointer to ensure that dnode_move() * recognizes the dnode as invalid in any subsequent callback. */ POINTER_INVALIDATE(&odn->dn_objset); /* * Satisfy the destructor. */ for (i = 0; i < TXG_SIZE; i++) { list_create(&odn->dn_dirty_records[i], sizeof (dbuf_dirty_record_t), offsetof(dbuf_dirty_record_t, dr_dirty_node)); odn->dn_free_ranges[i] = NULL; odn->dn_next_nlevels[i] = 0; odn->dn_next_indblkshift[i] = 0; odn->dn_next_bonustype[i] = 0; odn->dn_rm_spillblk[i] = 0; odn->dn_next_bonuslen[i] = 0; odn->dn_next_blksz[i] = 0; } odn->dn_allocated_txg = 0; odn->dn_free_txg = 0; odn->dn_assigned_txg = 0; odn->dn_dirtyctx = 0; odn->dn_dirtyctx_firstset = NULL; odn->dn_have_spill = B_FALSE; odn->dn_zio = NULL; odn->dn_oldused = 0; odn->dn_oldflags = 0; odn->dn_olduid = 0; odn->dn_oldgid = 0; odn->dn_newuid = 0; odn->dn_newgid = 0; odn->dn_id_flags = 0; /* * Mark the dnode. */ ndn->dn_moved = 1; odn->dn_moved = (uint8_t)-1; } #ifdef illumos #ifdef _KERNEL /*ARGSUSED*/ static kmem_cbrc_t dnode_move(void *buf, void *newbuf, size_t size, void *arg) { dnode_t *odn = buf, *ndn = newbuf; objset_t *os; int64_t refcount; uint32_t dbufs; /* * The dnode is on the objset's list of known dnodes if the objset * pointer is valid. We set the low bit of the objset pointer when * freeing the dnode to invalidate it, and the memory patterns written * by kmem (baddcafe and deadbeef) set at least one of the two low bits. * A newly created dnode sets the objset pointer last of all to indicate * that the dnode is known and in a valid state to be moved by this * function. */ os = odn->dn_objset; if (!POINTER_IS_VALID(os)) { DNODE_STAT_ADD(dnode_move_stats.dms_dnode_invalid); return (KMEM_CBRC_DONT_KNOW); } /* * Ensure that the objset does not go away during the move. */ rw_enter(&os_lock, RW_WRITER); if (os != odn->dn_objset) { rw_exit(&os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck1); return (KMEM_CBRC_DONT_KNOW); } /* * If the dnode is still valid, then so is the objset. We know that no * valid objset can be freed while we hold os_lock, so we can safely * ensure that the objset remains in use. */ mutex_enter(&os->os_lock); /* * Recheck the objset pointer in case the dnode was removed just before * acquiring the lock. */ if (os != odn->dn_objset) { mutex_exit(&os->os_lock); rw_exit(&os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_recheck2); return (KMEM_CBRC_DONT_KNOW); } /* * At this point we know that as long as we hold os->os_lock, the dnode * cannot be freed and fields within the dnode can be safely accessed. * The objset listing this dnode cannot go away as long as this dnode is * on its list. */ rw_exit(&os_lock); if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { mutex_exit(&os->os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_special); return (KMEM_CBRC_NO); } ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ /* * Lock the dnode handle to prevent the dnode from obtaining any new * holds. This also prevents the descendant dbufs and the bonus dbuf * from accessing the dnode, so that we can discount their holds. The * handle is safe to access because we know that while the dnode cannot * go away, neither can its handle. Once we hold dnh_zrlock, we can * safely move any dnode referenced only by dbufs. */ if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { mutex_exit(&os->os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_handle); return (KMEM_CBRC_LATER); } /* * Ensure a consistent view of the dnode's holds and the dnode's dbufs. * We need to guarantee that there is a hold for every dbuf in order to * determine whether the dnode is actively referenced. Falsely matching * a dbuf to an active hold would lead to an unsafe move. It's possible * that a thread already having an active dnode hold is about to add a * dbuf, and we can't compare hold and dbuf counts while the add is in * progress. */ if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_rwlock); return (KMEM_CBRC_LATER); } /* * A dbuf may be removed (evicted) without an active dnode hold. In that * case, the dbuf count is decremented under the handle lock before the * dbuf's hold is released. This order ensures that if we count the hold * after the dbuf is removed but before its hold is released, we will * treat the unmatched hold as active and exit safely. If we count the * hold before the dbuf is removed, the hold is discounted, and the * removal is blocked until the move completes. */ refcount = refcount_count(&odn->dn_holds); ASSERT(refcount >= 0); dbufs = odn->dn_dbufs_count; /* We can't have more dbufs than dnode holds. */ ASSERT3U(dbufs, <=, refcount); DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, uint32_t, dbufs); if (refcount > dbufs) { rw_exit(&odn->dn_struct_rwlock); zrl_exit(&odn->dn_handle->dnh_zrlock); mutex_exit(&os->os_lock); DNODE_STAT_ADD(dnode_move_stats.dms_dnode_active); return (KMEM_CBRC_LATER); } rw_exit(&odn->dn_struct_rwlock); /* * At this point we know that anyone with a hold on the dnode is not * actively referencing it. The dnode is known and in a valid state to * move. We're holding the locks needed to execute the critical section. */ dnode_move_impl(odn, ndn); list_link_replace(&odn->dn_link, &ndn->dn_link); /* If the dnode was safe to move, the refcount cannot have changed. */ ASSERT(refcount == refcount_count(&ndn->dn_holds)); ASSERT(dbufs == ndn->dn_dbufs_count); zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ mutex_exit(&os->os_lock); return (KMEM_CBRC_YES); } #endif /* _KERNEL */ #endif /* illumos */ void dnode_special_close(dnode_handle_t *dnh) { dnode_t *dn = dnh->dnh_dnode; /* * Wait for final references to the dnode to clear. This can * only happen if the arc is asyncronously evicting state that * has a hold on this dnode while we are trying to evict this * dnode. */ while (refcount_count(&dn->dn_holds) > 0) delay(1); + ASSERT(dn->dn_dbuf == NULL || + dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); zrl_add(&dnh->dnh_zrlock); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } -dnode_t * +void dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh) { - dnode_t *dn = dnode_create(os, dnp, NULL, object, dnh); - dnh->dnh_dnode = dn; + dnode_t *dn; + + dn = dnode_create(os, dnp, NULL, object, dnh); zrl_init(&dnh->dnh_zrlock); DNODE_VERIFY(dn); - return (dn); } static void -dnode_buf_pageout(dmu_buf_t *db, void *arg) +dnode_buf_pageout(void *dbu) { - dnode_children_t *children_dnodes = arg; + dnode_children_t *children_dnodes = dbu; int i; - int epb = db->db_size >> DNODE_SHIFT; - ASSERT(epb == children_dnodes->dnc_count); - - for (i = 0; i < epb; i++) { + for (i = 0; i < children_dnodes->dnc_count; i++) { dnode_handle_t *dnh = &children_dnodes->dnc_children[i]; dnode_t *dn; /* * The dnode handle lock guards against the dnode moving to * another valid address, so there is no need here to guard * against changes to or from NULL. */ if (dnh->dnh_dnode == NULL) { zrl_destroy(&dnh->dnh_zrlock); continue; } zrl_add(&dnh->dnh_zrlock); dn = dnh->dnh_dnode; /* * If there are holds on this dnode, then there should * be holds on the dnode's containing dbuf as well; thus * it wouldn't be eligible for eviction and this function * would not have been called. */ ASSERT(refcount_is_zero(&dn->dn_holds)); ASSERT(refcount_is_zero(&dn->dn_tx_holds)); dnode_destroy(dn); /* implicit zrl_remove() */ zrl_destroy(&dnh->dnh_zrlock); dnh->dnh_dnode = NULL; } kmem_free(children_dnodes, sizeof (dnode_children_t) + - epb * sizeof (dnode_handle_t)); + children_dnodes->dnc_count * sizeof (dnode_handle_t)); } /* * errors: * EINVAL - invalid object number. * EIO - i/o error. * succeeds even for free dnodes. */ int dnode_hold_impl(objset_t *os, uint64_t object, int flag, void *tag, dnode_t **dnp) { int epb, idx, err; int drop_struct_lock = FALSE; int type; uint64_t blk; dnode_t *mdn, *dn; dmu_buf_impl_t *db; dnode_children_t *children_dnodes; dnode_handle_t *dnh; /* * If you are holding the spa config lock as writer, you shouldn't * be asking the DMU to do *anything* unless it's the root pool * which may require us to read from the root filesystem while * holding some (not all) of the locks as writer. */ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || (spa_is_root(os->os_spa) && spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { dn = (object == DMU_USERUSED_OBJECT) ? DMU_USERUSED_DNODE(os) : DMU_GROUPUSED_DNODE(os); if (dn == NULL) return (SET_ERROR(ENOENT)); type = dn->dn_type; if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) return (SET_ERROR(ENOENT)); if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) return (SET_ERROR(EEXIST)); DNODE_VERIFY(dn); (void) refcount_add(&dn->dn_holds, tag); *dnp = dn; return (0); } if (object == 0 || object >= DN_MAX_OBJECT) return (SET_ERROR(EINVAL)); mdn = DMU_META_DNODE(os); ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); DNODE_VERIFY(mdn); if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { rw_enter(&mdn->dn_struct_rwlock, RW_READER); drop_struct_lock = TRUE; } blk = dbuf_whichblock(mdn, object * sizeof (dnode_phys_t)); db = dbuf_hold(mdn, blk, FTAG); if (drop_struct_lock) rw_exit(&mdn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); err = dbuf_read(db, NULL, DB_RF_CANFAIL); if (err) { dbuf_rele(db, FTAG); return (err); } ASSERT3U(db->db.db_size, >=, 1<db.db_size >> DNODE_SHIFT; idx = object & (epb-1); ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); children_dnodes = dmu_buf_get_user(&db->db); if (children_dnodes == NULL) { int i; dnode_children_t *winner; children_dnodes = kmem_zalloc(sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t), KM_SLEEP); children_dnodes->dnc_count = epb; dnh = &children_dnodes->dnc_children[0]; for (i = 0; i < epb; i++) { zrl_init(&dnh[i].dnh_zrlock); - dnh[i].dnh_dnode = NULL; } - if (winner = dmu_buf_set_user(&db->db, children_dnodes, - dnode_buf_pageout)) { + dmu_buf_init_user(&children_dnodes->dnc_dbu, + dnode_buf_pageout, NULL); + winner = dmu_buf_set_user(&db->db, &children_dnodes->dnc_dbu); + if (winner != NULL) { for (i = 0; i < epb; i++) { zrl_destroy(&dnh[i].dnh_zrlock); } kmem_free(children_dnodes, sizeof (dnode_children_t) + epb * sizeof (dnode_handle_t)); children_dnodes = winner; } } ASSERT(children_dnodes->dnc_count == epb); dnh = &children_dnodes->dnc_children[idx]; zrl_add(&dnh->dnh_zrlock); - if ((dn = dnh->dnh_dnode) == NULL) { + dn = dnh->dnh_dnode; + if (dn == NULL) { dnode_phys_t *phys = (dnode_phys_t *)db->db.db_data+idx; - dnode_t *winner; dn = dnode_create(os, phys, db, object, dnh); - winner = atomic_cas_ptr(&dnh->dnh_dnode, NULL, dn); - if (winner != NULL) { - zrl_add(&dnh->dnh_zrlock); - dnode_destroy(dn); /* implicit zrl_remove() */ - dn = winner; - } } mutex_enter(&dn->dn_mtx); type = dn->dn_type; if (dn->dn_free_txg || ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || ((flag & DNODE_MUST_BE_FREE) && (type != DMU_OT_NONE || !refcount_is_zero(&dn->dn_holds)))) { mutex_exit(&dn->dn_mtx); zrl_remove(&dnh->dnh_zrlock); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); } - mutex_exit(&dn->dn_mtx); - if (refcount_add(&dn->dn_holds, tag) == 1) dbuf_add_ref(db, dnh); + mutex_exit(&dn->dn_mtx); + /* Now we can rely on the hold to prevent the dnode from moving. */ zrl_remove(&dnh->dnh_zrlock); DNODE_VERIFY(dn); ASSERT3P(dn->dn_dbuf, ==, db); ASSERT3U(dn->dn_object, ==, object); dbuf_rele(db, FTAG); *dnp = dn; return (0); } /* * Return held dnode if the object is allocated, NULL if not. */ int dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) { return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); } /* * Can only add a reference if there is already at least one * reference on the dnode. Returns FALSE if unable to add a * new reference. */ boolean_t dnode_add_ref(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); if (refcount_is_zero(&dn->dn_holds)) { mutex_exit(&dn->dn_mtx); return (FALSE); } VERIFY(1 < refcount_add(&dn->dn_holds, tag)); mutex_exit(&dn->dn_mtx); return (TRUE); } void dnode_rele(dnode_t *dn, void *tag) { mutex_enter(&dn->dn_mtx); dnode_rele_and_unlock(dn, tag); } void dnode_rele_and_unlock(dnode_t *dn, void *tag) { uint64_t refs; /* Get while the hold prevents the dnode from moving. */ dmu_buf_impl_t *db = dn->dn_dbuf; dnode_handle_t *dnh = dn->dn_handle; refs = refcount_remove(&dn->dn_holds, tag); mutex_exit(&dn->dn_mtx); /* * It's unsafe to release the last hold on a dnode by dnode_rele() or * indirectly by dbuf_rele() while relying on the dnode handle to * prevent the dnode from moving, since releasing the last hold could * result in the dnode's parent dbuf evicting its dnode handles. For * that reason anyone calling dnode_rele() or dbuf_rele() without some * other direct or indirect hold on the dnode must first drop the dnode * handle. */ ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ if (refs == 0 && db != NULL) { /* * Another thread could add a hold to the dnode handle in * dnode_hold_impl() while holding the parent dbuf. Since the * hold on the parent dbuf prevents the handle from being * destroyed, the hold on the handle is OK. We can't yet assert * that the handle has zero references, but that will be * asserted anyway when the handle gets destroyed. */ dbuf_rele(db, dnh); } } void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { dsl_dataset_dirty(os->os_dsl_dataset, tx); return; } DNODE_VERIFY(dn); #ifdef ZFS_DEBUG mutex_enter(&dn->dn_mtx); ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); mutex_exit(&dn->dn_mtx); #endif /* * Determine old uid/gid when necessary */ dmu_objset_userquota_get_ids(dn, B_TRUE, tx); mutex_enter(&os->os_lock); /* * If we are already marked dirty, we're done. */ if (list_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { mutex_exit(&os->os_lock); return; } ASSERT(!refcount_is_zero(&dn->dn_holds) || !avl_is_empty(&dn->dn_dbufs)); ASSERT(dn->dn_datablksz != 0); ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]); ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]); ASSERT0(dn->dn_next_bonustype[txg&TXG_MASK]); dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", dn->dn_object, txg); if (dn->dn_free_txg > 0 && dn->dn_free_txg <= txg) { list_insert_tail(&os->os_free_dnodes[txg&TXG_MASK], dn); } else { list_insert_tail(&os->os_dirty_dnodes[txg&TXG_MASK], dn); } mutex_exit(&os->os_lock); /* * The dnode maintains a hold on its containing dbuf as * long as there are holds on it. Each instantiated child * dbuf maintains a hold on the dnode. When the last child * drops its hold, the dnode will drop its hold on the * containing dbuf. We add a "dirty hold" here so that the * dnode will hang around after we finish processing its * children. */ VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); (void) dbuf_dirty(dn->dn_dbuf, tx); dsl_dataset_dirty(os->os_dsl_dataset, tx); } void dnode_free(dnode_t *dn, dmu_tx_t *tx) { int txgoff = tx->tx_txg & TXG_MASK; dprintf("dn=%p txg=%llu\n", dn, tx->tx_txg); /* we should be the only holder... hopefully */ /* ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */ mutex_enter(&dn->dn_mtx); if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { mutex_exit(&dn->dn_mtx); return; } dn->dn_free_txg = tx->tx_txg; mutex_exit(&dn->dn_mtx); /* * If the dnode is already dirty, it needs to be moved from * the dirty list to the free list. */ mutex_enter(&dn->dn_objset->os_lock); if (list_link_active(&dn->dn_dirty_link[txgoff])) { list_remove(&dn->dn_objset->os_dirty_dnodes[txgoff], dn); list_insert_tail(&dn->dn_objset->os_free_dnodes[txgoff], dn); mutex_exit(&dn->dn_objset->os_lock); } else { mutex_exit(&dn->dn_objset->os_lock); dnode_setdirty(dn, tx); } } /* * Try to change the block size for the indicated dnode. This can only * succeed if there are no blocks allocated or dirty beyond first block */ int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) { dmu_buf_impl_t *db; int err; ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); if (size == 0) size = SPA_MINBLOCKSIZE; else size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); if (ibs == dn->dn_indblkshift) ibs = 0; if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) return (0); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); /* Check for any allocated blocks beyond the first */ if (dn->dn_maxblkid != 0) goto fail; mutex_enter(&dn->dn_dbufs_mtx); for (db = avl_first(&dn->dn_dbufs); db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && db->db_blkid != DMU_SPILL_BLKID) { mutex_exit(&dn->dn_dbufs_mtx); goto fail; } } mutex_exit(&dn->dn_dbufs_mtx); if (ibs && dn->dn_nlevels != 1) goto fail; /* resize the old block */ err = dbuf_hold_impl(dn, 0, 0, TRUE, FTAG, &db); if (err == 0) dbuf_new_size(db, size, tx); else if (err != ENOENT) goto fail; dnode_setdblksz(dn, size); dnode_setdirty(dn, tx); dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; if (ibs) { dn->dn_indblkshift = ibs; dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; } /* rele after we have fixed the blocksize in the dnode */ if (db) dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); return (0); fail: rw_exit(&dn->dn_struct_rwlock); return (SET_ERROR(ENOTSUP)); } /* read-holding callers must not rely on the lock being continuously held */ void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read) { uint64_t txgoff = tx->tx_txg & TXG_MASK; int epbs, new_nlevels; uint64_t sz; ASSERT(blkid != DMU_BONUS_BLKID); ASSERT(have_read ? RW_READ_HELD(&dn->dn_struct_rwlock) : RW_WRITE_HELD(&dn->dn_struct_rwlock)); /* * if we have a read-lock, check to see if we need to do any work * before upgrading to a write-lock. */ if (have_read) { if (blkid <= dn->dn_maxblkid) return; if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { rw_exit(&dn->dn_struct_rwlock); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); } } if (blkid <= dn->dn_maxblkid) goto out; dn->dn_maxblkid = blkid; /* * Compute the number of levels necessary to support the new maxblkid. */ new_nlevels = 1; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (sz = dn->dn_nblkptr; sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) new_nlevels++; if (new_nlevels > dn->dn_nlevels) { int old_nlevels = dn->dn_nlevels; dmu_buf_impl_t *db; list_t *list; dbuf_dirty_record_t *new, *dr, *dr_next; dn->dn_nlevels = new_nlevels; ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); dn->dn_next_nlevels[txgoff] = new_nlevels; /* dirty the left indirects */ db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); ASSERT(db != NULL); new = dbuf_dirty(db, tx); dbuf_rele(db, FTAG); /* transfer the dirty records to the new indirect */ mutex_enter(&dn->dn_mtx); mutex_enter(&new->dt.di.dr_mtx); list = &dn->dn_dirty_records[txgoff]; for (dr = list_head(list); dr; dr = dr_next) { dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); if (dr->dr_dbuf->db_level != new_nlevels-1 && dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); list_remove(&dn->dn_dirty_records[txgoff], dr); list_insert_tail(&new->dt.di.dr_children, dr); dr->dr_parent = new; } } mutex_exit(&new->dt.di.dr_mtx); mutex_exit(&dn->dn_mtx); } out: if (have_read) rw_downgrade(&dn->dn_struct_rwlock); } static void dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) { dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG); if (db != NULL) { dmu_buf_will_dirty(&db->db, tx); dbuf_rele(db, FTAG); } } void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { dmu_buf_impl_t *db; uint64_t blkoff, blkid, nblks; int blksz, blkshift, head, tail; int trunc = FALSE; int epbs; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); blksz = dn->dn_datablksz; blkshift = dn->dn_datablkshift; epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; if (len == DMU_OBJECT_END) { len = UINT64_MAX - off; trunc = TRUE; } /* * First, block align the region to free: */ if (ISP2(blksz)) { head = P2NPHASE(off, blksz); blkoff = P2PHASE(off, blksz); if ((off >> blkshift) > dn->dn_maxblkid) goto out; } else { ASSERT(dn->dn_maxblkid == 0); if (off == 0 && len >= blksz) { /* * Freeing the whole block; fast-track this request. * Note that we won't dirty any indirect blocks, * which is fine because we will be freeing the entire * file and thus all indirect blocks will be freed * by free_children(). */ blkid = 0; nblks = 1; goto done; } else if (off >= blksz) { /* Freeing past end-of-data */ goto out; } else { /* Freeing part of the block. */ head = blksz - off; ASSERT3U(head, >, 0); } blkoff = off; } /* zero out any partial block data at the start of the range */ if (head) { ASSERT3U(blkoff + head, ==, blksz); if (len < head) head = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off), TRUE, FTAG, &db) == 0) { caddr_t data; /* don't dirty if it isn't on disk and isn't dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); data = db->db.db_data; bzero(data + blkoff, head); } dbuf_rele(db, FTAG); } off += head; len -= head; } /* If the range was less than one block, we're done */ if (len == 0) goto out; /* If the remaining range is past end of file, we're done */ if ((off >> blkshift) > dn->dn_maxblkid) goto out; ASSERT(ISP2(blksz)); if (trunc) tail = 0; else tail = P2PHASE(len, blksz); ASSERT0(P2PHASE(off, blksz)); /* zero out any partial block data at the end of the range */ if (tail) { if (len < tail) tail = len; if (dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, off+len), TRUE, FTAG, &db) == 0) { /* don't dirty if not on disk and not dirty */ if (db->db_last_dirty || (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr))) { rw_exit(&dn->dn_struct_rwlock); dmu_buf_will_dirty(&db->db, tx); rw_enter(&dn->dn_struct_rwlock, RW_WRITER); bzero(db->db.db_data, tail); } dbuf_rele(db, FTAG); } len -= tail; } /* If the range did not include a full block, we are done */ if (len == 0) goto out; ASSERT(IS_P2ALIGNED(off, blksz)); ASSERT(trunc || IS_P2ALIGNED(len, blksz)); blkid = off >> blkshift; nblks = len >> blkshift; if (trunc) nblks += 1; /* * Dirty all the indirect blocks in this range. Note that only * the first and last indirect blocks can actually be written * (if they were partially freed) -- they must be dirtied, even if * they do not exist on disk yet. The interior blocks will * be freed by free_children(), so they will not actually be written. * Even though these interior blocks will not be written, we * dirty them for two reasons: * * - It ensures that the indirect blocks remain in memory until * syncing context. (They have already been prefetched by * dmu_tx_hold_free(), so we don't have to worry about reading * them serially here.) * * - The dirty space accounting will put pressure on the txg sync * mechanism to begin syncing, and to delay transactions if there * is a large amount of freeing. Even though these indirect * blocks will not be written, we could need to write the same * amount of space if we copy the freed BPs into deadlists. */ if (dn->dn_nlevels > 1) { uint64_t first, last; first = blkid >> epbs; dnode_dirty_l1(dn, first, tx); if (trunc) last = dn->dn_maxblkid >> epbs; else last = (blkid + nblks - 1) >> epbs; if (last != first) dnode_dirty_l1(dn, last, tx); int shift = dn->dn_datablkshift + dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (uint64_t i = first + 1; i < last; i++) { /* * Set i to the blockid of the next non-hole * level-1 indirect block at or after i. Note * that dnode_next_offset() operates in terms of * level-0-equivalent bytes. */ uint64_t ibyte = i << shift; int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, &ibyte, 2, 1, 0); i = ibyte >> shift; if (i >= last) break; /* * Normally we should not see an error, either * from dnode_next_offset() or dbuf_hold_level() * (except for ESRCH from dnode_next_offset). * If there is an i/o error, then when we read * this block in syncing context, it will use * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according * to the "failmode" property. dnode_next_offset() * doesn't have a flag to indicate MUSTSUCCEED. */ if (err != 0) break; dnode_dirty_l1(dn, i, tx); } } done: /* * Add this range to the dnode range list. * We will finish up this free operation in the syncing phase. */ mutex_enter(&dn->dn_mtx); int txgoff = tx->tx_txg & TXG_MASK; if (dn->dn_free_ranges[txgoff] == NULL) { dn->dn_free_ranges[txgoff] = range_tree_create(NULL, NULL, &dn->dn_mtx); } range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", blkid, nblks, tx->tx_txg); mutex_exit(&dn->dn_mtx); dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); dnode_setdirty(dn, tx); out: rw_exit(&dn->dn_struct_rwlock); } static boolean_t dnode_spill_freed(dnode_t *dn) { int i; mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) break; } mutex_exit(&dn->dn_mtx); return (i < TXG_SIZE); } /* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid) { void *dp = spa_get_dsl(dn->dn_objset->os_spa); int i; if (blkid == DMU_BONUS_BLKID) return (FALSE); /* * If we're in the process of opening the pool, dp will not be * set yet, but there shouldn't be anything dirty. */ if (dp == NULL) return (FALSE); if (dn->dn_free_txg) return (TRUE); if (blkid == DMU_SPILL_BLKID) return (dnode_spill_freed(dn)); mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { if (dn->dn_free_ranges[i] != NULL && range_tree_contains(dn->dn_free_ranges[i], blkid, 1)) break; } mutex_exit(&dn->dn_mtx); return (i < TXG_SIZE); } /* call from syncing context when we actually write/free space for this dnode */ void dnode_diduse_space(dnode_t *dn, int64_t delta) { uint64_t space; dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", dn, dn->dn_phys, (u_longlong_t)dn->dn_phys->dn_used, (longlong_t)delta); mutex_enter(&dn->dn_mtx); space = DN_USED_BYTES(dn->dn_phys); if (delta > 0) { ASSERT3U(space + delta, >=, space); /* no overflow */ } else { ASSERT3U(space, >=, -delta); /* no underflow */ } space += delta; if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); ASSERT0(P2PHASE(space, 1<dn_phys->dn_used = space >> DEV_BSHIFT; } else { dn->dn_phys->dn_used = space; dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; } mutex_exit(&dn->dn_mtx); } /* * Call when we think we're going to write/free space in open context to track * the amount of memory in use by the currently open txg. */ void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) { objset_t *os = dn->dn_objset; dsl_dataset_t *ds = os->os_dsl_dataset; int64_t aspace = spa_get_asize(os->os_spa, space); if (ds != NULL) { dsl_dir_willuse_space(ds->ds_dir, aspace, tx); dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); } dmu_tx_willuse_space(tx, aspace); } /* * Scans a block at the indicated "level" looking for a hole or data, * depending on 'flags'. * * If level > 0, then we are scanning an indirect block looking at its * pointers. If level == 0, then we are looking at a block of dnodes. * * If we don't find what we are looking for in the block, we return ESRCH. * Otherwise, return with *offset pointing to the beginning (if searching * forwards) or end (if searching backwards) of the range covered by the * block pointer we matched on (or dnode). * * The basic search algorithm used below by dnode_next_offset() is to * use this function to search up the block tree (widen the search) until * we find something (i.e., we don't return ESRCH) and then search back * down the tree (narrow the search) until we reach our original search * level. */ static int dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, int lvl, uint64_t blkfill, uint64_t txg) { dmu_buf_impl_t *db = NULL; void *data = NULL; uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; uint64_t epb = 1ULL << epbs; uint64_t minfill, maxfill; boolean_t hole; int i, inc, error, span; dprintf("probing object %llu offset %llx level %d of %u\n", dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); hole = ((flags & DNODE_FIND_HOLE) != 0); inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ASSERT(txg == 0 || !hole); if (lvl == dn->dn_phys->dn_nlevels) { error = 0; epb = dn->dn_phys->dn_nblkptr; data = dn->dn_phys->dn_blkptr; } else { uint64_t blkid = dbuf_whichblock(dn, *offset) >> (epbs * lvl); error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FTAG, &db); if (error) { if (error != ENOENT) return (error); if (hole) return (0); /* * This can only happen when we are searching up * the block tree for data. We don't really need to * adjust the offset, as we will just end up looking * at the pointer to this block in its parent, and its * going to be unallocated, so we will skip over it. */ return (SET_ERROR(ESRCH)); } error = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_HAVESTRUCT); if (error) { dbuf_rele(db, FTAG); return (error); } data = db->db.db_data; } if (db != NULL && txg != 0 && (db->db_blkptr == NULL || db->db_blkptr->blk_birth <= txg || BP_IS_HOLE(db->db_blkptr))) { /* * This can only happen when we are searching up the tree * and these conditions mean that we need to keep climbing. */ error = SET_ERROR(ESRCH); } else if (lvl == 0) { dnode_phys_t *dnp = data; span = DNODE_SHIFT; ASSERT(dn->dn_type == DMU_OT_DNODE); for (i = (*offset >> span) & (blkfill - 1); i >= 0 && i < blkfill; i += inc) { if ((dnp[i].dn_type == DMU_OT_NONE) == hole) break; *offset += (1ULL << span) * inc; } if (i < 0 || i == blkfill) error = SET_ERROR(ESRCH); } else { blkptr_t *bp = data; uint64_t start = *offset; span = (lvl - 1) * epbs + dn->dn_datablkshift; minfill = 0; maxfill = blkfill << ((lvl - 1) * epbs); if (hole) maxfill--; else minfill++; *offset = *offset >> span; for (i = BF64_GET(*offset, 0, epbs); i >= 0 && i < epb; i += inc) { if (BP_GET_FILL(&bp[i]) >= minfill && BP_GET_FILL(&bp[i]) <= maxfill && (hole || bp[i].blk_birth > txg)) break; if (inc > 0 || *offset > 0) *offset += inc; } *offset = *offset << span; if (inc < 0) { /* traversing backwards; position offset at the end */ ASSERT3U(*offset, <=, start); *offset = MIN(*offset + (1ULL << span) - 1, start); } else if (*offset < start) { *offset = start; } if (i < 0 || i >= epb) error = SET_ERROR(ESRCH); } if (db) dbuf_rele(db, FTAG); return (error); } /* * Find the next hole, data, or sparse region at or after *offset. * The value 'blkfill' tells us how many items we expect to find * in an L0 data block; this value is 1 for normal objects, * DNODES_PER_BLOCK for the meta dnode, and some fraction of * DNODES_PER_BLOCK when searching for sparse regions thereof. * * Examples: * * dnode_next_offset(dn, flags, offset, 1, 1, 0); * Finds the next/previous hole/data in a file. * Used in dmu_offset_next(). * * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); * Finds the next free/allocated dnode an objset's meta-dnode. * Only finds objects that have new contents since txg (ie. * bonus buffer changes and content removal are ignored). * Used in dmu_object_next(). * * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); * Finds the next L2 meta-dnode bp that's at most 1/4 full. * Used in dmu_object_alloc(). */ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, int minlvl, uint64_t blkfill, uint64_t txg) { uint64_t initial_offset = *offset; int lvl, maxlvl; int error = 0; if (!(flags & DNODE_FIND_HAVELOCK)) rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_phys->dn_nlevels == 0) { error = SET_ERROR(ESRCH); goto out; } if (dn->dn_datablkshift == 0) { if (*offset < dn->dn_datablksz) { if (flags & DNODE_FIND_HOLE) *offset = dn->dn_datablksz; } else { error = SET_ERROR(ESRCH); } goto out; } maxlvl = dn->dn_phys->dn_nlevels; for (lvl = minlvl; lvl <= maxlvl; lvl++) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); if (error != ESRCH) break; } while (error == 0 && --lvl >= minlvl) { error = dnode_next_offset_level(dn, flags, offset, lvl, blkfill, txg); } /* * There's always a "virtual hole" at the end of the object, even * if all BP's which physically exist are non-holes. */ if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 && minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) { error = 0; } if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? initial_offset < *offset : initial_offset > *offset)) error = SET_ERROR(ESRCH); out: if (!(flags & DNODE_FIND_HAVELOCK)) rw_exit(&dn->dn_struct_rwlock); return (error); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c (revision 286575) @@ -1,734 +1,722 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include static void dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) { dmu_buf_impl_t *db; int txgoff = tx->tx_txg & TXG_MASK; int nblkptr = dn->dn_phys->dn_nblkptr; int old_toplvl = dn->dn_phys->dn_nlevels - 1; int new_level = dn->dn_next_nlevels[txgoff]; int i; rw_enter(&dn->dn_struct_rwlock, RW_WRITER); /* this dnode can't be paged out because it's dirty */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); ASSERT(db != NULL); dn->dn_phys->dn_nlevels = new_level; dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, dn->dn_object, dn->dn_phys->dn_nlevels); /* check for existing blkptrs in the dnode */ for (i = 0; i < nblkptr; i++) if (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[i])) break; if (i != nblkptr) { /* transfer dnode's block pointers to new indirect block */ (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); ASSERT(db->db.db_data); ASSERT(arc_released(db->db_buf)); ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, sizeof (blkptr_t) * nblkptr); arc_buf_freeze(db->db_buf); } /* set dbuf's parent pointers to new indirect buf */ for (i = 0; i < nblkptr; i++) { dmu_buf_impl_t *child = dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); if (child == NULL) continue; #ifdef DEBUG DB_DNODE_ENTER(child); ASSERT3P(DB_DNODE(child), ==, dn); DB_DNODE_EXIT(child); #endif /* DEBUG */ if (child->db_parent && child->db_parent != dn->dn_dbuf) { ASSERT(child->db_parent->db_level == db->db_level); ASSERT(child->db_blkptr != &dn->dn_phys->dn_blkptr[child->db_blkid]); mutex_exit(&child->db_mtx); continue; } ASSERT(child->db_parent == NULL || child->db_parent == dn->dn_dbuf); child->db_parent = db; dbuf_add_ref(db, child); if (db->db.db_data) child->db_blkptr = (blkptr_t *)db->db.db_data + i; else child->db_blkptr = NULL; dprintf_dbuf_bp(child, child->db_blkptr, "changed db_blkptr to new indirect %s", ""); mutex_exit(&child->db_mtx); } bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); dbuf_rele(db, FTAG); rw_exit(&dn->dn_struct_rwlock); } static void free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) { dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint64_t bytesfreed = 0; dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); for (int i = 0; i < num; i++, bp++) { if (BP_IS_HOLE(bp)) continue; bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); /* * Save some useful information on the holes being * punched, including logical size, type, and indirection * level. Retaining birth time enables detection of when * holes are punched for reducing the number of free * records transmitted during a zfs send. */ uint64_t lsize = BP_GET_LSIZE(bp); dmu_object_type_t type = BP_GET_TYPE(bp); uint64_t lvl = BP_GET_LEVEL(bp); bzero(bp, sizeof (blkptr_t)); if (spa_feature_is_active(dn->dn_objset->os_spa, SPA_FEATURE_HOLE_BIRTH)) { BP_SET_LSIZE(bp, lsize); BP_SET_TYPE(bp, type); BP_SET_LEVEL(bp, lvl); BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0); } } dnode_diduse_space(dn, -bytesfreed); } #ifdef ZFS_DEBUG static void free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) { int off, num; int i, err, epbs; uint64_t txg = tx->tx_txg; dnode_t *dn; DB_DNODE_ENTER(db); dn = DB_DNODE(db); epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; off = start - (db->db_blkid * 1<=, 0); ASSERT3U(num, >=, 0); ASSERT3U(db->db_level, >, 0); ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); ASSERT(db->db_blkptr != NULL); for (i = off; i < off+num; i++) { uint64_t *buf; dmu_buf_impl_t *child; dbuf_dirty_record_t *dr; int j; ASSERT(db->db_level == 1); rw_enter(&dn->dn_struct_rwlock, RW_READER); err = dbuf_hold_impl(dn, db->db_level-1, (db->db_blkid << epbs) + i, TRUE, FTAG, &child); rw_exit(&dn->dn_struct_rwlock); if (err == ENOENT) continue; ASSERT(err == 0); ASSERT(child->db_level == 0); dr = child->db_last_dirty; while (dr && dr->dr_txg > txg) dr = dr->dr_next; ASSERT(dr == NULL || dr->dr_txg == txg); /* data_old better be zeroed */ if (dr) { buf = dr->dt.dl.dr_data->b_data; for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " "child=%p i=%d off=%d num=%d\n", (void *)child, i, off, num); } } } /* * db_data better be zeroed unless it's dirty in a * future txg. */ mutex_enter(&child->db_mtx); buf = child->db.db_data; if (buf != NULL && child->db_state != DB_FILL && child->db_last_dirty == NULL) { for (j = 0; j < child->db.db_size >> 3; j++) { if (buf[j] != 0) { panic("freed data not zero: " "child=%p i=%d off=%d num=%d\n", (void *)child, i, off, num); } } } mutex_exit(&child->db_mtx); dbuf_rele(child, FTAG); } DB_DNODE_EXIT(db); } #endif static void free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) { dnode_t *dn; blkptr_t *bp; dmu_buf_impl_t *subdb; uint64_t start, end, dbstart, dbend, i; int epbs, shift; /* * There is a small possibility that this block will not be cached: * 1 - if level > 1 and there are no children with level <= 1 * 2 - if this block was evicted since we read it from * dmu_tx_hold_free(). */ if (db->db_state != DB_CACHED) (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); dbuf_release_bp(db); bp = db->db.db_data; DB_DNODE_ENTER(db); dn = DB_DNODE(db); epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; shift = (db->db_level - 1) * epbs; dbstart = db->db_blkid << epbs; start = blkid >> shift; if (dbstart < start) { bp += start - dbstart; } else { start = dbstart; } dbend = ((db->db_blkid + 1) << epbs) - 1; end = (blkid + nblks - 1) >> shift; if (dbend <= end) end = dbend; ASSERT3U(start, <=, end); if (db->db_level == 1) { FREE_VERIFY(db, start, end, tx); free_blocks(dn, bp, end-start+1, tx); } else { for (i = start; i <= end; i++, bp++) { if (BP_IS_HOLE(bp)) continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, i, B_TRUE, FTAG, &subdb)); rw_exit(&dn->dn_struct_rwlock); ASSERT3P(bp, ==, subdb->db_blkptr); free_children(subdb, blkid, nblks, tx); dbuf_rele(subdb, FTAG); } } /* If this whole block is free, free ourself too. */ for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) { if (!BP_IS_HOLE(bp)) break; } if (i == 1 << epbs) { /* didn't find any non-holes */ bzero(db->db.db_data, db->db.db_size); free_blocks(dn, db->db_blkptr, 1, tx); } else { /* * Partial block free; must be marked dirty so that it * will be written out. */ ASSERT(db->db_dirtycnt > 0); } DB_DNODE_EXIT(db); arc_buf_freeze(db->db_buf); } /* * Traverse the indicated range of the provided file * and "free" all the blocks contained there. */ static void dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) { blkptr_t *bp = dn->dn_phys->dn_blkptr; int dnlevel = dn->dn_phys->dn_nlevels; boolean_t trunc = B_FALSE; if (blkid > dn->dn_phys->dn_maxblkid) return; ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); if (blkid + nblks > dn->dn_phys->dn_maxblkid) { nblks = dn->dn_phys->dn_maxblkid - blkid + 1; trunc = B_TRUE; } /* There are no indirect blocks in the object */ if (dnlevel == 1) { if (blkid >= dn->dn_phys->dn_nblkptr) { /* this range was never made persistent */ return; } ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); free_blocks(dn, bp + blkid, nblks, tx); } else { int shift = (dnlevel - 1) * (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); int start = blkid >> shift; int end = (blkid + nblks - 1) >> shift; dmu_buf_impl_t *db; ASSERT(start < dn->dn_phys->dn_nblkptr); bp += start; for (int i = start; i <= end; i++, bp++) { if (BP_IS_HOLE(bp)) continue; rw_enter(&dn->dn_struct_rwlock, RW_READER); VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, TRUE, FTAG, &db)); rw_exit(&dn->dn_struct_rwlock); free_children(db, blkid, nblks, tx); dbuf_rele(db, FTAG); } } if (trunc) { dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1; uint64_t off = (dn->dn_phys->dn_maxblkid + 1) * (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT(off < dn->dn_phys->dn_maxblkid || dn->dn_phys->dn_maxblkid == 0 || dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); } } typedef struct dnode_sync_free_range_arg { dnode_t *dsfra_dnode; dmu_tx_t *dsfra_tx; } dnode_sync_free_range_arg_t; static void dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) { dnode_sync_free_range_arg_t *dsfra = arg; dnode_t *dn = dsfra->dsfra_dnode; mutex_exit(&dn->dn_mtx); dnode_sync_free_range_impl(dn, blkid, nblks, dsfra->dsfra_tx); mutex_enter(&dn->dn_mtx); } /* * Try to kick all the dnode's dbufs out of the cache... */ void dnode_evict_dbufs(dnode_t *dn) { - int progress; - int pass = 0; + dmu_buf_impl_t db_marker; + dmu_buf_impl_t *db, *db_next; - do { - dmu_buf_impl_t *db, *db_next; - int evicting = FALSE; + mutex_enter(&dn->dn_dbufs_mtx); + for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { - progress = FALSE; - mutex_enter(&dn->dn_dbufs_mtx); - for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { - db_next = AVL_NEXT(&dn->dn_dbufs, db); #ifdef DEBUG - DB_DNODE_ENTER(db); - ASSERT3P(DB_DNODE(db), ==, dn); - DB_DNODE_EXIT(db); + DB_DNODE_ENTER(db); + ASSERT3P(DB_DNODE(db), ==, dn); + DB_DNODE_EXIT(db); #endif /* DEBUG */ - mutex_enter(&db->db_mtx); - if (db->db_state == DB_EVICTING) { - progress = TRUE; - evicting = TRUE; - mutex_exit(&db->db_mtx); - } else if (refcount_is_zero(&db->db_holds)) { - progress = TRUE; - dbuf_clear(db); /* exits db_mtx for us */ - } else { - mutex_exit(&db->db_mtx); - } + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING && + refcount_is_zero(&db->db_holds)) { + db_marker.db_level = db->db_level; + db_marker.db_blkid = db->db_blkid; + db_marker.db_state = DB_SEARCH; + avl_insert_here(&dn->dn_dbufs, &db_marker, db, + AVL_BEFORE); + dbuf_clear(db); + + db_next = AVL_NEXT(&dn->dn_dbufs, &db_marker); + avl_remove(&dn->dn_dbufs, &db_marker); + } else { + mutex_exit(&db->db_mtx); + db_next = AVL_NEXT(&dn->dn_dbufs, db); } - /* - * NB: we need to drop dn_dbufs_mtx between passes so - * that any DB_EVICTING dbufs can make progress. - * Ideally, we would have some cv we could wait on, but - * since we don't, just wait a bit to give the other - * thread a chance to run. - */ - mutex_exit(&dn->dn_dbufs_mtx); - if (evicting) - delay(1); - pass++; - ASSERT(pass < 100); /* sanity check */ - } while (progress); + } + mutex_exit(&dn->dn_dbufs_mtx); dnode_evict_bonus(dn); } void dnode_evict_bonus(dnode_t *dn) { rw_enter(&dn->dn_struct_rwlock, RW_WRITER); if (dn->dn_bonus && refcount_is_zero(&dn->dn_bonus->db_holds)) { mutex_enter(&dn->dn_bonus->db_mtx); dbuf_evict(dn->dn_bonus); dn->dn_bonus = NULL; } rw_exit(&dn->dn_struct_rwlock); } static void dnode_undirty_dbufs(list_t *list) { dbuf_dirty_record_t *dr; while (dr = list_head(list)) { dmu_buf_impl_t *db = dr->dr_dbuf; uint64_t txg = dr->dr_txg; if (db->db_level != 0) dnode_undirty_dbufs(&dr->dt.di.dr_children); mutex_enter(&db->db_mtx); /* XXX - use dbuf_undirty()? */ list_remove(list, dr); ASSERT(db->db_last_dirty == dr); db->db_last_dirty = NULL; db->db_dirtycnt -= 1; if (db->db_level == 0) { ASSERT(db->db_blkid == DMU_BONUS_BLKID || dr->dt.dl.dr_data == db->db_buf); dbuf_unoverride(dr); } else { mutex_destroy(&dr->dt.di.dr_mtx); list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); } } static void dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) { int txgoff = tx->tx_txg & TXG_MASK; ASSERT(dmu_tx_is_syncing(tx)); /* * Our contents should have been freed in dnode_sync() by the * free range record inserted by the caller of dnode_free(). */ ASSERT0(DN_USED_BYTES(dn->dn_phys)); ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); dnode_evict_dbufs(dn); ASSERT(avl_is_empty(&dn->dn_dbufs)); - ASSERT3P(dn->dn_bonus, ==, NULL); /* * XXX - It would be nice to assert this, but we may still * have residual holds from async evictions from the arc... * * zfs_obj_to_path() also depends on this being * commented out. * * ASSERT3U(refcount_count(&dn->dn_holds), ==, 1); */ /* Undirty next bits */ dn->dn_next_nlevels[txgoff] = 0; dn->dn_next_indblkshift[txgoff] = 0; dn->dn_next_blksz[txgoff] = 0; /* ASSERT(blkptrs are zero); */ ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); ASSERT(dn->dn_type != DMU_OT_NONE); ASSERT(dn->dn_free_txg > 0); if (dn->dn_allocated_txg != dn->dn_free_txg) dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); bzero(dn->dn_phys, sizeof (dnode_phys_t)); mutex_enter(&dn->dn_mtx); dn->dn_type = DMU_OT_NONE; dn->dn_maxblkid = 0; dn->dn_allocated_txg = 0; dn->dn_free_txg = 0; dn->dn_have_spill = B_FALSE; mutex_exit(&dn->dn_mtx); ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); /* * Now that we've released our hold, the dnode may * be evicted, so we musn't access it. */ } /* * Write out the dnode's dirty buffers. */ void dnode_sync(dnode_t *dn, dmu_tx_t *tx) { dnode_phys_t *dnp = dn->dn_phys; int txgoff = tx->tx_txg & TXG_MASK; list_t *list = &dn->dn_dirty_records[txgoff]; static const dnode_phys_t zerodn = { 0 }; boolean_t kill_spill = B_FALSE; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); ASSERT(dnp->dn_type != DMU_OT_NONE || bcmp(dnp, &zerodn, DNODE_SIZE) == 0); DNODE_VERIFY(dn); ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); if (dmu_objset_userused_enabled(dn->dn_objset) && !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { mutex_enter(&dn->dn_mtx); dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); dn->dn_oldflags = dn->dn_phys->dn_flags; dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; mutex_exit(&dn->dn_mtx); dmu_objset_userquota_get_ids(dn, B_FALSE, tx); } else { /* Once we account for it, we should always account for it. */ ASSERT(!(dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED)); } mutex_enter(&dn->dn_mtx); if (dn->dn_allocated_txg == tx->tx_txg) { /* The dnode is newly allocated or reallocated */ if (dnp->dn_type == DMU_OT_NONE) { /* this is a first alloc, not a realloc */ dnp->dn_nlevels = 1; dnp->dn_nblkptr = dn->dn_nblkptr; } dnp->dn_type = dn->dn_type; dnp->dn_bonustype = dn->dn_bonustype; dnp->dn_bonuslen = dn->dn_bonuslen; } ASSERT(dnp->dn_nlevels > 1 || BP_IS_HOLE(&dnp->dn_blkptr[0]) || BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) || BP_GET_LSIZE(&dnp->dn_blkptr[0]) == dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); ASSERT(dnp->dn_nlevels < 2 || BP_IS_HOLE(&dnp->dn_blkptr[0]) || BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift); if (dn->dn_next_type[txgoff] != 0) { dnp->dn_type = dn->dn_type; dn->dn_next_type[txgoff] = 0; } if (dn->dn_next_blksz[txgoff] != 0) { ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], SPA_MINBLOCKSIZE) == 0); ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || dn->dn_maxblkid == 0 || list_head(list) != NULL || dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == dnp->dn_datablkszsec || range_tree_space(dn->dn_free_ranges[txgoff]) != 0); dnp->dn_datablkszsec = dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; dn->dn_next_blksz[txgoff] = 0; } if (dn->dn_next_bonuslen[txgoff] != 0) { if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) dnp->dn_bonuslen = 0; else dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; ASSERT(dnp->dn_bonuslen <= DN_MAX_BONUSLEN); dn->dn_next_bonuslen[txgoff] = 0; } if (dn->dn_next_bonustype[txgoff] != 0) { ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; dn->dn_next_bonustype[txgoff] = 0; } boolean_t freeing_dnode = dn->dn_free_txg > 0 && dn->dn_free_txg <= tx->tx_txg; /* * Remove the spill block if we have been explicitly asked to * remove it, or if the object is being removed. */ if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) { if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) kill_spill = B_TRUE; dn->dn_rm_spillblk[txgoff] = 0; } if (dn->dn_next_indblkshift[txgoff] != 0) { ASSERT(dnp->dn_nlevels == 1); dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; dn->dn_next_indblkshift[txgoff] = 0; } /* * Just take the live (open-context) values for checksum and compress. * Strictly speaking it's a future leak, but nothing bad happens if we * start using the new checksum or compress algorithm a little early. */ dnp->dn_checksum = dn->dn_checksum; dnp->dn_compress = dn->dn_compress; mutex_exit(&dn->dn_mtx); if (kill_spill) { free_blocks(dn, &dn->dn_phys->dn_spill, 1, tx); mutex_enter(&dn->dn_mtx); dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; mutex_exit(&dn->dn_mtx); } /* process all the "freed" ranges in the file */ if (dn->dn_free_ranges[txgoff] != NULL) { dnode_sync_free_range_arg_t dsfra; dsfra.dsfra_dnode = dn; dsfra.dsfra_tx = tx; mutex_enter(&dn->dn_mtx); range_tree_vacate(dn->dn_free_ranges[txgoff], dnode_sync_free_range, &dsfra); range_tree_destroy(dn->dn_free_ranges[txgoff]); dn->dn_free_ranges[txgoff] = NULL; mutex_exit(&dn->dn_mtx); } if (freeing_dnode) { dnode_sync_free(dn, tx); return; } if (dn->dn_next_nlevels[txgoff]) { dnode_increase_indirection(dn, tx); dn->dn_next_nlevels[txgoff] = 0; } if (dn->dn_next_nblkptr[txgoff]) { /* this should only happen on a realloc */ ASSERT(dn->dn_allocated_txg == tx->tx_txg); if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) { /* zero the new blkptrs we are gaining */ bzero(dnp->dn_blkptr + dnp->dn_nblkptr, sizeof (blkptr_t) * (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr)); #ifdef ZFS_DEBUG } else { int i; ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr); /* the blkptrs we are losing better be unallocated */ for (i = dn->dn_next_nblkptr[txgoff]; i < dnp->dn_nblkptr; i++) ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); #endif } mutex_enter(&dn->dn_mtx); dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff]; dn->dn_next_nblkptr[txgoff] = 0; mutex_exit(&dn->dn_mtx); } dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx); if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { ASSERT3P(list_head(list), ==, NULL); dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); } /* * Although we have dropped our reference to the dnode, it * can't be evicted until its written, and we haven't yet * initiated the IO for the dnode's dbuf. */ } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_bookmark.c (revision 286575) @@ -1,457 +1,457 @@ /* * CDDL HEADER START * * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. * * CDDL HEADER END */ /* * Copyright (c) 2013, 2014 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include static int dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, dsl_dataset_t **dsp, void *tag, char **shortnamep) { char buf[MAXNAMELEN]; char *hashp; if (strlen(fullname) >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); hashp = strchr(fullname, '#'); if (hashp == NULL) return (SET_ERROR(EINVAL)); *shortnamep = hashp + 1; if (zfs_component_namecheck(*shortnamep, NULL, NULL)) return (SET_ERROR(EINVAL)); (void) strlcpy(buf, fullname, hashp - fullname + 1); return (dsl_dataset_hold(dp, buf, tag, dsp)); } /* * Returns ESRCH if bookmark is not found. */ static int dsl_dataset_bmark_lookup(dsl_dataset_t *ds, const char *shortname, zfs_bookmark_phys_t *bmark_phys) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t bmark_zapobj = ds->ds_bookmarks; matchtype_t mt; int err; if (bmark_zapobj == 0) return (SET_ERROR(ESRCH)); if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t), sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0, NULL); return (err == ENOENT ? ESRCH : err); } /* * If later_ds is non-NULL, this will return EXDEV if the the specified bookmark * does not represents an earlier point in later_ds's timeline. * * Returns ENOENT if the dataset containing the bookmark does not exist. * Returns ESRCH if the dataset exists but the bookmark was not found in it. */ int dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname, dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp) { char *shortname; dsl_dataset_t *ds; int error; error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname); if (error != 0) return (error); error = dsl_dataset_bmark_lookup(ds, shortname, bmp); if (error == 0 && later_ds != NULL) { if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg)) error = SET_ERROR(EXDEV); } dsl_dataset_rele(ds, FTAG); return (error); } typedef struct dsl_bookmark_create_arg { nvlist_t *dbca_bmarks; nvlist_t *dbca_errors; } dsl_bookmark_create_arg_t; static int dsl_bookmark_create_check_impl(dsl_dataset_t *snapds, const char *bookmark_name, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *bmark_fs; char *shortname; int error; zfs_bookmark_phys_t bmark_phys; - if (!dsl_dataset_is_snapshot(snapds)) + if (!snapds->ds_is_snapshot) return (SET_ERROR(EINVAL)); error = dsl_bookmark_hold_ds(dp, bookmark_name, &bmark_fs, FTAG, &shortname); if (error != 0) return (error); if (!dsl_dataset_is_before(bmark_fs, snapds, 0)) { dsl_dataset_rele(bmark_fs, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dataset_bmark_lookup(bmark_fs, shortname, &bmark_phys); dsl_dataset_rele(bmark_fs, FTAG); if (error == 0) return (SET_ERROR(EEXIST)); if (error == ESRCH) return (0); return (error); } static int dsl_bookmark_create_check(void *arg, dmu_tx_t *tx) { dsl_bookmark_create_arg_t *dbca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); int rv = 0; if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) return (SET_ERROR(ENOTSUP)); for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { dsl_dataset_t *snapds; int error; /* note: validity of nvlist checked by ioctl layer */ error = dsl_dataset_hold(dp, fnvpair_value_string(pair), FTAG, &snapds); if (error == 0) { error = dsl_bookmark_create_check_impl(snapds, nvpair_name(pair), tx); dsl_dataset_rele(snapds, FTAG); } if (error != 0) { fnvlist_add_int32(dbca->dbca_errors, nvpair_name(pair), error); rv = error; } } return (rv); } static void dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) { dsl_bookmark_create_arg_t *dbca = arg; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)); for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { dsl_dataset_t *snapds, *bmark_fs; zfs_bookmark_phys_t bmark_phys; char *shortname; VERIFY0(dsl_dataset_hold(dp, fnvpair_value_string(pair), FTAG, &snapds)); VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), &bmark_fs, FTAG, &shortname)); if (bmark_fs->ds_bookmarks == 0) { bmark_fs->ds_bookmarks = zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); dsl_dataset_zapify(bmark_fs, tx); VERIFY0(zap_add(mos, bmark_fs->ds_object, DS_FIELD_BOOKMARK_NAMES, sizeof (bmark_fs->ds_bookmarks), 1, &bmark_fs->ds_bookmarks, tx)); } bmark_phys.zbm_guid = dsl_dataset_phys(snapds)->ds_guid; bmark_phys.zbm_creation_txg = dsl_dataset_phys(snapds)->ds_creation_txg; bmark_phys.zbm_creation_time = dsl_dataset_phys(snapds)->ds_creation_time; VERIFY0(zap_add(mos, bmark_fs->ds_bookmarks, shortname, sizeof (uint64_t), sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), &bmark_phys, tx)); spa_history_log_internal_ds(bmark_fs, "bookmark", tx, "name=%s creation_txg=%llu target_snap=%llu", shortname, (longlong_t)bmark_phys.zbm_creation_txg, (longlong_t)snapds->ds_object); dsl_dataset_rele(bmark_fs, FTAG); dsl_dataset_rele(snapds, FTAG); } } /* * The bookmarks must all be in the same pool. */ int dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors) { nvpair_t *pair; dsl_bookmark_create_arg_t dbca; pair = nvlist_next_nvpair(bmarks, NULL); if (pair == NULL) return (0); dbca.dbca_bmarks = bmarks; dbca.dbca_errors = errors; return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check, dsl_bookmark_create_sync, &dbca, fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL)); } int dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) { int err = 0; zap_cursor_t zc; zap_attribute_t attr; dsl_pool_t *dp = ds->ds_dir->dd_pool; uint64_t bmark_zapobj = ds->ds_bookmarks; if (bmark_zapobj == 0) return (0); for (zap_cursor_init(&zc, dp->dp_meta_objset, bmark_zapobj); zap_cursor_retrieve(&zc, &attr) == 0; zap_cursor_advance(&zc)) { char *bmark_name = attr.za_name; zfs_bookmark_phys_t bmark_phys; err = dsl_dataset_bmark_lookup(ds, bmark_name, &bmark_phys); ASSERT3U(err, !=, ENOENT); if (err != 0) break; nvlist_t *out_props = fnvlist_alloc(); if (nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_GUID))) { dsl_prop_nvlist_add_uint64(out_props, ZFS_PROP_GUID, bmark_phys.zbm_guid); } if (nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_CREATETXG))) { dsl_prop_nvlist_add_uint64(out_props, ZFS_PROP_CREATETXG, bmark_phys.zbm_creation_txg); } if (nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_CREATION))) { dsl_prop_nvlist_add_uint64(out_props, ZFS_PROP_CREATION, bmark_phys.zbm_creation_time); } fnvlist_add_nvlist(outnvl, bmark_name, out_props); fnvlist_free(out_props); } zap_cursor_fini(&zc); return (err); } /* * Retrieve the bookmarks that exist in the specified dataset, and the * requested properties of each bookmark. * * The "props" nvlist specifies which properties are requested. * See lzc_get_bookmarks() for the list of valid properties. */ int dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; err = dsl_pool_hold(dsname, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } err = dsl_get_bookmarks_impl(ds, props, outnvl); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (err); } typedef struct dsl_bookmark_destroy_arg { nvlist_t *dbda_bmarks; nvlist_t *dbda_success; nvlist_t *dbda_errors; } dsl_bookmark_destroy_arg_t; static int dsl_dataset_bookmark_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t bmark_zapobj = ds->ds_bookmarks; matchtype_t mt; if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; return (zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); } static int dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx) { dsl_bookmark_destroy_arg_t *dbda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); int rv = 0; if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) return (0); for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) { const char *fullname = nvpair_name(pair); dsl_dataset_t *ds; zfs_bookmark_phys_t bm; int error; char *shortname; error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname); if (error == ENOENT) { /* ignore it; the bookmark is "already destroyed" */ continue; } if (error == 0) { error = dsl_dataset_bmark_lookup(ds, shortname, &bm); dsl_dataset_rele(ds, FTAG); if (error == ESRCH) { /* * ignore it; the bookmark is * "already destroyed" */ continue; } } if (error == 0) { fnvlist_add_boolean(dbda->dbda_success, fullname); } else { fnvlist_add_int32(dbda->dbda_errors, fullname, error); rv = error; } } return (rv); } static void dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx) { dsl_bookmark_destroy_arg_t *dbda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL); pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) { dsl_dataset_t *ds; char *shortname; uint64_t zap_cnt; VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), &ds, FTAG, &shortname)); VERIFY0(dsl_dataset_bookmark_remove(ds, shortname, tx)); /* * If all of this dataset's bookmarks have been destroyed, * free the zap object and decrement the feature's use count. */ VERIFY0(zap_count(mos, ds->ds_bookmarks, &zap_cnt)); if (zap_cnt == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); ds->ds_bookmarks = 0; spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); VERIFY0(zap_remove(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES, tx)); } spa_history_log_internal_ds(ds, "remove bookmark", tx, "name=%s", shortname); dsl_dataset_rele(ds, FTAG); } } /* * The bookmarks must all be in the same pool. */ int dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors) { int rv; dsl_bookmark_destroy_arg_t dbda; nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL); if (pair == NULL) return (0); dbda.dbda_bmarks = bmarks; dbda.dbda_errors = errors; dbda.dbda_success = fnvlist_alloc(); rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check, dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_RESERVED); fnvlist_free(dbda.dbda_success); return (rv); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dataset.c (revision 286575) @@ -1,3454 +1,3459 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright (c) 2011 Martin Matuska * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2014 RackTop Systems. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include SYSCTL_DECL(_vfs_zfs); /* * The SPA supports block sizes up to 16MB. However, very large blocks * can have an impact on i/o latency (e.g. tying up a spinning disk for * ~300ms), and also potentially on the memory allocator. Therefore, * we do not allow the recordsize to be set larger than zfs_max_recordsize * (default 1MB). Larger blocks can be created by changing this tunable, * and pools with larger blocks can always be imported and used, regardless * of this setting. */ int zfs_max_recordsize = 1 * 1024 * 1024; SYSCTL_INT(_vfs_zfs, OID_AUTO, max_recordsize, CTLFLAG_RWTUN, &zfs_max_recordsize, 0, "Maximum block size. Expect dragons when tuning this."); #define SWITCH64(x, y) \ { \ uint64_t __tmp = (x); \ (x) = (y); \ (y) = __tmp; \ } #define DS_REF_MAX (1ULL << 62) extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); -extern inline boolean_t dsl_dataset_is_snapshot(dsl_dataset_t *ds); /* * Figure out how much of this delta should be propogated to the dsl_dir * layer. If there's a refreservation, that space has already been * partially accounted for in our ancestors. */ static int64_t parent_delta(dsl_dataset_t *ds, int64_t delta) { dsl_dataset_phys_t *ds_phys; uint64_t old_bytes, new_bytes; if (ds->ds_reserved == 0) return (delta); ds_phys = dsl_dataset_phys(ds); old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved); new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved); ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); return (new_bytes - old_bytes); } void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; dprintf_bp(bp, "ds=%p", ds); ASSERT(dmu_tx_is_syncing(tx)); /* It could have been compressed away to nothing */ if (BP_IS_HOLE(bp)) return; ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); if (ds == NULL) { dsl_pool_mos_diduse_space(tx->tx_pool, used, compressed, uncompressed); return; } dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_lock); delta = parent_delta(ds, used); dsl_dataset_phys(ds)->ds_referenced_bytes += used; dsl_dataset_phys(ds)->ds_compressed_bytes += compressed; dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; dsl_dataset_phys(ds)->ds_unique_bytes += used; if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) ds->ds_need_large_blocks = B_TRUE; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, compressed, uncompressed, tx); dsl_dir_transfer_space(ds->ds_dir, used - delta, DD_USED_REFRSRV, DD_USED_HEAD, NULL); } int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async) { int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); if (BP_IS_HOLE(bp)) return (0); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(bp->blk_birth <= tx->tx_txg); if (ds == NULL) { dsl_free(tx->tx_pool, tx->tx_txg, bp); dsl_pool_mos_diduse_space(tx->tx_pool, -used, -compressed, -uncompressed); return (used); } ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); - ASSERT(!dsl_dataset_is_snapshot(ds)); + ASSERT(!ds->ds_is_snapshot); dmu_buf_will_dirty(ds->ds_dbuf, tx); if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { int64_t delta; dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_lock); ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used || !DS_UNIQUE_IS_ACCURATE(ds)); delta = parent_delta(ds, -used); dsl_dataset_phys(ds)->ds_unique_bytes -= used; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, -compressed, -uncompressed, tx); dsl_dir_transfer_space(ds->ds_dir, -used - delta, DD_USED_REFRSRV, DD_USED_HEAD, NULL); } else { dprintf_bp(bp, "putting on dead list: %s", ""); if (async) { /* * We are here as part of zio's write done callback, * which means we're a zio interrupt thread. We can't * call dsl_deadlist_insert() now because it may block * waiting for I/O. Instead, put bp on the deferred * queue and let dsl_pool_sync() finish the job. */ bplist_append(&ds->ds_pending_deadlist, bp); } else { dsl_deadlist_insert(&ds->ds_deadlist, bp, tx); } ASSERT3U(ds->ds_prev->ds_object, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj); ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object && bp->blk_birth > dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); mutex_enter(&ds->ds_prev->ds_lock); dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; mutex_exit(&ds->ds_prev->ds_lock); } if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { dsl_dir_transfer_space(ds->ds_dir, used, DD_USED_HEAD, DD_USED_SNAP, tx); } } mutex_enter(&ds->ds_lock); ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used); dsl_dataset_phys(ds)->ds_referenced_bytes -= used; ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed); dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed; ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed); dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed; mutex_exit(&ds->ds_lock); return (used); } uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) { uint64_t trysnap = 0; if (ds == NULL) return (0); /* * The snapshot creation could fail, but that would cause an * incorrect FALSE return, which would only result in an * overestimation of the amount of space that an operation would * consume, which is OK. * * There's also a small window where we could miss a pending * snapshot, because we could set the sync task in the quiescing * phase. So this should only be used as a guess. */ if (ds->ds_trysnap_txg > spa_last_synced_txg(ds->ds_dir->dd_pool->dp_spa)) trysnap = ds->ds_trysnap_txg; return (MAX(dsl_dataset_phys(ds)->ds_prev_snap_txg, trysnap)); } boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, uint64_t blk_birth) { if (blk_birth <= dsl_dataset_prev_snap_txg(ds) || (bp != NULL && BP_IS_HOLE(bp))) return (B_FALSE); ddt_prefetch(dsl_dataset_get_spa(ds), bp); return (B_TRUE); } -/* ARGSUSED */ static void -dsl_dataset_evict(dmu_buf_t *db, void *dsv) +dsl_dataset_evict(void *dbu) { - dsl_dataset_t *ds = dsv; + dsl_dataset_t *ds = dbu; ASSERT(ds->ds_owner == NULL); + ds->ds_dbuf = NULL; + unique_remove(ds->ds_fsid_guid); if (ds->ds_objset != NULL) dmu_objset_evict(ds->ds_objset); if (ds->ds_prev) { dsl_dataset_rele(ds->ds_prev, ds); ds->ds_prev = NULL; } bplist_destroy(&ds->ds_pending_deadlist); - if (dsl_dataset_phys(ds)->ds_deadlist_obj != 0) + if (ds->ds_deadlist.dl_os != NULL) dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_dir) - dsl_dir_rele(ds->ds_dir, ds); + dsl_dir_async_rele(ds->ds_dir, ds); ASSERT(!list_link_active(&ds->ds_synced_link)); if (mutex_owned(&ds->ds_lock)) mutex_exit(&ds->ds_lock); mutex_destroy(&ds->ds_lock); if (mutex_owned(&ds->ds_opening_lock)) mutex_exit(&ds->ds_opening_lock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_sendstream_lock); refcount_destroy(&ds->ds_longholds); kmem_free(ds, sizeof (dsl_dataset_t)); } int dsl_dataset_get_snapname(dsl_dataset_t *ds) { dsl_dataset_phys_t *headphys; int err; dmu_buf_t *headdbuf; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; if (ds->ds_snapname[0]) return (0); if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) return (0); err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &headdbuf); if (err != 0) return (err); headphys = headdbuf->db_data; err = zap_value_search(dp->dp_meta_objset, headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); dmu_buf_rele(headdbuf, FTAG); return (err); } int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; matchtype_t mt; int err; if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; err = zap_lookup_norm(mos, snapobj, name, 8, 1, value, mt, NULL, 0, NULL); if (err == ENOTSUP && mt == MT_FIRST) err = zap_lookup(mos, snapobj, name, 8, 1, value); return (err); } int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, boolean_t adj_cnt) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; matchtype_t mt; int err; dsl_dir_snap_cmtime_update(ds->ds_dir); if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else mt = MT_EXACT; err = zap_remove_norm(mos, snapobj, name, mt, tx); if (err == ENOTSUP && mt == MT_FIRST) err = zap_remove(mos, snapobj, name, tx); if (err == 0 && adj_cnt) dsl_fs_ss_count_adjust(ds->ds_dir, -1, DD_FIELD_SNAPSHOT_COUNT, tx); return (err); } boolean_t dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag) { dmu_buf_t *dbuf = ds->ds_dbuf; boolean_t result = B_FALSE; if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset, ds->ds_object, DMU_BONUS_BLKID, tag)) { if (ds == dmu_buf_get_user(dbuf)) result = B_TRUE; else dmu_buf_rele(dbuf, tag); } return (result); } int dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_t **dsp) { objset_t *mos = dp->dp_meta_objset; dmu_buf_t *dbuf; dsl_dataset_t *ds; int err; dmu_object_info_t doi; ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); if (err != 0) return (err); /* Make sure dsobj has the correct object type. */ dmu_object_info_from_db(dbuf, &doi); if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) { dmu_buf_rele(dbuf, tag); return (SET_ERROR(EINVAL)); } ds = dmu_buf_get_user(dbuf); if (ds == NULL) { dsl_dataset_t *winner = NULL; ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); ds->ds_dbuf = dbuf; ds->ds_object = dsobj; + ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); refcount_create(&ds->ds_longholds); bplist_create(&ds->ds_pending_deadlist); dsl_deadlist_open(&ds->ds_deadlist, mos, dsl_dataset_phys(ds)->ds_deadlist_obj); list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t), offsetof(dmu_sendarg_t, dsa_link)); if (doi.doi_type == DMU_OTN_ZAP_METADATA) { int zaperr = zap_contains(mos, dsobj, DS_FIELD_LARGE_BLOCKS); if (zaperr != ENOENT) { VERIFY0(zaperr); ds->ds_large_blocks = B_TRUE; } } if (err == 0) { err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj, NULL, ds, &ds->ds_dir); } if (err != 0) { mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_sendstream_lock); refcount_destroy(&ds->ds_longholds); bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); dmu_buf_rele(dbuf, tag); return (err); } - if (!dsl_dataset_is_snapshot(ds)) { + if (!ds->ds_is_snapshot) { ds->ds_snapname[0] = '\0'; if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev); } if (doi.doi_type == DMU_OTN_ZAP_METADATA) { int zaperr = zap_lookup(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES, sizeof (ds->ds_bookmarks), 1, &ds->ds_bookmarks); if (zaperr != ENOENT) VERIFY0(zaperr); } } else { if (zfs_flags & ZFS_DEBUG_SNAPNAMES) err = dsl_dataset_get_snapname(ds); if (err == 0 && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { err = zap_count( ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_userrefs_obj, &ds->ds_userrefs); } } - if (err == 0 && !dsl_dataset_is_snapshot(ds)) { + if (err == 0 && !ds->ds_is_snapshot) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &ds->ds_reserved); if (err == 0) { err = dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFQUOTA), &ds->ds_quota); } } else { ds->ds_reserved = ds->ds_quota = 0; } - if (err != 0 || (winner = dmu_buf_set_user_ie(dbuf, ds, - dsl_dataset_evict)) != NULL) { + dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict, &ds->ds_dbuf); + if (err == 0) + winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); + + if (err != 0 || winner != NULL) { bplist_destroy(&ds->ds_pending_deadlist); dsl_deadlist_close(&ds->ds_deadlist); if (ds->ds_prev) dsl_dataset_rele(ds->ds_prev, ds); dsl_dir_rele(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); mutex_destroy(&ds->ds_opening_lock); mutex_destroy(&ds->ds_sendstream_lock); refcount_destroy(&ds->ds_longholds); kmem_free(ds, sizeof (dsl_dataset_t)); if (err != 0) { dmu_buf_rele(dbuf, tag); return (err); } ds = winner; } else { ds->ds_fsid_guid = unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid); } } ASSERT3P(ds->ds_dbuf, ==, dbuf); ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data); ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 || spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); *dsp = ds; return (0); } int dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dataset_t **dsp) { dsl_dir_t *dd; const char *snapname; uint64_t obj; int err = 0; err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); if (err != 0) return (err); ASSERT(dsl_pool_config_held(dp)); obj = dsl_dir_phys(dd)->dd_head_dataset_obj; if (obj != 0) err = dsl_dataset_hold_obj(dp, obj, tag, dsp); else err = SET_ERROR(ENOENT); /* we may be looking for a snapshot */ if (err == 0 && snapname != NULL) { dsl_dataset_t *ds; if (*snapname++ != '@') { dsl_dataset_rele(*dsp, tag); dsl_dir_rele(dd, FTAG); return (SET_ERROR(ENOENT)); } dprintf("looking for snapshot '%s'\n", snapname); err = dsl_dataset_snap_lookup(*dsp, snapname, &obj); if (err == 0) err = dsl_dataset_hold_obj(dp, obj, tag, &ds); dsl_dataset_rele(*dsp, tag); if (err == 0) { mutex_enter(&ds->ds_lock); if (ds->ds_snapname[0] == 0) (void) strlcpy(ds->ds_snapname, snapname, sizeof (ds->ds_snapname)); mutex_exit(&ds->ds_lock); *dsp = ds; } } dsl_dir_rele(dd, FTAG); return (err); } int dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_t **dsp) { int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); if (err != 0) return (err); if (!dsl_dataset_tryown(*dsp, tag)) { dsl_dataset_rele(*dsp, tag); *dsp = NULL; return (SET_ERROR(EBUSY)); } return (0); } int dsl_dataset_own(dsl_pool_t *dp, const char *name, void *tag, dsl_dataset_t **dsp) { int err = dsl_dataset_hold(dp, name, tag, dsp); if (err != 0) return (err); if (!dsl_dataset_tryown(*dsp, tag)) { dsl_dataset_rele(*dsp, tag); return (SET_ERROR(EBUSY)); } return (0); } /* * See the comment above dsl_pool_hold() for details. In summary, a long * hold is used to prevent destruction of a dataset while the pool hold * is dropped, allowing other concurrent operations (e.g. spa_sync()). * * The dataset and pool must be held when this function is called. After it * is called, the pool hold may be released while the dataset is still held * and accessed. */ void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag) { ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); (void) refcount_add(&ds->ds_longholds, tag); } void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag) { (void) refcount_remove(&ds->ds_longholds, tag); } /* Return B_TRUE if there are any long holds on this dataset. */ boolean_t dsl_dataset_long_held(dsl_dataset_t *ds) { return (!refcount_is_zero(&ds->ds_longholds)); } void dsl_dataset_name(dsl_dataset_t *ds, char *name) { if (ds == NULL) { (void) strcpy(name, "mos"); } else { dsl_dir_name(ds->ds_dir, name); VERIFY0(dsl_dataset_get_snapname(ds)); if (ds->ds_snapname[0]) { (void) strcat(name, "@"); /* * We use a "recursive" mutex so that we * can call dprintf_ds() with ds_lock held. */ if (!MUTEX_HELD(&ds->ds_lock)) { mutex_enter(&ds->ds_lock); (void) strcat(name, ds->ds_snapname); mutex_exit(&ds->ds_lock); } else { (void) strcat(name, ds->ds_snapname); } } } } void dsl_dataset_rele(dsl_dataset_t *ds, void *tag) { dmu_buf_rele(ds->ds_dbuf, tag); } void dsl_dataset_disown(dsl_dataset_t *ds, void *tag) { ASSERT3P(ds->ds_owner, ==, tag); ASSERT(ds->ds_dbuf != NULL); mutex_enter(&ds->ds_lock); ds->ds_owner = NULL; mutex_exit(&ds->ds_lock); dsl_dataset_long_rele(ds, tag); dsl_dataset_rele(ds, tag); } boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag) { boolean_t gotit = FALSE; mutex_enter(&ds->ds_lock); if (ds->ds_owner == NULL && !DS_IS_INCONSISTENT(ds)) { ds->ds_owner = tag; dsl_dataset_long_hold(ds, tag); gotit = TRUE; } mutex_exit(&ds->ds_lock); return (gotit); } uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, uint64_t flags, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; uint64_t dsobj; objset_t *mos = dp->dp_meta_objset; if (origin == NULL) origin = dp->dp_origin_snap; ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0); dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = dd->dd_object; dsphys->ds_flags = flags; dsphys->ds_fsid_guid = unique_create(); do { (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); } while (dsphys->ds_guid == 0); dsphys->ds_snapnames_zapobj = zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, DMU_OT_NONE, 0, tx); dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; if (origin == NULL) { dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); } else { dsl_dataset_t *ohds; /* head of the origin snapshot */ dsphys->ds_prev_snap_obj = origin->ds_object; dsphys->ds_prev_snap_txg = dsl_dataset_phys(origin)->ds_creation_txg; dsphys->ds_referenced_bytes = dsl_dataset_phys(origin)->ds_referenced_bytes; dsphys->ds_compressed_bytes = dsl_dataset_phys(origin)->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = dsl_dataset_phys(origin)->ds_uncompressed_bytes; dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp; /* * Inherit flags that describe the dataset's contents * (INCONSISTENT) or properties (Case Insensitive). */ dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags & (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET); if (origin->ds_large_blocks) dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); dmu_buf_will_dirty(origin->ds_dbuf, tx); dsl_dataset_phys(origin)->ds_num_children++; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj, FTAG, &ohds)); dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); dsl_dataset_rele(ohds, FTAG); if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) { dsl_dataset_phys(origin)->ds_next_clones_obj = zap_create(mos, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(mos, dsl_dataset_phys(origin)->ds_next_clones_obj, dsobj, tx)); } dmu_buf_will_dirty(dd->dd_dbuf, tx); dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object; if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); dsl_dir_phys(origin->ds_dir)->dd_clones = zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(mos, dsl_dir_phys(origin->ds_dir)->dd_clones, dsobj, tx)); } } if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; dmu_buf_rele(dbuf, FTAG); dmu_buf_will_dirty(dd->dd_dbuf, tx); dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj; return (dsobj); } static void dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) { objset_t *os; VERIFY0(dmu_objset_from_ds(ds, &os)); bzero(&os->os_zil_header, sizeof (os->os_zil_header)); dsl_dataset_dirty(ds, tx); } uint64_t dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *cr, dmu_tx_t *tx) { dsl_pool_t *dp = pdd->dd_pool; uint64_t dsobj, ddobj; dsl_dir_t *dd; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(lastname[0] != '@'); ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); dsobj = dsl_dataset_create_sync_dd(dd, origin, flags & ~DS_CREATE_FLAG_NODIRTY, tx); dsl_deleg_set_create_perms(dd, tx, cr); /* * Since we're creating a new node we know it's a leaf, so we can * initialize the counts if the limit feature is active. */ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { uint64_t cnt = 0; objset_t *os = dd->dd_pool->dp_meta_objset; dsl_dir_zapify(dd, tx); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (cnt), 1, &cnt, tx)); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (cnt), 1, &cnt, tx)); } dsl_dir_rele(dd, FTAG); /* * If we are creating a clone, make sure we zero out any stale * data from the origin snapshots zil header. */ if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) { dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); dsl_dataset_zero_zil(ds, tx); dsl_dataset_rele(ds, FTAG); } return (dsobj); } #ifdef __FreeBSD__ /* FreeBSD ioctl compat begin */ struct destroyarg { nvlist_t *nvl; const char *snapname; }; static int dsl_check_snap_cb(const char *name, void *arg) { struct destroyarg *da = arg; dsl_dataset_t *ds; char *dsname; dsname = kmem_asprintf("%s@%s", name, da->snapname); fnvlist_add_boolean(da->nvl, dsname); kmem_free(dsname, strlen(dsname) + 1); return (0); } int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname, nvlist_t *snaps) { struct destroyarg *da; int err; da = kmem_zalloc(sizeof (struct destroyarg), KM_SLEEP); da->nvl = snaps; da->snapname = snapname; err = dmu_objset_find(fsname, dsl_check_snap_cb, da, DS_FIND_CHILDREN); kmem_free(da, sizeof (struct destroyarg)); return (err); } /* FreeBSD ioctl compat end */ #endif /* __FreeBSD__ */ /* * The unique space in the head dataset can be calculated by subtracting * the space used in the most recent snapshot, that is still being used * in this file system, from the space currently in use. To figure out * the space in the most recent snapshot still in use, we need to take * the total space used in the snapshot and subtract out the space that * has been freed up since the snapshot was taken. */ void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) { uint64_t mrs_used; uint64_t dlused, dlcomp, dluncomp; - ASSERT(!dsl_dataset_is_snapshot(ds)); + ASSERT(!ds->ds_is_snapshot); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes; else mrs_used = 0; dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); ASSERT3U(dlused, <=, mrs_used); dsl_dataset_phys(ds)->ds_unique_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused); if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; } void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t count; int err; ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2); err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, obj, tx); /* * The err should not be ENOENT, but a bug in a previous version * of the code could cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a missing entry. * If we knew that the pool was created after * SPA_VERSION_NEXT_CLONES, we could assert that it isn't * ENOENT. However, at least we can check that we don't have * too many entries in the next_clones_obj even after failing to * remove this one. */ if (err != ENOENT) VERIFY0(err); ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, &count)); ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2); } blkptr_t * dsl_dataset_get_blkptr(dsl_dataset_t *ds) { return (&dsl_dataset_phys(ds)->ds_bp); } void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); /* If it's the meta-objset, set dp_meta_rootbp */ if (ds == NULL) { tx->tx_pool->dp_meta_rootbp = *bp; } else { dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_bp = *bp; } } spa_t * dsl_dataset_get_spa(dsl_dataset_t *ds) { return (ds->ds_dir->dd_pool->dp_spa); } void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp; if (ds == NULL) /* this is the meta-objset */ return; ASSERT(ds->ds_objset != NULL); if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) panic("dirtying snapshot!"); dp = ds->ds_dir->dd_pool; if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, ds); } } boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds) { for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, ds, t)) return (B_TRUE); } return (B_FALSE); } static int dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) { uint64_t asize; if (!dmu_tx_is_syncing(tx)) return (0); /* * If there's an fs-only reservation, any blocks that might become * owned by the snapshot dataset must be accommodated by space * outside of the reservation. */ ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved); if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) return (SET_ERROR(ENOSPC)); /* * Propagate any reserved space for this snapshot to other * snapshot checks in this sync group. */ if (asize > 0) dsl_dir_willuse_space(ds->ds_dir, asize, tx); return (0); } typedef struct dsl_dataset_snapshot_arg { nvlist_t *ddsa_snaps; nvlist_t *ddsa_props; nvlist_t *ddsa_errors; cred_t *ddsa_cr; } dsl_dataset_snapshot_arg_t; int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr) { int error; uint64_t value; ds->ds_trysnap_txg = tx->tx_txg; if (!dmu_tx_is_syncing(tx)) return (0); /* * We don't allow multiple snapshots of the same txg. If there * is already one, try again. */ if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) return (SET_ERROR(EAGAIN)); /* * Check for conflicting snapshot name. */ error = dsl_dataset_snap_lookup(ds, snapname, &value); if (error == 0) return (SET_ERROR(EEXIST)); if (error != ENOENT) return (error); /* * We don't allow taking snapshots of inconsistent datasets, such as * those into which we are currently receiving. However, if we are * creating this snapshot as part of a receive, this check will be * executed atomically with respect to the completion of the receive * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this * case we ignore this, knowing it will be fixed up for us shortly in * dmu_recv_end_sync(). */ if (!recv && DS_IS_INCONSISTENT(ds)) return (SET_ERROR(EBUSY)); /* * Skip the check for temporary snapshots or if we have already checked * the counts in dsl_dataset_snapshot_check. This means we really only * check the count here when we're receiving a stream. */ if (cnt != 0 && cr != NULL) { error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr); if (error != 0) return (error); } error = dsl_dataset_snapshot_reserve_space(ds, tx); if (error != 0) return (error); return (0); } static int dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) { dsl_dataset_snapshot_arg_t *ddsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; int rv = 0; /* * Pre-compute how many total new snapshots will be created for each * level in the tree and below. This is needed for validating the * snapshot limit when either taking a recursive snapshot or when * taking multiple snapshots. * * The problem is that the counts are not actually adjusted when * we are checking, only when we finally sync. For a single snapshot, * this is easy, the count will increase by 1 at each node up the tree, * but its more complicated for the recursive/multiple snapshot case. * * The dsl_fs_ss_limit_check function does recursively check the count * at each level up the tree but since it is validating each snapshot * independently we need to be sure that we are validating the complete * count for the entire set of snapshots. We do this by rolling up the * counts for each component of the name into an nvlist and then * checking each of those cases with the aggregated count. * * This approach properly handles not only the recursive snapshot * case (where we get all of those on the ddsa_snaps list) but also * the sibling case (e.g. snapshot a/b and a/c so that we will also * validate the limit on 'a' using a count of 2). * * We validate the snapshot names in the third loop and only report * name errors once. */ if (dmu_tx_is_syncing(tx)) { nvlist_t *cnt_track = NULL; cnt_track = fnvlist_alloc(); /* Rollup aggregated counts into the cnt_track list */ for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { char *pdelim; uint64_t val; char nm[MAXPATHLEN]; (void) strlcpy(nm, nvpair_name(pair), sizeof (nm)); pdelim = strchr(nm, '@'); if (pdelim == NULL) continue; *pdelim = '\0'; do { if (nvlist_lookup_uint64(cnt_track, nm, &val) == 0) { /* update existing entry */ fnvlist_add_uint64(cnt_track, nm, val + 1); } else { /* add to list */ fnvlist_add_uint64(cnt_track, nm, 1); } pdelim = strrchr(nm, '/'); if (pdelim != NULL) *pdelim = '\0'; } while (pdelim != NULL); } /* Check aggregated counts at each level */ for (pair = nvlist_next_nvpair(cnt_track, NULL); pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) { int error = 0; char *name; uint64_t cnt = 0; dsl_dataset_t *ds; name = nvpair_name(pair); cnt = fnvpair_value_uint64(pair); ASSERT(cnt > 0); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error == 0) { error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, ZFS_PROP_SNAPSHOT_LIMIT, NULL, ddsa->ddsa_cr); dsl_dataset_rele(ds, FTAG); } if (error != 0) { if (ddsa->ddsa_errors != NULL) fnvlist_add_int32(ddsa->ddsa_errors, name, error); rv = error; /* only report one error for this check */ break; } } nvlist_free(cnt_track); } for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { int error = 0; dsl_dataset_t *ds; char *name, *atp; char dsname[MAXNAMELEN]; name = nvpair_name(pair); if (strlen(name) >= MAXNAMELEN) error = SET_ERROR(ENAMETOOLONG); if (error == 0) { atp = strchr(name, '@'); if (atp == NULL) error = SET_ERROR(EINVAL); if (error == 0) (void) strlcpy(dsname, name, atp - name + 1); } if (error == 0) error = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (error == 0) { /* passing 0/NULL skips dsl_fs_ss_limit_check */ error = dsl_dataset_snapshot_check_impl(ds, atp + 1, tx, B_FALSE, 0, NULL); dsl_dataset_rele(ds, FTAG); } if (error != 0) { if (ddsa->ddsa_errors != NULL) { fnvlist_add_int32(ddsa->ddsa_errors, name, error); } rv = error; } } return (rv); } void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx) { static zil_header_t zero_zil; dsl_pool_t *dp = ds->ds_dir->dd_pool; dmu_buf_t *dbuf; dsl_dataset_phys_t *dsphys; uint64_t dsobj, crtxg; objset_t *mos = dp->dp_meta_objset; objset_t *os; ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); /* * If we are on an old pool, the zil must not be active, in which * case it will be zeroed. Usually zil_suspend() accomplishes this. */ ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || dmu_objset_from_ds(ds, &os) != 0 || bcmp(&os->os_phys->os_zil_header, &zero_zil, sizeof (zero_zil)) == 0); dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); /* * The origin's ds_creation_txg has to be < TXG_INITIAL */ if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) crtxg = 1; else crtxg = tx->tx_txg; dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); dsphys = dbuf->db_data; bzero(dsphys, sizeof (dsl_dataset_phys_t)); dsphys->ds_dir_obj = ds->ds_dir->dd_object; dsphys->ds_fsid_guid = unique_create(); do { (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, sizeof (dsphys->ds_guid)); } while (dsphys->ds_guid == 0); dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; dsphys->ds_next_snap_obj = ds->ds_object; dsphys->ds_num_children = 1; dsphys->ds_creation_time = gethrestime_sec(); dsphys->ds_creation_txg = crtxg; dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes; dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes; dsphys->ds_uncompressed_bytes = dsl_dataset_phys(ds)->ds_uncompressed_bytes; dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags; dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp; dmu_buf_rele(dbuf, FTAG); if (ds->ds_large_blocks) dsl_dataset_activate_large_blocks_sync_impl(dsobj, tx); ASSERT3U(ds->ds_prev != 0, ==, dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); if (ds->ds_prev) { uint64_t next_clones_obj = dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj; ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object || dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1); if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object) { dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, dsl_dataset_phys(ds->ds_prev)->ds_creation_txg); dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj; } else if (next_clones_obj != 0) { dsl_dataset_remove_from_next_clones(ds->ds_prev, dsphys->ds_next_snap_obj, tx); VERIFY0(zap_add_int(mos, next_clones_obj, dsobj, tx)); } } /* * If we have a reference-reservation on this dataset, we will * need to increase the amount of refreservation being charged * since our unique space is going to zero. */ if (ds->ds_reserved) { int64_t delta; ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); } dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_deadlist_obj = dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_open(&ds->ds_deadlist, mos, dsl_dataset_phys(ds)->ds_deadlist_obj); dsl_deadlist_add_key(&ds->ds_deadlist, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg); dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj; dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg; dsl_dataset_phys(ds)->ds_unique_bytes = 0; if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, snapname, 8, 1, &dsobj, tx)); if (ds->ds_prev) dsl_dataset_rele(ds->ds_prev, ds); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); dsl_scan_ds_snapshotted(ds, tx); dsl_dir_snap_cmtime_update(ds->ds_dir); spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, ""); } static void dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_snapshot_arg_t *ddsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { dsl_dataset_t *ds; char *name, *atp; char dsname[MAXNAMELEN]; name = nvpair_name(pair); atp = strchr(name, '@'); (void) strlcpy(dsname, name, atp - name + 1); VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx); if (ddsa->ddsa_props != NULL) { dsl_props_set_sync_impl(ds->ds_prev, ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); } dsl_dataset_rele(ds, FTAG); } } /* * The snapshots must all be in the same pool. * All-or-nothing: if there are any failures, nothing will be modified. */ int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) { dsl_dataset_snapshot_arg_t ddsa; nvpair_t *pair; boolean_t needsuspend; int error; spa_t *spa; char *firstname; nvlist_t *suspended = NULL; pair = nvlist_next_nvpair(snaps, NULL); if (pair == NULL) return (0); firstname = nvpair_name(pair); error = spa_open(firstname, &spa, FTAG); if (error != 0) return (error); needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); spa_close(spa, FTAG); if (needsuspend) { suspended = fnvlist_alloc(); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char fsname[MAXNAMELEN]; char *snapname = nvpair_name(pair); char *atp; void *cookie; atp = strchr(snapname, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } (void) strlcpy(fsname, snapname, atp - snapname + 1); error = zil_suspend(fsname, &cookie); if (error != 0) break; fnvlist_add_uint64(suspended, fsname, (uintptr_t)cookie); } } ddsa.ddsa_snaps = snaps; ddsa.ddsa_props = props; ddsa.ddsa_errors = errors; ddsa.ddsa_cr = CRED(); if (error == 0) { error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, dsl_dataset_snapshot_sync, &ddsa, fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL); } if (suspended != NULL) { for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL; pair = nvlist_next_nvpair(suspended, pair)) { zil_resume((void *)(uintptr_t) fnvpair_value_uint64(pair)); } fnvlist_free(suspended); } #ifdef __FreeBSD__ #ifdef _KERNEL if (error == 0) { for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *snapname = nvpair_name(pair); zvol_create_minors(snapname); } } #endif #endif return (error); } typedef struct dsl_dataset_snapshot_tmp_arg { const char *ddsta_fsname; const char *ddsta_snapname; minor_t ddsta_cleanup_minor; const char *ddsta_htag; } dsl_dataset_snapshot_tmp_arg_t; static int dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) { dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds); if (error != 0) return (error); /* NULL cred means no limit check for tmp snapshot */ error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, tx, B_FALSE, 0, NULL); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag, B_TRUE, tx); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx); dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag, ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx); dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx); dsl_dataset_rele(ds, FTAG); } int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, minor_t cleanup_minor, const char *htag) { dsl_dataset_snapshot_tmp_arg_t ddsta; int error; spa_t *spa; boolean_t needsuspend; void *cookie; ddsta.ddsta_fsname = fsname; ddsta.ddsta_snapname = snapname; ddsta.ddsta_cleanup_minor = cleanup_minor; ddsta.ddsta_htag = htag; error = spa_open(fsname, &spa, FTAG); if (error != 0) return (error); needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); spa_close(spa, FTAG); if (needsuspend) { error = zil_suspend(fsname, &cookie); if (error != 0) return (error); } error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check, dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED); if (needsuspend) zil_resume(cookie); return (error); } void dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(ds->ds_objset != NULL); ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0); /* * in case we had to change ds_fsid_guid when we opened it, * sync it out now. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; dmu_objset_sync(ds->ds_objset, zio, tx); if (ds->ds_need_large_blocks && !ds->ds_large_blocks) { dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx); ds->ds_large_blocks = B_TRUE; } } static void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) { uint64_t count = 0; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; zap_cursor_t zc; zap_attribute_t za; nvlist_t *propval = fnvlist_alloc(); nvlist_t *val = fnvlist_alloc(); ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); /* * There may be missing entries in ds_next_clones_obj * due to a bug in a previous version of the code. * Only trust it if it has the right number of entries. */ if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, &count)); } if (count != dsl_dataset_phys(ds)->ds_num_children - 1) goto fail; for (zap_cursor_init(&zc, mos, dsl_dataset_phys(ds)->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *clone; char buf[ZFS_MAXNAMELEN]; VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, za.za_first_integer, FTAG, &clone)); dsl_dir_name(clone->ds_dir, buf); fnvlist_add_boolean(val, buf); dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(&zc); fnvlist_add_nvlist(propval, ZPROP_VALUE, val); fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), propval); fail: nvlist_free(val); nvlist_free(propval); } void dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) { dsl_pool_t *dp = ds->ds_dir->dd_pool; uint64_t refd, avail, uobjs, aobjs, ratio; ASSERT(dsl_pool_config_held(dp)); ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 : (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 / dsl_dataset_phys(ds)->ds_compressed_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, ratio); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, dsl_dataset_phys(ds)->ds_uncompressed_bytes); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, ratio); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dsl_dataset_phys(ds)->ds_unique_bytes); get_clones_stat(ds, nv); } else { if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) { char buf[MAXNAMELEN]; dsl_dataset_name(ds->ds_prev, buf); dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, buf); } dsl_dir_stats(ds->ds_dir, nv); } dsl_dataset_space(ds, &refd, &avail, &uobjs, &aobjs); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, avail); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, refd); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, dsl_dataset_phys(ds)->ds_creation_time); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, dsl_dataset_phys(ds)->ds_creation_txg); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, ds->ds_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, ds->ds_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, dsl_dataset_phys(ds)->ds_guid); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, dsl_dataset_phys(ds)->ds_unique_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, ds->ds_object); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, ds->ds_userrefs); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, DS_IS_DEFER_DESTROY(ds) ? 1 : 0); if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { uint64_t written, comp, uncomp; dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_dataset_t *prev; int err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err == 0) { err = dsl_dataset_space_written(prev, ds, &written, &comp, &uncomp); dsl_dataset_rele(prev, FTAG); if (err == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, written); } } } } void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) { dsl_pool_t *dp = ds->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); stat->dds_creation_txg = dsl_dataset_phys(ds)->ds_creation_txg; stat->dds_inconsistent = dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT; stat->dds_guid = dsl_dataset_phys(ds)->ds_guid; stat->dds_origin[0] = '\0'; - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = dsl_dataset_phys(ds)->ds_num_children - 1; } else { stat->dds_is_snapshot = B_FALSE; stat->dds_num_clones = 0; if (dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_t *ods; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods)); dsl_dataset_name(ods, stat->dds_origin); dsl_dataset_rele(ods, FTAG); } } } uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds) { return (ds->ds_fsid_guid); } void dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes; *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) *availbytesp += ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; if (ds->ds_quota != 0) { /* * Adjust available bytes according to refquota */ if (*refdbytesp < ds->ds_quota) *availbytesp = MIN(*availbytesp, ds->ds_quota - *refdbytesp); else *availbytesp = 0; } *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp); *availobjsp = DN_MAX_OBJECT - *usedobjsp; } boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) { dsl_pool_t *dp = ds->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); if (snap == NULL) return (B_FALSE); if (dsl_dataset_phys(ds)->ds_bp.blk_birth > dsl_dataset_phys(snap)->ds_creation_txg) { objset_t *os, *os_snap; /* * It may be that only the ZIL differs, because it was * reset in the head. Don't count that as being * modified. */ if (dmu_objset_from_ds(ds, &os) != 0) return (B_TRUE); if (dmu_objset_from_ds(snap, &os_snap) != 0) return (B_TRUE); return (bcmp(&os->os_phys->os_meta_dnode, &os_snap->os_phys->os_meta_dnode, sizeof (os->os_phys->os_meta_dnode)) != 0); } return (B_FALSE); } typedef struct dsl_dataset_rename_snapshot_arg { const char *ddrsa_fsname; const char *ddrsa_oldsnapname; const char *ddrsa_newsnapname; boolean_t ddrsa_recursive; dmu_tx_t *ddrsa_tx; } dsl_dataset_rename_snapshot_arg_t; /* ARGSUSED */ static int dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; int error; uint64_t val; error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); if (error != 0) { /* ignore nonexistent snapshots */ return (error == ENOENT ? 0 : error); } /* new name should not exist */ error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val); if (error == 0) error = SET_ERROR(EEXIST); else if (error == ENOENT) error = 0; /* dataset name + 1 for the "@" + the new snapshot name must fit */ if (dsl_dir_namelen(hds->ds_dir) + 1 + strlen(ddrsa->ddrsa_newsnapname) >= MAXNAMELEN) error = SET_ERROR(ENAMETOOLONG); return (error); } static int dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; int error; error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds); if (error != 0) return (error); if (ddrsa->ddrsa_recursive) { error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object, dsl_dataset_rename_snapshot_check_impl, ddrsa, DS_FIND_CHILDREN); } else { error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa); } dsl_dataset_rele(hds, FTAG); return (error); } static int dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { #ifdef __FreeBSD__ #ifdef _KERNEL char *oldname, *newname; #endif #endif dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; dsl_dataset_t *ds; uint64_t val; dmu_tx_t *tx = ddrsa->ddrsa_tx; int error; error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); ASSERT(error == 0 || error == ENOENT); if (error == ENOENT) { /* ignore nonexistent snapshots */ return (0); } VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds)); /* log before we change the name */ spa_history_log_internal_ds(ds, "rename", tx, "-> @%s", ddrsa->ddrsa_newsnapname); VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx, B_FALSE)); mutex_enter(&ds->ds_lock); (void) strcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname); mutex_exit(&ds->ds_lock); VERIFY0(zap_add(dp->dp_meta_objset, dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); #ifdef __FreeBSD__ #ifdef _KERNEL oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP); newname = kmem_alloc(MAXPATHLEN, KM_SLEEP); snprintf(oldname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname, ddrsa->ddrsa_oldsnapname); snprintf(newname, MAXPATHLEN, "%s@%s", ddrsa->ddrsa_fsname, ddrsa->ddrsa_newsnapname); zfsvfs_update_fromname(oldname, newname); zvol_rename_minors(oldname, newname); kmem_free(newname, MAXPATHLEN); kmem_free(oldname, MAXPATHLEN); #endif #endif dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds)); ddrsa->ddrsa_tx = tx; if (ddrsa->ddrsa_recursive) { VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object, dsl_dataset_rename_snapshot_sync_impl, ddrsa, DS_FIND_CHILDREN)); } else { VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa)); } dsl_dataset_rele(hds, FTAG); } int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive) { dsl_dataset_rename_snapshot_arg_t ddrsa; ddrsa.ddrsa_fsname = fsname; ddrsa.ddrsa_oldsnapname = oldsnapname; ddrsa.ddrsa_newsnapname = newsnapname; ddrsa.ddrsa_recursive = recursive; return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, dsl_dataset_rename_snapshot_sync, &ddrsa, 1, ZFS_SPACE_CHECK_RESERVED)); } /* * If we're doing an ownership handoff, we need to make sure that there is * only one long hold on the dataset. We're not allowed to change anything here * so we don't permanently release the long hold or regular hold here. We want * to do this only when syncing to avoid the dataset unexpectedly going away * when we release the long hold. */ static int dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) { boolean_t held; if (!dmu_tx_is_syncing(tx)) return (0); if (owner != NULL) { VERIFY3P(ds->ds_owner, ==, owner); dsl_dataset_long_rele(ds, owner); } held = dsl_dataset_long_held(ds); if (owner != NULL) dsl_dataset_long_hold(ds, owner); if (held) return (SET_ERROR(EBUSY)); return (0); } typedef struct dsl_dataset_rollback_arg { const char *ddra_fsname; void *ddra_owner; nvlist_t *ddra_result; } dsl_dataset_rollback_arg_t; static int dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) { dsl_dataset_rollback_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int64_t unused_refres_delta; int error; error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds); if (error != 0) return (error); /* must not be a snapshot */ - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* must have a most recent snapshot */ if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } /* must not have any bookmarks after the most recent snapshot */ nvlist_t *proprequest = fnvlist_alloc(); fnvlist_add_boolean(proprequest, zfs_prop_to_name(ZFS_PROP_CREATETXG)); nvlist_t *bookmarks = fnvlist_alloc(); error = dsl_get_bookmarks_impl(ds, proprequest, bookmarks); fnvlist_free(proprequest); if (error != 0) return (error); for (nvpair_t *pair = nvlist_next_nvpair(bookmarks, NULL); pair != NULL; pair = nvlist_next_nvpair(bookmarks, pair)) { nvlist_t *valuenv = fnvlist_lookup_nvlist(fnvpair_value_nvlist(pair), zfs_prop_to_name(ZFS_PROP_CREATETXG)); uint64_t createtxg = fnvlist_lookup_uint64(valuenv, "value"); if (createtxg > dsl_dataset_phys(ds)->ds_prev_snap_txg) { fnvlist_free(bookmarks); dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EEXIST)); } } fnvlist_free(bookmarks); error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } /* * Check if the snap we are rolling back to uses more than * the refquota. */ if (ds->ds_quota != 0 && dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EDQUOT)); } /* * When we do the clone swap, we will temporarily use more space * due to the refreservation (the head will no longer have any * unique space, so the entire amount of the refreservation will need * to be free). We will immediately destroy the clone, freeing * this space, but the freeing happens over many txg's. */ unused_refres_delta = (int64_t)MIN(ds->ds_reserved, dsl_dataset_phys(ds)->ds_unique_bytes); if (unused_refres_delta > 0 && unused_refres_delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOSPC)); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_rollback_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds, *clone; uint64_t cloneobj; char namebuf[ZFS_MAXNAMELEN]; VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds)); dsl_dataset_name(ds->ds_prev, namebuf); fnvlist_add_string(ddra->ddra_result, "target", namebuf); cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, tx); VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); dsl_dataset_clone_swap_sync_impl(clone, ds, tx); dsl_dataset_zero_zil(ds, tx); dsl_destroy_head_sync_impl(clone, tx); dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(ds, FTAG); } /* * Rolls back the given filesystem or volume to the most recent snapshot. * The name of the most recent snapshot will be returned under key "target" * in the result nvlist. * * If owner != NULL: * - The existing dataset MUST be owned by the specified owner at entry * - Upon return, dataset will still be held by the same owner, whether we * succeed or not. * * This mode is required any time the existing filesystem is mounted. See * notes above zfs_suspend_fs() for further details. */ int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result) { dsl_dataset_rollback_arg_t ddra; ddra.ddra_fsname = fsname; ddra.ddra_owner = owner; ddra.ddra_result = result; return (dsl_sync_task(fsname, dsl_dataset_rollback_check, dsl_dataset_rollback_sync, &ddra, 1, ZFS_SPACE_CHECK_RESERVED)); } struct promotenode { list_node_t link; dsl_dataset_t *ds; }; typedef struct dsl_dataset_promote_arg { const char *ddpa_clonename; dsl_dataset_t *ddpa_clone; list_t shared_snaps, origin_snaps, clone_snaps; dsl_dataset_t *origin_origin; /* origin of the origin */ uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; char *err_ds; cred_t *cr; } dsl_dataset_promote_arg_t; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag); static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag); static int dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) { dsl_dataset_promote_arg_t *ddpa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; struct promotenode *snap; dsl_dataset_t *origin_ds; int err; uint64_t unused; uint64_t ss_mv_cnt; size_t max_snap_len; err = promote_hold(ddpa, dp, FTAG); if (err != 0) return (err); hds = ddpa->ddpa_clone; max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1; if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) { promote_rele(ddpa, FTAG); return (SET_ERROR(EXDEV)); } /* * Compute and check the amount of space to transfer. Since this is * so expensive, don't do the preliminary check. */ if (!dmu_tx_is_syncing(tx)) { promote_rele(ddpa, FTAG); return (0); } snap = list_head(&ddpa->shared_snaps); origin_ds = snap->ds; /* compute origin's new unique space */ snap = list_tail(&ddpa->clone_snaps); ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, origin_ds->ds_object); dsl_deadlist_space_range(&snap->ds->ds_deadlist, dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX, &ddpa->unique, &unused, &unused); /* * Walk the snapshots that we are moving * * Compute space to transfer. Consider the incremental changes * to used by each snapshot: * (my used) = (prev's used) + (blocks born) - (blocks killed) * So each snapshot gave birth to: * (blocks born) = (my used) - (prev's used) + (blocks killed) * So a sequence would look like: * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) * Which simplifies to: * uN + kN + kN-1 + ... + k1 + k0 * Note however, if we stop before we reach the ORIGIN we get: * uN + kN + kN-1 + ... + kM - uM-1 */ ss_mv_cnt = 0; ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes; ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes; ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes; for (snap = list_head(&ddpa->shared_snaps); snap; snap = list_next(&ddpa->shared_snaps, snap)) { uint64_t val, dlused, dlcomp, dluncomp; dsl_dataset_t *ds = snap->ds; ss_mv_cnt++; /* * If there are long holds, we won't be able to evict * the objset. */ if (dsl_dataset_long_held(ds)) { err = SET_ERROR(EBUSY); goto out; } /* Check that the snapshot name does not conflict */ VERIFY0(dsl_dataset_get_snapname(ds)); if (strlen(ds->ds_snapname) >= max_snap_len) { err = SET_ERROR(ENAMETOOLONG); goto out; } err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); if (err == 0) { (void) strcpy(ddpa->err_ds, snap->ds->ds_snapname); err = SET_ERROR(EEXIST); goto out; } if (err != ENOENT) goto out; /* The very first snapshot does not have a deadlist */ if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0) continue; dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); ddpa->used += dlused; ddpa->comp += dlcomp; ddpa->uncomp += dluncomp; } /* * If we are a clone of a clone then we never reached ORIGIN, * so we need to subtract out the clone origin's used space. */ if (ddpa->origin_origin) { ddpa->used -= dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes; ddpa->comp -= dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes; ddpa->uncomp -= dsl_dataset_phys(ddpa->origin_origin)-> ds_uncompressed_bytes; } /* Check that there is enough space and limit headroom here */ err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, 0, ss_mv_cnt, ddpa->used, ddpa->cr); if (err != 0) goto out; /* * Compute the amounts of space that will be used by snapshots * after the promotion (for both origin and clone). For each, * it is the amount of space that will be on all of their * deadlists (that was not born before their new origin). */ if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) { uint64_t space; /* * Note, typically this will not be a clone of a clone, * so dd_origin_txg will be < TXG_INITIAL, so * these snaplist_space() -> dsl_deadlist_space_range() * calls will be fast because they do not have to * iterate over all bps. */ snap = list_head(&ddpa->origin_snaps); err = snaplist_space(&ddpa->shared_snaps, snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap); if (err != 0) goto out; err = snaplist_space(&ddpa->clone_snaps, snap->ds->ds_dir->dd_origin_txg, &space); if (err != 0) goto out; ddpa->cloneusedsnap += space; } if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) { err = snaplist_space(&ddpa->origin_snaps, dsl_dataset_phys(origin_ds)->ds_creation_txg, &ddpa->originusedsnap); if (err != 0) goto out; } out: promote_rele(ddpa, FTAG); return (err); } static void dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_promote_arg_t *ddpa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *hds; struct promotenode *snap; dsl_dataset_t *origin_ds; dsl_dataset_t *origin_head; dsl_dir_t *dd; dsl_dir_t *odd = NULL; uint64_t oldnext_obj; int64_t delta; #if defined(__FreeBSD__) && defined(_KERNEL) char *oldname, *newname; #endif VERIFY0(promote_hold(ddpa, dp, FTAG)); hds = ddpa->ddpa_clone; ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE); snap = list_head(&ddpa->shared_snaps); origin_ds = snap->ds; dd = hds->ds_dir; snap = list_head(&ddpa->origin_snaps); origin_head = snap->ds; /* * We need to explicitly open odd, since origin_ds's dd will be * changing. */ VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, NULL, FTAG, &odd)); /* change origin's next snap */ dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj; snap = list_tail(&ddpa->clone_snaps); ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, origin_ds->ds_object); dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object; /* change the origin's next clone */ if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) { dsl_dataset_remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dataset_phys(origin_ds)->ds_next_clones_obj, oldnext_obj, tx)); } /* change origin */ dmu_buf_will_dirty(dd->dd_dbuf, tx); ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object); dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj; dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; dmu_buf_will_dirty(odd->dd_dbuf, tx); dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object; origin_head->ds_dir->dd_origin_txg = dsl_dataset_phys(origin_ds)->ds_creation_txg; /* change dd_clone entries */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { VERIFY0(zap_remove_int(dp->dp_meta_objset, dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx)); VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, hds->ds_object, tx)); VERIFY0(zap_remove_int(dp->dp_meta_objset, dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, origin_head->ds_object, tx)); if (dsl_dir_phys(dd)->dd_clones == 0) { dsl_dir_phys(dd)->dd_clones = zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx)); } #if defined(__FreeBSD__) && defined(_KERNEL) /* Take the spa_namespace_lock early so zvol renames don't deadlock. */ mutex_enter(&spa_namespace_lock); oldname = kmem_alloc(MAXPATHLEN, KM_SLEEP); newname = kmem_alloc(MAXPATHLEN, KM_SLEEP); #endif /* move snapshots to this dir */ for (snap = list_head(&ddpa->shared_snaps); snap; snap = list_next(&ddpa->shared_snaps, snap)) { dsl_dataset_t *ds = snap->ds; /* * Property callbacks are registered to a particular * dsl_dir. Since ours is changing, evict the objset * so that they will be unregistered from the old dsl_dir. */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } /* move snap name entry */ VERIFY0(dsl_dataset_get_snapname(ds)); VERIFY0(dsl_dataset_snap_remove(origin_head, ds->ds_snapname, tx, B_TRUE)); VERIFY0(zap_add(dp->dp_meta_objset, dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, 8, 1, &ds->ds_object, tx)); dsl_fs_ss_count_adjust(hds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); /* change containing dsl_dir */ dmu_buf_will_dirty(ds->ds_dbuf, tx); ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object); dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object; ASSERT3P(ds->ds_dir, ==, odd); dsl_dir_rele(ds->ds_dir, ds); VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, NULL, ds, &ds->ds_dir)); #if defined(__FreeBSD__) && defined(_KERNEL) dsl_dataset_name(ds, newname); zfsvfs_update_fromname(oldname, newname); zvol_rename_minors(oldname, newname); #endif /* move any clone references */ if (dsl_dataset_phys(ds)->ds_next_clones_obj && spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { zap_cursor_t zc; zap_attribute_t za; for (zap_cursor_init(&zc, dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *cnds; uint64_t o; if (za.za_first_integer == oldnext_obj) { /* * We've already moved the * origin's reference. */ continue; } VERIFY0(dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &cnds)); o = dsl_dir_phys(cnds->ds_dir)-> dd_head_dataset_obj; VERIFY0(zap_remove_int(dp->dp_meta_objset, dsl_dir_phys(odd)->dd_clones, o, tx)); VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_clones, o, tx)); dsl_dataset_rele(cnds, FTAG); } zap_cursor_fini(&zc); } ASSERT(!dsl_prop_hascb(ds)); } #if defined(__FreeBSD__) && defined(_KERNEL) mutex_exit(&spa_namespace_lock); kmem_free(newname, MAXPATHLEN); kmem_free(oldname, MAXPATHLEN); #endif /* * Change space accounting. * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either * both be valid, or both be 0 (resulting in delta == 0). This * is true for each of {clone,origin} independently. */ delta = ddpa->cloneusedsnap - dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, >=, 0); ASSERT3U(ddpa->used, >=, delta); dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(dd, DD_USED_HEAD, ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); delta = ddpa->originusedsnap - dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP]; ASSERT3S(delta, <=, 0); ASSERT3U(ddpa->used, >=, -delta); dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); dsl_dir_diduse_space(odd, DD_USED_HEAD, -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; /* log history record */ spa_history_log_internal_ds(hds, "promote", tx, ""); dsl_dir_rele(odd, FTAG); promote_rele(ddpa, FTAG); } /* * Make a list of dsl_dataset_t's for the snapshots between first_obj * (exclusive) and last_obj (inclusive). The list will be in reverse * order (last_obj will be the list_head()). If first_obj == 0, do all * snapshots back to this dataset's origin. */ static int snaplist_make(dsl_pool_t *dp, uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag) { uint64_t obj = last_obj; list_create(l, sizeof (struct promotenode), offsetof(struct promotenode, link)); while (obj != first_obj) { dsl_dataset_t *ds; struct promotenode *snap; int err; err = dsl_dataset_hold_obj(dp, obj, tag, &ds); ASSERT(err != ENOENT); if (err != 0) return (err); if (first_obj == 0) first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj; snap = kmem_alloc(sizeof (*snap), KM_SLEEP); snap->ds = ds; list_insert_tail(l, snap); obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; } return (0); } static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) { struct promotenode *snap; *spacep = 0; for (snap = list_head(l); snap; snap = list_next(l, snap)) { uint64_t used, comp, uncomp; dsl_deadlist_space_range(&snap->ds->ds_deadlist, mintxg, UINT64_MAX, &used, &comp, &uncomp); *spacep += used; } return (0); } static void snaplist_destroy(list_t *l, void *tag) { struct promotenode *snap; if (l == NULL || !list_link_active(&l->list_head)) return; while ((snap = list_tail(l)) != NULL) { list_remove(l, snap); dsl_dataset_rele(snap->ds, tag); kmem_free(snap, sizeof (*snap)); } list_destroy(l); } static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) { int error; dsl_dir_t *dd; struct promotenode *snap; error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag, &ddpa->ddpa_clone); if (error != 0) return (error); dd = ddpa->ddpa_clone->ds_dir; - if (dsl_dataset_is_snapshot(ddpa->ddpa_clone) || + if (ddpa->ddpa_clone->ds_is_snapshot || !dsl_dir_is_clone(dd)) { dsl_dataset_rele(ddpa->ddpa_clone, tag); return (SET_ERROR(EINVAL)); } error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj, &ddpa->shared_snaps, tag); if (error != 0) goto out; error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object, &ddpa->clone_snaps, tag); if (error != 0) goto out; snap = list_head(&ddpa->shared_snaps); ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj); error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj, dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj, &ddpa->origin_snaps, tag); if (error != 0) goto out; if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) { error = dsl_dataset_hold_obj(dp, dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj, tag, &ddpa->origin_origin); if (error != 0) goto out; } out: if (error != 0) promote_rele(ddpa, tag); return (error); } static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) { snaplist_destroy(&ddpa->shared_snaps, tag); snaplist_destroy(&ddpa->clone_snaps, tag); snaplist_destroy(&ddpa->origin_snaps, tag); if (ddpa->origin_origin != NULL) dsl_dataset_rele(ddpa->origin_origin, tag); dsl_dataset_rele(ddpa->ddpa_clone, tag); } /* * Promote a clone. * * If it fails due to a conflicting snapshot name, "conflsnap" will be filled * in with the name. (It must be at least MAXNAMELEN bytes long.) */ int dsl_dataset_promote(const char *name, char *conflsnap) { dsl_dataset_promote_arg_t ddpa = { 0 }; uint64_t numsnaps; int error; objset_t *os; /* * We will modify space proportional to the number of * snapshots. Compute numsnaps. */ error = dmu_objset_hold(name, FTAG, &os); if (error != 0) return (error); error = zap_count(dmu_objset_pool(os)->dp_meta_objset, dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj, &numsnaps); dmu_objset_rele(os, FTAG); if (error != 0) return (error); ddpa.ddpa_clonename = name; ddpa.err_ds = conflsnap; ddpa.cr = CRED(); return (dsl_sync_task(name, dsl_dataset_promote_check, dsl_dataset_promote_sync, &ddpa, 2 + numsnaps, ZFS_SPACE_CHECK_RESERVED)); } int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx) { int64_t unused_refres_delta; /* they should both be heads */ - if (dsl_dataset_is_snapshot(clone) || - dsl_dataset_is_snapshot(origin_head)) + if (clone->ds_is_snapshot || + origin_head->ds_is_snapshot) return (SET_ERROR(EINVAL)); /* if we are not forcing, the branch point should be just before them */ if (!force && clone->ds_prev != origin_head->ds_prev) return (SET_ERROR(EINVAL)); /* clone should be the clone (unless they are unrelated) */ if (clone->ds_prev != NULL && clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap && origin_head->ds_dir != clone->ds_prev->ds_dir) return (SET_ERROR(EINVAL)); /* the clone should be a child of the origin */ if (clone->ds_dir->dd_parent != origin_head->ds_dir) return (SET_ERROR(EINVAL)); /* origin_head shouldn't be modified unless 'force' */ if (!force && dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev)) return (SET_ERROR(ETXTBSY)); /* origin_head should have no long holds (e.g. is not mounted) */ if (dsl_dataset_handoff_check(origin_head, owner, tx)) return (SET_ERROR(EBUSY)); /* check amount of any unconsumed refreservation */ unused_refres_delta = (int64_t)MIN(origin_head->ds_reserved, dsl_dataset_phys(origin_head)->ds_unique_bytes) - (int64_t)MIN(origin_head->ds_reserved, dsl_dataset_phys(clone)->ds_unique_bytes); if (unused_refres_delta > 0 && unused_refres_delta > dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) return (SET_ERROR(ENOSPC)); /* clone can't be over the head's refquota */ if (origin_head->ds_quota != 0 && dsl_dataset_phys(clone)->ds_referenced_bytes > origin_head->ds_quota) return (SET_ERROR(EDQUOT)); return (0); } void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); int64_t unused_refres_delta; ASSERT(clone->ds_reserved == 0); ASSERT(origin_head->ds_quota == 0 || dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota); ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); dmu_buf_will_dirty(clone->ds_dbuf, tx); dmu_buf_will_dirty(origin_head->ds_dbuf, tx); if (clone->ds_objset != NULL) { dmu_objset_evict(clone->ds_objset); clone->ds_objset = NULL; } if (origin_head->ds_objset != NULL) { dmu_objset_evict(origin_head->ds_objset); origin_head->ds_objset = NULL; } unused_refres_delta = (int64_t)MIN(origin_head->ds_reserved, dsl_dataset_phys(origin_head)->ds_unique_bytes) - (int64_t)MIN(origin_head->ds_reserved, dsl_dataset_phys(clone)->ds_unique_bytes); /* * Reset origin's unique bytes, if it exists. */ if (clone->ds_prev) { dsl_dataset_t *origin = clone->ds_prev; uint64_t comp, uncomp; dmu_buf_will_dirty(origin->ds_dbuf, tx); dsl_deadlist_space_range(&clone->ds_deadlist, dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX, &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp); } /* swap blkptrs */ { blkptr_t tmp; tmp = dsl_dataset_phys(origin_head)->ds_bp; dsl_dataset_phys(origin_head)->ds_bp = dsl_dataset_phys(clone)->ds_bp; dsl_dataset_phys(clone)->ds_bp = tmp; } /* set dd_*_bytes */ { int64_t dused, dcomp, duncomp; uint64_t cdl_used, cdl_comp, cdl_uncomp; uint64_t odl_used, odl_comp, odl_uncomp; ASSERT3U(dsl_dir_phys(clone->ds_dir)-> dd_used_breakdown[DD_USED_SNAP], ==, 0); dsl_deadlist_space(&clone->ds_deadlist, &cdl_used, &cdl_comp, &cdl_uncomp); dsl_deadlist_space(&origin_head->ds_deadlist, &odl_used, &odl_comp, &odl_uncomp); dused = dsl_dataset_phys(clone)->ds_referenced_bytes + cdl_used - (dsl_dataset_phys(origin_head)->ds_referenced_bytes + odl_used); dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes + cdl_comp - (dsl_dataset_phys(origin_head)->ds_compressed_bytes + odl_comp); duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes + cdl_uncomp - (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes + odl_uncomp); dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, dused, dcomp, duncomp, tx); dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD, -dused, -dcomp, -duncomp, tx); /* * The difference in the space used by snapshots is the * difference in snapshot space due to the head's * deadlist (since that's the only thing that's * changing that affects the snapused). */ dsl_deadlist_space_range(&clone->ds_deadlist, origin_head->ds_dir->dd_origin_txg, UINT64_MAX, &cdl_used, &cdl_comp, &cdl_uncomp); dsl_deadlist_space_range(&origin_head->ds_deadlist, origin_head->ds_dir->dd_origin_txg, UINT64_MAX, &odl_used, &odl_comp, &odl_uncomp); dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used, DD_USED_HEAD, DD_USED_SNAP, NULL); } /* swap ds_*_bytes */ SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes, dsl_dataset_phys(clone)->ds_referenced_bytes); SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes, dsl_dataset_phys(clone)->ds_compressed_bytes); SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes, dsl_dataset_phys(clone)->ds_uncompressed_bytes); SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes, dsl_dataset_phys(clone)->ds_unique_bytes); /* apply any parent delta for change in unconsumed refreservation */ dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, unused_refres_delta, 0, 0, tx); /* * Swap deadlists. */ dsl_deadlist_close(&clone->ds_deadlist); dsl_deadlist_close(&origin_head->ds_deadlist); SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj, dsl_dataset_phys(clone)->ds_deadlist_obj); dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, dsl_dataset_phys(clone)->ds_deadlist_obj); dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, dsl_dataset_phys(origin_head)->ds_deadlist_obj); dsl_scan_ds_clone_swapped(origin_head, clone, tx); spa_history_log_internal_ds(clone, "clone swap", tx, "parent=%s", origin_head->ds_dir->dd_myname); } /* * Given a pool name and a dataset object number in that pool, * return the name of that dataset. */ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) { dsl_pool_t *dp; dsl_dataset_t *ds; int error; error = dsl_pool_hold(pname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); if (error == 0) { dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) { int error = 0; ASSERT3S(asize, >, 0); /* * *ref_rsrv is the portion of asize that will come from any * unconsumed refreservation space. */ *ref_rsrv = 0; mutex_enter(&ds->ds_lock); /* * Make a space adjustment for reserved bytes. */ if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { ASSERT3U(*used, >=, ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); *used -= (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); *ref_rsrv = asize - MIN(asize, parent_delta(ds, asize + inflight)); } if (!check_quota || ds->ds_quota == 0) { mutex_exit(&ds->ds_lock); return (0); } /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >= ds->ds_quota) { if (inflight > 0 || dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota) error = SET_ERROR(ERESTART); else error = SET_ERROR(EDQUOT); } mutex_exit(&ds->ds_lock); return (error); } typedef struct dsl_dataset_set_qr_arg { const char *ddsqra_name; zprop_source_t ddsqra_source; uint64_t ddsqra_value; } dsl_dataset_set_qr_arg_t; /* ARGSUSED */ static int dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) { dsl_dataset_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; uint64_t newval; if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA) return (SET_ERROR(ENOTSUP)); error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_prop_predict(ds->ds_dir, zfs_prop_to_name(ZFS_PROP_REFQUOTA), ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (newval == 0) { dsl_dataset_rele(ds, FTAG); return (0); } if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes || newval < ds->ds_reserved) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOSPC)); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFQUOTA), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval)); if (ds->ds_quota != newval) { dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_quota = newval; } dsl_dataset_rele(ds, FTAG); } int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, uint64_t refquota) { dsl_dataset_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = dsname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = refquota; return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, dsl_dataset_set_refquota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); } static int dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) { dsl_dataset_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; uint64_t newval, unique; if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION) return (SET_ERROR(ENOTSUP)); error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_prop_predict(ds->ds_dir, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ if (!dmu_tx_is_syncing(tx)) { dsl_dataset_rele(ds, FTAG); return (0); } mutex_enter(&ds->ds_lock); if (!DS_UNIQUE_IS_ACCURATE(ds)) dsl_dataset_recalc_head_uniq(ds); unique = dsl_dataset_phys(ds)->ds_unique_bytes; mutex_exit(&ds->ds_lock); if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { uint64_t delta = MAX(unique, newval) - MAX(unique, ds->ds_reserved); if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) || (ds->ds_quota > 0 && newval > ds->ds_quota)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOSPC)); } } dsl_dataset_rele(ds, FTAG); return (0); } void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, zprop_source_t source, uint64_t value, dmu_tx_t *tx) { uint64_t newval; uint64_t unique; int64_t delta; dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), source, sizeof (value), 1, &value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval)); dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); unique = dsl_dataset_phys(ds)->ds_unique_bytes; delta = MAX(0, (int64_t)(newval - unique)) - MAX(0, (int64_t)(ds->ds_reserved - unique)); ds->ds_reserved = newval; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); mutex_exit(&ds->ds_dir->dd_lock); } static void dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); dsl_dataset_set_refreservation_sync_impl(ds, ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx); dsl_dataset_rele(ds, FTAG); } int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, uint64_t refreservation) { dsl_dataset_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = dsname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = refreservation; return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, dsl_dataset_set_refreservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); } /* * Return (in *usedp) the amount of space written in new that is not * present in oldsnap. New may be a snapshot or the head. Old must be * a snapshot before new, in new's filesystem (or its origin). If not then * fail and return EINVAL. * * The written space is calculated by considering two components: First, we * ignore any freed space, and calculate the written as new's used space * minus old's used space. Next, we add in the amount of space that was freed * between the two snapshots, thus reducing new's used space relative to old's. * Specifically, this is the space that was born before old->ds_creation_txg, * and freed before new (ie. on new's deadlist or a previous deadlist). * * space freed [---------------------] * snapshots ---O-------O--------O-------O------ * oldsnap new */ int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { int err = 0; uint64_t snapobj; dsl_pool_t *dp = new->ds_dir->dd_pool; ASSERT(dsl_pool_config_held(dp)); *usedp = 0; *usedp += dsl_dataset_phys(new)->ds_referenced_bytes; *usedp -= dsl_dataset_phys(oldsnap)->ds_referenced_bytes; *compp = 0; *compp += dsl_dataset_phys(new)->ds_compressed_bytes; *compp -= dsl_dataset_phys(oldsnap)->ds_compressed_bytes; *uncompp = 0; *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes; *uncompp -= dsl_dataset_phys(oldsnap)->ds_uncompressed_bytes; snapobj = new->ds_object; while (snapobj != oldsnap->ds_object) { dsl_dataset_t *snap; uint64_t used, comp, uncomp; if (snapobj == new->ds_object) { snap = new; } else { err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); if (err != 0) break; } if (dsl_dataset_phys(snap)->ds_prev_snap_txg == dsl_dataset_phys(oldsnap)->ds_creation_txg) { /* * The blocks in the deadlist can not be born after * ds_prev_snap_txg, so get the whole deadlist space, * which is more efficient (especially for old-format * deadlists). Unfortunately the deadlist code * doesn't have enough information to make this * optimization itself. */ dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp); } else { dsl_deadlist_space_range(&snap->ds_deadlist, 0, dsl_dataset_phys(oldsnap)->ds_creation_txg, &used, &comp, &uncomp); } *usedp += used; *compp += comp; *uncompp += uncomp; /* * If we get to the beginning of the chain of snapshots * (ds_prev_snap_obj == 0) before oldsnap, then oldsnap * was not a snapshot of/before new. */ snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; if (snap != new) dsl_dataset_rele(snap, FTAG); if (snapobj == 0) { err = SET_ERROR(EINVAL); break; } } return (err); } /* * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, * lastsnap, and all snapshots in between are deleted. * * blocks that would be freed [---------------------------] * snapshots ---O-------O--------O-------O--------O * firstsnap lastsnap * * This is the set of blocks that were born after the snap before firstsnap, * (birth > firstsnap->prev_snap_txg) and died before the snap after the * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). * We calculate this by iterating over the relevant deadlists (from the snap * after lastsnap, backward to the snap after firstsnap), summing up the * space on the deadlist that was born after the snap before firstsnap. */ int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *lastsnap, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { int err = 0; uint64_t snapobj; dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; - ASSERT(dsl_dataset_is_snapshot(firstsnap)); - ASSERT(dsl_dataset_is_snapshot(lastsnap)); + ASSERT(firstsnap->ds_is_snapshot); + ASSERT(lastsnap->ds_is_snapshot); /* * Check that the snapshots are in the same dsl_dir, and firstsnap * is before lastsnap. */ if (firstsnap->ds_dir != lastsnap->ds_dir || dsl_dataset_phys(firstsnap)->ds_creation_txg > dsl_dataset_phys(lastsnap)->ds_creation_txg) return (SET_ERROR(EINVAL)); *usedp = *compp = *uncompp = 0; snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj; while (snapobj != firstsnap->ds_object) { dsl_dataset_t *ds; uint64_t used, comp, uncomp; err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); if (err != 0) break; dsl_deadlist_space_range(&ds->ds_deadlist, dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX, &used, &comp, &uncomp); *usedp += used; *compp += comp; *uncompp += uncomp; snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; ASSERT3U(snapobj, !=, 0); dsl_dataset_rele(ds, FTAG); } return (err); } static int dsl_dataset_activate_large_blocks_check(void *arg, dmu_tx_t *tx) { const char *dsname = arg; dsl_dataset_t *ds; dsl_pool_t *dp = dmu_tx_pool(tx); int error = 0; if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS)) return (SET_ERROR(ENOTSUP)); ASSERT(spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)); error = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (error != 0) return (error); if (ds->ds_large_blocks) error = EALREADY; dsl_dataset_rele(ds, FTAG); return (error); } void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; uint64_t zero = 0; spa_feature_incr(spa, SPA_FEATURE_LARGE_BLOCKS, tx); dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); VERIFY0(zap_add(mos, dsobj, DS_FIELD_LARGE_BLOCKS, sizeof (zero), 1, &zero, tx)); } static void dsl_dataset_activate_large_blocks_sync(void *arg, dmu_tx_t *tx) { const char *dsname = arg; dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dmu_tx_pool(tx), dsname, FTAG, &ds)); dsl_dataset_activate_large_blocks_sync_impl(ds->ds_object, tx); ASSERT(!ds->ds_large_blocks); ds->ds_large_blocks = B_TRUE; dsl_dataset_rele(ds, FTAG); } int dsl_dataset_activate_large_blocks(const char *dsname) { int error; error = dsl_sync_task(dsname, dsl_dataset_activate_large_blocks_check, dsl_dataset_activate_large_blocks_sync, (void *)dsname, 1, ZFS_SPACE_CHECK_RESERVED); /* * EALREADY indicates that this dataset already supports large blocks. */ if (error == EALREADY) error = 0; return (error); } /* * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. * For example, they could both be snapshots of the same filesystem, and * 'earlier' is before 'later'. Or 'earlier' could be the origin of * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's * filesystem. Or 'earlier' could be the origin's origin. * * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg. */ boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, uint64_t earlier_txg) { dsl_pool_t *dp = later->ds_dir->dd_pool; int error; boolean_t ret; ASSERT(dsl_pool_config_held(dp)); - ASSERT(dsl_dataset_is_snapshot(earlier) || earlier_txg != 0); + ASSERT(earlier->ds_is_snapshot || earlier_txg != 0); if (earlier_txg == 0) earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg; - if (dsl_dataset_is_snapshot(later) && + if (later->ds_is_snapshot && earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg) return (B_FALSE); if (later->ds_dir == earlier->ds_dir) return (B_TRUE); if (!dsl_dir_is_clone(later->ds_dir)) return (B_FALSE); if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == earlier->ds_object) return (B_TRUE); dsl_dataset_t *origin; error = dsl_dataset_hold_obj(dp, dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin); if (error != 0) return (B_FALSE); ret = dsl_dataset_is_before(origin, earlier, earlier_txg); dsl_dataset_rele(origin, FTAG); return (ret); } void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deadlist.c (revision 286575) @@ -1,535 +1,538 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include /* * Deadlist concurrency: * * Deadlists can only be modified from the syncing thread. * * Except for dsl_deadlist_insert(), it can only be modified with the * dp_config_rwlock held with RW_WRITER. * * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can * be called concurrently, from open context, with the dl_config_rwlock held * with RW_READER. * * Therefore, we only need to provide locking between dsl_deadlist_insert() and * the accessors, protecting: * dl_phys->dl_used,comp,uncomp * and protecting the dl_tree from being loaded. * The locking is provided by dl_lock. Note that locking on the bpobj_t * provides its own locking, and dl_oldfmt is immutable. */ static int dsl_deadlist_compare(const void *arg1, const void *arg2) { const dsl_deadlist_entry_t *dle1 = arg1; const dsl_deadlist_entry_t *dle2 = arg2; if (dle1->dle_mintxg < dle2->dle_mintxg) return (-1); else if (dle1->dle_mintxg > dle2->dle_mintxg) return (+1); else return (0); } static void dsl_deadlist_load_tree(dsl_deadlist_t *dl) { zap_cursor_t zc; zap_attribute_t za; ASSERT(!dl->dl_oldfmt); if (dl->dl_havetree) return; avl_create(&dl->dl_tree, dsl_deadlist_compare, sizeof (dsl_deadlist_entry_t), offsetof(dsl_deadlist_entry_t, dle_node)); for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); dle->dle_mintxg = strtonum(za.za_name, NULL); VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, za.za_first_integer)); avl_add(&dl->dl_tree, dle); } zap_cursor_fini(&zc); dl->dl_havetree = B_TRUE; } void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) { dmu_object_info_t doi; mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); dl->dl_os = os; dl->dl_object = object; VERIFY3U(0, ==, dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); dmu_object_info_from_db(dl->dl_dbuf, &doi); if (doi.doi_type == DMU_OT_BPOBJ) { dmu_buf_rele(dl->dl_dbuf, dl); dl->dl_dbuf = NULL; dl->dl_oldfmt = B_TRUE; VERIFY3U(0, ==, bpobj_open(&dl->dl_bpobj, os, object)); return; } dl->dl_oldfmt = B_FALSE; dl->dl_phys = dl->dl_dbuf->db_data; dl->dl_havetree = B_FALSE; } void dsl_deadlist_close(dsl_deadlist_t *dl) { void *cookie = NULL; dsl_deadlist_entry_t *dle; + + dl->dl_os = NULL; if (dl->dl_oldfmt) { dl->dl_oldfmt = B_FALSE; bpobj_close(&dl->dl_bpobj); return; } if (dl->dl_havetree) { while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) { bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); } avl_destroy(&dl->dl_tree); } dmu_buf_rele(dl->dl_dbuf, dl); mutex_destroy(&dl->dl_lock); dl->dl_dbuf = NULL; dl->dl_phys = NULL; } uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) { if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, sizeof (dsl_deadlist_phys_t), tx)); } void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) { dmu_object_info_t doi; zap_cursor_t zc; zap_attribute_t za; VERIFY3U(0, ==, dmu_object_info(os, dlobj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { bpobj_free(os, dlobj, tx); return; } for (zap_cursor_init(&zc, os, dlobj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t obj = za.za_first_integer; if (obj == dmu_objset_pool(os)->dp_empty_bpobj) bpobj_decr_empty(os, tx); else bpobj_free(os, obj, tx); } zap_cursor_fini(&zc); VERIFY3U(0, ==, dmu_object_free(os, dlobj, tx)); } static void dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, const blkptr_t *bp, dmu_tx_t *tx) { if (dle->dle_bpobj.bpo_object == dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); bpobj_close(&dle->dle_bpobj); bpobj_decr_empty(dl->dl_os, tx); VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, dle->dle_mintxg, obj, tx)); } bpobj_enqueue(&dle->dle_bpobj, bp, tx); } static void dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, uint64_t obj, dmu_tx_t *tx) { if (dle->dle_bpobj.bpo_object != dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); } else { bpobj_close(&dle->dle_bpobj); bpobj_decr_empty(dl->dl_os, tx); VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); VERIFY3U(0, ==, zap_update_int_key(dl->dl_os, dl->dl_object, dle->dle_mintxg, obj, tx)); } } void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; avl_index_t where; if (dl->dl_oldfmt) { bpobj_enqueue(&dl->dl_bpobj, bp, tx); return; } dsl_deadlist_load_tree(dl); dmu_buf_will_dirty(dl->dl_dbuf, tx); mutex_enter(&dl->dl_lock); dl->dl_phys->dl_used += bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); dl->dl_phys->dl_comp += BP_GET_PSIZE(bp); dl->dl_phys->dl_uncomp += BP_GET_UCSIZE(bp); mutex_exit(&dl->dl_lock); dle_tofind.dle_mintxg = bp->blk_birth; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); else dle = AVL_PREV(&dl->dl_tree, dle); dle_enqueue(dl, dle, bp, tx); } /* * Insert new key in deadlist, which must be > all current entries. * mintxg is not inclusive. */ void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) { uint64_t obj; dsl_deadlist_entry_t *dle; if (dl->dl_oldfmt) return; dsl_deadlist_load_tree(dl); dle = kmem_alloc(sizeof (*dle), KM_SLEEP); dle->dle_mintxg = mintxg; obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY3U(0, ==, bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); avl_add(&dl->dl_tree, dle); VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, dl->dl_object, mintxg, obj, tx)); } /* * Remove this key, merging its entries into the previous key. */ void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle, *dle_prev; if (dl->dl_oldfmt) return; dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); dle_prev = AVL_PREV(&dl->dl_tree, dle); dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); avl_remove(&dl->dl_tree, dle); bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); } /* * Walk ds's snapshots to regenerate generate ZAP & AVL. */ static void dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, uint64_t mrs_obj, dmu_tx_t *tx) { dsl_deadlist_t dl; dsl_pool_t *dp = dmu_objset_pool(os); dsl_deadlist_open(&dl, os, dlobj); if (dl.dl_oldfmt) { dsl_deadlist_close(&dl); return; } while (mrs_obj != 0) { dsl_dataset_t *ds; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); dsl_deadlist_add_key(&dl, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; dsl_dataset_rele(ds, FTAG); } dsl_deadlist_close(&dl); } uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, uint64_t mrs_obj, dmu_tx_t *tx) { dsl_deadlist_entry_t *dle; uint64_t newobj; newobj = dsl_deadlist_alloc(dl->dl_os, tx); if (dl->dl_oldfmt) { dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); return (newobj); } dsl_deadlist_load_tree(dl); for (dle = avl_first(&dl->dl_tree); dle; dle = AVL_NEXT(&dl->dl_tree, dle)) { uint64_t obj; if (dle->dle_mintxg >= maxtxg) break; obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY3U(0, ==, zap_add_int_key(dl->dl_os, newobj, dle->dle_mintxg, obj, tx)); } return (newobj); } void dsl_deadlist_space(dsl_deadlist_t *dl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { if (dl->dl_oldfmt) { VERIFY3U(0, ==, bpobj_space(&dl->dl_bpobj, usedp, compp, uncompp)); return; } mutex_enter(&dl->dl_lock); *usedp = dl->dl_phys->dl_used; *compp = dl->dl_phys->dl_comp; *uncompp = dl->dl_phys->dl_uncomp; mutex_exit(&dl->dl_lock); } /* * return space used in the range (mintxg, maxtxg]. * Includes maxtxg, does not include mintxg. * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is * larger than any bp in the deadlist (eg. UINT64_MAX)). */ void dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) { dsl_deadlist_entry_t *dle; dsl_deadlist_entry_t dle_tofind; avl_index_t where; if (dl->dl_oldfmt) { VERIFY3U(0, ==, bpobj_space_range(&dl->dl_bpobj, mintxg, maxtxg, usedp, compp, uncompp)); return; } *usedp = *compp = *uncompp = 0; mutex_enter(&dl->dl_lock); dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); /* * If we don't find this mintxg, there shouldn't be anything * after it either. */ ASSERT(dle != NULL || avl_nearest(&dl->dl_tree, where, AVL_AFTER) == NULL); for (; dle && dle->dle_mintxg < maxtxg; dle = AVL_NEXT(&dl->dl_tree, dle)) { uint64_t used, comp, uncomp; VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); *usedp += used; *compp += comp; *uncompp += uncomp; } mutex_exit(&dl->dl_lock); } static void dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; avl_index_t where; uint64_t used, comp, uncomp; bpobj_t bpo; VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); VERIFY3U(0, ==, bpobj_space(&bpo, &used, &comp, &uncomp)); bpobj_close(&bpo); dsl_deadlist_load_tree(dl); dmu_buf_will_dirty(dl->dl_dbuf, tx); mutex_enter(&dl->dl_lock); dl->dl_phys->dl_used += used; dl->dl_phys->dl_comp += comp; dl->dl_phys->dl_uncomp += uncomp; mutex_exit(&dl->dl_lock); dle_tofind.dle_mintxg = birth; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); dle_enqueue_subobj(dl, dle, obj, tx); } static int dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; dsl_deadlist_insert(dl, bp, tx); return (0); } /* * Merge the deadlist pointed to by 'obj' into dl. obj will be left as * an empty deadlist. */ void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) { zap_cursor_t zc; zap_attribute_t za; dmu_buf_t *bonus; dsl_deadlist_phys_t *dlp; dmu_object_info_t doi; VERIFY3U(0, ==, dmu_object_info(dl->dl_os, obj, &doi)); if (doi.doi_type == DMU_OT_BPOBJ) { bpobj_t bpo; VERIFY3U(0, ==, bpobj_open(&bpo, dl->dl_os, obj)); VERIFY3U(0, ==, bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx)); bpobj_close(&bpo); return; } for (zap_cursor_init(&zc, dl->dl_os, obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t mintxg = strtonum(za.za_name, NULL); dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); VERIFY3U(0, ==, zap_remove_int(dl->dl_os, obj, mintxg, tx)); } zap_cursor_fini(&zc); VERIFY3U(0, ==, dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); dlp = bonus->db_data; dmu_buf_will_dirty(bonus, tx); bzero(dlp, sizeof (*dlp)); dmu_buf_rele(bonus, FTAG); } /* * Remove entries on dl that are >= mintxg, and put them on the bpobj. */ void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, dmu_tx_t *tx) { dsl_deadlist_entry_t dle_tofind; dsl_deadlist_entry_t *dle; avl_index_t where; ASSERT(!dl->dl_oldfmt); dmu_buf_will_dirty(dl->dl_dbuf, tx); dsl_deadlist_load_tree(dl); dle_tofind.dle_mintxg = mintxg; dle = avl_find(&dl->dl_tree, &dle_tofind, &where); if (dle == NULL) dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); while (dle) { uint64_t used, comp, uncomp; dsl_deadlist_entry_t *dle_next; bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); VERIFY3U(0, ==, bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); mutex_enter(&dl->dl_lock); ASSERT3U(dl->dl_phys->dl_used, >=, used); ASSERT3U(dl->dl_phys->dl_comp, >=, comp); ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); dl->dl_phys->dl_used -= used; dl->dl_phys->dl_comp -= comp; dl->dl_phys->dl_uncomp -= uncomp; mutex_exit(&dl->dl_lock); VERIFY3U(0, ==, zap_remove_int(dl->dl_os, dl->dl_object, dle->dle_mintxg, tx)); dle_next = AVL_NEXT(&dl->dl_tree, dle); avl_remove(&dl->dl_tree, dle); bpobj_close(&dle->dle_bpobj); kmem_free(dle, sizeof (*dle)); dle = dle_next; } } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_deleg.c (revision 286575) @@ -1,761 +1,761 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ /* * DSL permissions are stored in a two level zap attribute * mechanism. The first level identifies the "class" of * entry. The class is identified by the first 2 letters of * the attribute. The second letter "l" or "d" identifies whether * it is a local or descendent permission. The first letter * identifies the type of entry. * * ul$ identifies permissions granted locally for this userid. * ud$ identifies permissions granted on descendent datasets for * this userid. * Ul$ identifies permission sets granted locally for this userid. * Ud$ identifies permission sets granted on descendent datasets for * this userid. * gl$ identifies permissions granted locally for this groupid. * gd$ identifies permissions granted on descendent datasets for * this groupid. * Gl$ identifies permission sets granted locally for this groupid. * Gd$ identifies permission sets granted on descendent datasets for * this groupid. * el$ identifies permissions granted locally for everyone. * ed$ identifies permissions granted on descendent datasets * for everyone. * El$ identifies permission sets granted locally for everyone. * Ed$ identifies permission sets granted to descendent datasets for * everyone. * c-$ identifies permission to create at dataset creation time. * C-$ identifies permission sets to grant locally at dataset creation * time. * s-$@ permissions defined in specified set @ * S-$@ Sets defined in named set @ * * Each of the above entities points to another zap attribute that contains one * attribute for each allowed permission, such as create, destroy,... * All of the "upper" case class types will specify permission set names * rather than permissions. * * Basically it looks something like this: * ul$12 -> ZAP OBJ -> permissions... * * The ZAP OBJ is referred to as the jump object. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_deleg.h" /* * Validate that user is allowed to delegate specified permissions. * * In order to delegate "create" you must have "create" * and "allow". */ int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr) { nvpair_t *whopair = NULL; int error; if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) return (error); while (whopair = nvlist_next_nvpair(nvp, whopair)) { nvlist_t *perms; nvpair_t *permpair = NULL; VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); while (permpair = nvlist_next_nvpair(perms, permpair)) { const char *perm = nvpair_name(permpair); if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0) return (SET_ERROR(EPERM)); if ((error = dsl_deleg_access(ddname, perm, cr)) != 0) return (error); } } return (0); } /* * Validate that user is allowed to unallow specified permissions. They * must have the 'allow' permission, and even then can only unallow * perms for their uid. */ int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) { nvpair_t *whopair = NULL; int error; char idstr[32]; if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) return (error); (void) snprintf(idstr, sizeof (idstr), "%lld", (longlong_t)crgetuid(cr)); while (whopair = nvlist_next_nvpair(nvp, whopair)) { zfs_deleg_who_type_t type = nvpair_name(whopair)[0]; if (type != ZFS_DELEG_USER && type != ZFS_DELEG_USER_SETS) return (SET_ERROR(EPERM)); if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0) return (SET_ERROR(EPERM)); } return (0); } typedef struct dsl_deleg_arg { const char *dda_name; nvlist_t *dda_nvlist; } dsl_deleg_arg_t; static void dsl_deleg_set_sync(void *arg, dmu_tx_t *tx) { dsl_deleg_arg_t *dda = arg; dsl_dir_t *dd; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; nvpair_t *whopair = NULL; uint64_t zapobj; VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; if (zapobj == 0) { dmu_buf_will_dirty(dd->dd_dbuf, tx); zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); } while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) { const char *whokey = nvpair_name(whopair); nvlist_t *perms; nvpair_t *permpair = NULL; uint64_t jumpobj; perms = fnvpair_value_nvlist(whopair); if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS, zapobj, whokey, tx); } while (permpair = nvlist_next_nvpair(perms, permpair)) { const char *perm = nvpair_name(permpair); uint64_t n = 0; VERIFY(zap_update(mos, jumpobj, perm, 8, 1, &n, tx) == 0); spa_history_log_internal_dd(dd, "permission update", tx, "%s %s", whokey, perm); } } dsl_dir_rele(dd, FTAG); } static void dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx) { dsl_deleg_arg_t *dda = arg; dsl_dir_t *dd; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; nvpair_t *whopair = NULL; uint64_t zapobj; VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; if (zapobj == 0) { dsl_dir_rele(dd, FTAG); return; } while (whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair)) { const char *whokey = nvpair_name(whopair); nvlist_t *perms; nvpair_t *permpair = NULL; uint64_t jumpobj; if (nvpair_value_nvlist(whopair, &perms) != 0) { if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == 0) { (void) zap_remove(mos, zapobj, whokey, tx); VERIFY(0 == zap_destroy(mos, jumpobj, tx)); } spa_history_log_internal_dd(dd, "permission who remove", tx, "%s", whokey); continue; } if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) continue; while (permpair = nvlist_next_nvpair(perms, permpair)) { const char *perm = nvpair_name(permpair); uint64_t n = 0; (void) zap_remove(mos, jumpobj, perm, tx); if (zap_count(mos, jumpobj, &n) == 0 && n == 0) { (void) zap_remove(mos, zapobj, whokey, tx); VERIFY(0 == zap_destroy(mos, jumpobj, tx)); } spa_history_log_internal_dd(dd, "permission remove", tx, "%s %s", whokey, perm); } } dsl_dir_rele(dd, FTAG); } static int dsl_deleg_check(void *arg, dmu_tx_t *tx) { dsl_deleg_arg_t *dda = arg; dsl_dir_t *dd; int error; if (spa_version(dmu_tx_pool(tx)->dp_spa) < SPA_VERSION_DELEGATED_PERMS) { return (SET_ERROR(ENOTSUP)); } error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL); if (error == 0) dsl_dir_rele(dd, FTAG); return (error); } int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) { dsl_deleg_arg_t dda; /* nvp must already have been verified to be valid */ dda.dda_name = ddname; dda.dda_nvlist = nvp; return (dsl_sync_task(ddname, dsl_deleg_check, unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED)); } /* * Find all 'allow' permissions from a given point and then continue * traversing up to the root. * * This function constructs an nvlist of nvlists. * each setpoint is an nvlist composed of an nvlist of an nvlist * of the individual * users/groups/everyone/create * permissions. * * The nvlist will look like this. * * { source fsname -> { whokeys { permissions,...}, ...}} * * The fsname nvpairs will be arranged in a bottom up order. For example, * if we have the following structure a/b/c then the nvpairs for the fsnames * will be ordered a/b/c, a/b, a. */ int dsl_deleg_get(const char *ddname, nvlist_t **nvp) { dsl_dir_t *dd, *startdd; dsl_pool_t *dp; int error; objset_t *mos; error = dsl_pool_hold(ddname, FTAG, &dp); if (error != 0) return (error); error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dp = startdd->dd_pool; mos = dp->dp_meta_objset; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (dd = startdd; dd != NULL; dd = dd->dd_parent) { zap_cursor_t basezc; zap_attribute_t baseza; nvlist_t *sp_nvp; uint64_t n; char source[MAXNAMELEN]; if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 || zap_count(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0) continue; sp_nvp = fnvlist_alloc(); for (zap_cursor_init(&basezc, mos, dsl_dir_phys(dd)->dd_deleg_zapobj); zap_cursor_retrieve(&basezc, &baseza) == 0; zap_cursor_advance(&basezc)) { zap_cursor_t zc; zap_attribute_t za; nvlist_t *perms_nvp; ASSERT(baseza.za_integer_length == 8); ASSERT(baseza.za_num_integers == 1); perms_nvp = fnvlist_alloc(); for (zap_cursor_init(&zc, mos, baseza.za_first_integer); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { fnvlist_add_boolean(perms_nvp, za.za_name); } zap_cursor_fini(&zc); fnvlist_add_nvlist(sp_nvp, baseza.za_name, perms_nvp); fnvlist_free(perms_nvp); } zap_cursor_fini(&basezc); dsl_dir_name(dd, source); fnvlist_add_nvlist(*nvp, source, sp_nvp); nvlist_free(sp_nvp); } dsl_dir_rele(startdd, FTAG); dsl_pool_rele(dp, FTAG); return (0); } /* * Routines for dsl_deleg_access() -- access checking. */ typedef struct perm_set { avl_node_t p_node; boolean_t p_matched; char p_setname[ZFS_MAX_DELEG_NAME]; } perm_set_t; static int perm_set_compare(const void *arg1, const void *arg2) { const perm_set_t *node1 = arg1; const perm_set_t *node2 = arg2; int val; val = strcmp(node1->p_setname, node2->p_setname); if (val == 0) return (0); return (val > 0 ? 1 : -1); } /* * Determine whether a specified permission exists. * * First the base attribute has to be retrieved. i.e. ul$12 * Once the base object has been retrieved the actual permission * is lookup up in the zap object the base object points to. * * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if * there is no perm in that jumpobj. */ static int dsl_check_access(objset_t *mos, uint64_t zapobj, char type, char checkflag, void *valp, const char *perm) { int error; uint64_t jumpobj, zero; char whokey[ZFS_MAX_DELEG_NAME]; zfs_deleg_whokey(whokey, type, checkflag, valp); error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); if (error == 0) { error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero); if (error == ENOENT) error = SET_ERROR(EPERM); } return (error); } /* * check a specified user/group for a requested permission */ static int dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm, int checkflag, cred_t *cr) { const gid_t *gids; int ngids; int i; uint64_t id; /* check for user */ id = crgetuid(cr); if (dsl_check_access(mos, zapobj, ZFS_DELEG_USER, checkflag, &id, perm) == 0) return (0); /* check for users primary group */ id = crgetgid(cr); if (dsl_check_access(mos, zapobj, ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) return (0); /* check for everyone entry */ id = -1; if (dsl_check_access(mos, zapobj, ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0) return (0); /* check each supplemental group user is a member of */ ngids = crgetngroups(cr); gids = crgetgroups(cr); for (i = 0; i != ngids; i++) { id = gids[i]; if (dsl_check_access(mos, zapobj, ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) return (0); } return (SET_ERROR(EPERM)); } /* * Iterate over the sets specified in the specified zapobj * and load them into the permsets avl tree. */ static int dsl_load_sets(objset_t *mos, uint64_t zapobj, char type, char checkflag, void *valp, avl_tree_t *avl) { zap_cursor_t zc; zap_attribute_t za; perm_set_t *permnode; avl_index_t idx; uint64_t jumpobj; int error; char whokey[ZFS_MAX_DELEG_NAME]; zfs_deleg_whokey(whokey, type, checkflag, valp); error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); if (error != 0) return (error); for (zap_cursor_init(&zc, mos, jumpobj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP); (void) strlcpy(permnode->p_setname, za.za_name, sizeof (permnode->p_setname)); permnode->p_matched = B_FALSE; if (avl_find(avl, permnode, &idx) == NULL) { avl_insert(avl, permnode, idx); } else { kmem_free(permnode, sizeof (perm_set_t)); } } zap_cursor_fini(&zc); return (0); } /* * Load all permissions user based on cred belongs to. */ static void dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, char checkflag, cred_t *cr) { const gid_t *gids; int ngids, i; uint64_t id; id = crgetuid(cr); (void) dsl_load_sets(mos, zapobj, ZFS_DELEG_USER_SETS, checkflag, &id, avl); id = crgetgid(cr); (void) dsl_load_sets(mos, zapobj, ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); (void) dsl_load_sets(mos, zapobj, ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl); ngids = crgetngroups(cr); gids = crgetgroups(cr); for (i = 0; i != ngids; i++) { id = gids[i]; (void) dsl_load_sets(mos, zapobj, ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); } } /* * Check if user has requested permission. */ int dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) { dsl_dir_t *dd; dsl_pool_t *dp; void *cookie; int error; char checkflag; objset_t *mos; avl_tree_t permsets; perm_set_t *setnode; dp = ds->ds_dir->dd_pool; mos = dp->dp_meta_objset; if (dsl_delegation_on(mos) == B_FALSE) return (SET_ERROR(ECANCELED)); if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) < SPA_VERSION_DELEGATED_PERMS) return (SET_ERROR(EPERM)); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { /* * Snapshots are treated as descendents only, * local permissions do not apply. */ checkflag = ZFS_DELEG_DESCENDENT; } else { checkflag = ZFS_DELEG_LOCAL; } avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), offsetof(perm_set_t, p_node)); ASSERT(dsl_pool_config_held(dp)); for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, checkflag = ZFS_DELEG_DESCENDENT) { uint64_t zapobj; boolean_t expanded; /* * If not in global zone then make sure * the zoned property is set */ if (!INGLOBALZONE(curthread)) { uint64_t zoned; if (dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_ZONED), 8, 1, &zoned, NULL, B_FALSE) != 0) break; if (!zoned) break; } zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; if (zapobj == 0) continue; dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr); again: expanded = B_FALSE; for (setnode = avl_first(&permsets); setnode; setnode = AVL_NEXT(&permsets, setnode)) { if (setnode->p_matched == B_TRUE) continue; /* See if this set directly grants this permission */ error = dsl_check_access(mos, zapobj, ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm); if (error == 0) goto success; if (error == EPERM) setnode->p_matched = B_TRUE; /* See if this set includes other sets */ error = dsl_load_sets(mos, zapobj, ZFS_DELEG_NAMED_SET_SETS, 0, setnode->p_setname, &permsets); if (error == 0) setnode->p_matched = expanded = B_TRUE; } /* * If we expanded any sets, that will define more sets, * which we need to check. */ if (expanded) goto again; error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr); if (error == 0) goto success; } error = SET_ERROR(EPERM); success: cookie = NULL; while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) kmem_free(setnode, sizeof (perm_set_t)); return (error); } int dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *ds; int error; error = dsl_pool_hold(dsname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (error == 0) { error = dsl_deleg_access_impl(ds, perm, cr); dsl_dataset_rele(ds, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } /* * Other routines. */ static void copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj, boolean_t dosets, uint64_t uid, dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; uint64_t jumpobj, pjumpobj; uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; zap_cursor_t zc; zap_attribute_t za; char whokey[ZFS_MAX_DELEG_NAME]; zfs_deleg_whokey(whokey, dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE, ZFS_DELEG_LOCAL, NULL); if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0) return; if (zapobj == 0) { dmu_buf_will_dirty(dd->dd_dbuf, tx); zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); } zfs_deleg_whokey(whokey, dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER, ZFS_DELEG_LOCAL, &uid); if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) { jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0); } for (zap_cursor_init(&zc, mos, pjumpobj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { uint64_t zero = 0; ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); VERIFY(zap_update(mos, jumpobj, za.za_name, 8, 1, &zero, tx) == 0); } zap_cursor_fini(&zc); } /* * set all create time permission on new dataset. */ void dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr) { dsl_dir_t *dd; uint64_t uid = crgetuid(cr); if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) < SPA_VERSION_DELEGATED_PERMS) return; for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) { uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; if (pzapobj == 0) continue; copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx); copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx); } } int dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx) { zap_cursor_t zc; zap_attribute_t za; if (zapobj == 0) return (0); for (zap_cursor_init(&zc, mos, zapobj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx)); } zap_cursor_fini(&zc); VERIFY(0 == zap_destroy(mos, zapobj, tx)); return (0); } boolean_t dsl_delegation_on(objset_t *os) { return (!!spa_delegation(os->os_spa)); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_destroy.c (revision 286575) @@ -1,971 +1,971 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2013 by Joyent, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include typedef struct dmu_snapshots_destroy_arg { nvlist_t *dsda_snaps; nvlist_t *dsda_successful_snaps; boolean_t dsda_defer; nvlist_t *dsda_errlist; } dmu_snapshots_destroy_arg_t; int dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) { - if (!dsl_dataset_is_snapshot(ds)) + if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); if (dsl_dataset_long_held(ds)) return (SET_ERROR(EBUSY)); /* * Only allow deferred destroy on pools that support it. * NOTE: deferred destroy is only supported on snapshots. */ if (defer) { if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) return (SET_ERROR(ENOTSUP)); return (0); } /* * If this snapshot has an elevated user reference count, * we can't destroy it yet. */ if (ds->ds_userrefs > 0) return (SET_ERROR(EBUSY)); /* * Can't delete a branch point. */ if (dsl_dataset_phys(ds)->ds_num_children > 1) return (SET_ERROR(EEXIST)); return (0); } static int dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx) { dmu_snapshots_destroy_arg_t *dsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; int error = 0; if (!dmu_tx_is_syncing(tx)) return (0); for (pair = nvlist_next_nvpair(dsda->dsda_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_snaps, pair)) { dsl_dataset_t *ds; error = dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds); /* * If the snapshot does not exist, silently ignore it * (it's "already destroyed"). */ if (error == ENOENT) continue; if (error == 0) { error = dsl_destroy_snapshot_check_impl(ds, dsda->dsda_defer); dsl_dataset_rele(ds, FTAG); } if (error == 0) { fnvlist_add_boolean(dsda->dsda_successful_snaps, nvpair_name(pair)); } else { fnvlist_add_int32(dsda->dsda_errlist, nvpair_name(pair), error); } } pair = nvlist_next_nvpair(dsda->dsda_errlist, NULL); if (pair != NULL) return (fnvpair_value_int32(pair)); return (0); } struct process_old_arg { dsl_dataset_t *ds; dsl_dataset_t *ds_prev; boolean_t after_branch_point; zio_t *pio; uint64_t used, comp, uncomp; }; static int process_old_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { struct process_old_arg *poa = arg; dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; ASSERT(!BP_IS_HOLE(bp)); if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, tx); if (poa->ds_prev && !poa->after_branch_point && bp->blk_birth > dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes += bp_get_dsize_sync(dp->dp_spa, bp); } } else { poa->used += bp_get_dsize_sync(dp->dp_spa, bp); poa->comp += BP_GET_PSIZE(bp); poa->uncomp += BP_GET_UCSIZE(bp); dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); } return (0); } static void process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) { struct process_old_arg poa = { 0 }; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; uint64_t deadlist_obj; ASSERT(ds->ds_deadlist.dl_oldfmt); ASSERT(ds_next->ds_deadlist.dl_oldfmt); poa.ds = ds; poa.ds_prev = ds_prev; poa.after_branch_point = after_branch_point; poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, process_old_cb, &poa, tx)); VERIFY0(zio_wait(poa.pio)); ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes); /* change snapused */ dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -poa.used, -poa.comp, -poa.uncomp, tx); /* swap next's deadlist to our deadlist */ dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_close(&ds_next->ds_deadlist); deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; dsl_dataset_phys(ds)->ds_deadlist_obj = dsl_dataset_phys(ds_next)->ds_deadlist_obj; dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj; dsl_deadlist_open(&ds->ds_deadlist, mos, dsl_dataset_phys(ds)->ds_deadlist_obj); dsl_deadlist_open(&ds_next->ds_deadlist, mos, dsl_dataset_phys(ds_next)->ds_deadlist_obj); } static void dsl_dataset_remove_clones_key(dsl_dataset_t *ds, uint64_t mintxg, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; zap_cursor_t zc; zap_attribute_t za; /* * If it is the old version, dd_clones doesn't exist so we can't * find the clones, but dsl_deadlist_remove_key() is a no-op so it * doesn't matter. */ if (dsl_dir_phys(ds->ds_dir)->dd_clones == 0) return; for (zap_cursor_init(&zc, mos, dsl_dir_phys(ds->ds_dir)->dd_clones); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { dsl_dataset_t *clone; VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, za.za_first_integer, FTAG, &clone)); if (clone->ds_dir->dd_origin_txg > mintxg) { dsl_deadlist_remove_key(&clone->ds_deadlist, mintxg, tx); dsl_dataset_remove_clones_key(clone, mintxg, tx); } dsl_dataset_rele(clone, FTAG); } zap_cursor_fini(&zc); } void dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) { int err; int after_branch_point = FALSE; dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; dsl_dataset_t *ds_prev = NULL; uint64_t obj; ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); ASSERT(refcount_is_zero(&ds->ds_longholds)); if (defer && (ds->ds_userrefs > 0 || dsl_dataset_phys(ds)->ds_num_children > 1)) { ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY; spa_history_log_internal_ds(ds, "defer_destroy", tx, ""); return; } ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); /* We need to log before removing it from the namespace. */ spa_history_log_internal_ds(ds, "destroy", tx, ""); dsl_scan_ds_destroyed(ds, tx); obj = ds->ds_object; if (ds->ds_large_blocks) { ASSERT0(zap_contains(mos, obj, DS_FIELD_LARGE_BLOCKS)); spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx); } if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { ASSERT3P(ds->ds_prev, ==, NULL); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev)); after_branch_point = (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj); dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); if (after_branch_point && dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) { dsl_dataset_remove_from_next_clones(ds_prev, obj, tx); if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { VERIFY0(zap_add_int(mos, dsl_dataset_phys(ds_prev)-> ds_next_clones_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, tx)); } } if (!after_branch_point) { dsl_dataset_phys(ds_prev)->ds_next_snap_obj = dsl_dataset_phys(ds)->ds_next_snap_obj; } } dsl_dataset_t *ds_next; uint64_t old_unique; uint64_t used = 0, comp = 0, uncomp = 0; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next)); ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj); old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes; dmu_buf_will_dirty(ds_next->ds_dbuf, tx); dsl_dataset_phys(ds_next)->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; dsl_dataset_phys(ds_next)->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0); if (ds_next->ds_deadlist.dl_oldfmt) { process_old_deadlist(ds, ds_prev, ds_next, after_branch_point, tx); } else { /* Adjust prev's unique space. */ if (ds_prev && !after_branch_point) { dsl_deadlist_space_range(&ds_next->ds_deadlist, dsl_dataset_phys(ds_prev)->ds_prev_snap_txg, dsl_dataset_phys(ds)->ds_prev_snap_txg, &used, &comp, &uncomp); dsl_dataset_phys(ds_prev)->ds_unique_bytes += used; } /* Adjust snapused. */ dsl_deadlist_space_range(&ds_next->ds_deadlist, dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX, &used, &comp, &uncomp); dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, -used, -comp, -uncomp, tx); /* Move blocks to be freed to pool's free list. */ dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); /* Merge our deadlist into next's and free it. */ dsl_deadlist_merge(&ds_next->ds_deadlist, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); } dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_deadlist_obj = 0; /* Collapse range in clone heads */ dsl_dataset_remove_clones_key(ds, dsl_dataset_phys(ds)->ds_creation_txg, tx); - if (dsl_dataset_is_snapshot(ds_next)) { + if (ds_next->ds_is_snapshot) { dsl_dataset_t *ds_nextnext; /* * Update next's unique to include blocks which * were previously shared by only this snapshot * and it. Those blocks will be born after the * prev snap and before this snap, and will have * died after the next snap and before the one * after that (ie. be on the snap after next's * deadlist). */ VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds_next)->ds_next_snap_obj, FTAG, &ds_nextnext)); dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, dsl_dataset_phys(ds)->ds_prev_snap_txg, dsl_dataset_phys(ds)->ds_creation_txg, &used, &comp, &uncomp); dsl_dataset_phys(ds_next)->ds_unique_bytes += used; dsl_dataset_rele(ds_nextnext, FTAG); ASSERT3P(ds_next->ds_prev, ==, NULL); /* Collapse range in this head. */ dsl_dataset_t *hds; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &hds)); dsl_deadlist_remove_key(&hds->ds_deadlist, dsl_dataset_phys(ds)->ds_creation_txg, tx); dsl_dataset_rele(hds, FTAG); } else { ASSERT3P(ds_next->ds_prev, ==, ds); dsl_dataset_rele(ds_next->ds_prev, ds_next); ds_next->ds_prev = NULL; if (ds_prev) { VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds_next, &ds_next->ds_prev)); } dsl_dataset_recalc_head_uniq(ds_next); /* * Reduce the amount of our unconsumed refreservation * being charged to our parent by the amount of * new unique data we have gained. */ if (old_unique < ds_next->ds_reserved) { int64_t mrsdelta; uint64_t new_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes; ASSERT(old_unique <= new_unique); mrsdelta = MIN(new_unique - old_unique, ds_next->ds_reserved - old_unique); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); } } dsl_dataset_rele(ds_next, FTAG); /* * This must be done after the dsl_traverse(), because it will * re-open the objset. */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } /* remove from snapshot namespace */ dsl_dataset_t *ds_head; ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head)); VERIFY0(dsl_dataset_get_snapname(ds)); #ifdef ZFS_DEBUG { uint64_t val; err = dsl_dataset_snap_lookup(ds_head, ds->ds_snapname, &val); ASSERT0(err); ASSERT3U(val, ==, obj); } #endif VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE)); dsl_dataset_rele(ds_head, FTAG); if (ds_prev != NULL) dsl_dataset_rele(ds_prev, FTAG); spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { uint64_t count; ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, &count) && count == 0); VERIFY0(dmu_object_free(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, tx)); } if (dsl_dataset_phys(ds)->ds_props_obj != 0) VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj, tx)); if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, tx)); dsl_dir_rele(ds->ds_dir, ds); ds->ds_dir = NULL; dmu_object_free_zapified(mos, obj, tx); } static void dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx) { dmu_snapshots_destroy_arg_t *dsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; for (pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(dsda->dsda_successful_snaps, pair)) { dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); dsl_destroy_snapshot_sync_impl(ds, dsda->dsda_defer, tx); dsl_dataset_rele(ds, FTAG); } } /* * The semantics of this function are described in the comment above * lzc_destroy_snaps(). To summarize: * * The snapshots must all be in the same pool. * * Snapshots that don't exist will be silently ignored (considered to be * "already deleted"). * * On success, all snaps will be destroyed and this will return 0. * On failure, no snaps will be destroyed, the errlist will be filled in, * and this will return an errno. */ int dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, nvlist_t *errlist) { dmu_snapshots_destroy_arg_t dsda; int error; nvpair_t *pair; pair = nvlist_next_nvpair(snaps, NULL); if (pair == NULL) return (0); dsda.dsda_snaps = snaps; dsda.dsda_successful_snaps = fnvlist_alloc(); dsda.dsda_defer = defer; dsda.dsda_errlist = errlist; error = dsl_sync_task(nvpair_name(pair), dsl_destroy_snapshot_check, dsl_destroy_snapshot_sync, &dsda, 0, ZFS_SPACE_CHECK_NONE); fnvlist_free(dsda.dsda_successful_snaps); return (error); } int dsl_destroy_snapshot(const char *name, boolean_t defer) { int error; nvlist_t *nvl = fnvlist_alloc(); nvlist_t *errlist = fnvlist_alloc(); fnvlist_add_boolean(nvl, name); error = dsl_destroy_snapshots_nvl(nvl, defer, errlist); fnvlist_free(errlist); fnvlist_free(nvl); return (error); } struct killarg { dsl_dataset_t *ds; dmu_tx_t *tx; }; /* ARGSUSED */ static int kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { struct killarg *ka = arg; dmu_tx_t *tx = ka->tx; if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); if (zb->zb_level == ZB_ZIL_LEVEL) { ASSERT(zilog != NULL); /* * It's a block in the intent log. It has no * accounting, so just free it. */ dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); } else { ASSERT(zilog == NULL); ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ka->ds)->ds_prev_snap_txg); (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); } return (0); } static void old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) { struct killarg ka; /* * Free everything that we point to (that's born after * the previous snapshot, if we are a clone) * * NB: this should be very quick, because we already * freed all the objects in open context. */ ka.ds = ds; ka.tx = tx; VERIFY0(traverse_dataset(ds, dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST, kill_blkptr, &ka)); ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || dsl_dataset_phys(ds)->ds_unique_bytes == 0); } typedef struct dsl_destroy_head_arg { const char *ddha_name; } dsl_destroy_head_arg_t; int dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) { int error; uint64_t count; objset_t *mos; - ASSERT(!dsl_dataset_is_snapshot(ds)); - if (dsl_dataset_is_snapshot(ds)) + ASSERT(!ds->ds_is_snapshot); + if (ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); if (refcount_count(&ds->ds_longholds) != expected_holds) return (SET_ERROR(EBUSY)); mos = ds->ds_dir->dd_pool->dp_meta_objset; /* * Can't delete a head dataset if there are snapshots of it. * (Except if the only snapshots are from the branch we cloned * from.) */ if (ds->ds_prev != NULL && dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object) return (SET_ERROR(EBUSY)); /* * Can't delete if there are children of this fs. */ error = zap_count(mos, dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count); if (error != 0) return (error); if (count != 0) return (SET_ERROR(EEXIST)); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && ds->ds_prev->ds_userrefs == 0) { /* We need to remove the origin snapshot as well. */ if (!refcount_is_zero(&ds->ds_prev->ds_longholds)) return (SET_ERROR(EBUSY)); } return (0); } static int dsl_destroy_head_check(void *arg, dmu_tx_t *tx) { dsl_destroy_head_arg_t *ddha = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds); if (error != 0) return (error); error = dsl_destroy_head_check_impl(ds, 0); dsl_dataset_rele(ds, FTAG); return (error); } static void dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) { dsl_dir_t *dd; dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; dd_used_t t; ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock)); VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj); /* * Decrement the filesystem count for all parent filesystems. * * When we receive an incremental stream into a filesystem that already * exists, a temporary clone is created. We never count this temporary * clone, whose name begins with a '%'. */ if (dd->dd_myname[0] != '%' && dd->dd_parent != NULL) dsl_fs_ss_count_adjust(dd->dd_parent, -1, DD_FIELD_FILESYSTEM_COUNT, tx); /* * Remove our reservation. The impl() routine avoids setting the * actual property, which would require the (already destroyed) ds. */ dsl_dir_set_reservation_sync_impl(dd, 0, tx); ASSERT0(dsl_dir_phys(dd)->dd_used_bytes); ASSERT0(dsl_dir_phys(dd)->dd_reserved); for (t = 0; t < DD_USED_NUM; t++) ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]); VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx)); VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx)); VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx)); VERIFY0(zap_remove(mos, dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, dd->dd_myname, tx)); dsl_dir_rele(dd, FTAG); dmu_object_free_zapified(mos, ddobj, tx); } void dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; uint64_t obj, ddobj, prevobj = 0; boolean_t rmorigin; ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); ASSERT(ds->ds_prev == NULL || dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object); ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); /* We need to log before removing it from the namespace. */ spa_history_log_internal_ds(ds, "destroy", tx, ""); rmorigin = (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && ds->ds_prev->ds_userrefs == 0); /* Remove our reservation. */ if (ds->ds_reserved != 0) { dsl_dataset_set_refreservation_sync_impl(ds, (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), 0, tx); ASSERT0(ds->ds_reserved); } if (ds->ds_large_blocks) spa_feature_decr(dp->dp_spa, SPA_FEATURE_LARGE_BLOCKS, tx); dsl_scan_ds_destroyed(ds, tx); obj = ds->ds_object; if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { /* This is a clone */ ASSERT(ds->ds_prev != NULL); ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=, obj); ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj); dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) { dsl_dataset_remove_from_next_clones(ds->ds_prev, obj, tx); } ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1); dsl_dataset_phys(ds->ds_prev)->ds_num_children--; } /* * Destroy the deadlist. Unless it's a clone, the * deadlist should be empty. (If it's a clone, it's * safe to ignore the deadlist contents.) */ dsl_deadlist_close(&ds->ds_deadlist); dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_deadlist_obj = 0; objset_t *os; VERIFY0(dmu_objset_from_ds(ds, &os)); if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { old_synchronous_dataset_destroy(ds, tx); } else { /* * Move the bptree into the pool's list of trees to * clean up and update space accounting information. */ uint64_t used, comp, uncomp; zil_destroy_sync(dmu_objset_zil(os), tx); if (!spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { dsl_scan_t *scn = dp->dp_scan; spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, tx); dp->dp_bptree_obj = bptree_alloc(mos, tx); VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj, tx)); ASSERT(!scn->scn_async_destroying); scn->scn_async_destroying = B_TRUE; } used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || dsl_dataset_phys(ds)->ds_unique_bytes == used); bptree_add(mos, dp->dp_bptree_obj, &dsl_dataset_phys(ds)->ds_bp, dsl_dataset_phys(ds)->ds_prev_snap_txg, used, comp, uncomp, tx); dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, -used, -comp, -uncomp, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, used, comp, uncomp, tx); } if (ds->ds_prev != NULL) { if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { VERIFY0(zap_remove_int(mos, dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones, ds->ds_object, tx)); } prevobj = ds->ds_prev->ds_object; dsl_dataset_rele(ds->ds_prev, ds); ds->ds_prev = NULL; } /* * This must be done after the dsl_traverse(), because it will * re-open the objset. */ if (ds->ds_objset) { dmu_objset_evict(ds->ds_objset); ds->ds_objset = NULL; } /* Erase the link in the dir */ dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0; ddobj = ds->ds_dir->dd_object; ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0); VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx)); if (ds->ds_bookmarks != 0) { VERIFY0(zap_destroy(mos, ds->ds_bookmarks, tx)); spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); } spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj); ASSERT0(dsl_dataset_phys(ds)->ds_props_obj); ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj); dsl_dir_rele(ds->ds_dir, ds); ds->ds_dir = NULL; dmu_object_free_zapified(mos, obj, tx); dsl_dir_destroy_sync(ddobj, tx); if (rmorigin) { dsl_dataset_t *prev; VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev)); dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); dsl_dataset_rele(prev, FTAG); } } static void dsl_destroy_head_sync(void *arg, dmu_tx_t *tx) { dsl_destroy_head_arg_t *ddha = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); dsl_destroy_head_sync_impl(ds, tx); dsl_dataset_rele(ds, FTAG); } static void dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx) { dsl_destroy_head_arg_t *ddha = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); /* Mark it as inconsistent on-disk, in case we crash */ dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; spa_history_log_internal_ds(ds, "destroy begin", tx, ""); dsl_dataset_rele(ds, FTAG); } int dsl_destroy_head(const char *name) { dsl_destroy_head_arg_t ddha; int error; spa_t *spa; boolean_t isenabled; #ifdef _KERNEL zfs_destroy_unmount_origin(name); #endif error = spa_open(name, &spa, FTAG); if (error != 0) return (error); isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY); spa_close(spa, FTAG); ddha.ddha_name = name; if (!isenabled) { objset_t *os; error = dsl_sync_task(name, dsl_destroy_head_check, dsl_destroy_head_begin_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE); if (error != 0) return (error); /* * Head deletion is processed in one txg on old pools; * remove the objects from open context so that the txg sync * is not too long. */ error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, FTAG, &os); if (error == 0) { uint64_t prev_snap_txg = dsl_dataset_phys(dmu_objset_ds(os))-> ds_prev_snap_txg; for (uint64_t obj = 0; error == 0; error = dmu_object_next(os, &obj, FALSE, prev_snap_txg)) (void) dmu_free_long_object(os, obj); /* sync out all frees */ txg_wait_synced(dmu_objset_pool(os), 0); dmu_objset_disown(os, FTAG); } } return (dsl_sync_task(name, dsl_destroy_head_check, dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_NONE)); } /* * Note, this function is used as the callback for dmu_objset_find(). We * always return 0 so that we will continue to find and process * inconsistent datasets, even if we encounter an error trying to * process one of them. */ /* ARGSUSED */ int dsl_destroy_inconsistent(const char *dsname, void *arg) { objset_t *os; if (dmu_objset_hold(dsname, FTAG, &os) == 0) { boolean_t inconsistent = DS_IS_INCONSISTENT(dmu_objset_ds(os)); dmu_objset_rele(os, FTAG); if (inconsistent) (void) dsl_destroy_head(dsname); } return (0); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_dir.c (revision 286575) @@ -1,1984 +1,2002 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2014 Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" /* * Filesystem and Snapshot Limits * ------------------------------ * * These limits are used to restrict the number of filesystems and/or snapshots * that can be created at a given level in the tree or below. A typical * use-case is with a delegated dataset where the administrator wants to ensure * that a user within the zone is not creating too many additional filesystems * or snapshots, even though they're not exceeding their space quota. * * The filesystem and snapshot counts are stored as extensible properties. This * capability is controlled by a feature flag and must be enabled to be used. * Once enabled, the feature is not active until the first limit is set. At * that point, future operations to create/destroy filesystems or snapshots * will validate and update the counts. * * Because the count properties will not exist before the feature is active, * the counts are updated when a limit is first set on an uninitialized * dsl_dir node in the tree (The filesystem/snapshot count on a node includes * all of the nested filesystems/snapshots. Thus, a new leaf node has a * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and * snapshot count properties on a node indicate uninitialized counts on that * node.) When first setting a limit on an uninitialized node, the code starts * at the filesystem with the new limit and descends into all sub-filesystems * to add the count properties. * * In practice this is lightweight since a limit is typically set when the * filesystem is created and thus has no children. Once valid, changing the * limit value won't require a re-traversal since the counts are already valid. * When recursively fixing the counts, if a node with a limit is encountered * during the descent, the counts are known to be valid and there is no need to * descend into that filesystem's children. The counts on filesystems above the * one with the new limit will still be uninitialized, unless a limit is * eventually set on one of those filesystems. The counts are always recursively * updated when a limit is set on a dataset, unless there is already a limit. * When a new limit value is set on a filesystem with an existing limit, it is * possible for the new limit to be less than the current count at that level * since a user who can change the limit is also allowed to exceed the limit. * * Once the feature is active, then whenever a filesystem or snapshot is * created, the code recurses up the tree, validating the new count against the * limit at each initialized level. In practice, most levels will not have a * limit set. If there is a limit at any initialized level up the tree, the * check must pass or the creation will fail. Likewise, when a filesystem or * snapshot is destroyed, the counts are recursively adjusted all the way up * the initizized nodes in the tree. Renaming a filesystem into different point * in the tree will first validate, then update the counts on each branch up to * the common ancestor. A receive will also validate the counts and then update * them. * * An exception to the above behavior is that the limit is not enforced if the * user has permission to modify the limit. This is primarily so that * recursive snapshots in the global zone always work. We want to prevent a * denial-of-service in which a lower level delegated dataset could max out its * limit and thus block recursive snapshots from being taken in the global zone. * Because of this, it is possible for the snapshot count to be over the limit * and snapshots taken in the global zone could cause a lower level dataset to * hit or exceed its limit. The administrator taking the global zone recursive * snapshot should be aware of this side-effect and behave accordingly. * For consistency, the filesystem limit is also not enforced if the user can * modify the limit. * * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by * dsl_dir_init_fs_ss_count(). * * There is a special case when we receive a filesystem that already exists. In * this case a temporary clone name of %X is created (see dmu_recv_begin). We * never update the filesystem counts for temporary clones. * * Likewise, we do not update the snapshot counts for temporary snapshots, * such as those created by zfs diff. */ extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); -/* ARGSUSED */ static void -dsl_dir_evict(dmu_buf_t *db, void *arg) +dsl_dir_evict(void *dbu) { - dsl_dir_t *dd = arg; + dsl_dir_t *dd = dbu; dsl_pool_t *dp = dd->dd_pool; int t; + dd->dd_dbuf = NULL; + for (t = 0; t < TXG_SIZE; t++) { ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); ASSERT(dd->dd_tempreserved[t] == 0); ASSERT(dd->dd_space_towrite[t] == 0); } if (dd->dd_parent) - dsl_dir_rele(dd->dd_parent, dd); + dsl_dir_async_rele(dd->dd_parent, dd); - spa_close(dd->dd_pool->dp_spa, dd); + spa_async_close(dd->dd_pool->dp_spa, dd); /* * The props callback list should have been cleaned up by * objset_evict(). */ list_destroy(&dd->dd_prop_cbs); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); } int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **ddp) { dmu_buf_t *dbuf; dsl_dir_t *dd; int err; ASSERT(dsl_pool_config_held(dp)); err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); if (err != 0) return (err); dd = dmu_buf_get_user(dbuf); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(dbuf, &doi); ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); } #endif if (dd == NULL) { dsl_dir_t *winner; dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); dd->dd_object = ddobj; dd->dd_dbuf = dbuf; dd->dd_pool = dp; mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), offsetof(dsl_prop_cb_record_t, cbr_node)); dsl_dir_snap_cmtime_update(dd); if (dsl_dir_phys(dd)->dd_parent_obj) { err = dsl_dir_hold_obj(dp, dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, &dd->dd_parent); if (err != 0) goto errout; if (tail) { #ifdef ZFS_DEBUG uint64_t foundobj; err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dd->dd_parent)-> dd_child_dir_zapobj, tail, sizeof (foundobj), 1, &foundobj); ASSERT(err || foundobj == ddobj); #endif (void) strcpy(dd->dd_myname, tail); } else { err = zap_value_search(dp->dp_meta_objset, dsl_dir_phys(dd->dd_parent)-> dd_child_dir_zapobj, ddobj, 0, dd->dd_myname); } if (err != 0) goto errout; } else { (void) strcpy(dd->dd_myname, spa_name(dp->dp_spa)); } if (dsl_dir_is_clone(dd)) { dmu_buf_t *origin_bonus; dsl_dataset_phys_t *origin_phys; /* * We can't open the origin dataset, because * that would require opening this dsl_dir. * Just look at its phys directly instead. */ err = dmu_bonus_hold(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_bonus); if (err != 0) goto errout; origin_phys = origin_bonus->db_data; dd->dd_origin_txg = origin_phys->ds_creation_txg; dmu_buf_rele(origin_bonus, FTAG); } - winner = dmu_buf_set_user_ie(dbuf, dd, dsl_dir_evict); - if (winner) { + dmu_buf_init_user(&dd->dd_dbu, dsl_dir_evict, &dd->dd_dbuf); + winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); + if (winner != NULL) { if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dd = winner; } else { spa_open_ref(dp->dp_spa, dd); } } /* * The dsl_dir_t has both open-to-close and instantiate-to-evict * holds on the spa. We need the open-to-close holds because * otherwise the spa_refcnt wouldn't change when we open a * dir which the spa also has open, so we could incorrectly * think it was OK to unload/export/destroy the pool. We need * the instantiate-to-evict hold because the dsl_dir_t has a * pointer to the dd_pool, which has a pointer to the spa_t. */ spa_open_ref(dp->dp_spa, tag); ASSERT3P(dd->dd_pool, ==, dp); ASSERT3U(dd->dd_object, ==, ddobj); ASSERT3P(dd->dd_dbuf, ==, dbuf); *ddp = dd; return (0); errout: if (dd->dd_parent) dsl_dir_rele(dd->dd_parent, dd); mutex_destroy(&dd->dd_lock); kmem_free(dd, sizeof (dsl_dir_t)); dmu_buf_rele(dbuf, tag); return (err); } void dsl_dir_rele(dsl_dir_t *dd, void *tag) { dprintf_dd(dd, "%s\n", ""); spa_close(dd->dd_pool->dp_spa, tag); dmu_buf_rele(dd->dd_dbuf, tag); } +/* + * Remove a reference to the given dsl dir that is being asynchronously + * released. Async releases occur from a taskq performing eviction of + * dsl datasets and dirs. This process is identical to a normal release + * with the exception of using the async API for releasing the reference on + * the spa. + */ +void +dsl_dir_async_rele(dsl_dir_t *dd, void *tag) +{ + dprintf_dd(dd, "%s\n", ""); + spa_async_close(dd->dd_pool->dp_spa, tag); + dmu_buf_rele(dd->dd_dbuf, tag); +} + /* buf must be long enough (MAXNAMELEN + strlen(MOS_DIR_NAME) + 1 should do) */ void dsl_dir_name(dsl_dir_t *dd, char *buf) { if (dd->dd_parent) { dsl_dir_name(dd->dd_parent, buf); (void) strcat(buf, "/"); } else { buf[0] = '\0'; } if (!MUTEX_HELD(&dd->dd_lock)) { /* * recursive mutex so that we can use * dprintf_dd() with dd_lock held */ mutex_enter(&dd->dd_lock); (void) strcat(buf, dd->dd_myname); mutex_exit(&dd->dd_lock); } else { (void) strcat(buf, dd->dd_myname); } } /* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ int dsl_dir_namelen(dsl_dir_t *dd) { int result = 0; if (dd->dd_parent) { /* parent's name + 1 for the "/" */ result = dsl_dir_namelen(dd->dd_parent) + 1; } if (!MUTEX_HELD(&dd->dd_lock)) { /* see dsl_dir_name */ mutex_enter(&dd->dd_lock); result += strlen(dd->dd_myname); mutex_exit(&dd->dd_lock); } else { result += strlen(dd->dd_myname); } return (result); } static int getcomponent(const char *path, char *component, const char **nextp) { char *p; if ((path == NULL) || (path[0] == '\0')) return (SET_ERROR(ENOENT)); /* This would be a good place to reserve some namespace... */ p = strpbrk(path, "/@"); if (p && (p[1] == '/' || p[1] == '@')) { /* two separators in a row */ return (SET_ERROR(EINVAL)); } if (p == NULL || p == path) { /* * if the first thing is an @ or /, it had better be an * @ and it had better not have any more ats or slashes, * and it had better have something after the @. */ if (p != NULL && (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) return (SET_ERROR(EINVAL)); if (strlen(path) >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); (void) strcpy(component, path); p = NULL; } else if (p[0] == '/') { if (p - path >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); (void) strncpy(component, path, p - path); component[p - path] = '\0'; p++; } else if (p[0] == '@') { /* * if the next separator is an @, there better not be * any more slashes. */ if (strchr(path, '/')) return (SET_ERROR(EINVAL)); if (p - path >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); (void) strncpy(component, path, p - path); component[p - path] = '\0'; } else { panic("invalid p=%p", (void *)p); } *nextp = p; return (0); } /* * Return the dsl_dir_t, and possibly the last component which couldn't * be found in *tail. The name must be in the specified dsl_pool_t. This * thread must hold the dp_config_rwlock for the pool. Returns NULL if the * path is bogus, or if tail==NULL and we couldn't parse the whole name. * (*tail)[0] == '@' means that the last component is a snapshot. */ int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t **ddp, const char **tailp) { char buf[MAXNAMELEN]; const char *spaname, *next, *nextnext = NULL; int err; dsl_dir_t *dd; uint64_t ddobj; err = getcomponent(name, buf, &next); if (err != 0) return (err); /* Make sure the name is in the specified pool. */ spaname = spa_name(dp->dp_spa); if (strcmp(buf, spaname) != 0) return (SET_ERROR(EXDEV)); ASSERT(dsl_pool_config_held(dp)); err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); if (err != 0) { return (err); } while (next != NULL) { - dsl_dir_t *child_ds; + dsl_dir_t *child_dd; err = getcomponent(next, buf, &nextnext); if (err != 0) break; ASSERT(next[0] != '\0'); if (next[0] == '@') break; dprintf("looking up %s in obj%lld\n", buf, dsl_dir_phys(dd)->dd_child_dir_zapobj); err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dd)->dd_child_dir_zapobj, buf, sizeof (ddobj), 1, &ddobj); if (err != 0) { if (err == ENOENT) err = 0; break; } - err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_ds); + err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); if (err != 0) break; dsl_dir_rele(dd, tag); - dd = child_ds; + dd = child_dd; next = nextnext; } if (err != 0) { dsl_dir_rele(dd, tag); return (err); } /* * It's an error if there's more than one component left, or * tailp==NULL and there's any component left. */ if (next != NULL && (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { /* bad path name */ dsl_dir_rele(dd, tag); dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); err = SET_ERROR(ENOENT); } if (tailp != NULL) *tailp = next; *ddp = dd; return (err); } /* * If the counts are already initialized for this filesystem and its * descendants then do nothing, otherwise initialize the counts. * * The counts on this filesystem, and those below, may be uninitialized due to * either the use of a pre-existing pool which did not support the * filesystem/snapshot limit feature, or one in which the feature had not yet * been enabled. * * Recursively descend the filesystem tree and update the filesystem/snapshot * counts on each filesystem below, then update the cumulative count on the * current filesystem. If the filesystem already has a count set on it, * then we know that its counts, and the counts on the filesystems below it, * are already correct, so we don't have to update this filesystem. */ static void dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) { uint64_t my_fs_cnt = 0; uint64_t my_ss_cnt = 0; dsl_pool_t *dp = dd->dd_pool; objset_t *os = dp->dp_meta_objset; zap_cursor_t *zc; zap_attribute_t *za; dsl_dataset_t *ds; ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); ASSERT(dsl_pool_config_held(dp)); ASSERT(dmu_tx_is_syncing(tx)); dsl_dir_zapify(dd, tx); /* * If the filesystem count has already been initialized then we * don't need to recurse down any further. */ if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) return; zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); /* Iterate my child dirs */ for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dir_t *chld_dd; uint64_t count; VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, &chld_dd)); /* * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and * temporary datasets. */ if (chld_dd->dd_myname[0] == '$' || chld_dd->dd_myname[0] == '%') { dsl_dir_rele(chld_dd, FTAG); continue; } my_fs_cnt++; /* count this child */ dsl_dir_init_fs_ss_count(chld_dd, tx); VERIFY0(zap_lookup(os, chld_dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); my_fs_cnt += count; VERIFY0(zap_lookup(os, chld_dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); my_ss_cnt += count; dsl_dir_rele(chld_dd, FTAG); } zap_cursor_fini(zc); /* Count my snapshots (we counted children's snapshots above) */ VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { /* Don't count temporary snapshots */ if (za->za_name[0] != '%') my_ss_cnt++; } zap_cursor_fini(zc); dsl_dataset_rele(ds, FTAG); kmem_free(zc, sizeof (zap_cursor_t)); kmem_free(za, sizeof (zap_attribute_t)); /* we're in a sync task, update counts */ dmu_buf_will_dirty(dd->dd_dbuf, tx); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); } static int dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) { char *ddname = (char *)arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; dsl_dir_t *dd; int error; error = dsl_dataset_hold(dp, ddname, FTAG, &ds); if (error != 0) return (error); if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } dd = ds->ds_dir; if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && dsl_dir_is_zapified(dd) && zap_contains(dp->dp_meta_objset, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(EALREADY)); } dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) { char *ddname = (char *)arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; spa_t *spa; VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); spa = dsl_dataset_get_spa(ds); if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { /* * Since the feature was not active and we're now setting a * limit, increment the feature-active counter so that the * feature becomes active for the first time. * * We are already in a sync task so we can update the MOS. */ spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); } /* * Since we are now setting a non-UINT64_MAX limit on the filesystem, * we need to ensure the counts are correct. Descend down the tree from * this point and update all of the counts to be accurate. */ dsl_dir_init_fs_ss_count(ds->ds_dir, tx); dsl_dataset_rele(ds, FTAG); } /* * Make sure the feature is enabled and activate it if necessary. * Since we're setting a limit, ensure the on-disk counts are valid. * This is only called by the ioctl path when setting a limit value. * * We do not need to validate the new limit, since users who can change the * limit are also allowed to exceed the limit. */ int dsl_dir_activate_fs_ss_limit(const char *ddname) { int error; error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, ZFS_SPACE_CHECK_RESERVED); if (error == EALREADY) error = 0; return (error); } /* * Used to determine if the filesystem_limit or snapshot_limit should be * enforced. We allow the limit to be exceeded if the user has permission to * write the property value. We pass in the creds that we got in the open * context since we will always be the GZ root in syncing context. We also have * to handle the case where we are allowed to change the limit on the current * dataset, but there may be another limit in the tree above. * * We can never modify these two properties within a non-global zone. In * addition, the other checks are modeled on zfs_secpolicy_write_perms. We * can't use that function since we are already holding the dp_config_rwlock. * In addition, we already have the dd and dealing with snapshots is simplified * in this code. */ typedef enum { ENFORCE_ALWAYS, ENFORCE_NEVER, ENFORCE_ABOVE } enforce_res_t; static enforce_res_t dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, cred_t *cr) { enforce_res_t enforce = ENFORCE_ALWAYS; uint64_t obj; dsl_dataset_t *ds; uint64_t zoned; ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); #ifdef _KERNEL #ifdef __FreeBSD__ if (jailed(cr)) #else if (crgetzoneid(cr) != GLOBAL_ZONEID) #endif return (ENFORCE_ALWAYS); if (secpolicy_zfs(cr) == 0) return (ENFORCE_NEVER); #endif if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) return (ENFORCE_ALWAYS); ASSERT(dsl_pool_config_held(dd->dd_pool)); if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) return (ENFORCE_ALWAYS); if (dsl_prop_get_ds(ds, "zoned", 8, 1, &zoned, NULL) || zoned) { /* Only root can access zoned fs's from the GZ */ enforce = ENFORCE_ALWAYS; } else { if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) enforce = ENFORCE_ABOVE; } dsl_dataset_rele(ds, FTAG); return (enforce); } /* * Check if adding additional child filesystem(s) would exceed any filesystem * limits or adding additional snapshot(s) would exceed any snapshot limits. * The prop argument indicates which limit to check. * * Note that all filesystem limits up to the root (or the highest * initialized) filesystem or the given ancestor must be satisfied. */ int dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, dsl_dir_t *ancestor, cred_t *cr) { objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t limit, count; char *count_prop; enforce_res_t enforce; int err = 0; ASSERT(dsl_pool_config_held(dd->dd_pool)); ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || prop == ZFS_PROP_SNAPSHOT_LIMIT); /* * If we're allowed to change the limit, don't enforce the limit * e.g. this can happen if a snapshot is taken by an administrative * user in the global zone (i.e. a recursive snapshot by root). * However, we must handle the case of delegated permissions where we * are allowed to change the limit on the current dataset, but there * is another limit in the tree above. */ enforce = dsl_enforce_ds_ss_limits(dd, prop, cr); if (enforce == ENFORCE_NEVER) return (0); /* * e.g. if renaming a dataset with no snapshots, count adjustment * is 0. */ if (delta == 0) return (0); if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { /* * We don't enforce the limit for temporary snapshots. This is * indicated by a NULL cred_t argument. */ if (cr == NULL) return (0); count_prop = DD_FIELD_SNAPSHOT_COUNT; } else { count_prop = DD_FIELD_FILESYSTEM_COUNT; } /* * If an ancestor has been provided, stop checking the limit once we * hit that dir. We need this during rename so that we don't overcount * the check once we recurse up to the common ancestor. */ if (ancestor == dd) return (0); /* * If we hit an uninitialized node while recursing up the tree, we can * stop since we know there is no limit here (or above). The counts are * not valid on this node and we know we won't touch this node's counts. */ if (!dsl_dir_is_zapified(dd) || zap_lookup(os, dd->dd_object, count_prop, sizeof (count), 1, &count) == ENOENT) return (0); err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, B_FALSE); if (err != 0) return (err); /* Is there a limit which we've hit? */ if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) return (SET_ERROR(EDQUOT)); if (dd->dd_parent != NULL) err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, ancestor, cr); return (err); } /* * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all * parents. When a new filesystem/snapshot is created, increment the count on * all parents, and when a filesystem/snapshot is destroyed, decrement the * count. */ void dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, dmu_tx_t *tx) { int err; objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t count; ASSERT(dsl_pool_config_held(dd->dd_pool)); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); /* * When we receive an incremental stream into a filesystem that already * exists, a temporary clone is created. We don't count this temporary * clone, whose name begins with a '%'. We also ignore hidden ($FREE, * $MOS & $ORIGIN) objsets. */ if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') && strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) return; /* * e.g. if renaming a dataset with no snapshots, count adjustment is 0 */ if (delta == 0) return; /* * If we hit an uninitialized node while recursing up the tree, we can * stop since we know the counts are not valid on this node and we * know we shouldn't touch this node's counts. An uninitialized count * on the node indicates that either the feature has not yet been * activated or there are no limits on this part of the tree. */ if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, prop, sizeof (count), 1, &count)) == ENOENT) return; VERIFY0(err); count += delta; /* Use a signed verify to make sure we're not neg. */ VERIFY3S(count, >=, 0); VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, tx)); /* Roll up this additional count into our ancestors */ if (dd->dd_parent != NULL) dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); } uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; uint64_t ddobj; dsl_dir_phys_t *ddphys; dmu_buf_t *dbuf; ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); if (pds) { VERIFY(0 == zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, name, sizeof (uint64_t), 1, &ddobj, tx)); } else { /* it's the root dir */ VERIFY(0 == zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); } VERIFY(0 == dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); dmu_buf_will_dirty(dbuf, tx); ddphys = dbuf->db_data; ddphys->dd_creation_time = gethrestime_sec(); if (pds) { ddphys->dd_parent_obj = pds->dd_object; /* update the filesystem counts */ dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); } ddphys->dd_props_zapobj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); ddphys->dd_child_dir_zapobj = zap_create(mos, DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; dmu_buf_rele(dbuf, FTAG); return (ddobj); } boolean_t dsl_dir_is_clone(dsl_dir_t *dd) { return (dsl_dir_phys(dd)->dd_origin_obj && (dd->dd_pool->dp_origin_snap == NULL || dsl_dir_phys(dd)->dd_origin_obj != dd->dd_pool->dp_origin_snap->ds_object)); } void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) { mutex_enter(&dd->dd_lock); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, dsl_dir_phys(dd)->dd_used_bytes); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, dsl_dir_phys(dd)->dd_quota); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, dsl_dir_phys(dd)->dd_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / dsl_dir_phys(dd)->dd_compressed_bytes)); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, dsl_dir_phys(dd)->dd_uncompressed_bytes); if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); } mutex_exit(&dd->dd_lock); if (dsl_dir_is_zapified(dd)) { uint64_t count; objset_t *os = dd->dd_pool->dp_meta_objset; if (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT, count); } if (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count) == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, count); } } if (dsl_dir_is_clone(dd)) { dsl_dataset_t *ds; char buf[MAXNAMELEN]; VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); dsl_dataset_name(ds, buf); dsl_dataset_rele(ds, FTAG); dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); } } void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) { dsl_pool_t *dp = dd->dd_pool; ASSERT(dsl_dir_phys(dd)); if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(dd->dd_dbuf, dd); } } static int64_t parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) { uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); uint64_t new_accounted = MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); return (new_accounted - old_accounted); } void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); mutex_enter(&dd->dd_lock); ASSERT0(dd->dd_tempreserved[tx->tx_txg&TXG_MASK]); dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, dd->dd_space_towrite[tx->tx_txg&TXG_MASK] / 1024); dd->dd_space_towrite[tx->tx_txg&TXG_MASK] = 0; mutex_exit(&dd->dd_lock); /* release the hold from dsl_dir_dirty */ dmu_buf_rele(dd->dd_dbuf, dd); } static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd) { uint64_t space = 0; int i; ASSERT(MUTEX_HELD(&dd->dd_lock)); for (i = 0; i < TXG_SIZE; i++) { space += dd->dd_space_towrite[i&TXG_MASK]; ASSERT3U(dd->dd_space_towrite[i&TXG_MASK], >=, 0); } return (space); } /* * How much space would dd have available if ancestor had delta applied * to it? If ondiskonly is set, we're only interested in what's * on-disk, not estimated pending changes. */ uint64_t dsl_dir_space_available(dsl_dir_t *dd, dsl_dir_t *ancestor, int64_t delta, int ondiskonly) { uint64_t parentspace, myspace, quota, used; /* * If there are no restrictions otherwise, assume we have * unlimited space available. */ quota = UINT64_MAX; parentspace = UINT64_MAX; if (dd->dd_parent != NULL) { parentspace = dsl_dir_space_available(dd->dd_parent, ancestor, delta, ondiskonly); } mutex_enter(&dd->dd_lock); if (dsl_dir_phys(dd)->dd_quota != 0) quota = dsl_dir_phys(dd)->dd_quota; used = dsl_dir_phys(dd)->dd_used_bytes; if (!ondiskonly) used += dsl_dir_space_towrite(dd); if (dd->dd_parent == NULL) { uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, FALSE); quota = MIN(quota, poolsize); } if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { /* * We have some space reserved, in addition to what our * parent gave us. */ parentspace += dsl_dir_phys(dd)->dd_reserved - used; } if (dd == ancestor) { ASSERT(delta <= 0); ASSERT(used >= -delta); used += delta; if (parentspace != UINT64_MAX) parentspace -= delta; } if (used > quota) { /* over quota */ myspace = 0; } else { /* * the lesser of the space provided by our parent and * the space left in our quota */ myspace = MIN(parentspace, quota - used); } mutex_exit(&dd->dd_lock); return (myspace); } struct tempreserve { list_node_t tr_node; dsl_dir_t *tr_ds; uint64_t tr_size; }; static int dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, boolean_t ignorequota, boolean_t checkrefquota, list_t *tr_list, dmu_tx_t *tx, boolean_t first) { uint64_t txg = tx->tx_txg; uint64_t est_inflight, used_on_disk, quota, parent_rsrv; uint64_t deferred = 0; struct tempreserve *tr; int retval = EDQUOT; int txgidx = txg & TXG_MASK; int i; uint64_t ref_rsrv = 0; ASSERT3U(txg, !=, 0); ASSERT3S(asize, >, 0); mutex_enter(&dd->dd_lock); /* * Check against the dsl_dir's quota. We don't add in the delta * when checking for over-quota because they get one free hit. */ est_inflight = dsl_dir_space_towrite(dd); for (i = 0; i < TXG_SIZE; i++) est_inflight += dd->dd_tempreserved[i]; used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; /* * On the first iteration, fetch the dataset's used-on-disk and * refreservation values. Also, if checkrefquota is set, test if * allocating this space would exceed the dataset's refquota. */ if (first && tx->tx_objset) { int error; dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; error = dsl_dataset_check_quota(ds, checkrefquota, asize, est_inflight, &used_on_disk, &ref_rsrv); if (error) { mutex_exit(&dd->dd_lock); return (error); } } /* * If this transaction will result in a net free of space, * we want to let it through. */ if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) quota = UINT64_MAX; else quota = dsl_dir_phys(dd)->dd_quota; /* * Adjust the quota against the actual pool size at the root * minus any outstanding deferred frees. * To ensure that it's possible to remove files from a full * pool without inducing transient overcommits, we throttle * netfree transactions against a quota that is slightly larger, * but still within the pool's allocation slop. In cases where * we're very close to full, this will allow a steady trickle of * removes to get through. */ if (dd->dd_parent == NULL) { spa_t *spa = dd->dd_pool->dp_spa; uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); deferred = metaslab_class_get_deferred(spa_normal_class(spa)); if (poolsize - deferred < quota) { quota = poolsize - deferred; retval = ENOSPC; } } /* * If they are requesting more space, and our current estimate * is over quota, they get to try again unless the actual * on-disk is over quota and there are no pending changes (which * may free up space for us). */ if (used_on_disk + est_inflight >= quota) { if (est_inflight > 0 || used_on_disk < quota || (retval == ENOSPC && used_on_disk < quota + deferred)) retval = ERESTART; dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK err=%d\n", used_on_disk>>10, est_inflight>>10, quota>>10, asize>>10, retval); mutex_exit(&dd->dd_lock); return (SET_ERROR(retval)); } /* We need to up our estimated delta before dropping dd_lock */ dd->dd_tempreserved[txgidx] += asize; parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, asize - ref_rsrv); mutex_exit(&dd->dd_lock); tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_ds = dd; tr->tr_size = asize; list_insert_tail(tr_list, tr); /* see if it's OK with our parent */ if (dd->dd_parent && parent_rsrv) { boolean_t ismos = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); return (dsl_dir_tempreserve_impl(dd->dd_parent, parent_rsrv, netfree, ismos, TRUE, tr_list, tx, FALSE)); } else { return (0); } } /* * Reserve space in this dsl_dir, to be used in this tx's txg. * After the space has been dirtied (and dsl_dir_willuse_space() * has been called), the reservation should be canceled, using * dsl_dir_tempreserve_clear(). */ int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx) { int err; list_t *tr_list; if (asize == 0) { *tr_cookiep = NULL; return (0); } tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); list_create(tr_list, sizeof (struct tempreserve), offsetof(struct tempreserve, tr_node)); ASSERT3S(asize, >, 0); ASSERT3S(fsize, >=, 0); err = arc_tempreserve_space(lsize, tx->tx_txg); if (err == 0) { struct tempreserve *tr; tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); tr->tr_size = lsize; list_insert_tail(tr_list, tr); } else { if (err == EAGAIN) { /* * If arc_memory_throttle() detected that pageout * is running and we are low on memory, we delay new * non-pageout transactions to give pageout an * advantage. * * It is unfortunate to be delaying while the caller's * locks are held. */ txg_delay(dd->dd_pool, tx->tx_txg, MSEC2NSEC(10), MSEC2NSEC(10)); err = SET_ERROR(ERESTART); } } if (err == 0) { err = dsl_dir_tempreserve_impl(dd, asize, fsize >= asize, FALSE, asize > usize, tr_list, tx, TRUE); } if (err != 0) dsl_dir_tempreserve_clear(tr_list, tx); else *tr_cookiep = tr_list; return (err); } /* * Clear a temporary reservation that we previously made with * dsl_dir_tempreserve_space(). */ void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) { int txgidx = tx->tx_txg & TXG_MASK; list_t *tr_list = tr_cookie; struct tempreserve *tr; ASSERT3U(tx->tx_txg, !=, 0); if (tr_cookie == NULL) return; while ((tr = list_head(tr_list)) != NULL) { if (tr->tr_ds) { mutex_enter(&tr->tr_ds->dd_lock); ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, tr->tr_size); tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; mutex_exit(&tr->tr_ds->dd_lock); } else { arc_tempreserve_clear(tr->tr_size); } list_remove(tr_list, tr); kmem_free(tr, sizeof (struct tempreserve)); } kmem_free(tr_list, sizeof (list_t)); } /* * This should be called from open context when we think we're going to write * or free space, for example when dirtying data. Be conservative; it's okay * to write less space or free more, but we don't want to write more or free * less than the amount specified. */ void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) { int64_t parent_space; uint64_t est_used; mutex_enter(&dd->dd_lock); if (space > 0) dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; est_used = dsl_dir_space_towrite(dd) + dsl_dir_phys(dd)->dd_used_bytes; parent_space = parent_delta(dd, est_used, space); mutex_exit(&dd->dd_lock); /* Make sure that we clean up dd_space_to* */ dsl_dir_dirty(dd, tx); /* XXX this is potentially expensive and unnecessary... */ if (parent_space && dd->dd_parent) dsl_dir_willuse_space(dd->dd_parent, parent_space, tx); } /* call from syncing context when we actually write/free space for this dd */ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) { int64_t accounted_delta; /* * dsl_dataset_set_refreservation_sync_impl() calls this with * dd_lock held, so that it can atomically update * ds->ds_reserved and the dsl_dir accounting, so that * dsl_dataset_check_quota() can see dataset and dir accounting * consistently. */ boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); ASSERT(dmu_tx_is_syncing(tx)); ASSERT(type < DD_USED_NUM); dmu_buf_will_dirty(dd->dd_dbuf, tx); if (needlock) mutex_enter(&dd->dd_lock); accounted_delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); ASSERT(compressed >= 0 || dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); ASSERT(uncompressed >= 0 || dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); dsl_dir_phys(dd)->dd_used_bytes += used; dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; dsl_dir_phys(dd)->dd_compressed_bytes += compressed; if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { ASSERT(used > 0 || dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); dsl_dir_phys(dd)->dd_used_breakdown[type] += used; #ifdef DEBUG dd_used_t t; uint64_t u = 0; for (t = 0; t < DD_USED_NUM; t++) u += dsl_dir_phys(dd)->dd_used_breakdown[t]; ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); #endif } if (needlock) mutex_exit(&dd->dd_lock); if (dd->dd_parent != NULL) { dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, accounted_delta, compressed, uncompressed, tx); dsl_dir_transfer_space(dd->dd_parent, used - accounted_delta, DD_USED_CHILD_RSRV, DD_USED_CHILD, NULL); } } void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) { ASSERT(tx == NULL || dmu_tx_is_syncing(tx)); ASSERT(oldtype < DD_USED_NUM); ASSERT(newtype < DD_USED_NUM); if (delta == 0 || !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) return; if (tx != NULL) dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); ASSERT(delta > 0 ? dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; mutex_exit(&dd->dd_lock); } typedef struct dsl_dir_set_qr_arg { const char *ddsqra_name; zprop_source_t ddsqra_source; uint64_t ddsqra_value; } dsl_dir_set_qr_arg_t; static int dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; uint64_t towrite, newval; error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); error = dsl_prop_predict(ds->ds_dir, "quota", ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } if (newval == 0) { dsl_dataset_rele(ds, FTAG); return (0); } mutex_enter(&ds->ds_dir->dd_lock); /* * If we are doing the preliminary check in open context, and * there are pending changes, then don't fail it, since the * pending changes could under-estimate the amount of space to be * freed up. */ towrite = dsl_dir_space_towrite(ds->ds_dir); if ((dmu_tx_is_syncing(tx) || towrite == 0) && (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { error = SET_ERROR(ENOSPC); } mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); return (error); } static void dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); } else { newval = ddsqra->ddsqra_value; spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); } dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); dsl_dir_phys(ds->ds_dir)->dd_quota = newval; mutex_exit(&ds->ds_dir->dd_lock); dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) { dsl_dir_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = ddname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = quota; return (dsl_sync_task(ddname, dsl_dir_set_quota_check, dsl_dir_set_quota_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); } int dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; dsl_dir_t *dd; uint64_t newval, used, avail; int error; error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); if (error != 0) return (error); dd = ds->ds_dir; /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. */ if (!dmu_tx_is_syncing(tx)) { dsl_dataset_rele(ds, FTAG); return (0); } error = dsl_prop_predict(ds->ds_dir, zfs_prop_to_name(ZFS_PROP_RESERVATION), ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); if (error != 0) { dsl_dataset_rele(ds, FTAG); return (error); } mutex_enter(&dd->dd_lock); used = dsl_dir_phys(dd)->dd_used_bytes; mutex_exit(&dd->dd_lock); if (dd->dd_parent) { avail = dsl_dir_space_available(dd->dd_parent, NULL, 0, FALSE); } else { avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; } if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { uint64_t delta = MAX(used, newval) - MAX(used, dsl_dir_phys(dd)->dd_reserved); if (delta > avail || (dsl_dir_phys(dd)->dd_quota > 0 && newval > dsl_dir_phys(dd)->dd_quota)) error = SET_ERROR(ENOSPC); } dsl_dataset_rele(ds, FTAG); return (error); } void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) { uint64_t used; int64_t delta; dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); used = dsl_dir_phys(dd)->dd_used_bytes; delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); dsl_dir_phys(dd)->dd_reserved = value; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, delta, 0, 0, tx); } mutex_exit(&dd->dd_lock); } static void dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) { dsl_dir_set_qr_arg_t *ddsqra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t newval; VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, &ddsqra->ddsqra_value, tx); VERIFY0(dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); } else { newval = ddsqra->ddsqra_value; spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", zfs_prop_to_name(ZFS_PROP_RESERVATION), (longlong_t)newval); } dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); dsl_dataset_rele(ds, FTAG); } int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation) { dsl_dir_set_qr_arg_t ddsqra; ddsqra.ddsqra_name = ddname; ddsqra.ddsqra_source = source; ddsqra.ddsqra_value = reservation; return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, dsl_dir_set_reservation_sync, &ddsqra, 0, ZFS_SPACE_CHECK_NONE)); } static dsl_dir_t * closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) { for (; ds1; ds1 = ds1->dd_parent) { dsl_dir_t *dd; for (dd = ds2; dd; dd = dd->dd_parent) { if (ds1 == dd) return (dd); } } return (NULL); } /* * If delta is applied to dd, how much of that delta would be applied to * ancestor? Syncing context only. */ static int64_t would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) { if (dd == ancestor) return (delta); mutex_enter(&dd->dd_lock); delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); mutex_exit(&dd->dd_lock); return (would_change(dd->dd_parent, delta, ancestor)); } typedef struct dsl_dir_rename_arg { const char *ddra_oldname; const char *ddra_newname; cred_t *ddra_cred; } dsl_dir_rename_arg_t; /* ARGSUSED */ static int dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { int *deltap = arg; char namebuf[MAXNAMELEN]; dsl_dataset_name(ds, namebuf); if (strlen(namebuf) + *deltap >= MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int dsl_dir_rename_check(void *arg, dmu_tx_t *tx) { dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; const char *mynewname; int error; int delta = strlen(ddra->ddra_newname) - strlen(ddra->ddra_oldname); /* target dir should exist */ error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); if (error != 0) return (error); /* new parent should exist */ error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname); if (error != 0) { dsl_dir_rele(dd, FTAG); return (error); } /* can't rename to different pool */ if (dd->dd_pool != newparent->dd_pool) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EXDEV)); } /* new name should not already exist */ if (mynewname == NULL) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EEXIST)); } /* if the name length is growing, validate child name lengths */ if (delta > 0) { error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } } if (dmu_tx_is_syncing(tx)) { if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { /* * Although this is the check function and we don't * normally make on-disk changes in check functions, * we need to do that here. * * Ensure this portion of the tree's counts have been * initialized in case the new parent has limits set. */ dsl_dir_init_fs_ss_count(dd, tx); } } if (newparent != dd->dd_parent) { /* is there enough space? */ uint64_t myspace = MAX(dsl_dir_phys(dd)->dd_used_bytes, dsl_dir_phys(dd)->dd_reserved); objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t fs_cnt = 0; uint64_t ss_cnt = 0; if (dsl_dir_is_zapified(dd)) { int err; err = zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, &fs_cnt); if (err != ENOENT && err != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (err); } /* * have to add 1 for the filesystem itself that we're * moving */ fs_cnt++; err = zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, &ss_cnt); if (err != ENOENT && err != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (err); } } /* no rename into our descendant */ if (closest_common_ancestor(dd, newparent) == dd) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dir_transfer_possible(dd->dd_parent, newparent, fs_cnt, ss_cnt, myspace, ddra->ddra_cred); if (error != 0) { dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (error); } } dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); return (0); } static void dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) { dsl_dir_rename_arg_t *ddra = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd, *newparent; const char *mynewname; int error; objset_t *mos = dp->dp_meta_objset; VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, &mynewname)); /* Log this before we change the name. */ spa_history_log_internal_dd(dd, "rename", tx, "-> %s", ddra->ddra_newname); if (newparent != dd->dd_parent) { objset_t *os = dd->dd_pool->dp_meta_objset; uint64_t fs_cnt = 0; uint64_t ss_cnt = 0; /* * We already made sure the dd counts were initialized in the * check function. */ if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { VERIFY0(zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, &fs_cnt)); /* add 1 for the filesystem itself that we're moving */ fs_cnt++; VERIFY0(zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, &ss_cnt)); } dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, DD_FIELD_FILESYSTEM_COUNT, tx); dsl_fs_ss_count_adjust(newparent, fs_cnt, DD_FIELD_FILESYSTEM_COUNT, tx); dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, DD_FIELD_SNAPSHOT_COUNT, tx); dsl_fs_ss_count_adjust(newparent, ss_cnt, DD_FIELD_SNAPSHOT_COUNT, tx); dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, -dsl_dir_phys(dd)->dd_used_bytes, -dsl_dir_phys(dd)->dd_compressed_bytes, -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD, dsl_dir_phys(dd)->dd_used_bytes, dsl_dir_phys(dd)->dd_compressed_bytes, dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); if (dsl_dir_phys(dd)->dd_reserved > dsl_dir_phys(dd)->dd_used_bytes) { uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - dsl_dir_phys(dd)->dd_used_bytes; dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, -unused_rsrv, 0, 0, tx); dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, unused_rsrv, 0, 0, tx); } } dmu_buf_will_dirty(dd->dd_dbuf, tx); /* remove from old parent zapobj */ error = zap_remove(mos, dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, dd->dd_myname, tx); ASSERT0(error); (void) strcpy(dd->dd_myname, mynewname); dsl_dir_rele(dd->dd_parent, dd); dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; VERIFY0(dsl_dir_hold_obj(dp, newparent->dd_object, NULL, dd, &dd->dd_parent)); /* add to new parent zapobj */ VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, dd->dd_myname, 8, 1, &dd->dd_object, tx)); #ifdef __FreeBSD__ #ifdef _KERNEL zfsvfs_update_fromname(ddra->ddra_oldname, ddra->ddra_newname); zvol_rename_minors(ddra->ddra_oldname, ddra->ddra_newname); #endif #endif dsl_prop_notify_all(dd); dsl_dir_rele(newparent, FTAG); dsl_dir_rele(dd, FTAG); } int dsl_dir_rename(const char *oldname, const char *newname) { dsl_dir_rename_arg_t ddra; ddra.ddra_oldname = oldname; ddra.ddra_newname = newname; ddra.ddra_cred = CRED(); return (dsl_sync_task(oldname, dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, 3, ZFS_SPACE_CHECK_RESERVED)); } int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *cr) { dsl_dir_t *ancestor; int64_t adelta; uint64_t avail; int err; ancestor = closest_common_ancestor(sdd, tdd); adelta = would_change(sdd, -space, ancestor); avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); if (avail < space) return (SET_ERROR(ENOSPC)); err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, ancestor, cr); if (err != 0) return (err); err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, ancestor, cr); if (err != 0) return (err); return (0); } timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd) { timestruc_t t; mutex_enter(&dd->dd_lock); t = dd->dd_snap_cmtime; mutex_exit(&dd->dd_lock); return (t); } void dsl_dir_snap_cmtime_update(dsl_dir_t *dd) { timestruc_t t; gethrestime(&t); mutex_enter(&dd->dd_lock); dd->dd_snap_cmtime = t; mutex_exit(&dd->dd_lock); } void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); } boolean_t dsl_dir_is_zapified(dsl_dir_t *dd) { dmu_object_info_t doi; dmu_object_info_from_db(dd->dd_dbuf, &doi); return (doi.doi_type == DMU_OTN_ZAP_METADATA); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_pool.c (revision 286575) @@ -1,1148 +1,1151 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __FreeBSD__ #include #include #endif /* * ZFS Write Throttle * ------------------ * * ZFS must limit the rate of incoming writes to the rate at which it is able * to sync data modifications to the backend storage. Throttling by too much * creates an artificial limit; throttling by too little can only be sustained * for short periods and would lead to highly lumpy performance. On a per-pool * basis, ZFS tracks the amount of modified (dirty) data. As operations change * data, the amount of dirty data increases; as ZFS syncs out data, the amount * of dirty data decreases. When the amount of dirty data exceeds a * predetermined threshold further modifications are blocked until the amount * of dirty data decreases (as data is synced out). * * The limit on dirty data is tunable, and should be adjusted according to * both the IO capacity and available memory of the system. The larger the * window, the more ZFS is able to aggregate and amortize metadata (and data) * changes. However, memory is a limited resource, and allowing for more dirty * data comes at the cost of keeping other useful data in memory (for example * ZFS data cached by the ARC). * * Implementation * * As buffers are modified dsl_pool_willuse_space() increments both the per- * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of * dirty space used; dsl_pool_dirty_space() decrements those values as data * is synced out from dsl_pool_sync(). While only the poolwide value is * relevant, the per-txg value is useful for debugging. The tunable * zfs_dirty_data_max determines the dirty space limit. Once that value is * exceeded, new writes are halted until space frees up. * * The zfs_dirty_data_sync tunable dictates the threshold at which we * ensure that there is a txg syncing (see the comment in txg.c for a full * description of transaction group stages). * * The IO scheduler uses both the dirty space limit and current amount of * dirty data as inputs. Those values affect the number of concurrent IOs ZFS * issues. See the comment in vdev_queue.c for details of the IO scheduler. * * The delay is also calculated based on the amount of dirty data. See the * comment above dmu_tx_delay() for details. */ /* * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, * capped at zfs_dirty_data_max_max. It can also be overridden in /etc/system. */ uint64_t zfs_dirty_data_max; uint64_t zfs_dirty_data_max_max = 4ULL * 1024 * 1024 * 1024; int zfs_dirty_data_max_percent = 10; /* * If there is at least this much dirty data, push out a txg. */ uint64_t zfs_dirty_data_sync = 64 * 1024 * 1024; /* * Once there is this amount of dirty data, the dmu_tx_delay() will kick in * and delay each transaction. * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. */ int zfs_delay_min_dirty_percent = 60; /* * This controls how quickly the delay approaches infinity. * Larger values cause it to delay more for a given amount of dirty data. * Therefore larger values will cause there to be less dirty data for a * given throughput. * * For the smoothest delay, this value should be about 1 billion divided * by the maximum number of operations per second. This will smoothly * handle between 10x and 1/10th this number. * * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the * multiply in dmu_tx_delay(). */ uint64_t zfs_delay_scale = 1000 * 1000 * 1000 / 2000; #ifdef __FreeBSD__ extern int zfs_vdev_async_write_active_max_dirty_percent; SYSCTL_DECL(_vfs_zfs); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max, CTLFLAG_RWTUN, &zfs_dirty_data_max, 0, "The maximum amount of dirty data in bytes after which new writes are " "halted until space becomes available"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_max_max, CTLFLAG_RDTUN, &zfs_dirty_data_max_max, 0, "The absolute cap on dirty_data_max when auto calculating"); static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS); SYSCTL_PROC(_vfs_zfs, OID_AUTO, dirty_data_max_percent, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_zfs_dirty_data_max_percent, "I", "The percent of physical memory used to auto calculate dirty_data_max"); SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, dirty_data_sync, CTLFLAG_RWTUN, &zfs_dirty_data_sync, 0, "Force a txg if the number of dirty buffer bytes exceed this value"); static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS); /* No zfs_delay_min_dirty_percent tunable due to limit requirements */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_min_dirty_percent, CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(int), sysctl_zfs_delay_min_dirty_percent, "I", "The limit of outstanding dirty data before transations are delayed"); static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS); /* No zfs_delay_scale tunable due to limit requirements */ SYSCTL_PROC(_vfs_zfs, OID_AUTO, delay_scale, CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), sysctl_zfs_delay_scale, "QU", "Controls how quickly the delay approaches infinity"); static int sysctl_zfs_dirty_data_max_percent(SYSCTL_HANDLER_ARGS) { int val, err; val = zfs_dirty_data_max_percent; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < 0 || val > 100) return (EINVAL); zfs_dirty_data_max_percent = val; return (0); } static int sysctl_zfs_delay_min_dirty_percent(SYSCTL_HANDLER_ARGS) { int val, err; val = zfs_delay_min_dirty_percent; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val < zfs_vdev_async_write_active_max_dirty_percent) return (EINVAL); zfs_delay_min_dirty_percent = val; return (0); } static int sysctl_zfs_delay_scale(SYSCTL_HANDLER_ARGS) { uint64_t val; int err; val = zfs_delay_scale; err = sysctl_handle_64(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); if (val > UINT64_MAX / zfs_dirty_data_max) return (EINVAL); zfs_delay_scale = val; return (0); } #endif hrtime_t zfs_throttle_delay = MSEC2NSEC(10); hrtime_t zfs_throttle_resolution = MSEC2NSEC(10); int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { uint64_t obj; int err; err = zap_lookup(dp->dp_meta_objset, dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, name, sizeof (obj), 1, &obj); if (err) return (err); return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); } static dsl_pool_t * dsl_pool_open_impl(spa_t *spa, uint64_t txg) { dsl_pool_t *dp; blkptr_t *bp = spa_get_rootblkptr(spa); dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); dp->dp_spa = spa; dp->dp_meta_rootbp = *bp; rrw_init(&dp->dp_config_rwlock, B_TRUE); txg_init(dp, txg); txg_list_create(&dp->dp_dirty_datasets, offsetof(dsl_dataset_t, ds_dirty_link)); txg_list_create(&dp->dp_dirty_zilogs, offsetof(zilog_t, zl_dirty_link)); txg_list_create(&dp->dp_dirty_dirs, offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, offsetof(dsl_sync_task_t, dst_node)); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, 1, 4, 0); return (dp); } int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &dp->dp_meta_objset); if (err != 0) dsl_pool_close(dp); else *dpp = dp; return (err); } int dsl_pool_open(dsl_pool_t *dp) { int err; dsl_dir_t *dd; dsl_dataset_t *ds; uint64_t obj; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); if (err) goto out; err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir); if (err) goto out; err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); if (err) goto out; if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; err = dsl_dataset_hold_obj(dp, dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); if (err == 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap); dsl_dataset_rele(ds, FTAG); } dsl_dir_rele(dd, dp); if (err) goto out; } if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir); if (err) goto out; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err) goto out; VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } /* * Note: errors ignored, because the leak dir will not exist if we * have not encountered a leak yet. */ (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir); if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj); if (err != 0) goto out; } if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, &dp->dp_empty_bpobj); if (err != 0) goto out; } err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); if (err == ENOENT) err = 0; if (err) goto out; err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: rrw_exit(&dp->dp_config_rwlock, FTAG); return (err); } void dsl_pool_close(dsl_pool_t *dp) { /* * Drop our references from dsl_pool_open(). * * Since we held the origin_snap from "syncing" context (which * includes pool-opening context), it actually only got a "ref" * and not a hold, so just drop that here. */ if (dp->dp_origin_snap) dsl_dataset_rele(dp->dp_origin_snap, dp); if (dp->dp_mos_dir) dsl_dir_rele(dp->dp_mos_dir, dp); if (dp->dp_free_dir) dsl_dir_rele(dp->dp_free_dir, dp); if (dp->dp_leak_dir) dsl_dir_rele(dp->dp_leak_dir, dp); if (dp->dp_root_dir) dsl_dir_rele(dp->dp_root_dir, dp); bpobj_close(&dp->dp_free_bpobj); /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ if (dp->dp_meta_objset) dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_zilogs); txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); arc_flush(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); + dmu_buf_user_evict_wait(); + rrw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); taskq_destroy(dp->dp_vnrele_taskq); if (dp->dp_blkstats) kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp, sizeof (dsl_pool_t)); } dsl_pool_t * dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); objset_t *os; dsl_dataset_t *ds; uint64_t obj; rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = dmu_objset_create_impl(spa, NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); /* create the pool directory */ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); ASSERT0(err); /* Initialize scan structures */ VERIFY0(dsl_scan_init(dp, txg)); /* create and open the root dir */ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir)); /* create and open the meta-objset dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir)); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { /* create and open the free dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* create and open the free_bplist */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) dsl_pool_create_origin(dp, tx); /* create the root dataset */ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); /* create the root objset */ VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); os = dmu_objset_create_impl(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); #ifdef _KERNEL zfs_create_fs(os, kcred, zplprops, tx); #endif dsl_dataset_rele(ds, FTAG); dmu_tx_commit(tx); rrw_exit(&dp->dp_config_rwlock, FTAG); return (dp); } /* * Account for the meta-objset space in its placeholder dsl_dir. */ void dsl_pool_mos_diduse_space(dsl_pool_t *dp, int64_t used, int64_t comp, int64_t uncomp) { ASSERT3U(comp, ==, uncomp); /* it's all metadata */ mutex_enter(&dp->dp_lock); dp->dp_mos_used_delta += used; dp->dp_mos_compressed_delta += comp; dp->dp_mos_uncompressed_delta += uncomp; mutex_exit(&dp->dp_lock); } static int deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; dsl_deadlist_insert(dl, bp, tx); return (0); } static void dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) { zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dmu_objset_sync(dp->dp_meta_objset, zio, tx); VERIFY0(zio_wait(zio)); dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } static void dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) { ASSERT(MUTEX_HELD(&dp->dp_lock)); if (delta < 0) ASSERT3U(-delta, <=, dp->dp_dirty_total); dp->dp_dirty_total += delta; /* * Note: we signal even when increasing dp_dirty_total. * This ensures forward progress -- each thread wakes the next waiter. */ if (dp->dp_dirty_total <= zfs_dirty_data_max) cv_signal(&dp->dp_spaceavail_cv); } void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { zio_t *zio; dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; objset_t *mos = dp->dp_meta_objset; list_t synced_datasets; list_create(&synced_datasets, sizeof (dsl_dataset_t), offsetof(dsl_dataset_t, ds_synced_link)); tx = dmu_tx_create_assigned(dp, txg); /* * Write out all dirty blocks of dirty datasets. */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { /* * We must not sync any non-MOS datasets twice, because * we may have taken a snapshot of them. However, we * may sync newly-created datasets on pass 2. */ ASSERT(!list_link_active(&ds->ds_synced_link)); list_insert_tail(&synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } VERIFY0(zio_wait(zio)); /* * We have written all of the accounted dirty data, so our * dp_space_towrite should now be zero. However, some seldom-used * code paths do not adhere to this (e.g. dbuf_undirty(), also * rounding error in dbuf_write_physdone). * Shore up the accounting of any dirtied space now. */ dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); /* * After the data blocks have been written (ensured by the zio_wait() * above), update the user/group space accounting. */ for (ds = list_head(&synced_datasets); ds != NULL; ds = list_next(&synced_datasets, ds)) { dmu_objset_do_userquota_updates(ds->ds_objset, tx); } /* * Sync the datasets again to push out the changes due to * userspace updates. This must be done before we process the * sync tasks, so that any snapshots will have the correct * user accounting information (and we won't get confused * about which blocks are part of the snapshot). */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, zio, tx); } VERIFY0(zio_wait(zio)); /* * Now that the datasets have been completely synced, we can * clean up our in-memory structures accumulated while syncing: * * - move dead blocks from the pending deadlist to the on-disk deadlist * - release hold from dsl_dataset_dirty() */ while ((ds = list_remove_head(&synced_datasets)) != NULL) { objset_t *os = ds->ds_objset; bplist_iterate(&ds->ds_pending_deadlist, deadlist_enqueue_cb, &ds->ds_deadlist, tx); ASSERT(!dmu_objset_is_dirty(os, txg)); dmu_buf_rele(ds->ds_dbuf, ds); } while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { dsl_dir_sync(dd, tx); } /* * The MOS's space is accounted for in the pool/$MOS * (dp_mos_dir). We can't modify the mos while we're syncing * it, so we remember the deltas and apply them here. */ if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || dp->dp_mos_uncompressed_delta != 0) { dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, dp->dp_mos_used_delta, dp->dp_mos_compressed_delta, dp->dp_mos_uncompressed_delta, tx); dp->dp_mos_used_delta = 0; dp->dp_mos_compressed_delta = 0; dp->dp_mos_uncompressed_delta = 0; } if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { dsl_pool_sync_mos(dp, tx); } /* * If we modify a dataset in the same txg that we want to destroy it, * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. * dsl_dir_destroy_check() will fail if there are unexpected holds. * Therefore, we want to sync the MOS (thus syncing the dd_dbuf * and clearing the hold on it) before we process the sync_tasks. * The MOS data dirtied by the sync_tasks will be synced on the next * pass. */ if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { dsl_sync_task_t *dst; /* * No more sync tasks should have been added while we * were syncing. */ ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) dsl_sync_task_sync(dst, tx); } dmu_tx_commit(tx); DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); } void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) { zilog_t *zilog; while (zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg)) { dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); zil_clean(zilog, txg); ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); dmu_buf_rele(ds->ds_dbuf, zilog); } ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } /* * TRUE if the current thread is the tx_sync_thread or if we * are being called from SPA context during pool initialization. */ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || spa_is_initializing(dp->dp_spa)); } uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) { uint64_t space, resv; /* * If we're trying to assess whether it's OK to do a free, * cut the reservation in half to allow forward progress * (e.g. make it possible to rm(1) files from a full pool). */ space = spa_get_dspace(dp->dp_spa); resv = spa_get_slop_space(dp->dp_spa); if (netfree) resv >>= 1; return (space - resv); } boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp) { uint64_t delay_min_bytes = zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; boolean_t rv; mutex_enter(&dp->dp_lock); if (dp->dp_dirty_total > zfs_dirty_data_sync) txg_kick(dp); rv = (dp->dp_dirty_total > delay_min_bytes); mutex_exit(&dp->dp_lock); return (rv); } void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) { if (space > 0) { mutex_enter(&dp->dp_lock); dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; dsl_pool_dirty_delta(dp, space); mutex_exit(&dp->dp_lock); } } void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) { ASSERT3S(space, >=, 0); if (space == 0) return; mutex_enter(&dp->dp_lock); if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { /* XXX writing something we didn't dirty? */ space = dp->dp_dirty_pertxg[txg & TXG_MASK]; } ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; ASSERT3U(dp->dp_dirty_total, >=, space); dsl_pool_dirty_delta(dp, -space); mutex_exit(&dp->dp_lock); } /* ARGSUSED */ static int upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds, *prev = NULL; int err; err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) break; dsl_dataset_rele(ds, FTAG); ds = prev; prev = NULL; } if (prev == NULL) { prev = dp->dp_origin_snap; /* * The $ORIGIN can't have any data, or the accounting * will be wrong. */ ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); /* The origin doesn't get attached to itself */ if (ds->ds_object == prev->ds_object) { dsl_dataset_rele(ds, FTAG); return (0); } dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; dsl_dataset_phys(ds)->ds_prev_snap_txg = dsl_dataset_phys(prev)->ds_creation_txg; dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; dmu_buf_will_dirty(prev->ds_dbuf, tx); dsl_dataset_phys(prev)->ds_num_children++; if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { ASSERT(ds->ds_prev == NULL); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); } } ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { dmu_buf_will_dirty(prev->ds_dbuf, tx); dsl_dataset_phys(prev)->ds_next_clones_obj = zap_create(dp->dp_meta_objset, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); dsl_dataset_rele(ds, FTAG); if (prev != dp->dp_origin_snap) dsl_dataset_rele(prev, FTAG); return (0); } void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap != NULL); VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, tx, DS_FIND_CHILDREN)); } /* ARGSUSED */ static int upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { dmu_tx_t *tx = arg; objset_t *mos = dp->dp_meta_objset; if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { dsl_dataset_t *origin; VERIFY0(dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); dsl_dir_phys(origin->ds_dir)->dd_clones = zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY0(zap_add_int(dp->dp_meta_objset, dsl_dir_phys(origin->ds_dir)->dd_clones, ds->ds_object, tx)); dsl_dataset_rele(origin, FTAG); } return (0); } void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); uint64_t obj; (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* * We can't use bpobj_alloc(), because spa_version() still * returns the old version, and we need a new-version bpobj with * subobj support. So call dmu_object_alloc() directly. */ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); } void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t dsobj; dsl_dataset_t *ds; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap == NULL); ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); /* create the origin dir, ds, & snap-ds */ dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, NULL, 0, kcred, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); } taskq_t * dsl_pool_vnrele_taskq(dsl_pool_t *dp) { return (dp->dp_vnrele_taskq); } /* * Walk through the pool-wide zap object of temporary snapshot user holds * and release them. */ void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) { zap_attribute_t za; zap_cursor_t zc; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; nvlist_t *holds; if (zapobj == 0) return; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); holds = fnvlist_alloc(); for (zap_cursor_init(&zc, mos, zapobj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { char *htag; nvlist_t *tags; htag = strchr(za.za_name, '-'); *htag = '\0'; ++htag; if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { tags = fnvlist_alloc(); fnvlist_add_boolean(tags, htag); fnvlist_add_nvlist(holds, za.za_name, tags); fnvlist_free(tags); } else { fnvlist_add_boolean(tags, htag); } } dsl_dataset_user_release_tmp(dp, holds); fnvlist_free(holds); zap_cursor_fini(&zc); } /* * Create the pool-wide zap object for storing temporary snapshot holds. */ void dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; ASSERT(dp->dp_tmp_userrefs_obj == 0); ASSERT(dmu_tx_is_syncing(tx)); dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); } static int dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) { objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; char *name; int error; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); ASSERT(dmu_tx_is_syncing(tx)); /* * If the pool was created prior to SPA_VERSION_USERREFS, the * zap object for temporary holds might not exist yet. */ if (zapobj == 0) { if (holding) { dsl_pool_user_hold_create_obj(dp, tx); zapobj = dp->dp_tmp_userrefs_obj; } else { return (SET_ERROR(ENOENT)); } } name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); if (holding) error = zap_add(mos, zapobj, name, 8, 1, &now, tx); else error = zap_remove(mos, zapobj, name, tx); strfree(name); return (error); } /* * Add a temporary hold for the given dataset object and tag. */ int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, uint64_t now, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); } /* * Release a temporary hold for the given dataset object and tag. */ int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, tx, B_FALSE)); } /* * DSL Pool Configuration Lock * * The dp_config_rwlock protects against changes to DSL state (e.g. dataset * creation / destruction / rename / property setting). It must be held for * read to hold a dataset or dsl_dir. I.e. you must call * dsl_pool_config_enter() or dsl_pool_hold() before calling * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock * must be held continuously until all datasets and dsl_dirs are released. * * The only exception to this rule is that if a "long hold" is placed on * a dataset, then the dp_config_rwlock may be dropped while the dataset * is still held. The long hold will prevent the dataset from being * destroyed -- the destroy will fail with EBUSY. A long hold can be * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset * (by calling dsl_{dataset,objset}_{try}own{_obj}). * * Legitimate long-holders (including owners) should be long-running, cancelable * tasks that should cause "zfs destroy" to fail. This includes DMU * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), * "zfs send", and "zfs diff". There are several other long-holders whose * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). * * The usual formula for long-holding would be: * dsl_pool_hold() * dsl_dataset_hold() * ... perform checks ... * dsl_dataset_long_hold() * dsl_pool_rele() * ... perform long-running task ... * dsl_dataset_long_rele() * dsl_dataset_rele() * * Note that when the long hold is released, the dataset is still held but * the pool is not held. The dataset may change arbitrarily during this time * (e.g. it could be destroyed). Therefore you shouldn't do anything to the * dataset except release it. * * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only * or modifying operations. * * Modifying operations should generally use dsl_sync_task(). The synctask * infrastructure enforces proper locking strategy with respect to the * dp_config_rwlock. See the comment above dsl_sync_task() for details. * * Read-only operations will manually hold the pool, then the dataset, obtain * information from the dataset, then release the pool and dataset. * dmu_objset_{hold,rele}() are convenience routines that also do the pool * hold/rele. */ int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) { spa_t *spa; int error; error = spa_open(name, &spa, tag); if (error == 0) { *dp = spa_get_dsl(spa); dsl_pool_config_enter(*dp, tag); } return (error); } void dsl_pool_rele(dsl_pool_t *dp, void *tag) { dsl_pool_config_exit(dp, tag); spa_close(dp->dp_spa, tag); } void dsl_pool_config_enter(dsl_pool_t *dp, void *tag) { /* * We use a "reentrant" reader-writer lock, but not reentrantly. * * The rrwlock can (with the track_all flag) track all reading threads, * which is very useful for debugging which code path failed to release * the lock, and for verifying that the *current* thread does hold * the lock. * * (Unlike a rwlock, which knows that N threads hold it for * read, but not *which* threads, so rw_held(RW_READER) returns TRUE * if any thread holds it for read, even if this thread doesn't). */ ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); } void dsl_pool_config_exit(dsl_pool_t *dp, void *tag) { rrw_exit(&dp->dp_config_rwlock, tag); } boolean_t dsl_pool_config_held(dsl_pool_t *dp) { return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_prop.c (revision 286575) @@ -1,1157 +1,1155 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #define ZPROP_INHERIT_SUFFIX "$inherit" #define ZPROP_RECVD_SUFFIX "$recvd" static int dodefault(const char *propname, int intsz, int numints, void *buf) { zfs_prop_t prop; /* * The setonce properties are read-only, BUT they still * have a default value that can be used as the initial * value. */ if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL || (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop))) return (SET_ERROR(ENOENT)); if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { if (intsz != 1) return (SET_ERROR(EOVERFLOW)); (void) strncpy(buf, zfs_prop_default_string(prop), numints); } else { if (intsz != 8 || numints < 1) return (SET_ERROR(EOVERFLOW)); *(uint64_t *)buf = zfs_prop_default_numeric(prop); } return (0); } int dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot) { int err = ENOENT; dsl_dir_t *target = dd; objset_t *mos = dd->dd_pool->dp_meta_objset; zfs_prop_t prop; boolean_t inheritable; boolean_t inheriting = B_FALSE; char *inheritstr; char *recvdstr; ASSERT(dsl_pool_config_held(dd->dd_pool)); if (setpoint) setpoint[0] = '\0'; prop = zfs_name_to_prop(propname); inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); /* * Note: dd may become NULL, therefore we shouldn't dereference it * after this loop. */ for (; dd != NULL; dd = dd->dd_parent) { if (dd != target || snapshot) { if (!inheritable) break; inheriting = B_TRUE; } /* Check for a local value. */ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, propname, intsz, numints, buf); if (err != ENOENT) { if (setpoint != NULL && err == 0) dsl_dir_name(dd, setpoint); break; } /* * Skip the check for a received value if there is an explicit * inheritance entry. */ err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, inheritstr); if (err != 0 && err != ENOENT) break; if (err == ENOENT) { /* Check for a received value. */ err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, recvdstr, intsz, numints, buf); if (err != ENOENT) { if (setpoint != NULL && err == 0) { if (inheriting) { dsl_dir_name(dd, setpoint); } else { (void) strcpy(setpoint, ZPROP_SOURCE_VAL_RECVD); } } break; } } /* * If we found an explicit inheritance entry, err is zero even * though we haven't yet found the value, so reinitializing err * at the end of the loop (instead of at the beginning) ensures * that err has a valid post-loop value. */ err = SET_ERROR(ENOENT); } if (err == ENOENT) err = dodefault(propname, intsz, numints, buf); strfree(inheritstr); strfree(recvdstr); return (err); } int dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, int intsz, int numints, void *buf, char *setpoint) { zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t inheritable; - boolean_t snapshot; uint64_t zapobj; ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); - snapshot = dsl_dataset_is_snapshot(ds); zapobj = dsl_dataset_phys(ds)->ds_props_obj; if (zapobj != 0) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; int err; - ASSERT(snapshot); + ASSERT(ds->ds_is_snapshot); /* Check for a local value. */ err = zap_lookup(mos, zapobj, propname, intsz, numints, buf); if (err != ENOENT) { if (setpoint != NULL && err == 0) dsl_dataset_name(ds, setpoint); return (err); } /* * Skip the check for a received value if there is an explicit * inheritance entry. */ if (inheritable) { char *inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); err = zap_contains(mos, zapobj, inheritstr); strfree(inheritstr); if (err != 0 && err != ENOENT) return (err); } if (err == ENOENT) { /* Check for a received value. */ char *recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); err = zap_lookup(mos, zapobj, recvdstr, intsz, numints, buf); strfree(recvdstr); if (err != ENOENT) { if (setpoint != NULL && err == 0) (void) strcpy(setpoint, ZPROP_SOURCE_VAL_RECVD); return (err); } } } return (dsl_prop_get_dd(ds->ds_dir, propname, - intsz, numints, buf, setpoint, snapshot)); + intsz, numints, buf, setpoint, ds->ds_is_snapshot)); } /* * Register interest in the named property. We'll call the callback * once to notify it of the current property value, and again each time * the property changes, until this callback is unregistered. * * Return 0 on success, errno if the prop is not an integer value. */ int dsl_prop_register(dsl_dataset_t *ds, const char *propname, dsl_prop_changed_cb_t *callback, void *cbarg) { dsl_dir_t *dd = ds->ds_dir; dsl_pool_t *dp = dd->dd_pool; uint64_t value; dsl_prop_cb_record_t *cbr; int err; ASSERT(dsl_pool_config_held(dp)); err = dsl_prop_get_int_ds(ds, propname, &value); if (err != 0) return (err); cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP); cbr->cbr_ds = ds; cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP); (void) strcpy((char *)cbr->cbr_propname, propname); cbr->cbr_func = callback; cbr->cbr_arg = cbarg; mutex_enter(&dd->dd_lock); list_insert_head(&dd->dd_prop_cbs, cbr); mutex_exit(&dd->dd_lock); cbr->cbr_func(cbr->cbr_arg, value); return (0); } int dsl_prop_get(const char *dsname, const char *propname, int intsz, int numints, void *buf, char *setpoint) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = dsl_prop_get_ds(dmu_objset_ds(os), propname, intsz, numints, buf, setpoint); dmu_objset_rele(os, FTAG); return (error); } /* * Get the current property value. It may have changed by the time this * function returns, so it is NOT safe to follow up with * dsl_prop_register() and assume that the value has not changed in * between. * * Return 0 on success, ENOENT if ddname is invalid. */ int dsl_prop_get_integer(const char *ddname, const char *propname, uint64_t *valuep, char *setpoint) { return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); } int dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname, uint64_t *valuep) { return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL)); } /* * Predict the effective value of the given special property if it were set with * the given value and source. This is not a general purpose function. It exists * only to handle the special requirements of the quota and reservation * properties. The fact that these properties are non-inheritable greatly * simplifies the prediction logic. * * Returns 0 on success, a positive error code on failure, or -1 if called with * a property not handled by this function. */ int dsl_prop_predict(dsl_dir_t *dd, const char *propname, zprop_source_t source, uint64_t value, uint64_t *newvalp) { zfs_prop_t prop = zfs_name_to_prop(propname); objset_t *mos; uint64_t zapobj; uint64_t version; char *recvdstr; int err = 0; switch (prop) { case ZFS_PROP_QUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFQUOTA: case ZFS_PROP_REFRESERVATION: break; default: return (-1); } mos = dd->dd_pool->dp_meta_objset; zapobj = dsl_dir_phys(dd)->dd_props_zapobj; recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); version = spa_version(dd->dd_pool->dp_spa); if (version < SPA_VERSION_RECVD_PROPS) { if (source & ZPROP_SRC_NONE) source = ZPROP_SRC_NONE; else if (source & ZPROP_SRC_RECEIVED) source = ZPROP_SRC_LOCAL; } switch (source) { case ZPROP_SRC_NONE: /* Revert to the received value, if any. */ err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp); if (err == ENOENT) *newvalp = 0; break; case ZPROP_SRC_LOCAL: *newvalp = value; break; case ZPROP_SRC_RECEIVED: /* * If there's no local setting, then the new received value will * be the effective value. */ err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); if (err == ENOENT) *newvalp = value; break; case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): /* * We're clearing the received value, so the local setting (if * it exists) remains the effective value. */ err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); if (err == ENOENT) *newvalp = 0; break; default: panic("unexpected property source: %d", source); } strfree(recvdstr); if (err == ENOENT) return (0); return (err); } /* * Unregister this callback. Return 0 on success, ENOENT if ddname is * invalid, or ENOMSG if no matching callback registered. */ int dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, dsl_prop_changed_cb_t *callback, void *cbarg) { dsl_dir_t *dd = ds->ds_dir; dsl_prop_cb_record_t *cbr; mutex_enter(&dd->dd_lock); for (cbr = list_head(&dd->dd_prop_cbs); cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { if (cbr->cbr_ds == ds && cbr->cbr_func == callback && cbr->cbr_arg == cbarg && strcmp(cbr->cbr_propname, propname) == 0) break; } if (cbr == NULL) { mutex_exit(&dd->dd_lock); return (SET_ERROR(ENOMSG)); } list_remove(&dd->dd_prop_cbs, cbr); mutex_exit(&dd->dd_lock); kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1); kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); return (0); } boolean_t dsl_prop_hascb(dsl_dataset_t *ds) { dsl_dir_t *dd = ds->ds_dir; boolean_t rv = B_FALSE; dsl_prop_cb_record_t *cbr; mutex_enter(&dd->dd_lock); for (cbr = list_head(&dd->dd_prop_cbs); cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { if (cbr->cbr_ds == ds) { rv = B_TRUE; break; } } mutex_exit(&dd->dd_lock); return (rv); } /* ARGSUSED */ static int dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { dsl_dir_t *dd = ds->ds_dir; dsl_prop_cb_record_t *cbr; mutex_enter(&dd->dd_lock); for (cbr = list_head(&dd->dd_prop_cbs); cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { uint64_t value; /* * Callback entries do not have holds on their datasets * so that datasets with registered callbacks are still * eligible for eviction. Unlike operations on callbacks * for a single dataset, we are performing a recursive * descent of related datasets and the calling context * for this iteration only has a dataset hold on the root. * Without a hold, the callback's pointer to the dataset * could be invalidated by eviction at any time. * * Use dsl_dataset_try_add_ref() to verify that the * dataset has not begun eviction processing and to * prevent eviction from occurring for the duration * of the callback. If the hold attempt fails, this * object is already being evicted and the callback can * be safely ignored. */ if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG)) continue; if (dsl_prop_get_ds(cbr->cbr_ds, cbr->cbr_propname, sizeof (value), 1, &value, NULL) == 0) cbr->cbr_func(cbr->cbr_arg, value); dsl_dataset_rele(cbr->cbr_ds, FTAG); } mutex_exit(&dd->dd_lock); return (0); } /* * Update all property values for ddobj & its descendants. This is used * when renaming the dir. */ void dsl_prop_notify_all(dsl_dir_t *dd) { dsl_pool_t *dp = dd->dd_pool; ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb, NULL, DS_FIND_CHILDREN); } static void dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, const char *propname, uint64_t value, int first) { dsl_dir_t *dd; dsl_prop_cb_record_t *cbr; objset_t *mos = dp->dp_meta_objset; zap_cursor_t zc; zap_attribute_t *za; int err; ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); if (err) return; if (!first) { /* * If the prop is set here, then this change is not * being inherited here or below; stop the recursion. */ err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, propname); if (err == 0) { dsl_dir_rele(dd, FTAG); return; } ASSERT3U(err, ==, ENOENT); } mutex_enter(&dd->dd_lock); for (cbr = list_head(&dd->dd_prop_cbs); cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) { uint64_t propobj; /* * cbr->cbf_ds may be invalidated due to eviction, * requiring the use of dsl_dataset_try_add_ref(). * See comment block in dsl_prop_notify_all_cb() * for details. */ if (strcmp(cbr->cbr_propname, propname) != 0 || !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG)) continue; propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj; /* * If the property is not set on this ds, then it is * inherited here; call the callback. */ if (propobj == 0 || zap_contains(mos, propobj, propname) != 0) cbr->cbr_func(cbr->cbr_arg, value); dsl_dataset_rele(cbr->cbr_ds, FTAG); } mutex_exit(&dd->dd_lock); za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(&zc, mos, dsl_dir_phys(dd)->dd_child_dir_zapobj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { dsl_prop_changed_notify(dp, za->za_first_integer, propname, value, FALSE); } kmem_free(za, sizeof (zap_attribute_t)); zap_cursor_fini(&zc); dsl_dir_rele(dd, FTAG); } void dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, zprop_source_t source, int intsz, int numints, const void *value, dmu_tx_t *tx) { objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; uint64_t zapobj, intval, dummy; int isint; char valbuf[32]; const char *valstr = NULL; char *inheritstr; char *recvdstr; char *tbuf = NULL; int err; uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); isint = (dodefault(propname, 8, 1, &intval) == 0); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { ASSERT(version >= SPA_VERSION_SNAP_PROPS); if (dsl_dataset_phys(ds)->ds_props_obj == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_dataset_phys(ds)->ds_props_obj = zap_create(mos, DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); } zapobj = dsl_dataset_phys(ds)->ds_props_obj; } else { zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj; } if (version < SPA_VERSION_RECVD_PROPS) { if (source & ZPROP_SRC_NONE) source = ZPROP_SRC_NONE; else if (source & ZPROP_SRC_RECEIVED) source = ZPROP_SRC_LOCAL; } inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); switch (source) { case ZPROP_SRC_NONE: /* * revert to received value, if any (inherit -S) * - remove propname * - remove propname$inherit */ err = zap_remove(mos, zapobj, propname, tx); ASSERT(err == 0 || err == ENOENT); err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); break; case ZPROP_SRC_LOCAL: /* * remove propname$inherit * set propname -> value */ err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); VERIFY0(zap_update(mos, zapobj, propname, intsz, numints, value, tx)); break; case ZPROP_SRC_INHERITED: /* * explicitly inherit * - remove propname * - set propname$inherit */ err = zap_remove(mos, zapobj, propname, tx); ASSERT(err == 0 || err == ENOENT); if (version >= SPA_VERSION_RECVD_PROPS && dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) { dummy = 0; VERIFY0(zap_update(mos, zapobj, inheritstr, 8, 1, &dummy, tx)); } break; case ZPROP_SRC_RECEIVED: /* * set propname$recvd -> value */ err = zap_update(mos, zapobj, recvdstr, intsz, numints, value, tx); ASSERT(err == 0); break; case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED): /* * clear local and received settings * - remove propname * - remove propname$inherit * - remove propname$recvd */ err = zap_remove(mos, zapobj, propname, tx); ASSERT(err == 0 || err == ENOENT); err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); /* FALLTHRU */ case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): /* * remove propname$recvd */ err = zap_remove(mos, zapobj, recvdstr, tx); ASSERT(err == 0 || err == ENOENT); break; default: cmn_err(CE_PANIC, "unexpected property source: %d", source); } strfree(inheritstr); strfree(recvdstr); if (isint) { VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { dsl_prop_cb_record_t *cbr; /* * It's a snapshot; nothing can inherit this * property, so just look for callbacks on this * ds here. */ mutex_enter(&ds->ds_dir->dd_lock); for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr; cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) { if (cbr->cbr_ds == ds && strcmp(cbr->cbr_propname, propname) == 0) cbr->cbr_func(cbr->cbr_arg, intval); } mutex_exit(&ds->ds_dir->dd_lock); } else { dsl_prop_changed_notify(ds->ds_dir->dd_pool, ds->ds_dir->dd_object, propname, intval, TRUE); } (void) snprintf(valbuf, sizeof (valbuf), "%lld", (longlong_t)intval); valstr = valbuf; } else { if (source == ZPROP_SRC_LOCAL) { valstr = value; } else { tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); if (dsl_prop_get_ds(ds, propname, 1, ZAP_MAXVALUELEN, tbuf, NULL) == 0) valstr = tbuf; } } spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE || source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx, "%s=%s", propname, (valstr == NULL ? "" : valstr)); if (tbuf != NULL) kmem_free(tbuf, ZAP_MAXVALUELEN); } int dsl_prop_set_int(const char *dsname, const char *propname, zprop_source_t source, uint64_t value) { nvlist_t *nvl = fnvlist_alloc(); int error; fnvlist_add_uint64(nvl, propname, value); error = dsl_props_set(dsname, source, nvl); fnvlist_free(nvl); return (error); } int dsl_prop_set_string(const char *dsname, const char *propname, zprop_source_t source, const char *value) { nvlist_t *nvl = fnvlist_alloc(); int error; fnvlist_add_string(nvl, propname, value); error = dsl_props_set(dsname, source, nvl); fnvlist_free(nvl); return (error); } int dsl_prop_inherit(const char *dsname, const char *propname, zprop_source_t source) { nvlist_t *nvl = fnvlist_alloc(); int error; fnvlist_add_boolean(nvl, propname); error = dsl_props_set(dsname, source, nvl); fnvlist_free(nvl); return (error); } typedef struct dsl_props_set_arg { const char *dpsa_dsname; zprop_source_t dpsa_source; nvlist_t *dpsa_props; } dsl_props_set_arg_t; static int dsl_props_set_check(void *arg, dmu_tx_t *tx) { dsl_props_set_arg_t *dpsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; uint64_t version; nvpair_t *elem = NULL; int err; err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds); if (err != 0) return (err); version = spa_version(ds->ds_dir->dd_pool->dp_spa); while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) { if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENAMETOOLONG)); } if (nvpair_type(elem) == DATA_TYPE_STRING) { char *valstr = fnvpair_value_string(elem); if (strlen(valstr) >= (version < SPA_VERSION_STMF_PROP ? ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { dsl_dataset_rele(ds, FTAG); return (E2BIG); } } } - if (dsl_dataset_is_snapshot(ds) && version < SPA_VERSION_SNAP_PROPS) { + if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) { dsl_dataset_rele(ds, FTAG); return (SET_ERROR(ENOTSUP)); } dsl_dataset_rele(ds, FTAG); return (0); } void dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source, nvlist_t *props, dmu_tx_t *tx) { nvpair_t *elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { nvpair_t *pair = elem; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs = fnvpair_value_nvlist(pair); pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(pair) == DATA_TYPE_STRING) { const char *value = fnvpair_value_string(pair); dsl_prop_set_sync_impl(ds, nvpair_name(pair), source, 1, strlen(value) + 1, value, tx); } else if (nvpair_type(pair) == DATA_TYPE_UINT64) { uint64_t intval = fnvpair_value_uint64(pair); dsl_prop_set_sync_impl(ds, nvpair_name(pair), source, sizeof (intval), 1, &intval, tx); } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) { dsl_prop_set_sync_impl(ds, nvpair_name(pair), source, 0, 0, NULL, tx); } else { panic("invalid nvpair type"); } } } static void dsl_props_set_sync(void *arg, dmu_tx_t *tx) { dsl_props_set_arg_t *dpsa = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds)); dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx); dsl_dataset_rele(ds, FTAG); } /* * All-or-nothing; if any prop can't be set, nothing will be modified. */ int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) { dsl_props_set_arg_t dpsa; int nblks = 0; dpsa.dpsa_dsname = dsname; dpsa.dpsa_source = source; dpsa.dpsa_props = props; /* * If the source includes NONE, then we will only be removing entries * from the ZAP object. In that case don't check for ENOSPC. */ if ((source & ZPROP_SRC_NONE) == 0) nblks = 2 * fnvlist_num_pairs(props); return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync, &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED)); } typedef enum dsl_prop_getflags { DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */ DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */ DSL_PROP_GET_LOCAL = 0x4, /* local properties */ DSL_PROP_GET_RECEIVED = 0x8 /* received properties */ } dsl_prop_getflags_t; static int dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv) { zap_cursor_t zc; zap_attribute_t za; int err = 0; for (zap_cursor_init(&zc, mos, propobj); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { nvlist_t *propval; zfs_prop_t prop; char buf[ZAP_MAXNAMELEN]; char *valstr; const char *suffix; const char *propname; const char *source; suffix = strchr(za.za_name, '$'); if (suffix == NULL) { /* * Skip local properties if we only want received * properties. */ if (flags & DSL_PROP_GET_RECEIVED) continue; propname = za.za_name; source = setpoint; } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) { /* Skip explicitly inherited entries. */ continue; } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) { if (flags & DSL_PROP_GET_LOCAL) continue; (void) strncpy(buf, za.za_name, (suffix - za.za_name)); buf[suffix - za.za_name] = '\0'; propname = buf; if (!(flags & DSL_PROP_GET_RECEIVED)) { /* Skip if locally overridden. */ err = zap_contains(mos, propobj, propname); if (err == 0) continue; if (err != ENOENT) break; /* Skip if explicitly inherited. */ valstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); err = zap_contains(mos, propobj, valstr); strfree(valstr); if (err == 0) continue; if (err != ENOENT) break; } source = ((flags & DSL_PROP_GET_INHERITING) ? setpoint : ZPROP_SOURCE_VAL_RECVD); } else { /* * For backward compatibility, skip suffixes we don't * recognize. */ continue; } prop = zfs_name_to_prop(propname); /* Skip non-inheritable properties. */ if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) continue; /* Skip properties not valid for this type. */ if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL && !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT)) continue; /* Skip properties already defined. */ if (nvlist_exists(nv, propname)) continue; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (za.za_integer_length == 1) { /* * String property */ char *tmp = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, propobj, za.za_name, 1, za.za_num_integers, tmp); if (err != 0) { kmem_free(tmp, za.za_num_integers); break; } VERIFY(nvlist_add_string(propval, ZPROP_VALUE, tmp) == 0); kmem_free(tmp, za.za_num_integers); } else { /* * Integer property */ ASSERT(za.za_integer_length == 8); (void) nvlist_add_uint64(propval, ZPROP_VALUE, za.za_first_integer); } VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0); VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); nvlist_free(propval); } zap_cursor_fini(&zc); if (err == ENOENT) err = 0; return (err); } /* * Iterate over all properties for this dataset and return them in an nvlist. */ static int dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, dsl_prop_getflags_t flags) { dsl_dir_t *dd = ds->ds_dir; dsl_pool_t *dp = dd->dd_pool; objset_t *mos = dp->dp_meta_objset; int err = 0; char setpoint[MAXNAMELEN]; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - if (dsl_dataset_is_snapshot(ds)) + if (ds->ds_is_snapshot) flags |= DSL_PROP_GET_SNAPSHOT; ASSERT(dsl_pool_config_held(dp)); if (dsl_dataset_phys(ds)->ds_props_obj != 0) { ASSERT(flags & DSL_PROP_GET_SNAPSHOT); dsl_dataset_name(ds, setpoint); err = dsl_prop_get_all_impl(mos, dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp); if (err) goto out; } for (; dd != NULL; dd = dd->dd_parent) { if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) { if (flags & (DSL_PROP_GET_LOCAL | DSL_PROP_GET_RECEIVED)) break; flags |= DSL_PROP_GET_INHERITING; } dsl_dir_name(dd, setpoint); err = dsl_prop_get_all_impl(mos, dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp); if (err) break; } out: return (err); } boolean_t dsl_prop_get_hasrecvd(const char *dsname) { uint64_t dummy; return (0 == dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL)); } static int dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source) { uint64_t version; spa_t *spa; int error = 0; VERIFY0(spa_open(dsname, &spa, FTAG)); version = spa_version(spa); spa_close(spa, FTAG); if (version >= SPA_VERSION_RECVD_PROPS) error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0); return (error); } /* * Call after successfully receiving properties to ensure that only the first * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties. */ int dsl_prop_set_hasrecvd(const char *dsname) { int error = 0; if (!dsl_prop_get_hasrecvd(dsname)) error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL); return (error); } void dsl_prop_unset_hasrecvd(const char *dsname) { VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE)); } int dsl_prop_get_all(objset_t *os, nvlist_t **nvp) { return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0)); } int dsl_prop_get_received(const char *dsname, nvlist_t **nvp) { objset_t *os; int error; /* * Received properties are not distinguishable from local properties * until the dataset has received properties on or after * SPA_VERSION_RECVD_PROPS. */ dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ? DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags); dmu_objset_rele(os, FTAG); return (error); } void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) { nvlist_t *propval; const char *propname = zfs_prop_to_name(prop); uint64_t default_value; if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); return; } VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); /* Indicate the default source if we can. */ if (dodefault(propname, 8, 1, &default_value) == 0 && value == default_value) { VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0); } VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); nvlist_free(propval); } void dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) { nvlist_t *propval; const char *propname = zfs_prop_to_name(prop); if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); return; } VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); nvlist_free(propval); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_scan.c (revision 286575) @@ -1,1835 +1,1835 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_phys_t *); static scan_cb_t dsl_scan_scrub_cb; static void dsl_scan_cancel_sync(void *, dmu_tx_t *); static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *tx); unsigned int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ unsigned int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ unsigned int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ SYSCTL_DECL(_vfs_zfs); SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, &zfs_top_maxinflight, 0, "Maximum I/Os per top-level vdev"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_delay, CTLFLAG_RWTUN, &zfs_resilver_delay, 0, "Number of ticks to delay resilver"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scrub_delay, CTLFLAG_RWTUN, &zfs_scrub_delay, 0, "Number of ticks to delay scrub"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_idle, CTLFLAG_RWTUN, &zfs_scan_idle, 0, "Idle scan window in clock ticks"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, scan_min_time_ms, CTLFLAG_RWTUN, &zfs_scan_min_time_ms, 0, "Min millisecs to scrub per txg"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, free_min_time_ms, CTLFLAG_RWTUN, &zfs_free_min_time_ms, 0, "Min millisecs to free per txg"); SYSCTL_UINT(_vfs_zfs, OID_AUTO, resilver_min_time_ms, CTLFLAG_RWTUN, &zfs_resilver_min_time_ms, 0, "Min millisecs to resilver per txg"); SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_io, CTLFLAG_RWTUN, &zfs_no_scrub_io, 0, "Disable scrub I/O"); SYSCTL_INT(_vfs_zfs, OID_AUTO, no_scrub_prefetch, CTLFLAG_RWTUN, &zfs_no_scrub_prefetch, 0, "Disable scrub prefetching"); enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ uint64_t zfs_free_max_blocks = UINT64_MAX; SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, free_max_blocks, CTLFLAG_RWTUN, &zfs_free_max_blocks, 0, "Maximum number of blocks to free in one TXG"); #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) extern int zfs_txg_timeout; /* the order has to match pool_scan_type */ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { NULL, dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ }; int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { int err; dsl_scan_t *scn; spa_t *spa = dp->dp_spa; uint64_t f; scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); scn->scn_dp = dp; /* * It's possible that we're resuming a scan after a reboot so * make sure that the scan_async_destroying flag is initialized * appropriately. */ ASSERT(!scn->scn_async_destroying); scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { /* * There was an old-style scrub in progress. Restart a * new-style scrub from the beginning. */ scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress; " "restarting new-style scrub in txg %llu", scn->scn_restart_txg); /* * Load the queue obj from the old location so that it * can be freed by dsl_scan_done(). */ (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_queue", sizeof (uint64_t), 1, &scn->scn_phys.scn_queue_obj); } else { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys); if (err == ENOENT) return (0); else if (err) return (err); if (scn->scn_phys.scn_state == DSS_SCANNING && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old * pool, and the pool was accessed by old * software. Restart from the beginning, since * the old software may have changed the pool in * the meantime. */ scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", scn->scn_restart_txg); } } spa_scan_stat_init(spa); return (0); } void dsl_scan_fini(dsl_pool_t *dp) { if (dp->dp_scan) { kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); dp->dp_scan = NULL; } } /* ARGSUSED */ static int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (scn->scn_phys.scn_state == DSS_SCANNING) return (SET_ERROR(EBUSY)); return (0); } static void dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; pool_scan_func_t *funcp = arg; dmu_object_type_t ot = 0; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); bzero(&scn->scn_phys, sizeof (scn->scn_phys)); scn->scn_phys.scn_func = *funcp; scn->scn_phys.scn_state = DSS_SCANNING; scn->scn_phys.scn_min_txg = 0; scn->scn_phys.scn_max_txg = tx->tx_txg; scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; scn->scn_restart_txg = 0; scn->scn_done_txg = 0; spa_scan_stat_init(spa); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; /* rewrite all disk labels */ vdev_config_dirty(spa->spa_root_vdev); if (vdev_resilver_needed(spa->spa_root_vdev, &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { spa_event_notify(spa, NULL, ESC_ZFS_RESILVER_START); } else { spa_event_notify(spa, NULL, ESC_ZFS_SCRUB_START); } spa->spa_scrub_started = B_TRUE; /* * If this is an incremental scrub, limit the DDT scrub phase * to just the auto-ditto class (for correctness); the rest * of the scrub should go faster using top-down pruning. */ if (scn->scn_phys.scn_min_txg > TXG_INITIAL) scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; } /* back to the generic stuff */ if (dp->dp_blkstats == NULL) { dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); } bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); dsl_scan_sync_state(scn, tx); spa_history_log_internal(spa, "scan setup", tx, "func=%u mintxg=%llu maxtxg=%llu", *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); } /* ARGSUSED */ static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) { static const char *old_names[] = { "scrub_bookmark", "scrub_ddt_bookmark", "scrub_ddt_class_max", "scrub_queue", "scrub_min_txg", "scrub_max_txg", "scrub_func", "scrub_errors", NULL }; dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; int i; /* Remove any remnants of an old-style scrub. */ for (i = 0; old_names[i]; i++) { (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); } if (scn->scn_phys.scn_queue_obj != 0) { VERIFY(0 == dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = 0; } /* * If we were "restarted" from a stopped state, don't bother * with anything else. */ if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (complete) scn->scn_phys.scn_state = DSS_FINISHED; else scn->scn_phys.scn_state = DSS_CANCELED; spa_history_log_internal(spa, "scan done", tx, "complete=%u", complete); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight > 0) { cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } mutex_exit(&spa->spa_scrub_lock); spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; /* * If the scrub/resilver completed, update all DTLs to * reflect this. Whether it succeeded or not, vacate * all temporary scrub DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, complete ? scn->scn_phys.scn_max_txg : 0, B_TRUE); if (complete) { spa_event_notify(spa, NULL, scn->scn_phys.scn_min_txg ? ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); } spa_errlog_rotate(spa); /* * We may have finished replacing a device. * Let the async thread assess this and handle the detach. */ spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); } scn->scn_phys.scn_end_time = gethrestime_sec(); } /* ARGSUSED */ static int dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; if (scn->scn_phys.scn_state != DSS_SCANNING) return (SET_ERROR(ENOENT)); return (0); } /* ARGSUSED */ static void dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); dsl_scan_sync_state(scn, tx); } int dsl_scan_cancel(dsl_pool_t *dp) { return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx); static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) { zio_free(dp->dp_spa, txg, bp); } void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) { ASSERT(dsl_pool_sync_context(dp)); zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, BP_GET_PSIZE(bpp), pio->io_flags)); } static uint64_t dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; - if (dsl_dataset_is_snapshot(ds)) + if (ds->ds_is_snapshot) return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); return (smt); } static void dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) { VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, &scn->scn_phys, tx)); } extern int zfs_vdev_async_write_active_min_dirty_percent; static boolean_t dsl_scan_check_pause(dsl_scan_t *scn, const zbookmark_phys_t *zb) { /* we never skip user/group accounting objects */ if (zb && (int64_t)zb->zb_object < 0) return (B_FALSE); if (scn->scn_pausing) return (B_TRUE); /* we're already pausing */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ if (zb && zb->zb_level != 0) return (B_FALSE); /* * We pause if: * - we have scanned for the maximum time: an entire txg * timeout (default 5 sec) * or * - we have scanned for at least the minimum time (default 1 sec * for scrub, 3 sec for resilver), and either we have sufficient * dirty data that we are starting to write more quickly * (default 30%), or someone is explicitly waiting for this txg * to complete. * or * - the spa is shutting down because this pool is being exported * or the machine is rebooting. */ int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? zfs_resilver_min_time_ms : zfs_scan_min_time_ms; uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout || (NSEC2MSEC(elapsed_nanosecs) > mintime && (txg_sync_waiting(scn->scn_dp) || dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) || spa_shutting_down(scn->scn_dp->dp_spa)) { if (zb) { dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); scn->scn_phys.scn_bookmark = *zb; } dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); scn->scn_pausing = B_TRUE; return (B_TRUE); } return (B_FALSE); } typedef struct zil_scan_arg { dsl_pool_t *zsa_dp; zil_header_t *zsa_zh; } zil_scan_arg_t; /* ARGSUSED */ static int dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; zbookmark_phys_t zb; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* * One block ("stubby") can be allocated a long time ago; we * want to visit that one because it has been allocated * (on-disk) even if it hasn't been claimed (even though for * scrub there's nothing to do to it). */ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); return (0); } /* ARGSUSED */ static int dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { if (lrc->lrc_txtype == TX_WRITE) { zil_scan_arg_t *zsa = arg; dsl_pool_t *dp = zsa->zsa_dp; dsl_scan_t *scn = dp->dp_scan; zil_header_t *zh = zsa->zsa_zh; lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; zbookmark_phys_t zb; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return (0); /* * birth can be < claim_txg if this record's txg is * already txg sync'ed (but this log block contains * other records that are not synced) */ if (claim_txg == 0 || bp->blk_birth < claim_txg) return (0); SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); } return (0); } static void dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) { uint64_t claim_txg = zh->zh_claim_txg; zil_scan_arg_t zsa = { dp, zh }; zilog_t *zilog; /* * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ if (claim_txg == 0 && spa_writeable(dp->dp_spa)) return; zilog = zil_alloc(dp->dp_meta_objset, zh); (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, claim_txg); zil_free(zilog); } /* ARGSUSED */ static void dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, uint64_t object, uint64_t blkid) { zbookmark_phys_t czb; arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; if (zfs_no_scrub_prefetch) return; if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) return; SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); } static boolean_t dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, const zbookmark_phys_t *zb) { /* * We never skip over user/group accounting objects (obj<0) */ if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && (int64_t)zb->zb_object >= 0) { /* * If we already visited this bp & everything below (in * a prior txg sync), don't bother doing it again. */ if (zbookmark_is_before(dnp, zb, &scn->scn_phys.scn_bookmark)) return (B_TRUE); /* * If we found the block we're trying to resume from, or * we went past it to a different object, zero it out to * indicate that it's OK to start checking for pausing * again. */ if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { dprintf("resuming at %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); } } return (B_FALSE); } /* * Return nonzero on i/o error. * Return new buf to write out in *bufp. */ static int dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_phys_t *zb, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; int err; if (BP_GET_LEVEL(bp) > 0) { arc_flags_t flags = ARC_FLAG_WAIT; int i; blkptr_t *cbp; int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_object, zb->zb_blkid * epb + i); } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, zb->zb_level - 1, zb->zb_blkid * epb + i); dsl_scan_visitbp(cbp, &czb, dnp, ds, scn, ostype, tx); } (void) arc_buf_remove_ref(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; int i, j; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { for (j = 0; j < cdnp->dn_nblkptr; j++) { blkptr_t *cbp = &cdnp->dn_blkptr[j]; dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, zb->zb_blkid * epb + i, j); } } for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { dsl_scan_visitdnode(scn, ds, ostype, cdnp, zb->zb_blkid * epb + i, tx); } (void) arc_buf_remove_ref(buf, &buf); } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { arc_flags_t flags = ARC_FLAG_WAIT; objset_phys_t *osp; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } osp = buf->b_data; dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); if (OBJSET_BUF_HAS_USERUSED(buf)) { /* * We also always visit user/group accounting * objects, and never skip them, even if we are * pausing. This is necessary so that the space * deltas from this txg get integrated. */ dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_groupused_dnode, DMU_GROUPUSED_OBJECT, tx); dsl_scan_visitdnode(scn, ds, osp->os_type, &osp->os_userused_dnode, DMU_USERUSED_OBJECT, tx); } (void) arc_buf_remove_ref(buf, &buf); } return (0); } static void dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx) { int j; for (j = 0; j < dnp->dn_nblkptr; j++) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, dnp->dn_nlevels - 1, j); dsl_scan_visitbp(&dnp->dn_blkptr[j], &czb, dnp, ds, scn, ostype, tx); } if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { zbookmark_phys_t czb; SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, 0, DMU_SPILL_BLKID); dsl_scan_visitbp(&dnp->dn_spill, &czb, dnp, ds, scn, ostype, tx); } } /* * The arguments are in this order because mdb can only print the * first 5; we want them to be useful. */ static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, dmu_objset_type_t ostype, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; arc_buf_t *buf = NULL; blkptr_t bp_toread = *bp; /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ if (dsl_scan_check_pause(scn, zb)) return; if (dsl_scan_check_resume(scn, dnp, zb)) return; if (BP_IS_HOLE(bp)) return; scn->scn_visited_this_txg++; dprintf_bp(bp, "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", ds, ds ? ds->ds_object : 0, zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, bp); if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) return; if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0) return; /* * If dsl_scan_ddt() has aready visited this block, it will have * already done any translations or scrubbing, so don't call the * callback again. */ if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { ASSERT(buf == NULL); return; } /* * If this block is from the future (after cur_max_txg), then we * are doing this on behalf of a deleted snapshot, and we will * revisit the future block on the next pass of this dataset. * Don't scan it now unless we need to because something * under it was modified. */ if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) { scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); } } static void dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { zbookmark_phys_t zb; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); dprintf_ds(ds, "finished scan%s", ""); } void dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { /* Note, scn_cur_{min,max}_txg stays the same. */ scn->scn_phys.scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; } else { SET_BOOKMARK(&scn->scn_phys.scn_bookmark, ZB_DESTROYED_OBJSET, 0, 0, 0); zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset bookmark to -1,0,0,0", (u_longlong_t)ds->ds_object); } } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); - if (dsl_dataset_is_snapshot(ds)) { + if (ds->ds_is_snapshot) { /* * We keep the same mintxg; it could be > * ds_creation_txg if the previous snapshot was * deleted too. */ VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("destroying ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); } else { zfs_dbgmsg("destroying ds %llu; in queue; removing", (u_longlong_t)ds->ds_object); } } else { zfs_dbgmsg("destroying ds %llu; ignoring", (u_longlong_t)ds->ds_object); } /* * dsl_scan_sync() should be called after this, and should sync * out our changed state, but just to be safe, do it here. */ dsl_scan_sync_state(scn, tx); } void dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (scn->scn_phys.scn_state != DSS_SCANNING) return; ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_prev_snap_obj; zfs_dbgmsg("snapshotting ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); zfs_dbgmsg("snapshotting ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } dsl_scan_sync_state(scn, tx); } void dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) { dsl_pool_t *dp = ds1->ds_dir->dd_pool; dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg) == 0) { int err; ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); err = zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg, tx); VERIFY(err == 0 || err == EEXIST); if (err == EEXIST) { /* Both were there to begin with */ VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); } zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); } else if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); VERIFY(0 == zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg, tx)); zfs_dbgmsg("clone_swap ds %llu; in queue; " "replacing with %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } dsl_scan_sync_state(scn, tx); } struct enqueue_clones_arg { dmu_tx_t *tx; uint64_t originobj; }; /* ARGSUSED */ static int enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { struct enqueue_clones_arg *eca = arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj) return (0); err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); dsl_dataset_rele(ds, FTAG); if (err) return (err); ds = prev; } VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0); dsl_dataset_rele(ds, FTAG); return (0); } static void dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; dsl_dataset_t *ds; objset_t *os; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (dmu_objset_from_ds(ds, &os)) goto out; /* * Only the ZIL in the head (non-snapshot) is valid. Even though * snapshots can have ZIL block pointers (which may be the same * BP as in the head), they must be ignored. So we traverse the * ZIL here, rather than in scan_recurse(), because the regular * snapshot block-sharing rules don't apply to it. */ - if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !dsl_dataset_is_snapshot(ds)) + if (DSL_SCAN_IS_SCRUB_RESILVER(scn) && !ds->ds_is_snapshot) dsl_scan_zil(dp, &os->os_zil_header); /* * Iterate over the bps in this ds. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); char *dsname = kmem_alloc(ZFS_MAXNAMELEN, KM_SLEEP); dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " "pausing=%u", (longlong_t)dsobj, dsname, (longlong_t)scn->scn_phys.scn_cur_min_txg, (longlong_t)scn->scn_phys.scn_cur_max_txg, (int)scn->scn_pausing); kmem_free(dsname, ZFS_MAXNAMELEN); if (scn->scn_pausing) goto out; /* * We've finished this pass over this dataset. */ /* * If we did not completely visit this dataset, do another pass. */ if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { zfs_dbgmsg("incomplete pass; visiting again"); scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, scn->scn_phys.scn_cur_max_txg, tx) == 0); goto out; } /* * Add descendent datasets to work queue. */ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_next_snap_obj, dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0); } if (dsl_dataset_phys(ds)->ds_num_children > 1) { boolean_t usenext = B_FALSE; if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { uint64_t count; /* * A bug in a previous version of the code could * cause upgrade_clones_cb() to not set * ds_next_snap_obj when it should, leading to a * missing entry. Therefore we can only use the * next_clones_obj when its count is correct. */ int err = zap_count(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj, &count); if (err == 0 && count == dsl_dataset_phys(ds)->ds_num_children - 1) usenext = B_TRUE; } if (usenext) { VERIFY0(zap_join_key(dp->dp_meta_objset, dsl_dataset_phys(ds)->ds_next_clones_obj, scn->scn_phys.scn_queue_obj, dsl_dataset_phys(ds)->ds_creation_txg, tx)); } else { struct enqueue_clones_arg eca; eca.tx = tx; eca.originobj = ds->ds_object; VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); } } out: dsl_dataset_rele(ds, FTAG); } /* ARGSUSED */ static int enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } /* * If this is a clone, we don't need to worry about it for now. */ if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { dsl_dataset_rele(ds, FTAG); dsl_dataset_rele(prev, FTAG); return (0); } dsl_dataset_rele(ds, FTAG); ds = prev; } VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0); dsl_dataset_rele(ds, FTAG); return (0); } /* * Scrub/dedup interaction. * * If there are N references to a deduped block, we don't want to scrub it * N times -- ideally, we should scrub it exactly once. * * We leverage the fact that the dde's replication class (enum ddt_class) * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. * * To prevent excess scrubbing, the scrub begins by walking the DDT * to find all blocks with refcnt > 1, and scrubs each of these once. * Since there are two replication classes which contain blocks with * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. * * There would be nothing more to say if a block's refcnt couldn't change * during a scrub, but of course it can so we must account for changes * in a block's replication class. * * Here's an example of what can occur: * * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 * when visited during the top-down scrub phase, it will be scrubbed twice. * This negates our scrub optimization, but is otherwise harmless. * * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 * on each visit during the top-down scrub phase, it will never be scrubbed. * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 * while a scrub is in progress, it scrubs the block right then. */ static void dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) { ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; ddt_entry_t dde = { 0 }; int error; uint64_t n = 0; while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { ddt_t *ddt; if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) break; dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", (longlong_t)ddb->ddb_class, (longlong_t)ddb->ddb_type, (longlong_t)ddb->ddb_checksum, (longlong_t)ddb->ddb_cursor); /* There should be no pending changes to the dedup table */ ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; ASSERT(avl_first(&ddt->ddt_tree) == NULL); dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); n++; if (dsl_scan_check_pause(scn, NULL)) break; } zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; pausing=%u", (longlong_t)n, (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_pausing); ASSERT(error == 0 || error == ENOENT); ASSERT(error != ENOENT || ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); } /* ARGSUSED */ void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, ddt_entry_t *dde, dmu_tx_t *tx) { const ddt_key_t *ddk = &dde->dde_key; ddt_phys_t *ddp = dde->dde_phys; blkptr_t bp; zbookmark_phys_t zb = { 0 }; if (scn->scn_phys.scn_state != DSS_SCANNING) return; for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0 || ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) continue; ddt_bp_create(checksum, ddk, ddp, &bp); scn->scn_visited_this_txg++; scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); } } static void dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; zap_cursor_t zc; zap_attribute_t za; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_ddt(scn, tx); if (scn->scn_pausing) return; } if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { /* First do the MOS & ORIGIN */ scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; dsl_scan_visit_rootbp(scn, NULL, &dp->dp_meta_rootbp, tx); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); if (scn->scn_pausing) return; if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, enqueue_cb, tx, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); } ASSERT(!scn->scn_pausing); } else if (scn->scn_phys.scn_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { /* * If we were paused, continue from here. Note if the * ds we were paused on was deleted, the zb_objset may * be -1, so we will skip this and find a new objset * below. */ dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); if (scn->scn_pausing) return; } /* * In case we were paused right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); /* keep pulling things out of the zap-object-as-queue */ while (zap_cursor_init(&zc, dp->dp_meta_objset, scn->scn_phys.scn_queue_obj), zap_cursor_retrieve(&zc, &za) == 0) { dsl_dataset_t *ds; uint64_t dsobj; dsobj = strtonum(za.za_name, NULL); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, dsobj, tx)); /* Set up min/max txg */ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (za.za_first_integer != 0) { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, za.za_first_integer); } else { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, dsl_dataset_phys(ds)->ds_prev_snap_txg); } scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); dsl_dataset_rele(ds, FTAG); dsl_scan_visitds(scn, dsobj, tx); zap_cursor_fini(&zc); if (scn->scn_pausing) return; } zap_cursor_fini(&zc); } static boolean_t dsl_scan_free_should_pause(dsl_scan_t *scn) { uint64_t elapsed_nanosecs; if (zfs_recover) return (B_FALSE); if (scn->scn_visited_this_txg >= zfs_free_max_blocks) return (B_TRUE); elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || (NSEC2MSEC(elapsed_nanosecs) > zfs_free_min_time_ms && txg_sync_waiting(scn->scn_dp)) || spa_shutting_down(scn->scn_dp->dp_spa)); } static int dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_scan_t *scn = arg; if (!scn->scn_is_bptree || (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { if (dsl_scan_free_should_pause(scn)) return (SET_ERROR(ERESTART)); } zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), 0)); dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); scn->scn_visited_this_txg++; return (0); } boolean_t dsl_scan_active(dsl_scan_t *scn) { spa_t *spa = scn->scn_dp->dp_spa; uint64_t used = 0, comp, uncomp; if (spa->spa_load_state != SPA_LOAD_NONE) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); if (scn->scn_phys.scn_state == DSS_SCANNING || (scn->scn_async_destroying && !scn->scn_async_stalled)) return (B_TRUE); if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, &used, &comp, &uncomp); } return (used != 0); } void dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; int err = 0; /* * Check for scn_restart_txg before checking spa_load_state, so * that we can restart an old-style scan while the pool is being * imported (see dsl_scan_init). */ if (scn->scn_restart_txg != 0 && scn->scn_restart_txg <= tx->tx_txg) { pool_scan_func_t func = POOL_SCAN_SCRUB; dsl_scan_done(scn, B_FALSE, tx); if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u txg=%llu", func, tx->tx_txg); dsl_scan_setup_sync(&func, tx); } /* * If the scan is inactive due to a stalled async destroy, try again. */ if ((!scn->scn_async_stalled && !dsl_scan_active(scn)) || spa_sync_pass(dp->dp_spa) > 1) return; scn->scn_visited_this_txg = 0; scn->scn_pausing = B_FALSE; scn->scn_sync_start_time = gethrtime(); spa->spa_scrub_active = B_TRUE; /* * First process the async destroys. If we pause, don't do * any scrubbing or resilvering. This ensures that there are no * async destroys while we are scanning, so the scan code doesn't * have to worry about traversing it. It is also faster to free the * blocks than to scrub them. */ if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, dsl_scan_free_block_cb, scn, tx); VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); } if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { ASSERT(scn->scn_async_destroying); scn->scn_is_bptree = B_TRUE; scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); if (err == EIO || err == ECKSUM) { err = 0; } else if (err != 0 && err != ERESTART) { zfs_panic_recover("error %u from " "traverse_dataset_destroyed()", err); } if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { /* finished; deactivate async destroy feature */ spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); ASSERT(!spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)); VERIFY0(zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, tx)); VERIFY0(bptree_free(dp->dp_meta_objset, dp->dp_bptree_obj, tx)); dp->dp_bptree_obj = 0; scn->scn_async_destroying = B_FALSE; scn->scn_async_stalled = B_FALSE; } else { /* * If we didn't make progress, mark the async * destroy as stalled, so that we will not initiate * a spa_sync() on its behalf. Note that we only * check this if we are not finished, because if the * bptree had no blocks for us to visit, we can * finish without "making progress". */ scn->scn_async_stalled = (scn->scn_visited_this_txg == 0); } } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " "free_bpobj/bptree txg %llu; err=%d", (longlong_t)scn->scn_visited_this_txg, (longlong_t) NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), (longlong_t)tx->tx_txg, err); scn->scn_visited_this_txg = 0; /* * Write out changes to the DDT that may be required as a * result of the blocks freed. This ensures that the DDT * is clean when a scrub/resilver runs. */ ddt_sync(spa, tx->tx_txg); } if (err != 0) return; if (!scn->scn_async_destroying && zfs_free_leak_on_eio && (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { /* * We have finished background destroying, but there is still * some space left in the dp_free_dir. Transfer this leaked * space to the dp_leak_dir. */ if (dp->dp_leak_dir == NULL) { rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, LEAK_DIR_NAME, tx); VERIFY0(dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, &dp->dp_leak_dir)); rrw_exit(&dp->dp_config_rwlock, FTAG); } dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); } if (!scn->scn_async_destroying) { /* finished; verify that space accounting went to zero */ ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); } if (scn->scn_phys.scn_state != DSS_SCANNING) return; if (scn->scn_done_txg == tx->tx_txg) { ASSERT(!scn->scn_pausing); /* finished with scan. */ zfs_dbgmsg("txg %llu scan complete", tx->tx_txg); dsl_scan_done(scn, B_TRUE, tx); ASSERT3U(spa->spa_scrub_inflight, ==, 0); dsl_scan_sync_state(scn, tx); return; } if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { zfs_dbgmsg("doing scan sync txg %llu; " "ddt bm=%llu/%llu/%llu/%llx", (longlong_t)tx->tx_txg, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); } else { zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", (longlong_t)tx->tx_txg, (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, (longlong_t)scn->scn_phys.scn_bookmark.zb_object, (longlong_t)scn->scn_phys.scn_bookmark.zb_level, (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); } scn->scn_zio_root = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_CANFAIL); dsl_pool_config_enter(dp, FTAG); dsl_scan_visit(scn, tx); dsl_pool_config_exit(dp, FTAG); (void) zio_wait(scn->scn_zio_root); scn->scn_zio_root = NULL; zfs_dbgmsg("visited %llu blocks in %llums", (longlong_t)scn->scn_visited_this_txg, (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time)); if (!scn->scn_pausing) { scn->scn_done_txg = tx->tx_txg + 1; zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu", tx->tx_txg, scn->scn_done_txg); } if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight > 0) { cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); } mutex_exit(&spa->spa_scrub_lock); } dsl_scan_sync_state(scn, tx); } /* * This will start a new scan, or restart an existing one. */ void dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) { if (txg == 0) { dmu_tx_t *tx; tx = dmu_tx_create_dd(dp->dp_mos_dir); VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); txg = dmu_tx_get_txg(tx); dp->dp_scan->scn_restart_txg = txg; dmu_tx_commit(tx); } else { dp->dp_scan->scn_restart_txg = txg; } zfs_dbgmsg("restarting resilver txg=%llu", txg); } boolean_t dsl_scan_resilvering(dsl_pool_t *dp) { return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); } /* * scrub consumers */ static void count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; /* * If we resume after a reboot, zab will be NULL; don't record * incomplete stats in that case. */ if (zab == NULL) return; for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; if (t & DMU_OT_NEWTYPE) t = DMU_OT_OTHER; zfs_blkstat_t *zb = &zab->zab_type[l][t]; int equal; zb->zb_count++; zb->zb_asize += BP_GET_ASIZE(bp); zb->zb_lsize += BP_GET_LSIZE(bp); zb->zb_psize += BP_GET_PSIZE(bp); zb->zb_gangs += BP_COUNT_GANG(bp); switch (BP_GET_NDVAS(bp)) { case 2: if (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) zb->zb_ditto_2_of_2_samevdev++; break; case 3: equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[1])) + (DVA_GET_VDEV(&bp->blk_dva[0]) == DVA_GET_VDEV(&bp->blk_dva[2])) + (DVA_GET_VDEV(&bp->blk_dva[1]) == DVA_GET_VDEV(&bp->blk_dva[2])); if (equal == 1) zb->zb_ditto_2_of_3_samevdev++; else if (equal == 3) zb->zb_ditto_3_of_3_samevdev++; break; } } } static void dsl_scan_scrub_done(zio_t *zio) { spa_t *spa = zio->io_spa; zio_data_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; cv_broadcast(&spa->spa_scrub_io_cv); if (zio->io_error && (zio->io_error != ECKSUM || !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; } mutex_exit(&spa->spa_scrub_lock); } static int dsl_scan_scrub_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_phys_t *zb) { dsl_scan_t *scn = dp->dp_scan; size_t size = BP_GET_PSIZE(bp); spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); boolean_t needs_io; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; unsigned int scan_delay = 0; if (phys_birth <= scn->scn_phys.scn_min_txg || phys_birth >= scn->scn_phys.scn_max_txg) return (0); count_block(dp->dp_blkstats, bp); if (BP_IS_EMBEDDED(bp)) return (0); ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; needs_io = B_TRUE; scan_delay = zfs_scrub_delay; } else { ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); zio_flags |= ZIO_FLAG_RESILVER; needs_io = B_FALSE; scan_delay = zfs_resilver_delay; } /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; for (int d = 0; d < BP_GET_NDVAS(bp); d++) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[d])); /* * Keep track of how much data we've examined so that * zpool(1M) status can make useful progress reports. */ scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); /* if it's a resilver, this may not be in the target range */ if (!needs_io) { if (DVA_GET_GANG(&bp->blk_dva[d])) { /* * Gang members may be spread across multiple * vdevs, so the best estimate we have is the * scrub range, which has already been checked. * XXX -- it would be better to change our * allocation policy to ensure that all * gang members reside on the same vdev. */ needs_io = B_TRUE; } else { needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1); } } } if (needs_io && !zfs_no_scrub_io) { vdev_t *rvd = spa->spa_root_vdev; uint64_t maxinflight = rvd->vdev_children * MAX(zfs_top_maxinflight, 1); void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); /* * If we're seeing recent (zfs_scan_idle) "important" I/Os * then throttle our workload to limit the impact of a scan. */ if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) delay(MAX((int)scan_delay, 0)); zio_nowait(zio_read(NULL, spa, bp, data, size, dsl_scan_scrub_done, NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb)); } /* do not relocate this block */ return (0); } int dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) { spa_t *spa = dp->dp_spa; /* * Purge all vdev caches and probe all devices. We do this here * rather than in sync context because this requires a writer lock * on the spa_config lock, which we can't do from sync context. The * spa_scrub_reopen flag indicates that vdev_open() should not * attempt to start another scrub. */ spa_vdev_state_enter(spa, SCL_NONE); spa->spa_scrub_reopen = B_TRUE; vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_NONE)); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dsl_userhold.c (revision 286575) @@ -1,666 +1,666 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include typedef struct dsl_dataset_user_hold_arg { nvlist_t *dduha_holds; nvlist_t *dduha_chkholds; nvlist_t *dduha_errlist; minor_t dduha_minor; } dsl_dataset_user_hold_arg_t; /* * If you add new checks here, you may need to add additional checks to the * "temporary" case in snapshot_check() in dmu_objset.c. */ int dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, boolean_t temphold, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; int error = 0; ASSERT(dsl_pool_config_held(dp)); if (strlen(htag) > MAXNAMELEN) return (SET_ERROR(E2BIG)); /* Tempholds have a more restricted length */ if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) return (SET_ERROR(E2BIG)); /* tags must be unique (if ds already exists) */ if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { uint64_t value; error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, htag, 8, 1, &value); if (error == 0) error = SET_ERROR(EEXIST); else if (error == ENOENT) error = 0; } return (error); } static int dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) { dsl_dataset_user_hold_arg_t *dduha = arg; dsl_pool_t *dp = dmu_tx_pool(tx); if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) return (SET_ERROR(ENOTSUP)); if (!dmu_tx_is_syncing(tx)) return (0); for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { dsl_dataset_t *ds; int error = 0; char *htag, *name; /* must be a snapshot */ name = nvpair_name(pair); if (strchr(name, '@') == NULL) error = SET_ERROR(EINVAL); if (error == 0) error = nvpair_value_string(pair, &htag); if (error == 0) error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error == 0) { error = dsl_dataset_user_hold_check_one(ds, htag, dduha->dduha_minor != 0, tx); dsl_dataset_rele(ds, FTAG); } if (error == 0) { fnvlist_add_string(dduha->dduha_chkholds, name, htag); } else { /* * We register ENOENT errors so they can be correctly * reported if needed, such as when all holds fail. */ fnvlist_add_int32(dduha->dduha_errlist, name, error); if (error != ENOENT) return (error); } } return (0); } static void dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds, const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj; ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) { /* * This is the first user hold for this dataset. Create * the userrefs zap object. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); } else { zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj; } ds->ds_userrefs++; VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx)); if (minor != 0) { char name[MAXNAMELEN]; nvlist_t *tags; VERIFY0(dsl_pool_user_hold(dp, ds->ds_object, htag, now, tx)); (void) snprintf(name, sizeof (name), "%llx", (u_longlong_t)ds->ds_object); if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) { tags = fnvlist_alloc(); fnvlist_add_boolean(tags, htag); fnvlist_add_nvlist(tmpholds, name, tags); fnvlist_free(tags); } else { fnvlist_add_boolean(tags, htag); } } spa_history_log_internal_ds(ds, "hold", tx, "tag=%s temp=%d refs=%llu", htag, minor != 0, ds->ds_userrefs); } typedef struct zfs_hold_cleanup_arg { char zhca_spaname[MAXNAMELEN]; uint64_t zhca_spa_load_guid; nvlist_t *zhca_holds; } zfs_hold_cleanup_arg_t; static void dsl_dataset_user_release_onexit(void *arg) { zfs_hold_cleanup_arg_t *ca = arg; spa_t *spa; int error; error = spa_open(ca->zhca_spaname, &spa, FTAG); if (error != 0) { zfs_dbgmsg("couldn't release holds on pool=%s " "because pool is no longer loaded", ca->zhca_spaname); return; } if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { zfs_dbgmsg("couldn't release holds on pool=%s " "because pool is no longer loaded (guid doesn't match)", ca->zhca_spaname); spa_close(spa, FTAG); return; } (void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds); fnvlist_free(ca->zhca_holds); kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); spa_close(spa, FTAG); } static void dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor) { zfs_hold_cleanup_arg_t *ca; if (minor == 0 || nvlist_empty(holds)) { fnvlist_free(holds); return; } ASSERT(spa != NULL); ca = kmem_alloc(sizeof (*ca), KM_SLEEP); (void) strlcpy(ca->zhca_spaname, spa_name(spa), sizeof (ca->zhca_spaname)); ca->zhca_spa_load_guid = spa_load_guid(spa); ca->zhca_holds = holds; VERIFY0(zfs_onexit_add_cb(minor, dsl_dataset_user_release_onexit, ca, NULL)); } void dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx) { nvlist_t *tmpholds; if (minor != 0) tmpholds = fnvlist_alloc(); else tmpholds = NULL; dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx); dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor); } static void dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_user_hold_arg_t *dduha = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvlist_t *tmpholds; uint64_t now = gethrestime_sec(); if (dduha->dduha_minor != 0) tmpholds = fnvlist_alloc(); else tmpholds = NULL; for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL); pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) { dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, fnvpair_value_string(pair), dduha->dduha_minor, now, tx); dsl_dataset_rele(ds, FTAG); } dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor); } /* * The full semantics of this function are described in the comment above * lzc_hold(). * * To summarize: * holds is nvl of snapname -> holdname * errlist will be filled in with snapname -> error * * The snaphosts must all be in the same pool. * * Holds for snapshots that don't exist will be skipped. * * If none of the snapshots for requested holds exist then ENOENT will be * returned. * * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned * up when the process exits. * * On success all the holds, for snapshots that existed, will be created and 0 * will be returned. * * On failure no holds will be created, the errlist will be filled in, * and an errno will returned. * * In all cases the errlist will contain entries for holds where the snapshot * didn't exist. */ int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) { dsl_dataset_user_hold_arg_t dduha; nvpair_t *pair; int ret; pair = nvlist_next_nvpair(holds, NULL); if (pair == NULL) return (0); dduha.dduha_holds = holds; dduha.dduha_chkholds = fnvlist_alloc(); dduha.dduha_errlist = errlist; dduha.dduha_minor = cleanup_minor; ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, &dduha, fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED); fnvlist_free(dduha.dduha_chkholds); return (ret); } typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag, dsl_dataset_t **dsp); typedef struct dsl_dataset_user_release_arg { dsl_holdfunc_t *ddura_holdfunc; nvlist_t *ddura_holds; nvlist_t *ddura_todelete; nvlist_t *ddura_errlist; nvlist_t *ddura_chkholds; } dsl_dataset_user_release_arg_t; /* Place a dataset hold on the snapshot identified by passed dsobj string */ static int dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag, dsl_dataset_t **dsp) { return (dsl_dataset_hold_obj(dp, strtonum(dsobj, NULL), tag, dsp)); } static int dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, dsl_dataset_t *ds, nvlist_t *holds, const char *snapname) { uint64_t zapobj; nvlist_t *holds_found; objset_t *mos; int numholds; - if (!dsl_dataset_is_snapshot(ds)) + if (!ds->ds_is_snapshot) return (SET_ERROR(EINVAL)); if (nvlist_empty(holds)) return (0); numholds = 0; mos = ds->ds_dir->dd_pool->dp_meta_objset; zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj; holds_found = fnvlist_alloc(); for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { uint64_t tmp; int error; const char *holdname = nvpair_name(pair); if (zapobj != 0) error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp); else error = SET_ERROR(ENOENT); /* * Non-existent holds are put on the errlist, but don't * cause an overall failure. */ if (error == ENOENT) { if (ddura->ddura_errlist != NULL) { char *errtag = kmem_asprintf("%s#%s", snapname, holdname); fnvlist_add_int32(ddura->ddura_errlist, errtag, ENOENT); strfree(errtag); } continue; } if (error != 0) { fnvlist_free(holds_found); return (error); } fnvlist_add_boolean(holds_found, holdname); numholds++; } if (DS_IS_DEFER_DESTROY(ds) && dsl_dataset_phys(ds)->ds_num_children == 1 && ds->ds_userrefs == numholds) { /* we need to destroy the snapshot as well */ if (dsl_dataset_long_held(ds)) { fnvlist_free(holds_found); return (SET_ERROR(EBUSY)); } fnvlist_add_boolean(ddura->ddura_todelete, snapname); } if (numholds != 0) { fnvlist_add_nvlist(ddura->ddura_chkholds, snapname, holds_found); } fnvlist_free(holds_found); return (0); } static int dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx) { dsl_dataset_user_release_arg_t *ddura; dsl_holdfunc_t *holdfunc; dsl_pool_t *dp; if (!dmu_tx_is_syncing(tx)) return (0); dp = dmu_tx_pool(tx); ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); ddura = arg; holdfunc = ddura->ddura_holdfunc; for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { int error; dsl_dataset_t *ds; nvlist_t *holds; const char *snapname = nvpair_name(pair); error = nvpair_value_nvlist(pair, &holds); if (error != 0) error = (SET_ERROR(EINVAL)); else error = holdfunc(dp, snapname, FTAG, &ds); if (error == 0) { error = dsl_dataset_user_release_check_one(ddura, ds, holds, snapname); dsl_dataset_rele(ds, FTAG); } if (error != 0) { if (ddura->ddura_errlist != NULL) { fnvlist_add_int32(ddura->ddura_errlist, snapname, error); } /* * Non-existent snapshots are put on the errlist, * but don't cause an overall failure. */ if (error != ENOENT) return (error); } } return (0); } static void dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { int error; const char *holdname = nvpair_name(pair); /* Remove temporary hold if one exists. */ error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx); VERIFY(error == 0 || error == ENOENT); VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, holdname, tx)); ds->ds_userrefs--; spa_history_log_internal_ds(ds, "release", tx, "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs); } } static void dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_user_release_arg_t *ddura = arg; dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc; dsl_pool_t *dp = dmu_tx_pool(tx); ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL); pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds, pair)) { dsl_dataset_t *ds; const char *name = nvpair_name(pair); VERIFY0(holdfunc(dp, name, FTAG, &ds)); dsl_dataset_user_release_sync_one(ds, fnvpair_value_nvlist(pair), tx); if (nvlist_exists(ddura->ddura_todelete, name)) { ASSERT(ds->ds_userrefs == 0 && dsl_dataset_phys(ds)->ds_num_children == 1 && DS_IS_DEFER_DESTROY(ds)); dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); } dsl_dataset_rele(ds, FTAG); } } /* * The full semantics of this function are described in the comment above * lzc_release(). * * To summarize: * Releases holds specified in the nvl holds. * * holds is nvl of snapname -> { holdname, ... } * errlist will be filled in with snapname -> error * * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots, * otherwise they should be the names of shapshots. * * As a release may cause snapshots to be destroyed this trys to ensure they * aren't mounted. * * The release of non-existent holds are skipped. * * At least one hold must have been released for the this function to succeed * and return 0. */ static int dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, dsl_pool_t *tmpdp) { dsl_dataset_user_release_arg_t ddura; nvpair_t *pair; char *pool; int error; pair = nvlist_next_nvpair(holds, NULL); if (pair == NULL) return (0); /* * The release may cause snapshots to be destroyed; make sure they * are not mounted. */ if (tmpdp != NULL) { /* Temporary holds are specified by dsobj string. */ ddura.ddura_holdfunc = dsl_dataset_hold_obj_string; pool = spa_name(tmpdp->dp_spa); #ifdef _KERNEL for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { dsl_dataset_t *ds; dsl_pool_config_enter(tmpdp, FTAG); error = dsl_dataset_hold_obj_string(tmpdp, nvpair_name(pair), FTAG, &ds); if (error == 0) { char name[MAXNAMELEN]; dsl_dataset_name(ds, name); dsl_pool_config_exit(tmpdp, FTAG); dsl_dataset_rele(ds, FTAG); (void) zfs_unmount_snap(name); } else { dsl_pool_config_exit(tmpdp, FTAG); } } #endif } else { /* Non-temporary holds are specified by name. */ ddura.ddura_holdfunc = dsl_dataset_hold; pool = nvpair_name(pair); #ifdef _KERNEL for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { (void) zfs_unmount_snap(nvpair_name(pair)); } #endif } ddura.ddura_holds = holds; ddura.ddura_errlist = errlist; ddura.ddura_todelete = fnvlist_alloc(); ddura.ddura_chkholds = fnvlist_alloc(); error = dsl_sync_task(pool, dsl_dataset_user_release_check, dsl_dataset_user_release_sync, &ddura, 0, ZFS_SPACE_CHECK_NONE); fnvlist_free(ddura.ddura_todelete); fnvlist_free(ddura.ddura_chkholds); return (error); } /* * holds is nvl of snapname -> { holdname, ... } * errlist will be filled in with snapname -> error */ int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist) { return (dsl_dataset_user_release_impl(holds, errlist, NULL)); } /* * holds is nvl of snapdsobj -> { holdname, ... } */ void dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds) { ASSERT(dp != NULL); (void) dsl_dataset_user_release_impl(holds, NULL, dp); } int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; err = dsl_pool_hold(dsname, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { zap_attribute_t *za; zap_cursor_t zc; za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, dsl_dataset_phys(ds)->ds_userrefs_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { fnvlist_add_uint64(nvl, za->za_name, za->za_first_integer); } zap_cursor_fini(&zc); kmem_free(za, sizeof (zap_attribute_t)); } dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (0); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sa.c (revision 286575) @@ -1,1999 +1,2000 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 iXsystems, Inc * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * ZFS System attributes: * * A generic mechanism to allow for arbitrary attributes * to be stored in a dnode. The data will be stored in the bonus buffer of * the dnode and if necessary a special "spill" block will be used to handle * overflow situations. The spill block will be sized to fit the data * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the * spill block is stored at the end of the current bonus buffer. Any * attributes that would be in the way of the blkptr_t will be relocated * into the spill block. * * Attribute registration: * * Stored persistently on a per dataset basis * a mapping between attribute "string" names and their actual attribute * numeric values, length, and byteswap function. The names are only used * during registration. All attributes are known by their unique attribute * id value. If an attribute can have a variable size then the value * 0 will be used to indicate this. * * Attribute Layout: * * Attribute layouts are a way to compactly store multiple attributes, but * without taking the overhead associated with managing each attribute * individually. Since you will typically have the same set of attributes * stored in the same order a single table will be used to represent that * layout. The ZPL for example will usually have only about 10 different * layouts (regular files, device files, symlinks, * regular files + scanstamp, files/dir with extended attributes, and then * you have the possibility of all of those minus ACL, because it would * be kicked out into the spill block) * * Layouts are simply an array of the attributes and their * ordering i.e. [0, 1, 4, 5, 2] * * Each distinct layout is given a unique layout number and that is whats * stored in the header at the beginning of the SA data buffer. * * A layout only covers a single dbuf (bonus or spill). If a set of * attributes is split up between the bonus buffer and a spill buffer then * two different layouts will be used. This allows us to byteswap the * spill without looking at the bonus buffer and keeps the on disk format of * the bonus and spill buffer the same. * * Adding a single attribute will cause the entire set of attributes to * be rewritten and could result in a new layout number being constructed * as part of the rewrite if no such layout exists for the new set of * attribues. The new attribute will be appended to the end of the already * existing attributes. * * Both the attribute registration and attribute layout information are * stored in normal ZAP attributes. Their should be a small number of * known layouts and the set of attributes is assumed to typically be quite * small. * * The registered attributes and layout "table" information is maintained * in core and a special "sa_os_t" is attached to the objset_t. * * A special interface is provided to allow for quickly applying * a large set of attributes at once. sa_replace_all_by_template() is * used to set an array of attributes. This is used by the ZPL when * creating a brand new file. The template that is passed into the function * specifies the attribute, size for variable length attributes, location of * data and special "data locator" function if the data isn't in a contiguous * location. * * Byteswap implications: * * Since the SA attributes are not entirely self describing we can't do * the normal byteswap processing. The special ZAP layout attribute and * attribute registration attributes define the byteswap function and the * size of the attributes, unless it is variable sized. * The normal ZFS byteswapping infrastructure assumes you don't need * to read any objects in order to do the necessary byteswapping. Whereas * SA attributes can only be properly byteswapped if the dataset is opened * and the layout/attribute ZAP attributes are available. Because of this * the SA attributes will be byteswapped when they are first accessed by * the SA code that will read the SA data. */ typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t, uint16_t length, int length_idx, boolean_t, void *userp); static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype); static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab); static void *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data); static void sa_idx_tab_rele(objset_t *os, void *arg); static void sa_copy_data(sa_data_locator_t *func, void *start, void *target, int buflen); static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, sa_data_op_t action, sa_data_locator_t *locator, void *datastart, uint16_t buflen, dmu_tx_t *tx); arc_byteswap_func_t *sa_bswap_table[] = { byteswap_uint64_array, byteswap_uint32_array, byteswap_uint16_array, byteswap_uint8_array, zfs_acl_byteswap, }; #define SA_COPY_DATA(f, s, t, l) \ { \ if (f == NULL) { \ if (l == 8) { \ *(uint64_t *)t = *(uint64_t *)s; \ } else if (l == 16) { \ *(uint64_t *)t = *(uint64_t *)s; \ *(uint64_t *)((uintptr_t)t + 8) = \ *(uint64_t *)((uintptr_t)s + 8); \ } else { \ bcopy(s, t, l); \ } \ } else \ sa_copy_data(f, s, t, l); \ } /* * This table is fixed and cannot be changed. Its purpose is to * allow the SA code to work with both old/new ZPL file systems. * It contains the list of legacy attributes. These attributes aren't * stored in the "attribute" registry zap objects, since older ZPL file systems * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will * use this static table. */ sa_attr_reg_t sa_legacy_attrs[] = { {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, }; /* * This is only used for objects of type DMU_OT_ZNODE */ sa_attr_type_t sa_legacy_zpl_layout[] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }; /* * Special dummy layout used for buffers with no attributes. */ sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; static int sa_legacy_attr_count = 16; static kmem_cache_t *sa_cache = NULL; /*ARGSUSED*/ static int sa_cache_constructor(void *buf, void *unused, int kmflag) { sa_handle_t *hdl = buf; mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } /*ARGSUSED*/ static void sa_cache_destructor(void *buf, void *unused) { sa_handle_t *hdl = buf; + hdl->sa_dbu.dbu_evict_func = NULL; mutex_destroy(&hdl->sa_lock); } void sa_cache_init(void) { sa_cache = kmem_cache_create("sa_cache", sizeof (sa_handle_t), 0, sa_cache_constructor, sa_cache_destructor, NULL, NULL, NULL, 0); } void sa_cache_fini(void) { if (sa_cache) kmem_cache_destroy(sa_cache); } static int layout_num_compare(const void *arg1, const void *arg2) { const sa_lot_t *node1 = arg1; const sa_lot_t *node2 = arg2; if (node1->lot_num > node2->lot_num) return (1); else if (node1->lot_num < node2->lot_num) return (-1); return (0); } static int layout_hash_compare(const void *arg1, const void *arg2) { const sa_lot_t *node1 = arg1; const sa_lot_t *node2 = arg2; if (node1->lot_hash > node2->lot_hash) return (1); if (node1->lot_hash < node2->lot_hash) return (-1); if (node1->lot_instance > node2->lot_instance) return (1); if (node1->lot_instance < node2->lot_instance) return (-1); return (0); } boolean_t sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) { int i; if (count != tbf->lot_attr_count) return (1); for (i = 0; i != count; i++) { if (attrs[i] != tbf->lot_attrs[i]) return (1); } return (0); } #define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF]) static uint64_t sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count) { int i; uint64_t crc = -1ULL; for (i = 0; i != attr_count; i++) crc ^= SA_ATTR_HASH(attrs[i]); return (crc); } static int sa_get_spill(sa_handle_t *hdl) { int rc; if (hdl->sa_spill == NULL) { if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL, &hdl->sa_spill)) == 0) VERIFY(0 == sa_build_index(hdl, SA_SPILL)); } else { rc = 0; } return (rc); } /* * Main attribute lookup/update function * returns 0 for success or non zero for failures * * Operates on bulk array, first failure will abort further processing */ int sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, sa_data_op_t data_op, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; int i; int error = 0; sa_buf_type_t buftypes; buftypes = 0; ASSERT(count > 0); for (i = 0; i != count; i++) { ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs); bulk[i].sa_addr = NULL; /* First check the bonus buffer */ if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT( hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) { SA_ATTR_INFO(sa, hdl->sa_bonus_tab, SA_GET_HDR(hdl, SA_BONUS), bulk[i].sa_attr, bulk[i], SA_BONUS, hdl); if (tx && !(buftypes & SA_BONUS)) { dmu_buf_will_dirty(hdl->sa_bonus, tx); buftypes |= SA_BONUS; } } if (bulk[i].sa_addr == NULL && ((error = sa_get_spill(hdl)) == 0)) { if (TOC_ATTR_PRESENT( hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) { SA_ATTR_INFO(sa, hdl->sa_spill_tab, SA_GET_HDR(hdl, SA_SPILL), bulk[i].sa_attr, bulk[i], SA_SPILL, hdl); if (tx && !(buftypes & SA_SPILL) && bulk[i].sa_size == bulk[i].sa_length) { dmu_buf_will_dirty(hdl->sa_spill, tx); buftypes |= SA_SPILL; } } } if (error && error != ENOENT) { return ((error == ECKSUM) ? EIO : error); } switch (data_op) { case SA_LOOKUP: if (bulk[i].sa_addr == NULL) return (SET_ERROR(ENOENT)); if (bulk[i].sa_data) { SA_COPY_DATA(bulk[i].sa_data_func, bulk[i].sa_addr, bulk[i].sa_data, bulk[i].sa_size); } continue; case SA_UPDATE: /* existing rewrite of attr */ if (bulk[i].sa_addr && bulk[i].sa_size == bulk[i].sa_length) { SA_COPY_DATA(bulk[i].sa_data_func, bulk[i].sa_data, bulk[i].sa_addr, bulk[i].sa_length); continue; } else if (bulk[i].sa_addr) { /* attr size change */ error = sa_modify_attrs(hdl, bulk[i].sa_attr, SA_REPLACE, bulk[i].sa_data_func, bulk[i].sa_data, bulk[i].sa_length, tx); } else { /* adding new attribute */ error = sa_modify_attrs(hdl, bulk[i].sa_attr, SA_ADD, bulk[i].sa_data_func, bulk[i].sa_data, bulk[i].sa_length, tx); } if (error) return (error); break; } } return (error); } static sa_lot_t * sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx) { sa_os_t *sa = os->os_sa; sa_lot_t *tb, *findtb; int i; avl_index_t loc; ASSERT(MUTEX_HELD(&sa->sa_lock)); tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); tb->lot_attr_count = attr_count; tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, KM_SLEEP); bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); tb->lot_num = lot_num; tb->lot_hash = hash; tb->lot_instance = 0; if (zapadd) { char attr_name[8]; if (sa->sa_layout_attr_obj == 0) { sa->sa_layout_attr_obj = zap_create_link(os, DMU_OT_SA_ATTR_LAYOUTS, sa->sa_master_obj, SA_LAYOUTS, tx); } (void) snprintf(attr_name, sizeof (attr_name), "%d", (int)lot_num); VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj, attr_name, 2, attr_count, attrs, tx)); } list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t), offsetof(sa_idx_tab_t, sa_next)); for (i = 0; i != attr_count; i++) { if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0) tb->lot_var_sizes++; } avl_add(&sa->sa_layout_num_tree, tb); /* verify we don't have a hash collision */ if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) { for (; findtb && findtb->lot_hash == hash; findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) { if (findtb->lot_instance != tb->lot_instance) break; tb->lot_instance++; } } avl_add(&sa->sa_layout_hash_tree, tb); return (tb); } static void sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs, int count, dmu_tx_t *tx, sa_lot_t **lot) { sa_lot_t *tb, tbsearch; avl_index_t loc; sa_os_t *sa = os->os_sa; boolean_t found = B_FALSE; mutex_enter(&sa->sa_lock); tbsearch.lot_hash = hash; tbsearch.lot_instance = 0; tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc); if (tb) { for (; tb && tb->lot_hash == hash; tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) { if (sa_layout_equal(tb, attrs, count) == 0) { found = B_TRUE; break; } } } if (!found) { tb = sa_add_layout_entry(os, attrs, count, avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx); } mutex_exit(&sa->sa_lock); *lot = tb; } static int sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) { int error; uint32_t blocksize; if (size == 0) { blocksize = SPA_MINBLOCKSIZE; } else if (size > SPA_OLD_MAXBLOCKSIZE) { ASSERT(0); return (SET_ERROR(EFBIG)); } else { blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t); } error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx); ASSERT(error == 0); return (error); } static void sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) { if (func == NULL) { bcopy(datastart, target, buflen); } else { boolean_t start; int bytes; void *dataptr; void *saptr = target; uint32_t length; start = B_TRUE; bytes = 0; while (bytes < buflen) { func(&dataptr, &length, buflen, start, datastart); bcopy(dataptr, saptr, length); saptr = (void *)((caddr_t)saptr + length); bytes += length; start = B_FALSE; } } } /* * Determine several different sizes * first the sa header size * the number of bytes to be stored * if spill would occur the index in the attribute array is returned * * the boolean will_spill will be set when spilling is necessary. It * is only set when the buftype is SA_BONUS */ static int sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, dmu_buf_t *db, sa_buf_type_t buftype, int *index, int *total, boolean_t *will_spill) { int var_size = 0; int i; int j = -1; int full_space; int hdrsize; boolean_t done = B_FALSE; if (buftype == SA_BONUS && sa->sa_force_spill) { *total = 0; *index = 0; *will_spill = B_TRUE; return (0); } *index = -1; *total = 0; if (buftype == SA_BONUS) *will_spill = B_FALSE; hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : sizeof (sa_hdr_phys_t); full_space = (buftype == SA_BONUS) ? DN_MAX_BONUSLEN : db->db_size; ASSERT(IS_P2ALIGNED(full_space, 8)); for (i = 0; i != attr_count; i++) { boolean_t is_var_sz; *total = P2ROUNDUP(*total, 8); *total += attr_desc[i].sa_length; if (done) goto next; is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0); if (is_var_sz) { var_size++; } if (is_var_sz && var_size > 1) { if (P2ROUNDUP(hdrsize + sizeof (uint16_t), 8) + *total < full_space) { /* * Account for header space used by array of * optional sizes of variable-length attributes. * Record the index in case this increase needs * to be reversed due to spill-over. */ hdrsize += sizeof (uint16_t); j = i; } else { done = B_TRUE; *index = i; if (buftype == SA_BONUS) *will_spill = B_TRUE; continue; } } /* * find index of where spill *could* occur. * Then continue to count of remainder attribute * space. The sum is used later for sizing bonus * and spill buffer. */ if (buftype == SA_BONUS && *index == -1 && (*total + P2ROUNDUP(hdrsize, 8)) > (full_space - sizeof (blkptr_t))) { *index = i; done = B_TRUE; } next: if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space && buftype == SA_BONUS) *will_spill = B_TRUE; } /* * j holds the index of the last variable-sized attribute for * which hdrsize was increased. Reverse the increase if that * attribute will be relocated to the spill block. */ if (*will_spill && j == *index) hdrsize -= sizeof (uint16_t); hdrsize = P2ROUNDUP(hdrsize, 8); return (hdrsize); } #define BUF_SPACE_NEEDED(total, header) (total + header) /* * Find layout that corresponds to ordering of attributes * If not found a new layout number is created and added to * persistent layout tables. */ static int sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; uint64_t hash; sa_buf_type_t buftype; sa_hdr_phys_t *sahdr; void *data_start; int buf_space; sa_attr_type_t *attrs, *attrs_start; int i, lot_count; int hdrsize; int spillhdrsize = 0; int used; dmu_object_type_t bonustype; sa_lot_t *lot; int len_idx; int spill_used; boolean_t spilling; dmu_buf_will_dirty(hdl->sa_bonus, tx); bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); /* first determine bonus header size and sum of all attributes */ hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, SA_BONUS, &i, &used, &spilling); if (used > SPA_OLD_MAXBLOCKSIZE) return (SET_ERROR(EFBIG)); VERIFY(0 == dmu_set_bonus(hdl->sa_bonus, spilling ? MIN(DN_MAX_BONUSLEN - sizeof (blkptr_t), used + hdrsize) : used + hdrsize, tx)); ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || bonustype == DMU_OT_SA); /* setup and size spill buffer when needed */ if (spilling) { boolean_t dummy; if (hdl->sa_spill == NULL) { VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, NULL, &hdl->sa_spill) == 0); } dmu_buf_will_dirty(hdl->sa_spill, tx); spillhdrsize = sa_find_sizes(sa, &attr_desc[i], attr_count - i, hdl->sa_spill, SA_SPILL, &i, &spill_used, &dummy); if (spill_used > SPA_OLD_MAXBLOCKSIZE) return (SET_ERROR(EFBIG)); buf_space = hdl->sa_spill->db_size - spillhdrsize; if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > hdl->sa_spill->db_size) VERIFY(0 == sa_resize_spill(hdl, BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx)); } /* setup starting pointers to lay down data */ data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; buftype = SA_BONUS; if (spilling) buf_space = (sa->sa_force_spill) ? 0 : SA_BLKPTR_SPACE - hdrsize; else buf_space = hdl->sa_bonus->db_size - hdrsize; attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, KM_SLEEP); lot_count = 0; for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) { uint16_t length; ASSERT(IS_P2ALIGNED(data_start, 8)); ASSERT(IS_P2ALIGNED(buf_space, 8)); attrs[i] = attr_desc[i].sa_attr; length = SA_REGISTERED_LEN(sa, attrs[i]); if (length == 0) length = attr_desc[i].sa_length; else VERIFY(length == attr_desc[i].sa_length); if (buf_space < length) { /* switch to spill buffer */ VERIFY(spilling); VERIFY(bonustype == DMU_OT_SA); if (buftype == SA_BONUS && !sa->sa_force_spill) { sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); SA_SET_HDR(sahdr, lot->lot_num, hdrsize); } buftype = SA_SPILL; hash = -1ULL; len_idx = 0; sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; sahdr->sa_magic = SA_MAGIC; data_start = (void *)((uintptr_t)sahdr + spillhdrsize); attrs_start = &attrs[i]; buf_space = hdl->sa_spill->db_size - spillhdrsize; lot_count = 0; } hash ^= SA_ATTR_HASH(attrs[i]); attr_desc[i].sa_addr = data_start; attr_desc[i].sa_size = length; SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data, data_start, length); if (sa->sa_attr_table[attrs[i]].sa_length == 0) { sahdr->sa_lengths[len_idx++] = length; } VERIFY((uintptr_t)data_start % 8 == 0); data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + length), 8); buf_space -= P2ROUNDUP(length, 8); lot_count++; } sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); /* * Verify that old znodes always have layout number 0. * Must be DMU_OT_SA for arbitrary layouts */ VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) || (bonustype == DMU_OT_SA && lot->lot_num > 1)); if (bonustype == DMU_OT_SA) { SA_SET_HDR(sahdr, lot->lot_num, buftype == SA_BONUS ? hdrsize : spillhdrsize); } kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count); if (hdl->sa_bonus_tab) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); hdl->sa_bonus_tab = NULL; } if (!sa->sa_force_spill) VERIFY(0 == sa_build_index(hdl, SA_BONUS)); if (hdl->sa_spill) { sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); if (!spilling) { /* * remove spill block that is no longer needed. */ dmu_buf_rele(hdl->sa_spill, NULL); hdl->sa_spill = NULL; hdl->sa_spill_tab = NULL; VERIFY(0 == dmu_rm_spill(hdl->sa_os, sa_handle_object(hdl), tx)); } else { VERIFY(0 == sa_build_index(hdl, SA_SPILL)); } } return (0); } static void sa_free_attr_table(sa_os_t *sa) { int i; if (sa->sa_attr_table == NULL) return; for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_name) kmem_free(sa->sa_attr_table[i].sa_name, strlen(sa->sa_attr_table[i].sa_name) + 1); } kmem_free(sa->sa_attr_table, sizeof (sa_attr_table_t) * sa->sa_num_attrs); sa->sa_attr_table = NULL; } static int sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) { sa_os_t *sa = os->os_sa; uint64_t sa_attr_count = 0; uint64_t sa_reg_count = 0; int error = 0; uint64_t attr_value; sa_attr_table_t *tb; zap_cursor_t zc; zap_attribute_t za; int registered_count = 0; int i; dmu_objset_type_t ostype = dmu_objset_type(os); sa->sa_user_table = kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP); sa->sa_user_table_sz = count * sizeof (sa_attr_type_t); if (sa->sa_reg_attr_obj != 0) { error = zap_count(os, sa->sa_reg_attr_obj, &sa_attr_count); /* * Make sure we retrieved a count and that it isn't zero */ if (error || (error == 0 && sa_attr_count == 0)) { if (error == 0) error = SET_ERROR(EINVAL); goto bail; } sa_reg_count = sa_attr_count; } if (ostype == DMU_OST_ZFS && sa_attr_count == 0) sa_attr_count += sa_legacy_attr_count; /* Allocate attribute numbers for attributes that aren't registered */ for (i = 0; i != count; i++) { boolean_t found = B_FALSE; int j; if (ostype == DMU_OST_ZFS) { for (j = 0; j != sa_legacy_attr_count; j++) { if (strcmp(reg_attrs[i].sa_name, sa_legacy_attrs[j].sa_name) == 0) { sa->sa_user_table[i] = sa_legacy_attrs[j].sa_attr; found = B_TRUE; } } } if (found) continue; if (sa->sa_reg_attr_obj) error = zap_lookup(os, sa->sa_reg_attr_obj, reg_attrs[i].sa_name, 8, 1, &attr_value); else error = SET_ERROR(ENOENT); switch (error) { case ENOENT: sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count; sa_attr_count++; break; case 0: sa->sa_user_table[i] = ATTR_NUM(attr_value); break; default: goto bail; } } sa->sa_num_attrs = sa_attr_count; tb = sa->sa_attr_table = kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP); /* * Attribute table is constructed from requested attribute list, * previously foreign registered attributes, and also the legacy * ZPL set of attributes. */ if (sa->sa_reg_attr_obj) { for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t value; value = za.za_first_integer; registered_count++; tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value); tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value); tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value); tb[ATTR_NUM(value)].sa_registered = B_TRUE; if (tb[ATTR_NUM(value)].sa_name) { continue; } tb[ATTR_NUM(value)].sa_name = kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP); (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name, strlen(za.za_name) +1); } zap_cursor_fini(&zc); /* * Make sure we processed the correct number of registered * attributes */ if (registered_count != sa_reg_count) { ASSERT(error != 0); goto bail; } } if (ostype == DMU_OST_ZFS) { for (i = 0; i != sa_legacy_attr_count; i++) { if (tb[i].sa_name) continue; tb[i].sa_attr = sa_legacy_attrs[i].sa_attr; tb[i].sa_length = sa_legacy_attrs[i].sa_length; tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap; tb[i].sa_registered = B_FALSE; tb[i].sa_name = kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1, KM_SLEEP); (void) strlcpy(tb[i].sa_name, sa_legacy_attrs[i].sa_name, strlen(sa_legacy_attrs[i].sa_name) + 1); } } for (i = 0; i != count; i++) { sa_attr_type_t attr_id; attr_id = sa->sa_user_table[i]; if (tb[attr_id].sa_name) continue; tb[attr_id].sa_length = reg_attrs[i].sa_length; tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap; tb[attr_id].sa_attr = attr_id; tb[attr_id].sa_name = kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP); (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name, strlen(reg_attrs[i].sa_name) + 1); } sa->sa_need_attr_registration = (sa_attr_count != registered_count); return (0); bail: kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t)); sa->sa_user_table = NULL; sa_free_attr_table(sa); return ((error != 0) ? error : EINVAL); } int sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, sa_attr_type_t **user_table) { zap_cursor_t zc; zap_attribute_t za; sa_os_t *sa; dmu_objset_type_t ostype = dmu_objset_type(os); sa_attr_type_t *tb; int error; mutex_enter(&os->os_user_ptr_lock); if (os->os_sa) { mutex_enter(&os->os_sa->sa_lock); mutex_exit(&os->os_user_ptr_lock); tb = os->os_sa->sa_user_table; mutex_exit(&os->os_sa->sa_lock); *user_table = tb; return (0); } sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP); mutex_init(&sa->sa_lock, NULL, MUTEX_DEFAULT, NULL); sa->sa_master_obj = sa_obj; os->os_sa = sa; mutex_enter(&sa->sa_lock); mutex_exit(&os->os_user_ptr_lock); avl_create(&sa->sa_layout_num_tree, layout_num_compare, sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node)); if (sa_obj) { error = zap_lookup(os, sa_obj, SA_LAYOUTS, 8, 1, &sa->sa_layout_attr_obj); if (error != 0 && error != ENOENT) goto fail; error = zap_lookup(os, sa_obj, SA_REGISTRY, 8, 1, &sa->sa_reg_attr_obj); if (error != 0 && error != ENOENT) goto fail; } if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0) goto fail; if (sa->sa_layout_attr_obj != 0) { uint64_t layout_count; error = zap_count(os, sa->sa_layout_attr_obj, &layout_count); /* * Layout number count should be > 0 */ if (error || (error == 0 && layout_count == 0)) { if (error == 0) error = SET_ERROR(EINVAL); goto fail; } for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj); (error = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { sa_attr_type_t *lot_attrs; uint64_t lot_num; lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) * za.za_num_integers, KM_SLEEP); if ((error = (zap_lookup(os, sa->sa_layout_attr_obj, za.za_name, 2, za.za_num_integers, lot_attrs))) != 0) { kmem_free(lot_attrs, sizeof (sa_attr_type_t) * za.za_num_integers); break; } VERIFY(ddi_strtoull(za.za_name, NULL, 10, (unsigned long long *)&lot_num) == 0); (void) sa_add_layout_entry(os, lot_attrs, za.za_num_integers, lot_num, sa_layout_info_hash(lot_attrs, za.za_num_integers), B_FALSE, NULL); kmem_free(lot_attrs, sizeof (sa_attr_type_t) * za.za_num_integers); } zap_cursor_fini(&zc); /* * Make sure layout count matches number of entries added * to AVL tree */ if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) { ASSERT(error != 0); goto fail; } } /* Add special layout number for old ZNODES */ if (ostype == DMU_OST_ZFS) { (void) sa_add_layout_entry(os, sa_legacy_zpl_layout, sa_legacy_attr_count, 0, sa_layout_info_hash(sa_legacy_zpl_layout, sa_legacy_attr_count), B_FALSE, NULL); (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1, 0, B_FALSE, NULL); } *user_table = os->os_sa->sa_user_table; mutex_exit(&sa->sa_lock); return (0); fail: os->os_sa = NULL; sa_free_attr_table(sa); if (sa->sa_user_table) kmem_free(sa->sa_user_table, sa->sa_user_table_sz); mutex_exit(&sa->sa_lock); avl_destroy(&sa->sa_layout_hash_tree); avl_destroy(&sa->sa_layout_num_tree); mutex_destroy(&sa->sa_lock); kmem_free(sa, sizeof (sa_os_t)); return ((error == ECKSUM) ? EIO : error); } void sa_tear_down(objset_t *os) { sa_os_t *sa = os->os_sa; sa_lot_t *layout; void *cookie; kmem_free(sa->sa_user_table, sa->sa_user_table_sz); /* Free up attr table */ sa_free_attr_table(sa); cookie = NULL; while (layout = avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie)) { sa_idx_tab_t *tab; while (tab = list_head(&layout->lot_idx_tab)) { ASSERT(refcount_count(&tab->sa_refcount)); sa_idx_tab_rele(os, tab); } } cookie = NULL; while (layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie)) { kmem_free(layout->lot_attrs, sizeof (sa_attr_type_t) * layout->lot_attr_count); kmem_free(layout, sizeof (sa_lot_t)); } avl_destroy(&sa->sa_layout_hash_tree); avl_destroy(&sa->sa_layout_num_tree); mutex_destroy(&sa->sa_lock); kmem_free(sa, sizeof (sa_os_t)); os->os_sa = NULL; } void sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr, uint16_t length, int length_idx, boolean_t var_length, void *userp) { sa_idx_tab_t *idx_tab = userp; if (var_length) { ASSERT(idx_tab->sa_variable_lengths); idx_tab->sa_variable_lengths[length_idx] = length; } TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx, (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr)); } static void sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, sa_iterfunc_t func, sa_lot_t *tab, void *userp) { void *data_start; sa_lot_t *tb = tab; sa_lot_t search; avl_index_t loc; sa_os_t *sa = os->os_sa; int i; uint16_t *length_start = NULL; uint8_t length_idx = 0; if (tab == NULL) { search.lot_num = SA_LAYOUT_NUM(hdr, type); tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); ASSERT(tb); } if (IS_SA_BONUSTYPE(type)) { data_start = (void *)P2ROUNDUP(((uintptr_t)hdr + offsetof(sa_hdr_phys_t, sa_lengths) + (sizeof (uint16_t) * tb->lot_var_sizes)), 8); length_start = hdr->sa_lengths; } else { data_start = hdr; } for (i = 0; i != tb->lot_attr_count; i++) { int attr_length, reg_length; uint8_t idx_len; reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length; if (reg_length) { attr_length = reg_length; idx_len = 0; } else { attr_length = length_start[length_idx]; idx_len = length_idx++; } func(hdr, data_start, tb->lot_attrs[i], attr_length, idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp); data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + attr_length), 8); } } /*ARGSUSED*/ void sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, uint16_t length, int length_idx, boolean_t variable_length, void *userp) { sa_handle_t *hdl = userp; sa_os_t *sa = hdl->sa_os->os_sa; sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length); } void sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype) { sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype); dmu_buf_impl_t *db; sa_os_t *sa = hdl->sa_os->os_sa; int num_lengths = 1; int i; ASSERT(MUTEX_HELD(&sa->sa_lock)); if (sa_hdr_phys->sa_magic == SA_MAGIC) return; db = SA_GET_DB(hdl, buftype); if (buftype == SA_SPILL) { arc_release(db->db_buf, NULL); arc_buf_thaw(db->db_buf); } sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic); sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info); /* * Determine number of variable lenghts in header * The standard 8 byte header has one for free and a * 16 byte header would have 4 + 1; */ if (SA_HDR_SIZE(sa_hdr_phys) > 8) num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1; for (i = 0; i != num_lengths; i++) sa_hdr_phys->sa_lengths[i] = BSWAP_16(sa_hdr_phys->sa_lengths[i]); sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA, sa_byteswap_cb, NULL, hdl); if (buftype == SA_SPILL) arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf); } static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) { sa_hdr_phys_t *sa_hdr_phys; dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype); dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db); sa_os_t *sa = hdl->sa_os->os_sa; sa_idx_tab_t *idx_tab; sa_hdr_phys = SA_GET_HDR(hdl, buftype); mutex_enter(&sa->sa_lock); /* Do we need to byteswap? */ /* only check if not old znode */ if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC && sa_hdr_phys->sa_magic != 0) { VERIFY(BSWAP_32(sa_hdr_phys->sa_magic) == SA_MAGIC); sa_byteswap(hdl, buftype); } idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys); if (buftype == SA_BONUS) hdl->sa_bonus_tab = idx_tab; else hdl->sa_spill_tab = idx_tab; mutex_exit(&sa->sa_lock); return (0); } /*ARGSUSED*/ -void -sa_evict(dmu_buf_t *db, void *sap) +static void +sa_evict(void *dbu) { - panic("evicting sa dbuf %p\n", (void *)db); + panic("evicting sa dbuf\n"); } static void sa_idx_tab_rele(objset_t *os, void *arg) { sa_os_t *sa = os->os_sa; sa_idx_tab_t *idx_tab = arg; if (idx_tab == NULL) return; mutex_enter(&sa->sa_lock); if (refcount_remove(&idx_tab->sa_refcount, NULL) == 0) { list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab); if (idx_tab->sa_variable_lengths) kmem_free(idx_tab->sa_variable_lengths, sizeof (uint16_t) * idx_tab->sa_layout->lot_var_sizes); refcount_destroy(&idx_tab->sa_refcount); kmem_free(idx_tab->sa_idx_tab, sizeof (uint32_t) * sa->sa_num_attrs); kmem_free(idx_tab, sizeof (sa_idx_tab_t)); } mutex_exit(&sa->sa_lock); } static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab) { sa_os_t *sa = os->os_sa; ASSERT(MUTEX_HELD(&sa->sa_lock)); (void) refcount_add(&idx_tab->sa_refcount, NULL); } void sa_handle_destroy(sa_handle_t *hdl) { + dmu_buf_t *db = hdl->sa_bonus; + mutex_enter(&hdl->sa_lock); - (void) dmu_buf_update_user((dmu_buf_t *)hdl->sa_bonus, hdl, - NULL, NULL); + (void) dmu_buf_remove_user(db, &hdl->sa_dbu); if (hdl->sa_bonus_tab) sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); if (hdl->sa_spill_tab) sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); dmu_buf_rele(hdl->sa_bonus, NULL); if (hdl->sa_spill) dmu_buf_rele((dmu_buf_t *)hdl->sa_spill, NULL); mutex_exit(&hdl->sa_lock); kmem_cache_free(sa_cache, hdl); } int sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, sa_handle_type_t hdl_type, sa_handle_t **handlepp) { int error = 0; dmu_object_info_t doi; - sa_handle_t *handle; + sa_handle_t *handle = NULL; #ifdef ZFS_DEBUG dmu_object_info_from_db(db, &doi); ASSERT(doi.doi_bonus_type == DMU_OT_SA || doi.doi_bonus_type == DMU_OT_ZNODE); #endif /* find handle, if it exists */ /* if one doesn't exist then create a new one, and initialize it */ - handle = (hdl_type == SA_HDL_SHARED) ? dmu_buf_get_user(db) : NULL; + if (hdl_type == SA_HDL_SHARED) + handle = dmu_buf_get_user(db); + if (handle == NULL) { - sa_handle_t *newhandle; + sa_handle_t *winner = NULL; + handle = kmem_cache_alloc(sa_cache, KM_SLEEP); handle->sa_userp = userp; handle->sa_bonus = db; handle->sa_os = os; handle->sa_spill = NULL; handle->sa_bonus_tab = NULL; handle->sa_spill_tab = NULL; error = sa_build_index(handle, SA_BONUS); - newhandle = (hdl_type == SA_HDL_SHARED) ? - dmu_buf_set_user_ie(db, handle, sa_evict) : NULL; - if (newhandle != NULL) { + if (hdl_type == SA_HDL_SHARED) { + dmu_buf_init_user(&handle->sa_dbu, sa_evict, NULL); + winner = dmu_buf_set_user_ie(db, &handle->sa_dbu); + } + + if (winner != NULL) { kmem_cache_free(sa_cache, handle); - handle = newhandle; + handle = winner; } } *handlepp = handle; return (error); } int sa_handle_get(objset_t *objset, uint64_t objid, void *userp, sa_handle_type_t hdl_type, sa_handle_t **handlepp) { dmu_buf_t *db; int error; if (error = dmu_bonus_hold(objset, objid, NULL, &db)) return (error); return (sa_handle_get_from_db(objset, db, userp, hdl_type, handlepp)); } int sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db) { return (dmu_bonus_hold(objset, obj_num, tag, db)); } void sa_buf_rele(dmu_buf_t *db, void *tag) { dmu_buf_rele(db, tag); } int sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count) { ASSERT(hdl); ASSERT(MUTEX_HELD(&hdl->sa_lock)); return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL)); } int sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) { int error; sa_bulk_attr_t bulk; bulk.sa_attr = attr; bulk.sa_data = buf; bulk.sa_length = buflen; bulk.sa_data_func = NULL; ASSERT(hdl); mutex_enter(&hdl->sa_lock); error = sa_lookup_impl(hdl, &bulk, 1); mutex_exit(&hdl->sa_lock); return (error); } #ifdef _KERNEL int sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) { int error; sa_bulk_attr_t bulk; bulk.sa_data = NULL; bulk.sa_attr = attr; bulk.sa_data_func = NULL; ASSERT(hdl); mutex_enter(&hdl->sa_lock); if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, uio->uio_resid), UIO_READ, uio); } mutex_exit(&hdl->sa_lock); return (error); } #endif void * sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, void *data) { sa_idx_tab_t *idx_tab; sa_hdr_phys_t *hdr = (sa_hdr_phys_t *)data; sa_os_t *sa = os->os_sa; sa_lot_t *tb, search; avl_index_t loc; /* * Deterimine layout number. If SA node and header == 0 then * force the index table to the dummy "1" empty layout. * * The layout number would only be zero for a newly created file * that has not added any attributes yet, or with crypto enabled which * doesn't write any attributes to the bonus buffer. */ search.lot_num = SA_LAYOUT_NUM(hdr, bonustype); tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); /* Verify header size is consistent with layout information */ ASSERT(tb); ASSERT(IS_SA_BONUSTYPE(bonustype) && SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) || !IS_SA_BONUSTYPE(bonustype) || (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0)); /* * See if any of the already existing TOC entries can be reused? */ for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab; idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) { boolean_t valid_idx = B_TRUE; int i; if (tb->lot_var_sizes != 0 && idx_tab->sa_variable_lengths != NULL) { for (i = 0; i != tb->lot_var_sizes; i++) { if (hdr->sa_lengths[i] != idx_tab->sa_variable_lengths[i]) { valid_idx = B_FALSE; break; } } } if (valid_idx) { sa_idx_tab_hold(os, idx_tab); return (idx_tab); } } /* No such luck, create a new entry */ idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP); idx_tab->sa_idx_tab = kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP); idx_tab->sa_layout = tb; refcount_create(&idx_tab->sa_refcount); if (tb->lot_var_sizes) idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) * tb->lot_var_sizes, KM_SLEEP); sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab, tb, idx_tab); sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */ sa_idx_tab_hold(os, idx_tab); /* one for layout */ list_insert_tail(&tb->lot_idx_tab, idx_tab); return (idx_tab); } void sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len, boolean_t start, void *userdata) { ASSERT(start); *dataptr = userdata; *len = total_len; } static void sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) { uint64_t attr_value = 0; sa_os_t *sa = hdl->sa_os->os_sa; sa_attr_table_t *tb = sa->sa_attr_table; int i; mutex_enter(&sa->sa_lock); if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) { mutex_exit(&sa->sa_lock); return; } if (sa->sa_reg_attr_obj == 0) { sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os, DMU_OT_SA_ATTR_REGISTRATION, sa->sa_master_obj, SA_REGISTRY, tx); } for (i = 0; i != sa->sa_num_attrs; i++) { if (sa->sa_attr_table[i].sa_registered) continue; ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length, tb[i].sa_byteswap); VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj, tb[i].sa_name, 8, 1, &attr_value, tx)); tb[i].sa_registered = B_TRUE; } sa->sa_need_attr_registration = B_FALSE; mutex_exit(&sa->sa_lock); } /* * Replace all attributes with attributes specified in template. * If dnode had a spill buffer then those attributes will be * also be replaced, possibly with just an empty spill block * * This interface is intended to only be used for bulk adding of * attributes for a new file. It will also be used by the ZPL * when converting and old formatted znode to native SA support. */ int sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; if (sa->sa_need_attr_registration) sa_attr_register_sync(hdl, tx); return (sa_build_layouts(hdl, attr_desc, attr_count, tx)); } int sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, dmu_tx_t *tx) { int error; mutex_enter(&hdl->sa_lock); error = sa_replace_all_by_template_locked(hdl, attr_desc, attr_count, tx); mutex_exit(&hdl->sa_lock); return (error); } /* * Add/remove a single attribute or replace a variable-sized attribute value * with a value of a different size, and then rewrite the entire set * of attributes. * Same-length attribute value replacement (including fixed-length attributes) * is handled more efficiently by the upper layers. */ static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, sa_data_op_t action, sa_data_locator_t *locator, void *datastart, uint16_t buflen, dmu_tx_t *tx) { sa_os_t *sa = hdl->sa_os->os_sa; dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; dnode_t *dn; sa_bulk_attr_t *attr_desc; void *old_data[2]; int bonus_attr_count = 0; int bonus_data_size = 0; int spill_data_size = 0; int spill_attr_count = 0; int error; uint16_t length; int i, j, k, length_idx; sa_hdr_phys_t *hdr; sa_idx_tab_t *idx_tab; int attr_count; int count; ASSERT(MUTEX_HELD(&hdl->sa_lock)); /* First make of copy of the old data */ DB_DNODE_ENTER(db); dn = DB_DNODE(db); if (dn->dn_bonuslen != 0) { bonus_data_size = hdl->sa_bonus->db_size; old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); bcopy(hdl->sa_bonus->db_data, old_data[0], hdl->sa_bonus->db_size); bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; } else { old_data[0] = NULL; } DB_DNODE_EXIT(db); /* Bring spill buffer online if it isn't currently */ if ((error = sa_get_spill(hdl)) == 0) { spill_data_size = hdl->sa_spill->db_size; old_data[1] = kmem_alloc(spill_data_size, KM_SLEEP); bcopy(hdl->sa_spill->db_data, old_data[1], hdl->sa_spill->db_size); spill_attr_count = hdl->sa_spill_tab->sa_layout->lot_attr_count; } else if (error && error != ENOENT) { if (old_data[0]) kmem_free(old_data[0], bonus_data_size); return (error); } else { old_data[1] = NULL; } /* build descriptor of all attributes */ attr_count = bonus_attr_count + spill_attr_count; if (action == SA_ADD) attr_count++; else if (action == SA_REMOVE) attr_count--; attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP); /* * loop through bonus and spill buffer if it exists, and * build up new attr_descriptor to reset the attributes */ k = j = 0; count = bonus_attr_count; hdr = SA_GET_HDR(hdl, SA_BONUS); idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS); for (; k != 2; k++) { /* iterate over each attribute in layout */ for (i = 0, length_idx = 0; i != count; i++) { sa_attr_type_t attr; attr = idx_tab->sa_layout->lot_attrs[i]; if (attr == newattr) { /* duplicate attributes are not allowed */ ASSERT(action == SA_REPLACE || action == SA_REMOVE); /* must be variable-sized to be replaced here */ if (action == SA_REPLACE) { ASSERT(SA_REGISTERED_LEN(sa, attr) == 0); SA_ADD_BULK_ATTR(attr_desc, j, attr, locator, datastart, buflen); } } else { length = SA_REGISTERED_LEN(sa, attr); if (length == 0) { length = hdr->sa_lengths[length_idx]; } SA_ADD_BULK_ATTR(attr_desc, j, attr, NULL, (void *) (TOC_OFF(idx_tab->sa_idx_tab[attr]) + (uintptr_t)old_data[k]), length); } if (SA_REGISTERED_LEN(sa, attr) == 0) length_idx++; } if (k == 0 && hdl->sa_spill) { hdr = SA_GET_HDR(hdl, SA_SPILL); idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL); count = spill_attr_count; } else { break; } } if (action == SA_ADD) { length = SA_REGISTERED_LEN(sa, newattr); if (length == 0) { length = buflen; } SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, datastart, buflen); } ASSERT3U(j, ==, attr_count); error = sa_build_layouts(hdl, attr_desc, attr_count, tx); if (old_data[0]) kmem_free(old_data[0], bonus_data_size); if (old_data[1]) kmem_free(old_data[1], spill_data_size); kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); return (error); } static int sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, dmu_tx_t *tx) { int error; sa_os_t *sa = hdl->sa_os->os_sa; dmu_object_type_t bonustype; bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS)); ASSERT(hdl); ASSERT(MUTEX_HELD(&hdl->sa_lock)); /* sync out registration table if necessary */ if (sa->sa_need_attr_registration) sa_attr_register_sync(hdl, tx); error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx); if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb) sa->sa_update_cb(hdl, tx); return (error); } /* * update or add new attribute */ int sa_update(sa_handle_t *hdl, sa_attr_type_t type, void *buf, uint32_t buflen, dmu_tx_t *tx) { int error; sa_bulk_attr_t bulk; bulk.sa_attr = type; bulk.sa_data_func = NULL; bulk.sa_length = buflen; bulk.sa_data = buf; mutex_enter(&hdl->sa_lock); error = sa_bulk_update_impl(hdl, &bulk, 1, tx); mutex_exit(&hdl->sa_lock); return (error); } int sa_update_from_cb(sa_handle_t *hdl, sa_attr_type_t attr, uint32_t buflen, sa_data_locator_t *locator, void *userdata, dmu_tx_t *tx) { int error; sa_bulk_attr_t bulk; bulk.sa_attr = attr; bulk.sa_data = userdata; bulk.sa_data_func = locator; bulk.sa_length = buflen; mutex_enter(&hdl->sa_lock); error = sa_bulk_update_impl(hdl, &bulk, 1, tx); mutex_exit(&hdl->sa_lock); return (error); } /* * Return size of an attribute */ int sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) { sa_bulk_attr_t bulk; int error; bulk.sa_data = NULL; bulk.sa_attr = attr; bulk.sa_data_func = NULL; ASSERT(hdl); mutex_enter(&hdl->sa_lock); if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) { mutex_exit(&hdl->sa_lock); return (error); } *size = bulk.sa_size; mutex_exit(&hdl->sa_lock); return (0); } int sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) { ASSERT(hdl); ASSERT(MUTEX_HELD(&hdl->sa_lock)); return (sa_lookup_impl(hdl, attrs, count)); } int sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) { int error; ASSERT(hdl); mutex_enter(&hdl->sa_lock); error = sa_bulk_lookup_locked(hdl, attrs, count); mutex_exit(&hdl->sa_lock); return (error); } int sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx) { int error; ASSERT(hdl); mutex_enter(&hdl->sa_lock); error = sa_bulk_update_impl(hdl, attrs, count, tx); mutex_exit(&hdl->sa_lock); return (error); } int sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) { int error; mutex_enter(&hdl->sa_lock); error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL, NULL, 0, tx); mutex_exit(&hdl->sa_lock); return (error); } void sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) { dmu_object_info_from_db((dmu_buf_t *)hdl->sa_bonus, doi); } void sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) { dmu_object_size_from_db((dmu_buf_t *)hdl->sa_bonus, blksize, nblocks); -} - -void -sa_update_user(sa_handle_t *newhdl, sa_handle_t *oldhdl) -{ - (void) dmu_buf_update_user((dmu_buf_t *)newhdl->sa_bonus, - oldhdl, newhdl, sa_evict); - oldhdl->sa_bonus = NULL; } void sa_set_userp(sa_handle_t *hdl, void *ptr) { hdl->sa_userp = ptr; } dmu_buf_t * sa_get_db(sa_handle_t *hdl) { return ((dmu_buf_t *)hdl->sa_bonus); } void * sa_get_userdata(sa_handle_t *hdl) { return (hdl->sa_userp); } void sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func) { ASSERT(MUTEX_HELD(&os->os_sa->sa_lock)); os->os_sa->sa_update_cb = func; } void sa_register_update_callback(objset_t *os, sa_update_cb_t *func) { mutex_enter(&os->os_sa->sa_lock); sa_register_update_callback_locked(os, func); mutex_exit(&os->os_sa->sa_lock); } uint64_t sa_handle_object(sa_handle_t *hdl) { return (hdl->sa_bonus->db_object); } boolean_t sa_enabled(objset_t *os) { return (os->os_sa == NULL); } int sa_set_sa_object(objset_t *os, uint64_t sa_object) { sa_os_t *sa = os->os_sa; if (sa->sa_master_obj) return (1); sa->sa_master_obj = sa_object; return (0); } int sa_hdrsize(void *arg) { sa_hdr_phys_t *hdr = arg; return (SA_HDR_SIZE(hdr)); } void sa_handle_lock(sa_handle_t *hdl) { ASSERT(hdl); mutex_enter(&hdl->sa_lock); } void sa_handle_unlock(sa_handle_t *hdl) { ASSERT(hdl); mutex_exit(&hdl->sa_lock); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c (revision 286575) @@ -1,6997 +1,7014 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2013, 2014, Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2013 Martin Matuska . All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* * SPA: Storage Pool Allocator * * This file contains all the routines used when modifying on-disk SPA state. * This includes opening, importing, destroying, exporting a pool, and syncing a * pool. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #include #include #endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" /* Check hostid on import? */ static int check_hostid = 1; /* * The interval, in seconds, at which failed configuration cache file writes * should be retried. */ static int zfs_ccw_retry_interval = 300; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, check_hostid, CTLFLAG_RWTUN, &check_hostid, 0, "Check hostid on import?"); TUNABLE_INT("vfs.zfs.ccw_retry_interval", &zfs_ccw_retry_interval); SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RW, &zfs_ccw_retry_interval, 0, "Configuration cache file write, retry after failure, interval (seconds)"); typedef enum zti_modes { ZTI_MODE_FIXED, /* value is # of threads (min 1) */ ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ ZTI_MODE_NULL, /* don't create a taskq */ ZTI_NMODES } zti_modes_t; #define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } #define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } #define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } #define ZTI_N(n) ZTI_P(n, 1) #define ZTI_ONE ZTI_N(1) typedef struct zio_taskq_info { zti_modes_t zti_mode; uint_t zti_value; uint_t zti_count; } zio_taskq_info_t; static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { "issue", "issue_high", "intr", "intr_high" }; /* * This table defines the taskq settings for each ZFS I/O type. When * initializing a pool, we use this table to create an appropriately sized * taskq. Some operations are low volume and therefore have a small, static * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE * macros. Other operations process a large amount of data; the ZTI_BATCH * macro causes us to create a taskq oriented for throughput. Some operations * are so high frequency and short-lived that the taskq itself can become a a * point of lock contention. The ZTI_P(#, #) macro indicates that we need an * additional degree of parallelism specified by the number of threads per- * taskq and the number of taskqs; when dispatching an event in this case, the * particular taskq is chosen at random. * * The different taskq priorities are to handle the different contexts (issue * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ { ZTI_BATCH, ZTI_N(5), ZTI_N(8), ZTI_N(5) }, /* WRITE */ { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ }; static void spa_sync_version(void *arg, dmu_tx_t *tx); static void spa_sync_props(void *arg, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, char **ereport); static void spa_vdev_resilver_done(spa_t *spa); uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ #ifdef PSRSET_BIND id_t zio_taskq_psrset_bind = PS_NONE; #endif #ifdef SYSDC boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ #endif uint_t zio_taskq_basedc = 80; /* base duty cycle */ boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ extern int zfs_sync_pass_deferred_free; #ifndef illumos extern void spa_deadman(void *arg); #endif /* * This (illegal) pool name is used when temporarily importing a spa_t in order * to get the vdev stats associated with the imported devices. */ #define TRYIMPORT_NAME "$import" /* * ========================================================================== * SPA properties routines * ========================================================================== */ /* * Add a (source=src, propname=propval) list to an nvlist. */ static void spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, uint64_t intval, zprop_source_t src) { const char *propname = zpool_prop_to_name(prop); nvlist_t *propval; VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); if (strval != NULL) VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); else VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); nvlist_free(propval); } /* * Get property values from the spa configuration. */ static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; uint64_t size, alloc, cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); if (rvd != NULL) { alloc = metaslab_class_get_alloc(spa_normal_class(spa)); size = metaslab_class_get_space(spa_normal_class(spa)); spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, metaslab_class_fragmentation(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, metaslab_class_expandable_space(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == FREAD), src); cap = (size == 0) ? 0 : (alloc * 100 / size); spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, ddt_get_pool_dedup_ratio(spa), src); spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, rvd->vdev_state, src); version = spa_version(spa); if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) src = ZPROP_SRC_DEFAULT; else src = ZPROP_SRC_LOCAL; spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); } if (pool != NULL) { /* * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, * when opening pools before this version freedir will be NULL. */ if (pool->dp_free_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, 0, src); } if (pool->dp_leak_dir != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, src); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, 0, src); } } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); if (spa->spa_comment != NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, 0, ZPROP_SRC_LOCAL); } if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, 0, ZPROP_SRC_LOCAL); if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); } else { spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); } if ((dp = list_head(&spa->spa_config_list)) != NULL) { if (dp->scd_path == NULL) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, "none", 0, ZPROP_SRC_LOCAL); } else if (strcmp(dp->scd_path, spa_config_path) != 0) { spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, dp->scd_path, 0, ZPROP_SRC_LOCAL); } } } /* * Get zpool property values. */ int spa_prop_get(spa_t *spa, nvlist_t **nvp) { objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; int err; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); mutex_enter(&spa->spa_props_lock); /* * Get properties from the spa config. */ spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ if (mos == NULL || spa->spa_pool_props_object == 0) { mutex_exit(&spa->spa_props_lock); return (0); } /* * Get properties from the MOS pool property object. */ for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); (err = zap_cursor_retrieve(&zc, &za)) == 0; zap_cursor_advance(&zc)) { uint64_t intval = 0; char *strval = NULL; zprop_source_t src = ZPROP_SRC_DEFAULT; zpool_prop_t prop; if ((prop = zpool_name_to_prop(za.za_name)) == ZPROP_INVAL) continue; switch (za.za_integer_length) { case 8: /* integer property */ if (za.za_first_integer != zpool_prop_default_numeric(prop)) src = ZPROP_SRC_LOCAL; if (prop == ZPOOL_PROP_BOOTFS) { dsl_pool_t *dp; dsl_dataset_t *ds = NULL; dp = spa_get_dsl(spa); dsl_pool_config_enter(dp, FTAG); if (err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &ds)) { dsl_pool_config_exit(dp, FTAG); break; } strval = kmem_alloc( MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, KM_SLEEP); dsl_dataset_name(ds, strval); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); } else { strval = NULL; intval = za.za_first_integer; } spa_prop_add_list(*nvp, prop, strval, intval, src); if (strval != NULL) kmem_free(strval, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); break; case 1: /* string property */ strval = kmem_alloc(za.za_num_integers, KM_SLEEP); err = zap_lookup(mos, spa->spa_pool_props_object, za.za_name, 1, za.za_num_integers, strval); if (err) { kmem_free(strval, za.za_num_integers); break; } spa_prop_add_list(*nvp, prop, strval, 0, src); kmem_free(strval, za.za_num_integers); break; default: break; } } zap_cursor_fini(&zc); mutex_exit(&spa->spa_props_lock); out: if (err && err != ENOENT) { nvlist_free(*nvp); *nvp = NULL; return (err); } return (0); } /* * Validate the given pool properties nvlist and modify the list * for the property values to be set. */ static int spa_prop_validate(spa_t *spa, nvlist_t *props) { nvpair_t *elem; int error = 0, reset_bootfs = 0; uint64_t objnum = 0; boolean_t has_feature = B_FALSE; elem = NULL; while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { uint64_t intval; char *strval, *slash, *check, *fname; const char *propname = nvpair_name(elem); zpool_prop_t prop = zpool_name_to_prop(propname); switch (prop) { case ZPROP_INVAL: if (!zpool_prop_feature(propname)) { error = SET_ERROR(EINVAL); break; } /* * Sanitize the input. */ if (nvpair_type(elem) != DATA_TYPE_UINT64) { error = SET_ERROR(EINVAL); break; } if (nvpair_value_uint64(elem, &intval) != 0) { error = SET_ERROR(EINVAL); break; } if (intval != 0) { error = SET_ERROR(EINVAL); break; } fname = strchr(propname, '@') + 1; if (zfeature_lookup_name(fname, NULL) != 0) { error = SET_ERROR(EINVAL); break; } has_feature = B_TRUE; break; case ZPOOL_PROP_VERSION: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < spa_version(spa) || intval > SPA_VERSION_BEFORE_FEATURES || has_feature)) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: case ZPOOL_PROP_AUTOEXPAND: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_BOOTFS: /* * If the pool version is less than SPA_VERSION_BOOTFS, * or the pool is still being created (version == 0), * the bootfs property cannot be set. */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = SET_ERROR(ENOTSUP); break; } /* * Make sure the vdev config is bootable */ if (!vdev_is_bootable(spa->spa_root_vdev)) { error = SET_ERROR(ENOTSUP); break; } reset_bootfs = 1; error = nvpair_value_string(elem, &strval); if (!error) { objset_t *os; uint64_t propval; if (strval == NULL || strval[0] == '\0') { objnum = zpool_prop_default_numeric( ZPOOL_PROP_BOOTFS); break; } if (error = dmu_objset_hold(strval, FTAG, &os)) break; /* * Must be ZPL, and its property settings * must be supported by GRUB (compression * is not gzip, and large blocks are not used). */ if (dmu_objset_type(os) != DMU_OST_ZFS) { error = SET_ERROR(ENOTSUP); } else if ((error = dsl_prop_get_int_ds(dmu_objset_ds(os), zfs_prop_to_name(ZFS_PROP_COMPRESSION), &propval)) == 0 && !BOOTFS_COMPRESS_VALID(propval)) { error = SET_ERROR(ENOTSUP); } else if ((error = dsl_prop_get_int_ds(dmu_objset_ds(os), zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &propval)) == 0 && propval > SPA_OLD_MAXBLOCKSIZE) { error = SET_ERROR(ENOTSUP); } else { objnum = dmu_objset_id(os); } dmu_objset_rele(os, FTAG); } break; case ZPOOL_PROP_FAILUREMODE: error = nvpair_value_uint64(elem, &intval); if (!error && (intval < ZIO_FAILURE_MODE_WAIT || intval > ZIO_FAILURE_MODE_PANIC)) error = SET_ERROR(EINVAL); /* * This is a special case which only occurs when * the pool has completely failed. This allows * the user to change the in-core failmode property * without syncing it out to disk (I/Os might * currently be blocked). We do this by returning * EIO to the caller (spa_prop_set) to trick it * into thinking we encountered a property validation * error. */ if (!error && spa_suspended(spa)) { spa->spa_failmode = intval; error = SET_ERROR(EIO); } break; case ZPOOL_PROP_CACHEFILE: if ((error = nvpair_value_string(elem, &strval)) != 0) break; if (strval[0] == '\0') break; if (strcmp(strval, "none") == 0) break; if (strval[0] != '/') { error = SET_ERROR(EINVAL); break; } slash = strrchr(strval, '/'); ASSERT(slash != NULL); if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || strcmp(slash, "/..") == 0) error = SET_ERROR(EINVAL); break; case ZPOOL_PROP_COMMENT: if ((error = nvpair_value_string(elem, &strval)) != 0) break; for (check = strval; *check != '\0'; check++) { /* * The kernel doesn't have an easy isprint() * check. For this kernel check, we merely * check ASCII apart from DEL. Fix this if * there is an easy-to-use kernel isprint(). */ if (*check >= 0x7f) { error = SET_ERROR(EINVAL); break; } check++; } if (strlen(strval) > ZPROP_MAX_COMMENT) error = E2BIG; break; case ZPOOL_PROP_DEDUPDITTO: if (spa_version(spa) < SPA_VERSION_DEDUP) error = SET_ERROR(ENOTSUP); else error = nvpair_value_uint64(elem, &intval); if (error == 0 && intval != 0 && intval < ZIO_DEDUPDITTO_MIN) error = SET_ERROR(EINVAL); break; } if (error) break; } if (!error && reset_bootfs) { error = nvlist_remove(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); if (!error) { error = nvlist_add_uint64(props, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); } } return (error); } void spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) { char *cachefile; spa_config_dirent_t *dp; if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), &cachefile) != 0) return; dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); if (cachefile[0] == '\0') dp->scd_path = spa_strdup(spa_config_path); else if (strcmp(cachefile, "none") == 0) dp->scd_path = NULL; else dp->scd_path = spa_strdup(cachefile); list_insert_head(&spa->spa_config_list, dp); if (need_sync) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); } int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; nvpair_t *elem = NULL; boolean_t need_sync = B_FALSE; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT || prop == ZPOOL_PROP_READONLY) continue; if (prop == ZPOOL_PROP_VERSION || prop == ZPROP_INVAL) { uint64_t ver; if (prop == ZPOOL_PROP_VERSION) { VERIFY(nvpair_value_uint64(elem, &ver) == 0); } else { ASSERT(zpool_prop_feature(nvpair_name(elem))); ver = SPA_VERSION_FEATURES; need_sync = B_TRUE; } /* Save time if the version is already set. */ if (ver == spa_version(spa)) continue; /* * In addition to the pool directory object, we might * create the pool properties object, the features for * read object, the features for write object, or the * feature descriptions object. */ error = dsl_sync_task(spa->spa_name, NULL, spa_sync_version, &ver, 6, ZFS_SPACE_CHECK_RESERVED); if (error) return (error); continue; } need_sync = B_TRUE; break; } if (need_sync) { return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, nvp, 6, ZFS_SPACE_CHECK_RESERVED)); } return (0); } /* * If the bootfs property value is dsobj, clear it. */ void spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) { if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { VERIFY(zap_remove(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); spa->spa_bootfs = 0; } } /*ARGSUSED*/ static int spa_change_guid_check(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; vdev_t *rvd = spa->spa_root_vdev; uint64_t vdev_state; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); vdev_state = rvd->vdev_state; spa_config_exit(spa, SCL_STATE, FTAG); if (vdev_state != VDEV_STATE_HEALTHY) return (SET_ERROR(ENXIO)); ASSERT3U(spa_guid(spa), !=, *newguid); return (0); } static void spa_change_guid_sync(void *arg, dmu_tx_t *tx) { uint64_t *newguid = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; uint64_t oldguid; vdev_t *rvd = spa->spa_root_vdev; oldguid = spa_guid(spa); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); rvd->vdev_guid = *newguid; rvd->vdev_guid_sum += (*newguid - oldguid); vdev_config_dirty(rvd); spa_config_exit(spa, SCL_STATE, FTAG); spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", oldguid, *newguid); } /* * Change the GUID for the pool. This is done so that we can later * re-import a pool built from a clone of our own vdevs. We will modify * the root vdev's guid, our own pool guid, and then mark all of our * vdevs dirty. Note that we must make sure that all our vdevs are * online when we do this, or else any vdevs that weren't present * would be orphaned from our pool. We are also going to issue a * sysevent to update any watchers. */ int spa_change_guid(spa_t *spa) { int error; uint64_t guid; mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); guid = spa_generate_guid(NULL); error = dsl_sync_task(spa->spa_name, spa_change_guid_check, spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); if (error == 0) { spa_config_sync(spa, B_FALSE, B_TRUE); spa_event_notify(spa, NULL, ESC_ZFS_POOL_REGUID); } mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * ========================================================================== * SPA state manipulation (open/create/destroy/import/export) * ========================================================================== */ static int spa_error_entry_compare(const void *a, const void *b) { spa_error_entry_t *sa = (spa_error_entry_t *)a; spa_error_entry_t *sb = (spa_error_entry_t *)b; int ret; ret = bcmp(&sa->se_bookmark, &sb->se_bookmark, sizeof (zbookmark_phys_t)); if (ret < 0) return (-1); else if (ret > 0) return (1); else return (0); } /* * Utility function which retrieves copies of the current logs and * re-initializes them in the process. */ void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) { ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } static void spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; enum zti_modes mode = ztip->zti_mode; uint_t value = ztip->zti_value; uint_t count = ztip->zti_count; spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; char name[32]; uint_t flags = 0; boolean_t batch = B_FALSE; if (mode == ZTI_MODE_NULL) { tqs->stqs_count = 0; tqs->stqs_taskq = NULL; return; } ASSERT3U(count, >, 0); tqs->stqs_count = count; tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); switch (mode) { case ZTI_MODE_FIXED: ASSERT3U(value, >=, 1); value = MAX(value, 1); break; case ZTI_MODE_BATCH: batch = B_TRUE; flags |= TASKQ_THREADS_CPU_PCT; value = zio_taskq_batch_pct; break; default: panic("unrecognized mode for %s_%s taskq (%u:%u) in " "spa_activate()", zio_type_name[t], zio_taskq_types[q], mode, value); break; } for (uint_t i = 0; i < count; i++) { taskq_t *tq; if (count > 1) { (void) snprintf(name, sizeof (name), "%s_%s_%u", zio_type_name[t], zio_taskq_types[q], i); } else { (void) snprintf(name, sizeof (name), "%s_%s", zio_type_name[t], zio_taskq_types[q]); } #ifdef SYSDC if (zio_taskq_sysdc && spa->spa_proc != &p0) { if (batch) flags |= TASKQ_DC_BATCH; tq = taskq_create_sysdc(name, value, 50, INT_MAX, spa->spa_proc, zio_taskq_basedc, flags); } else { #endif pri_t pri = maxclsyspri; /* * The write issue taskq can be extremely CPU * intensive. Run it at slightly lower priority * than the other taskqs. */ if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) pri--; tq = taskq_create_proc(name, value, pri, 50, INT_MAX, spa->spa_proc, flags); #ifdef SYSDC } #endif tqs->stqs_taskq[i] = tq; } } static void spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; if (tqs->stqs_taskq == NULL) { ASSERT0(tqs->stqs_count); return; } for (uint_t i = 0; i < tqs->stqs_count; i++) { ASSERT3P(tqs->stqs_taskq[i], !=, NULL); taskq_destroy(tqs->stqs_taskq[i]); } kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); tqs->stqs_taskq = NULL; } /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention * on the taskq itself. In that case we choose which taskq at random by using * the low bits of gethrtime(). */ void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) { spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; taskq_t *tq; ASSERT3P(tqs->stqs_taskq, !=, NULL); ASSERT3U(tqs->stqs_count, !=, 0); if (tqs->stqs_count == 1) { tq = tqs->stqs_taskq[0]; } else { #ifdef _KERNEL tq = tqs->stqs_taskq[cpu_ticks() % tqs->stqs_count]; #else tq = tqs->stqs_taskq[gethrtime() % tqs->stqs_count]; #endif } taskq_dispatch_ent(tq, func, arg, flags, ent); } static void spa_create_zio_taskqs(spa_t *spa) { for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_init(spa, t, q); } } } #ifdef _KERNEL #ifdef SPA_PROCESS static void spa_thread(void *arg) { callb_cpr_t cprinfo; spa_t *spa = arg; user_t *pu = PTOU(curproc); CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, spa->spa_name); ASSERT(curproc != &p0); (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), "zpool-%s", spa->spa_name); (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); #ifdef PSRSET_BIND /* bind this thread to the requested psrset */ if (zio_taskq_psrset_bind != PS_NONE) { pool_lock(); mutex_enter(&cpu_lock); mutex_enter(&pidlock); mutex_enter(&curproc->p_lock); if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, 0, NULL, NULL) == 0) { curthread->t_bind_pset = zio_taskq_psrset_bind; } else { cmn_err(CE_WARN, "Couldn't bind process for zfs pool \"%s\" to " "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); } mutex_exit(&curproc->p_lock); mutex_exit(&pidlock); mutex_exit(&cpu_lock); pool_unlock(); } #endif #ifdef SYSDC if (zio_taskq_sysdc) { sysdc_thread_enter(curthread, 100, 0); } #endif spa->spa_proc = curproc; spa->spa_did = curthread->t_did; spa_create_zio_taskqs(spa); mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); spa->spa_proc_state = SPA_PROC_ACTIVE; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_SAFE_BEGIN(&cprinfo); while (spa->spa_proc_state == SPA_PROC_ACTIVE) cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); spa->spa_proc_state = SPA_PROC_GONE; spa->spa_proc = &p0; cv_broadcast(&spa->spa_proc_cv); CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ mutex_enter(&curproc->p_lock); lwp_exit(); } #endif /* SPA_PROCESS */ #endif /* * Activate an uninitialized pool. */ static void spa_activate(spa_t *spa, int mode) { ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; spa->spa_mode = mode; spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); /* Try to create a covering process */ mutex_enter(&spa->spa_proc_lock); ASSERT(spa->spa_proc_state == SPA_PROC_NONE); ASSERT(spa->spa_proc == &p0); spa->spa_did = 0; #ifdef SPA_PROCESS /* Only create a process if we're going to be around a while. */ if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, NULL, 0) == 0) { spa->spa_proc_state = SPA_PROC_CREATED; while (spa->spa_proc_state == SPA_PROC_CREATED) { cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); ASSERT(spa->spa_proc != &p0); ASSERT(spa->spa_did != 0); } else { #ifdef _KERNEL cmn_err(CE_WARN, "Couldn't create process for zfs pool \"%s\"\n", spa->spa_name); #endif } } #endif /* SPA_PROCESS */ mutex_exit(&spa->spa_proc_lock); /* If we didn't create a process, we need to create our taskqs. */ ASSERT(spa->spa_proc == &p0); if (spa->spa_proc == &p0) { spa_create_zio_taskqs(spa); } /* * Start TRIM thread. */ trim_thread_create(spa); list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); + list_create(&spa->spa_evicting_os_list, sizeof (objset_t), + offsetof(objset_t, os_evicting_node)); list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_state_dirty_node)); txg_list_create(&spa->spa_vdev_txg_list, offsetof(struct vdev, vdev_txg_node)); avl_create(&spa->spa_errlist_scrub, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); avl_create(&spa->spa_errlist_last, spa_error_entry_compare, sizeof (spa_error_entry_t), offsetof(spa_error_entry_t, se_avl)); } /* * Opposite of spa_activate(). */ static void spa_deactivate(spa_t *spa) { ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); /* * Stop TRIM thread in case spa_unload() wasn't called directly * before spa_deactivate(). */ trim_thread_destroy(spa); + spa_evicting_os_wait(spa); + txg_list_destroy(&spa->spa_vdev_txg_list); list_destroy(&spa->spa_config_dirty_list); + list_destroy(&spa->spa_evicting_os_list); list_destroy(&spa->spa_state_dirty_list); for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { spa_taskqs_fini(spa, t, q); } } metaslab_class_destroy(spa->spa_normal_class); spa->spa_normal_class = NULL; metaslab_class_destroy(spa->spa_log_class); spa->spa_log_class = NULL; /* * If this was part of an import or the open otherwise failed, we may * still have errors left in the queues. Empty them just in case. */ spa_errlog_drain(spa); avl_destroy(&spa->spa_errlist_scrub); avl_destroy(&spa->spa_errlist_last); spa->spa_state = POOL_STATE_UNINITIALIZED; mutex_enter(&spa->spa_proc_lock); if (spa->spa_proc_state != SPA_PROC_NONE) { ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); spa->spa_proc_state = SPA_PROC_DEACTIVATE; cv_broadcast(&spa->spa_proc_cv); while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { ASSERT(spa->spa_proc != &p0); cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); } ASSERT(spa->spa_proc_state == SPA_PROC_GONE); spa->spa_proc_state = SPA_PROC_NONE; } ASSERT(spa->spa_proc == &p0); mutex_exit(&spa->spa_proc_lock); #ifdef SPA_PROCESS /* * We want to make sure spa_thread() has actually exited the ZFS * module, so that the module can't be unloaded out from underneath * it. */ if (spa->spa_did != 0) { thread_join(spa->spa_did); spa->spa_did = 0; } #endif /* SPA_PROCESS */ } /* * Verify a pool configuration, and construct the vdev tree appropriately. This * will create all the necessary vdevs in the appropriate layout, with each vdev * in the CLOSED state. This will prep the pool before open/creation/import. * All vdev validation is done by the vdev_alloc() routine. */ static int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) return (error); if ((*vdp)->vdev_ops->vdev_op_leaf) return (0); error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children); if (error == ENOENT) return (0); if (error) { vdev_free(*vdp); *vdp = NULL; return (SET_ERROR(EINVAL)); } for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { vdev_free(*vdp); *vdp = NULL; return (error); } } ASSERT(*vdp != NULL); return (0); } /* * Opposite of spa_load(). */ static void spa_unload(spa_t *spa) { int i; ASSERT(MUTEX_HELD(&spa_namespace_lock)); /* * Stop TRIM thread. */ trim_thread_destroy(spa); /* * Stop async tasks. */ spa_async_suspend(spa); /* * Stop syncing. */ if (spa->spa_sync_on) { txg_sync_stop(spa->spa_dsl_pool); spa->spa_sync_on = B_FALSE; } /* * Wait for any outstanding async I/O to complete. */ if (spa->spa_async_zio_root != NULL) { for (int i = 0; i < max_ncpus; i++) (void) zio_wait(spa->spa_async_zio_root[i]); kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); spa->spa_async_zio_root = NULL; } bpobj_close(&spa->spa_deferred_bpobj); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Close all vdevs. */ if (spa->spa_root_vdev) vdev_free(spa->spa_root_vdev); ASSERT(spa->spa_root_vdev == NULL); /* * Close the dsl pool. */ if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; spa->spa_meta_objset = NULL; } ddt_unload(spa); /* * Drop and purge level 2 cache */ spa_l2cache_drop(spa); for (i = 0; i < spa->spa_spares.sav_count; i++) vdev_free(spa->spa_spares.sav_vdevs[i]); if (spa->spa_spares.sav_vdevs) { kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); spa->spa_spares.sav_vdevs = NULL; } if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; } spa->spa_spares.sav_count = 0; for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); vdev_free(spa->spa_l2cache.sav_vdevs[i]); } if (spa->spa_l2cache.sav_vdevs) { kmem_free(spa->spa_l2cache.sav_vdevs, spa->spa_l2cache.sav_count * sizeof (void *)); spa->spa_l2cache.sav_vdevs = NULL; } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; } spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; if (spa->spa_comment != NULL) { spa_strfree(spa->spa_comment); spa->spa_comment = NULL; } spa_config_exit(spa, SCL_ALL, FTAG); } /* * Load (or re-load) the current list of vdevs describing the active spares for * this pool. When this is called, we have some form of basic information in * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. */ static void spa_load_spares(spa_t *spa) { nvlist_t **spares; uint_t nspares; int i; vdev_t *vd, *tvd; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * First, close and free any existing spare vdevs. */ for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; /* Undo the call to spa_activate() below */ if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL && tvd->vdev_isspare) spa_spare_remove(tvd); vdev_close(vd); vdev_free(vd); } if (spa->spa_spares.sav_vdevs) kmem_free(spa->spa_spares.sav_vdevs, spa->spa_spares.sav_count * sizeof (void *)); if (spa->spa_spares.sav_config == NULL) nspares = 0; else VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); spa->spa_spares.sav_count = (int)nspares; spa->spa_spares.sav_vdevs = NULL; if (nspares == 0) return; /* * Construct the array of vdevs, opening them to get status in the * process. For each spare, there is potentially two different vdev_t * structures associated with it: one in the list of spares (used only * for basic validation purposes) and one in the active vdev * configuration (if it's spared in). During this phase we open and * validate each vdev on the spare list. If the vdev also exists in the * active configuration, then we also mark this vdev as an active spare. */ spa->spa_spares.sav_vdevs = kmem_alloc(nspares * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) { VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, VDEV_ALLOC_SPARE) == 0); ASSERT(vd != NULL); spa->spa_spares.sav_vdevs[i] = vd; if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, B_FALSE)) != NULL) { if (!tvd->vdev_isspare) spa_spare_add(tvd); /* * We only mark the spare active if we were successfully * able to load the vdev. Otherwise, importing a pool * with a bad active spare would result in strange * behavior, because multiple pool would think the spare * is actively in use. * * There is a vulnerability here to an equally bizarre * circumstance, where a dead active spare is later * brought back to life (onlined or otherwise). Given * the rarity of this scenario, and the extra complexity * it adds, we ignore the possibility. */ if (!vdev_is_dead(tvd)) spa_spare_activate(tvd); } vd->vdev_top = vd; vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; if (vdev_validate_aux(vd) == 0) spa_spare_add(vd); } /* * Recompute the stashed list of spares, with status information * this time. */ VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < spa->spa_spares.sav_count; i++) spares[i] = vdev_config_generate(spa, spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); for (i = 0; i < spa->spa_spares.sav_count; i++) nvlist_free(spares[i]); kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); } /* * Load (or re-load) the current list of vdevs describing the active l2cache for * this pool. When this is called, we have some form of basic information in * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and * then re-generate a more complete list including status information. * Devices which are already active have their details maintained, and are * not re-opened. */ static void spa_load_l2cache(spa_t *spa) { nvlist_t **l2cache; uint_t nl2cache; int i, j, oldnvdevs; uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if (sav->sav_config != NULL) { VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); } else { nl2cache = 0; newvdevs = NULL; } oldvdevs = sav->sav_vdevs; oldnvdevs = sav->sav_count; sav->sav_vdevs = NULL; sav->sav_count = 0; /* * Process new nvlist of vdevs. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); newvdevs[i] = NULL; for (j = 0; j < oldnvdevs; j++) { vd = oldvdevs[j]; if (vd != NULL && guid == vd->vdev_guid) { /* * Retain previous vdev for add/remove ops. */ newvdevs[i] = vd; oldvdevs[j] = NULL; break; } } if (newvdevs[i] == NULL) { /* * Create new vdev */ VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, VDEV_ALLOC_L2CACHE) == 0); ASSERT(vd != NULL); newvdevs[i] = vd; /* * Commit this vdev as an l2cache device, * even if it fails to open. */ spa_l2cache_add(vd); vd->vdev_top = vd; vd->vdev_aux = sav; spa_l2cache_activate(vd); if (vdev_open(vd) != 0) continue; (void) vdev_validate_aux(vd); if (!vdev_is_dead(vd)) l2arc_add_vdev(spa, vd); } } /* * Purge vdevs that were dropped */ for (i = 0; i < oldnvdevs; i++) { uint64_t pool; vd = oldvdevs[i]; if (vd != NULL) { ASSERT(vd->vdev_isl2cache); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); vdev_clear_stats(vd); vdev_free(vd); } } if (oldvdevs) kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); if (sav->sav_config == NULL) goto out; sav->sav_vdevs = newvdevs; sav->sav_count = (int)nl2cache; /* * Recompute the stashed list of l2cache devices, with status * information this time. */ VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); l2cache = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) l2cache[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(sav->sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); out: for (i = 0; i < sav->sav_count; i++) nvlist_free(l2cache[i]); if (sav->sav_count) kmem_free(l2cache, sav->sav_count * sizeof (void *)); } static int load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) { dmu_buf_t *db; char *packed = NULL; size_t nvsize = 0; int error; *value = NULL; error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); if (error != 0) return (error); nvsize = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); packed = kmem_alloc(nvsize, KM_SLEEP); error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); return (error); } /* * Checks to see if the given vdev could not be opened, in which case we post a * sysevent to notify the autoreplace code that the device has been removed. */ static void spa_check_removed(vdev_t *vd) { for (int c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && !vd->vdev_ishole) { zfs_post_autoreplace(vd->vdev_spa, vd); spa_event_notify(vd->vdev_spa, vd, ESC_ZFS_VDEV_CHECK); } } /* * Validate the current config against the MOS config */ static boolean_t spa_config_valid(spa_t *spa, nvlist_t *config) { vdev_t *mrvd, *rvd = spa->spa_root_vdev; nvlist_t *nv; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nv) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); ASSERT3U(rvd->vdev_children, ==, mrvd->vdev_children); /* * If we're doing a normal import, then build up any additional * diagnostic information about missing devices in this config. * We'll pass this up to the user for further processing. */ if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { nvlist_t **child, *nv; uint64_t idx = 0; child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t **), KM_SLEEP); VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; vdev_t *mtvd = mrvd->vdev_child[c]; if (tvd->vdev_ops == &vdev_missing_ops && mtvd->vdev_ops != &vdev_missing_ops && mtvd->vdev_islog) child[idx++] = vdev_config_generate(spa, mtvd, B_FALSE, 0); } if (idx) { VERIFY(nvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, child, idx) == 0); VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_MISSING_DEVICES, nv) == 0); for (int i = 0; i < idx; i++) nvlist_free(child[i]); } nvlist_free(nv); kmem_free(child, rvd->vdev_children * sizeof (char **)); } /* * Compare the root vdev tree with the information we have * from the MOS config (mrvd). Check each top-level vdev * with the corresponding MOS config top-level (mtvd). */ for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; vdev_t *mtvd = mrvd->vdev_child[c]; /* * Resolve any "missing" vdevs in the current configuration. * If we find that the MOS config has more accurate information * about the top-level vdev then use that vdev instead. */ if (tvd->vdev_ops == &vdev_missing_ops && mtvd->vdev_ops != &vdev_missing_ops) { if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) continue; /* * Device specific actions. */ if (mtvd->vdev_islog) { spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * XXX - once we have 'readonly' pool * support we should be able to handle * missing data devices by transitioning * the pool to readonly. */ continue; } /* * Swap the missing vdev with the data we were * able to obtain from the MOS config. */ vdev_remove_child(rvd, tvd); vdev_remove_child(mrvd, mtvd); vdev_add_child(rvd, mtvd); vdev_add_child(mrvd, tvd); spa_config_exit(spa, SCL_ALL, FTAG); vdev_load(mtvd); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_reopen(rvd); } else if (mtvd->vdev_islog) { /* * Load the slog device's state from the MOS config * since it's possible that the label does not * contain the most up-to-date information. */ vdev_load_log_state(tvd, mtvd); vdev_reopen(tvd); } } vdev_free(mrvd); spa_config_exit(spa, SCL_ALL, FTAG); /* * Ensure we were able to validate the config. */ return (rvd->vdev_guid_sum == spa->spa_uberblock.ub_guid_sum); } /* * Check for missing log devices */ static boolean_t spa_check_logs(spa_t *spa) { boolean_t rv = B_FALSE; switch (spa->spa_log_state) { case SPA_LOG_MISSING: /* need to recheck in case slog has been restored */ case SPA_LOG_UNKNOWN: rv = (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); if (rv) spa_set_log_state(spa, SPA_LOG_MISSING); break; } return (rv); } static boolean_t spa_passivate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; boolean_t slog_found = B_FALSE; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); if (!spa_has_slogs(spa)) return (B_FALSE); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) { metaslab_group_passivate(mg); slog_found = B_TRUE; } } return (slog_found); } static void spa_activate_log(spa_t *spa) { vdev_t *rvd = spa->spa_root_vdev; ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); for (int c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; metaslab_group_t *mg = tvd->vdev_mg; if (tvd->vdev_islog) metaslab_group_activate(mg); } } int spa_offline_log(spa_t *spa) { int error; error = dmu_objset_find(spa_name(spa), zil_vdev_offline, NULL, DS_FIND_CHILDREN); if (error == 0) { /* * We successfully offlined the log device, sync out the * current txg so that the "stubby" block can be removed * by zil_sync(). */ txg_wait_synced(spa->spa_dsl_pool, 0); } return (error); } static void spa_aux_check_removed(spa_aux_vdev_t *sav) { int i; for (i = 0; i < sav->sav_count; i++) spa_check_removed(sav->sav_vdevs[i]); } void spa_claim_notify(zio_t *zio) { spa_t *spa = zio->io_spa; if (zio->io_error) return; mutex_enter(&spa->spa_props_lock); /* any mutex will do */ if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) spa->spa_claim_max_txg = zio->io_bp->blk_birth; mutex_exit(&spa->spa_props_lock); } typedef struct spa_load_error { uint64_t sle_meta_count; uint64_t sle_data_count; } spa_load_error_t; static void spa_load_verify_done(zio_t *zio) { blkptr_t *bp = zio->io_bp; spa_load_error_t *sle = zio->io_private; dmu_object_type_t type = BP_GET_TYPE(bp); int error = zio->io_error; spa_t *spa = zio->io_spa; if (error) { if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && type != DMU_OT_INTENT_LOG) atomic_inc_64(&sle->sle_meta_count); else atomic_inc_64(&sle->sle_data_count); } zio_data_buf_free(zio->io_data, zio->io_size); mutex_enter(&spa->spa_scrub_lock); spa->spa_scrub_inflight--; cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } /* * Maximum number of concurrent scrub i/os to create while verifying * a pool while importing it. */ int spa_load_verify_maxinflight = 10000; boolean_t spa_load_verify_metadata = B_TRUE; boolean_t spa_load_verify_data = B_TRUE; SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_maxinflight, CTLFLAG_RWTUN, &spa_load_verify_maxinflight, 0, "Maximum number of concurrent scrub I/Os to create while verifying a " "pool while importing it"); SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_metadata, CTLFLAG_RWTUN, &spa_load_verify_metadata, 0, "Check metadata on import?"); SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_load_verify_data, CTLFLAG_RWTUN, &spa_load_verify_data, 0, "Check user data on import?"); /*ARGSUSED*/ static int spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) return (0); /* * Note: normally this routine will not be called if * spa_load_verify_metadata is not set. However, it may be useful * to manually set the flag after the traversal has begun. */ if (!spa_load_verify_metadata) return (0); if (BP_GET_BUFC_TYPE(bp) == ARC_BUFC_DATA && !spa_load_verify_data) return (0); zio_t *rio = arg; size_t size = BP_GET_PSIZE(bp); void *data = zio_data_buf_alloc(size); mutex_enter(&spa->spa_scrub_lock); while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); spa->spa_scrub_inflight++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, data, size, spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); return (0); } static int spa_load_verify(spa_t *spa) { zio_t *rio; spa_load_error_t sle = { 0 }; zpool_rewind_policy_t policy; boolean_t verify_ok = B_FALSE; int error = 0; zpool_get_rewind_policy(spa->spa_config, &policy); if (policy.zrp_request & ZPOOL_NEVER_REWIND) return (0); rio = zio_root(spa, NULL, &sle, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); if (spa_load_verify_metadata) { error = traverse_pool(spa, spa->spa_verify_min_txg, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, spa_load_verify_cb, rio); } (void) zio_wait(rio); spa->spa_load_meta_errors = sle.sle_meta_count; spa->spa_load_data_errors = sle.sle_data_count; if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && sle.sle_data_count <= policy.zrp_maxdata) { int64_t loss = 0; verify_ok = B_TRUE; spa->spa_load_txg = spa->spa_uberblock.ub_txg; spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); VERIFY(nvlist_add_int64(spa->spa_load_info, ZPOOL_CONFIG_REWIND_TIME, loss) == 0); VERIFY(nvlist_add_uint64(spa->spa_load_info, ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); } else { spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; } if (error) { if (error != ENXIO && error != EIO) error = SET_ERROR(EIO); return (error); } return (verify_ok ? 0 : EIO); } /* * Find a value in the pool props object. */ static void spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) { (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); } /* * Find a value in the pool directory object. */ static int spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) { return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, name, sizeof (uint64_t), 1, val)); } static int spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) { vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); return (err); } /* * Fix up config after a partly-completed split. This is done with the * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off * pool have that entry in their config, but only the splitting one contains * a list of all the guids of the vdevs that are being split off. * * This function determines what to do with that list: either rejoin * all the disks to the pool, or complete the splitting process. To attempt * the rejoin, each disk that is offlined is marked online again, and * we do a reopen() call. If the vdev label for every disk that was * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) * then we call vdev_split() on each disk, and complete the split. * * Otherwise we leave the config alone, with all the vdevs in place in * the original pool. */ static void spa_try_repair(spa_t *spa, nvlist_t *config) { uint_t extracted; uint64_t *glist; uint_t i, gcount; nvlist_t *nvl; vdev_t **vd; boolean_t attempt_reopen; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) return; /* check that the config is complete */ if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, &glist, &gcount) != 0) return; vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); /* attempt to online all the vdevs & validate */ attempt_reopen = B_TRUE; for (i = 0; i < gcount; i++) { if (glist[i] == 0) /* vdev is hole */ continue; vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); if (vd[i] == NULL) { /* * Don't bother attempting to reopen the disks; * just do the split. */ attempt_reopen = B_FALSE; } else { /* attempt to re-online it */ vd[i]->vdev_offline = B_FALSE; } } if (attempt_reopen) { vdev_reopen(spa->spa_root_vdev); /* check each device to see what state it's in */ for (extracted = 0, i = 0; i < gcount; i++) { if (vd[i] != NULL && vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) break; ++extracted; } } /* * If every disk has been moved to the new pool, or if we never * even attempted to look at them, then we split them off for * good. */ if (!attempt_reopen || gcount == extracted) { for (i = 0; i < gcount; i++) if (vd[i] != NULL) vdev_split(vd[i]); vdev_reopen(spa->spa_root_vdev); } kmem_free(vd, gcount * sizeof (vdev_t *)); } static int spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig) { nvlist_t *config = spa->spa_config; char *ereport = FM_EREPORT_ZFS_POOL; char *comment; int error; uint64_t pool_guid; nvlist_t *nvl; if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) return (SET_ERROR(EINVAL)); ASSERT(spa->spa_comment == NULL); if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) spa->spa_comment = spa_strdup(comment); /* * Versioning wasn't explicitly added to the label until later, so if * it's not present treat it as the initial version. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &spa->spa_config_txg); if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && spa_guid_exists(pool_guid, 0)) { error = SET_ERROR(EEXIST); } else { spa->spa_config_guid = pool_guid; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) { VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, KM_SLEEP) == 0); } nvlist_free(spa->spa_load_info); spa->spa_load_info = fnvlist_alloc(); gethrestime(&spa->spa_loaded_ts); error = spa_load_impl(spa, pool_guid, config, state, type, mosconfig, &ereport); } + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); if (error) { if (error != EEXIST) { spa->spa_loaded_ts.tv_sec = 0; spa->spa_loaded_ts.tv_nsec = 0; } if (error != EBADF) { zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); } } spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; spa->spa_ena = 0; return (error); } /* * Load an existing storage pool, using the pool's builtin spa_config as a * source of configuration information. */ static int spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, char **ereport) { int error = 0; nvlist_t *nvroot = NULL; nvlist_t *label; vdev_t *rvd; uberblock_t *ub = &spa->spa_uberblock; uint64_t children, config_cache_txg = spa->spa_config_txg; int orig_mode = spa->spa_mode; int parse; uint64_t obj; boolean_t missing_feat_write = B_FALSE; /* * If this is an untrusted config, access the pool in read-only mode. * This prevents things like resilvering recently removed devices. */ if (!mosconfig) spa->spa_mode = FREAD; ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa->spa_load_state = state; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) return (SET_ERROR(EINVAL)); parse = (type == SPA_IMPORT_EXISTING ? VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Parse the configuration into a vdev tree. We explicitly set the * value that will be returned by spa_version() since parsing the * configuration requires knowing the version number. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); ASSERT(spa->spa_root_vdev == rvd); ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); if (type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_guid(spa) == pool_guid); } /* * Try to open all vdevs, loading each label in the process. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_open(rvd); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); /* * We need to validate the vdev labels against the configuration that * we have in hand, which is dependent on the setting of mosconfig. If * mosconfig is true then we're validating the vdev labels based on * that config. Otherwise, we're validating against the cached config * (zpool.cache) that was read when we loaded the zfs module, and then * later we will recursively call spa_load() and validate against * the vdev config. * * If we're assembling a new pool that's been split off from an * existing pool, the labels haven't yet been updated so we skip * validation for now. */ if (type != SPA_IMPORT_ASSEMBLE) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = vdev_validate(rvd, mosconfig); spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) return (error); if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) return (SET_ERROR(ENXIO)); } /* * Find the best uberblock. */ vdev_uberblock_load(rvd, ub, &label); /* * If we weren't able to find a single valid uberblock, return failure. */ if (ub->ub_txg == 0) { nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * If the pool has an unsupported version we can't open it. */ if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); } if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *features; /* * If we weren't able to find what's necessary for reading the * MOS in the label, return failure. */ if (label == NULL || nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) != 0) { nvlist_free(label); return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); } /* * Update our in-core representation with the definitive values * from the label. */ nvlist_free(spa->spa_label_features); VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } nvlist_free(label); /* * Look through entries in the label nvlist's features_for_read. If * there is a feature listed there which we don't understand then we * cannot open a pool. */ if (ub->ub_version >= SPA_VERSION_FEATURES) { nvlist_t *unsup_feat; VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, NULL); nvp != NULL; nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { if (!zfeature_is_supported(nvpair_name(nvp))) { VERIFY(nvlist_add_string(unsup_feat, nvpair_name(nvp), "") == 0); } } if (!nvlist_empty(unsup_feat)) { VERIFY(nvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); nvlist_free(unsup_feat); return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } nvlist_free(unsup_feat); } /* * If the vdev guid sum doesn't match the uberblock, we have an * incomplete configuration. We first check to see if the pool * is aware of the complete config (i.e ZPOOL_CONFIG_VDEV_CHILDREN). * If it is, defer the vdev_guid_sum check till later so we * can handle missing vdevs. */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, &children) != 0 && mosconfig && type != SPA_IMPORT_ASSEMBLE && rvd->vdev_guid_sum != ub->ub_guid_sum) return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_try_repair(spa, config); spa_config_exit(spa, SCL_ALL, FTAG); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; } /* * Initialize internal SPA structures. */ spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; spa->spa_verify_min_txg = spa->spa_extreme_rewind ? TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; spa->spa_first_txg = spa->spa_last_ubsync_txg ? spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; spa->spa_claim_max_txg = spa->spa_first_txg; spa->spa_prev_software_version = ub->ub_software_version; error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); if (error) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (spa_version(spa) >= SPA_VERSION_FEATURES) { boolean_t missing_feat_read = B_FALSE; nvlist_t *unsup_feat, *enabled_feat; if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, &spa->spa_feat_for_read_obj) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, &spa->spa_feat_for_write_obj) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, &spa->spa_feat_desc_obj) != 0) { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } enabled_feat = fnvlist_alloc(); unsup_feat = fnvlist_alloc(); if (!spa_features_check(spa, B_FALSE, unsup_feat, enabled_feat)) missing_feat_read = B_TRUE; if (spa_writeable(spa) || state == SPA_LOAD_TRYIMPORT) { if (!spa_features_check(spa, B_TRUE, unsup_feat, enabled_feat)) { missing_feat_write = B_TRUE; } } fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); if (!nvlist_empty(unsup_feat)) { fnvlist_add_nvlist(spa->spa_load_info, ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); } fnvlist_free(enabled_feat); fnvlist_free(unsup_feat); if (!missing_feat_read) { fnvlist_add_boolean(spa->spa_load_info, ZPOOL_CONFIG_CAN_RDONLY); } /* * If the state is SPA_LOAD_TRYIMPORT, our objective is * twofold: to determine whether the pool is available for * import in read-write mode and (if it is not) whether the * pool is available for import in read-only mode. If the pool * is available for import in read-write mode, it is displayed * as available in userland; if it is not available for import * in read-only mode, it is displayed as unavailable in * userland. If the pool is available for import in read-only * mode but not read-write mode, it is displayed as unavailable * in userland with a special note that the pool is actually * available for open in read-only mode. * * As a result, if the state is SPA_LOAD_TRYIMPORT and we are * missing a feature for write, we must first determine whether * the pool can be opened read-only before returning to * userland in order to know whether to display the * abovementioned note. */ if (missing_feat_read || (missing_feat_write && spa_writeable(spa))) { return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * Load refcounts for ZFS features from disk into an in-memory * cache during SPA initialization. */ for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { uint64_t refcount; error = feature_get_refcount_from_disk(spa, &spa_feature_table[i], &refcount); if (error == 0) { spa->spa_feat_refcount_cache[i] = refcount; } else if (error == ENOTSUP) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } else { return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } } } if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, &spa->spa_feat_enabled_txg_obj) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); } spa->spa_is_initializing = B_TRUE; error = dsl_pool_open(spa->spa_dsl_pool); spa->spa_is_initializing = B_FALSE; if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!mosconfig) { uint64_t hostid; nvlist_t *policy = NULL, *nvconfig; if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { char *hostname; unsigned long myhostid = 0; VERIFY(nvlist_lookup_string(nvconfig, ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); #ifdef _KERNEL myhostid = zone_get_hostid(NULL); #else /* _KERNEL */ /* * We're emulating the system's hostid in userland, so * we can't use zone_get_hostid(). */ (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); #endif /* _KERNEL */ if (check_hostid && hostid != 0 && myhostid != 0 && hostid != myhostid) { nvlist_free(nvconfig); cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%lx). " "See: http://illumos.org/msg/ZFS-8000-EY", spa_name(spa), hostname, (unsigned long)hostid); return (SET_ERROR(EBADF)); } } if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_REWIND_POLICY, &policy) == 0) VERIFY(nvlist_add_nvlist(nvconfig, ZPOOL_REWIND_POLICY, policy) == 0); spa_config_set(spa, nvconfig); spa_unload(spa); spa_deactivate(spa); spa_activate(spa, orig_mode); return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); } if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, &spa->spa_creation_version); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, &spa->spa_errlog_scrub); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * If we're assembling the pool from the split-off vdevs of * an existing pool, we don't want to attach the spares & cache * devices. */ /* * Load any hot spares for this pool. */ error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, &spa->spa_spares.sav_config) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object); if (error != 0 && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, &spa->spa_l2cache.sav_config) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); } else if (error == 0) { spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); if (error && error != ENOENT) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { uint64_t autoreplace; spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, &spa->spa_dedup_ditto); spa->spa_autoreplace = (autoreplace != 0); } /* * If the 'autoreplace' property is set, then post a resource notifying * the ZFS DE that it should not issue any faults for unopenable * devices. We also iterate over the vdevs, and post a sysevent for any * unopenable vdevs so that the normal autoreplace handler can take * over. */ if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); /* * For the import case, this is done in spa_import(), because * at this point we're using the spare definitions from * the MOS config, not necessarily from the userland config. */ if (state != SPA_LOAD_IMPORT) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } } /* * Load the vdev state for all toplevel vdevs. */ vdev_load(rvd); /* * Propagate the leaf DTLs we just loaded all the way up the tree. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_dtl_reassess(rvd, 0, 0, B_FALSE); spa_config_exit(spa, SCL_ALL, FTAG); /* * Load the DDTs (dedup tables). */ error = ddt_load(spa); if (error != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_update_dspace(spa); /* * Validate the config, using the MOS config to fill in any * information which might be missing. If we fail to validate * the config then declare the pool unfit for use. If we're * assembling a pool from a split, the log is not transferred * over. */ if (type != SPA_IMPORT_ASSEMBLE) { nvlist_t *nvconfig; if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!spa_config_valid(spa, nvconfig)) { nvlist_free(nvconfig); return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); } nvlist_free(nvconfig); /* * Now that we've validated the config, check the state of the * root vdev. If it can't be opened, it indicates one or * more toplevel vdevs are faulted. */ if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) return (SET_ERROR(ENXIO)); if (spa_check_logs(spa)) { *ereport = FM_EREPORT_ZFS_LOG_REPLAY; return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); } } if (missing_feat_write) { ASSERT(state == SPA_LOAD_TRYIMPORT); /* * At this point, we know that we can open the pool in * read-only mode but not read-write mode. We now have enough * information and can return to userland. */ return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, ENOTSUP)); } /* * We've successfully opened the pool, verify that we're ready * to start pushing transactions. */ if (state != SPA_LOAD_TRYIMPORT) { if (error = spa_load_verify(spa)) return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); } if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || spa->spa_load_max_txg == UINT64_MAX)) { dmu_tx_t *tx; int need_update = B_FALSE; ASSERT(state != SPA_LOAD_TRYIMPORT); /* * Claim log blocks that haven't been committed yet. * This must all happen in a single txg. * Note: spa_claim_max_txg is updated by spa_claim_notify(), * invoked from zil_claim_log_block()'s i/o done callback. * Price of rollback is that we abandon the log. */ spa->spa_claiming = B_TRUE; tx = dmu_tx_create_assigned(spa_get_dsl(spa), spa_first_txg(spa)); (void) dmu_objset_find(spa_name(spa), zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); spa->spa_claiming = B_FALSE; spa_set_log_state(spa, SPA_LOG_GOOD); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); /* * Wait for all claims to sync. We sync up to the highest * claimed log block birth time so that claimed log blocks * don't appear to be from the future. spa_claim_max_txg * will have been set for us by either zil_check_log_chain() * (invoked from spa_check_logs()) or zil_claim() above. */ txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. * * If this is a verbatim import, trust the current * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || state == SPA_LOAD_IMPORT || state == SPA_LOAD_RECOVER || (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) need_update = B_TRUE; for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; /* * Update the config cache asychronously in case we're the * root pool, in which case the config cache isn't writable yet. */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); /* * Check all DTLs to see if anything needs resilvering. */ if (!dsl_scan_resilvering(spa->spa_dsl_pool) && vdev_resilver_needed(rvd, NULL, NULL)) spa_async_request(spa, SPA_ASYNC_RESILVER); /* * Log the fact that we booted up (so that we can detect if * we rebooted in the middle of an operation). */ spa_history_log_version(spa, "open"); /* * Delete any inconsistent datasets. */ (void) dmu_objset_find(spa_name(spa), dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); /* * Clean up any stale temporary dataset userrefs. */ dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); } return (0); } static int spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) { int mode = spa->spa_mode; spa_unload(spa); spa_deactivate(spa); spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; spa_activate(spa, mode); spa_async_suspend(spa); return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); } /* * If spa_load() fails this function will try loading prior txg's. If * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this * function will not rewind the pool and will return the same error as * spa_load(). */ static int spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, uint64_t max_request, int rewind_flags) { nvlist_t *loadinfo = NULL; nvlist_t *config = NULL; int load_error, rewind_error; uint64_t safe_rewind_txg; uint64_t min_txg; if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { spa->spa_load_max_txg = spa->spa_load_txg; spa_set_log_state(spa, SPA_LOG_CLEAR); } else { spa->spa_load_max_txg = max_request; if (max_request != UINT64_MAX) spa->spa_extreme_rewind = B_TRUE; } load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig); if (load_error == 0) return (0); if (spa->spa_root_vdev != NULL) config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; if (rewind_flags & ZPOOL_NEVER_REWIND) { nvlist_free(config); return (load_error); } if (state == SPA_LOAD_RECOVER) { /* Price of rolling back is discarding txgs, including log */ spa_set_log_state(spa, SPA_LOG_CLEAR); } else { /* * If we aren't rolling back save the load info from our first * import attempt so that we can restore it after attempting * to rewind. */ loadinfo = spa->spa_load_info; spa->spa_load_info = fnvlist_alloc(); } spa->spa_load_max_txg = spa->spa_last_ubsync_txg; safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? TXG_INITIAL : safe_rewind_txg; /* * Continue as long as we're finding errors, we're still within * the acceptable rewind range, and we're still finding uberblocks */ while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { if (spa->spa_load_max_txg < safe_rewind_txg) spa->spa_extreme_rewind = B_TRUE; rewind_error = spa_load_retry(spa, state, mosconfig); } spa->spa_extreme_rewind = B_FALSE; spa->spa_load_max_txg = UINT64_MAX; if (config && (rewind_error || state != SPA_LOAD_RECOVER)) spa_config_set(spa, config); if (state == SPA_LOAD_RECOVER) { ASSERT3P(loadinfo, ==, NULL); return (rewind_error); } else { /* Store the rewind info as part of the initial load info */ fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, spa->spa_load_info); /* Restore the initial load info */ fnvlist_free(spa->spa_load_info); spa->spa_load_info = loadinfo; return (load_error); } } /* * Pool Open/Import * * The import case is identical to an open except that the configuration is sent * down from userland, instead of grabbed from the configuration cache. For the * case of an open, the pool configuration will exist in the * POOL_STATE_UNINITIALIZED state. * * The stats information (gen/count/ustats) is used to gather vdev statistics at * the same time open the pool, without having to keep around the spa_t in some * ambiguous state. */ static int spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, nvlist_t **config) { spa_t *spa; spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; int firstopen = B_FALSE; *spapp = NULL; /* * As disgusting as this is, we need to support recursive calls to this * function because dsl_dir_open() is called during spa_load(), and ends * up calling spa_open() again. The real fix is to figure out how to * avoid dsl_dir_open() calling this in the first place. */ if (mutex_owner(&spa_namespace_lock) != curthread) { mutex_enter(&spa_namespace_lock); locked = B_TRUE; } if ((spa = spa_lookup(pool)) == NULL) { if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (spa->spa_state == POOL_STATE_UNINITIALIZED) { zpool_rewind_policy_t policy; firstopen = B_TRUE; zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); if (policy.zrp_request & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; spa_activate(spa, spa_mode_global); if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, policy.zrp_request); if (error == EBADF) { /* * If vdev_validate() returns failure (indicated by * EBADF), it indicates that one of the vdevs indicates * that the pool has been exported or destroyed. If * this is the case, the config cache is out of sync and * we should remove the pool from the namespace. */ spa_unload(spa); spa_deactivate(spa); spa_config_sync(spa, B_TRUE, B_TRUE); spa_remove(spa); if (locked) mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } if (error) { /* * We can't open the pool, but we still have useful * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ if (config != NULL && spa->spa_config) { VERIFY(nvlist_dup(spa->spa_config, config, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } spa_unload(spa); spa_deactivate(spa); spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); } } spa_open_ref(spa, tag); if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); /* * If we've recovered the pool, pass back any information we * gathered while doing the load. */ if (state == SPA_LOAD_RECOVER) { VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); } if (locked) { spa->spa_last_open_failed = 0; spa->spa_last_ubsync_txg = 0; spa->spa_load_txg = 0; mutex_exit(&spa_namespace_lock); #ifdef __FreeBSD__ #ifdef _KERNEL if (firstopen) zvol_create_minors(spa->spa_name); #endif #endif } *spapp = spa; return (0); } int spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, nvlist_t **config) { return (spa_open_common(name, spapp, tag, policy, config)); } int spa_open(const char *name, spa_t **spapp, void *tag) { return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* * Lookup the given spa_t, incrementing the inject count in the process, * preventing it from being exported or destroyed. */ spa_t * spa_inject_addref(char *name) { spa_t *spa; mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(name)) == NULL) { mutex_exit(&spa_namespace_lock); return (NULL); } spa->spa_inject_ref++; mutex_exit(&spa_namespace_lock); return (spa); } void spa_inject_delref(spa_t *spa) { mutex_enter(&spa_namespace_lock); spa->spa_inject_ref--; mutex_exit(&spa_namespace_lock); } /* * Add spares device information to the nvlist. */ static void spa_add_spares(spa_t *spa, nvlist_t *config) { nvlist_t **spares; uint_t i, nspares; nvlist_t *nvroot; uint64_t guid; vdev_stat_t *vs; uint_t vsc; uint64_t pool; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_spares.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); if (nspares != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); /* * Go through and find any spares which have since been * repurposed as an active spare. If this is the case, update * their status appropriately. */ for (i = 0; i < nspares; i++) { VERIFY(nvlist_lookup_uint64(spares[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (spa_spare_exists(guid, &pool, NULL) && pool != 0ULL) { VERIFY(nvlist_lookup_uint64_array( spares[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vs->vs_state = VDEV_STATE_CANT_OPEN; vs->vs_aux = VDEV_AUX_SPARED; } } } } /* * Add l2cache device information to the nvlist, including vdev stats. */ static void spa_add_l2cache(spa_t *spa, nvlist_t *config) { nvlist_t **l2cache; uint_t i, j, nl2cache; nvlist_t *nvroot; uint64_t guid; vdev_t *vd; vdev_stat_t *vs; uint_t vsc; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); if (spa->spa_l2cache.sav_count == 0) return; VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); if (nl2cache != 0) { VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); VERIFY(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); /* * Update level 2 cache device stats. */ for (i = 0; i < nl2cache; i++) { VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, &guid) == 0); vd = NULL; for (j = 0; j < spa->spa_l2cache.sav_count; j++) { if (guid == spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { vd = spa->spa_l2cache.sav_vdevs[j]; break; } } ASSERT(vd != NULL); VERIFY(nvlist_lookup_uint64_array(l2cache[i], ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) == 0); vdev_get_stats(vd, vs); } } } static void spa_add_feature_stats(spa_t *spa, nvlist_t *config) { nvlist_t *features; zap_cursor_t zc; zap_attribute_t za; ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); VERIFY(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP) == 0); /* We may be unable to read features if pool is suspended. */ if (spa_suspended(spa)) goto out; if (spa->spa_feat_for_read_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_read_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } if (spa->spa_feat_for_write_obj != 0) { for (zap_cursor_init(&zc, spa->spa_meta_objset, spa->spa_feat_for_write_obj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { ASSERT(za.za_integer_length == sizeof (uint64_t) && za.za_num_integers == 1); VERIFY3U(0, ==, nvlist_add_uint64(features, za.za_name, za.za_first_integer)); } zap_cursor_fini(&zc); } out: VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, features) == 0); nvlist_free(features); } int spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) { int error; spa_t *spa; *config = NULL; error = spa_open_common(name, &spa, FTAG, NULL, config); if (spa != NULL) { /* * This still leaves a window of inconsistency where the spares * or l2cache devices could change and the config would be * self-inconsistent. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); if (*config != NULL) { uint64_t loadtimes[2]; loadtimes[0] = spa->spa_loaded_ts.tv_sec; loadtimes[1] = spa->spa_loaded_ts.tv_nsec; VERIFY(nvlist_add_uint64_array(*config, ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, spa_get_errlog_size(spa)) == 0); if (spa_suspended(spa)) VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); spa_add_spares(spa, *config); spa_add_l2cache(spa, *config); spa_add_feature_stats(spa, *config); } } /* * We want to get the alternate root even for faulted pools, so we cheat * and call spa_lookup() directly. */ if (altroot) { if (spa == NULL) { mutex_enter(&spa_namespace_lock); spa = spa_lookup(name); if (spa) spa_altroot(spa, altroot, buflen); else altroot[0] = '\0'; spa = NULL; mutex_exit(&spa_namespace_lock); } else { spa_altroot(spa, altroot, buflen); } } if (spa != NULL) { spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); } return (error); } /* * Validate that the auxiliary device array is well formed. We must have an * array of nvlists, each which describes a valid leaf vdev. If this is an * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be * specified, as long as they are well-formed. */ static int spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, spa_aux_vdev_t *sav, const char *config, uint64_t version, vdev_labeltype_t label) { nvlist_t **dev; uint_t i, ndev; vdev_t *vd; int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); /* * It's acceptable to have no devs specified. */ if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) return (0); if (ndev == 0) return (SET_ERROR(EINVAL)); /* * Make sure the pool is formatted with a version that supports this * device type. */ if (spa_version(spa) < version) return (SET_ERROR(ENOTSUP)); /* * Set the pending device list so we correctly handle device in-use * checking. */ sav->sav_pending = dev; sav->sav_npending = ndev; for (i = 0; i < ndev; i++) { if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, mode)) != 0) goto out; if (!vd->vdev_ops->vdev_op_leaf) { vdev_free(vd); error = SET_ERROR(EINVAL); goto out; } /* * The L2ARC currently only supports disk devices in * kernel context. For user-level testing, we allow it. */ #ifdef _KERNEL if ((strcmp(config, ZPOOL_CONFIG_L2CACHE) == 0) && strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_DISK) != 0) { error = SET_ERROR(ENOTBLK); vdev_free(vd); goto out; } #endif vd->vdev_top = vd; if ((error = vdev_open(vd)) == 0 && (error = vdev_label_init(vd, crtxg, label)) == 0) { VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } vdev_free(vd); if (error && (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) goto out; else error = 0; } out: sav->sav_pending = NULL; sav->sav_npending = 0; return (error); } static int spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) { int error; ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, VDEV_LABEL_SPARE)) != 0) { return (error); } return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, VDEV_LABEL_L2CACHE)); } static void spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, const char *config) { int i; if (sav->sav_config != NULL) { nvlist_t **olddevs; uint_t oldndevs; nvlist_t **newdevs; /* * Generate new dev list by concatentating with the * current dev list. */ VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, &olddevs, &oldndevs) == 0); newdevs = kmem_alloc(sizeof (void *) * (ndevs + oldndevs), KM_SLEEP); for (i = 0; i < oldndevs; i++) VERIFY(nvlist_dup(olddevs[i], &newdevs[i], KM_SLEEP) == 0); for (i = 0; i < ndevs; i++) VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], KM_SLEEP) == 0); VERIFY(nvlist_remove(sav->sav_config, config, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, newdevs, ndevs + oldndevs) == 0); for (i = 0; i < oldndevs + ndevs; i++) nvlist_free(newdevs[i]); kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); } else { /* * Generate a new dev list. */ VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, devs, ndevs) == 0); } } /* * Stop and drop level 2 ARC devices */ void spa_l2cache_drop(spa_t *spa) { vdev_t *vd; int i; spa_aux_vdev_t *sav = &spa->spa_l2cache; for (i = 0; i < sav->sav_count; i++) { uint64_t pool; vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); if (spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); } } /* * Pool Creation */ int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, nvlist_t *zplprops) { spa_t *spa; char *altroot = NULL; vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; uint64_t version, obj; boolean_t has_features; /* * If this pool already exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Allocate a new spa_t structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); spa = spa_add(pool, NULL, altroot); spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } has_features = B_FALSE; for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); elem != NULL; elem = nvlist_next_nvpair(props, elem)) { if (zpool_prop_feature(nvpair_name(elem))) has_features = B_TRUE; } if (has_features || nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { version = SPA_VERSION; } ASSERT(SPA_VERSION_IS_SUPPORTED(version)); spa->spa_first_txg = txg; spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; /* * Create "The Godfather" zio to hold all async IOs */ spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), KM_SLEEP); for (int i = 0; i < max_ncpus; i++) { spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); } /* * Create the root vdev. */ spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); ASSERT(error != 0 || rvd != NULL); ASSERT(error != 0 || spa->spa_root_vdev == rvd); if (error == 0 && !zfs_allocatable_devs(nvroot)) error = SET_ERROR(EINVAL); if (error == 0 && (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { for (int c = 0; c < rvd->vdev_children; c++) { vdev_ashift_optimize(rvd->vdev_child[c]); vdev_metaslab_set_size(rvd->vdev_child[c]); vdev_expand(rvd->vdev_child[c], txg); } } spa_config_exit(spa, SCL_ALL, FTAG); if (error != 0) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } /* * Get the list of spares, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } /* * Get the list of level 2 cache devices, if specified. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } spa->spa_is_initializing = B_TRUE; spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; spa->spa_is_initializing = B_FALSE; /* * Create DDTs (dedup tables). */ ddt_create(spa); spa_update_dspace(spa); tx = dmu_tx_create_assigned(dp, txg); /* * Create the pool config object. */ spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool config"); } if (spa_version(spa) >= SPA_VERSION_FEATURES) spa_feature_create_zap_objects(spa, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, sizeof (uint64_t), 1, &version, tx) != 0) { cmn_err(CE_PANIC, "failed to add pool version"); } /* Newly created pools with the right version are always deflated. */ if (version >= SPA_VERSION_RAIDZ_DEFLATE) { spa->spa_deflate = TRUE; if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { cmn_err(CE_PANIC, "failed to add deflate"); } } /* * Create the deferred-free bpobj. Turn off compression * because sync-to-convergence takes longer if the blocksize * keeps changing. */ obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); dmu_object_set_compress(spa->spa_meta_objset, obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, sizeof (uint64_t), 1, &obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bpobj"); } VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj)); /* * Create the pool's history object. */ if (version >= SPA_VERSION_ZPOOL_HISTORY) spa_history_create_obj(spa, tx); /* * Set pool properties. */ spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); if (props != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_sync_props(props, tx); } dmu_tx_commit(tx); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); /* * We explicitly wait for the first transaction to complete so that our * bean counters are appropriately updated. */ txg_wait_synced(spa->spa_dsl_pool, txg); spa_config_sync(spa, B_FALSE, B_TRUE); spa_history_log_version(spa, "create"); + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); spa->spa_minref = refcount_count(&spa->spa_refcount); mutex_exit(&spa_namespace_lock); return (0); } #ifdef _KERNEL #ifdef illumos /* * Get the root pool information from the root disk, then import the root pool * during the system boot up time. */ extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); static nvlist_t * spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) { nvlist_t *config; nvlist_t *nvtop, *nvroot; uint64_t pgid; if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) return (NULL); /* * Add this top-level vdev to the child array. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); /* * Put this pool's top-level vdevs into a root vdev. */ VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &nvtop, 1) == 0); /* * Replace the existing vdev_tree with the new root vdev in * this pool's configuration (remove the old, add the new). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); return (config); } /* * Walk the vdev tree and see if we can find a device with "better" * configuration. A configuration is "better" if the label on that * device has a more recent txg. */ static void spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) { for (int c = 0; c < vd->vdev_children; c++) spa_alt_rootvdev(vd->vdev_child[c], avd, txg); if (vd->vdev_ops->vdev_op_leaf) { nvlist_t *label; uint64_t label_txg; if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, &label) != 0) return; VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, &label_txg) == 0); /* * Do we have a better boot device? */ if (label_txg > *txg) { *txg = label_txg; *avd = vd; } nvlist_free(label); } } /* * Import a root pool. * * For x86. devpath_list will consist of devid and/or physpath name of * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). * The GRUB "findroot" command will return the vdev we should boot. * * For Sparc, devpath_list consists the physpath name of the booting device * no matter the rootpool is a single device pool or a mirrored pool. * e.g. * "/pci@1f,0/ide@d/disk@0,0:a" */ int spa_import_rootpool(char *devpath, char *devid) { spa_t *spa; vdev_t *rvd, *bvd, *avd = NULL; nvlist_t *config, *nvtop; uint64_t guid, txg; char *pname; int error; /* * Read the label from the boot device and generate a configuration. */ config = spa_generate_rootconf(devpath, devid, &guid); #if defined(_OBP) && defined(_KERNEL) if (config == NULL) { if (strstr(devpath, "/iscsi/ssd") != NULL) { /* iscsi boot */ get_iscsi_bootpath_phy(devpath); config = spa_generate_rootconf(devpath, devid, &guid); } } #endif if (config == NULL) { cmn_err(CE_NOTE, "Cannot read the pool label from '%s'", devpath); return (SET_ERROR(EIO)); } VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pname)) != NULL) { /* * Remove the existing root pool from the namespace so that we * can replace it with the correct config we just read in. */ spa_remove(spa); } spa = spa_add(pname, config, NULL); spa->spa_is_root = B_TRUE; spa->spa_import_flags = ZFS_IMPORT_VERBATIM; /* * Build up a vdev tree based on the boot device's label config. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, VDEV_ALLOC_ROOTPOOL); spa_config_exit(spa, SCL_ALL, FTAG); if (error) { mutex_exit(&spa_namespace_lock); nvlist_free(config); cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", pname); return (error); } /* * Get the boot vdev. */ if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", (u_longlong_t)guid); error = SET_ERROR(ENOENT); goto out; } /* * Determine if there is a better boot device. */ avd = bvd; spa_alt_rootvdev(rvd, &avd, &txg); if (avd != bvd) { cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " "try booting from '%s'", avd->vdev_path); error = SET_ERROR(EINVAL); goto out; } /* * If the boot device is part of a spare vdev then ensure that * we're booting off the active spare. */ if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && !bvd->vdev_isspare) { cmn_err(CE_NOTE, "The boot device is currently spared. Please " "try booting from '%s'", bvd->vdev_parent-> vdev_child[bvd->vdev_parent->vdev_children - 1]->vdev_path); error = SET_ERROR(EINVAL); goto out; } error = 0; out: spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); nvlist_free(config); return (error); } #else /* !illumos */ extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, uint64_t *count); static nvlist_t * spa_generate_rootconf(const char *name) { nvlist_t **configs, **tops; nvlist_t *config; nvlist_t *best_cfg, *nvtop, *nvroot; uint64_t *holes; uint64_t best_txg; uint64_t nchildren; uint64_t pgid; uint64_t count; uint64_t i; uint_t nholes; if (vdev_geom_read_pool_label(name, &configs, &count) != 0) return (NULL); ASSERT3U(count, !=, 0); best_txg = 0; for (i = 0; i < count; i++) { uint64_t txg; VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if (txg > best_txg) { best_txg = txg; best_cfg = configs[i]; } } nchildren = 1; nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); holes = NULL; nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, &holes, &nholes); tops = kmem_zalloc(nchildren * sizeof(void *), KM_SLEEP); for (i = 0; i < nchildren; i++) { if (i >= count) break; if (configs[i] == NULL) continue; VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); nvlist_dup(nvtop, &tops[i], KM_SLEEP); } for (i = 0; holes != NULL && i < nholes; i++) { if (i >= nchildren) continue; if (tops[holes[i]] != NULL) continue; nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, holes[i]) == 0); VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, 0) == 0); } for (i = 0; i < nchildren; i++) { if (tops[i] != NULL) continue; nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, VDEV_TYPE_MISSING) == 0); VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, i) == 0); VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, 0) == 0); } /* * Create pool config based on the best vdev config. */ nvlist_dup(best_cfg, &config, KM_SLEEP); /* * Put this pool's top-level vdevs into a root vdev. */ VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) == 0); VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, tops, nchildren) == 0); /* * Replace the existing vdev_tree with the new root vdev in * this pool's configuration (remove the old, add the new). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); /* * Drop vdev config elements that should not be present at pool level. */ nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); for (i = 0; i < count; i++) nvlist_free(configs[i]); kmem_free(configs, count * sizeof(void *)); for (i = 0; i < nchildren; i++) nvlist_free(tops[i]); kmem_free(tops, nchildren * sizeof(void *)); nvlist_free(nvroot); return (config); } int spa_import_rootpool(const char *name) { spa_t *spa; vdev_t *rvd, *bvd, *avd = NULL; nvlist_t *config, *nvtop; uint64_t txg; char *pname; int error; /* * Read the label from the boot device and generate a configuration. */ config = spa_generate_rootconf(name); mutex_enter(&spa_namespace_lock); if (config != NULL) { VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0 && strcmp(name, pname) == 0); VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); if ((spa = spa_lookup(pname)) != NULL) { /* * Remove the existing root pool from the namespace so * that we can replace it with the correct config * we just read in. */ spa_remove(spa); } spa = spa_add(pname, config, NULL); /* * Set spa_ubsync.ub_version as it can be used in vdev_alloc() * via spa_version(). */ if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &spa->spa_ubsync.ub_version) != 0) spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; } else if ((spa = spa_lookup(name)) == NULL) { cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", name); return (EIO); } else { VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); } spa->spa_is_root = B_TRUE; spa->spa_import_flags = ZFS_IMPORT_VERBATIM; /* * Build up a vdev tree based on the boot device's label config. */ VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, VDEV_ALLOC_ROOTPOOL); spa_config_exit(spa, SCL_ALL, FTAG); if (error) { mutex_exit(&spa_namespace_lock); nvlist_free(config); cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", pname); return (error); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); vdev_free(rvd); spa_config_exit(spa, SCL_ALL, FTAG); mutex_exit(&spa_namespace_lock); nvlist_free(config); return (0); } #endif /* illumos */ #endif /* _KERNEL */ /* * Import a non-root pool into the system. */ int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) { spa_t *spa; char *altroot = NULL; spa_load_state_t state = SPA_LOAD_IMPORT; zpool_rewind_policy_t policy; uint64_t mode = spa_mode_global; uint64_t readonly = B_FALSE; int error; nvlist_t *nvroot; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; /* * If a pool with this name exists, return failure. */ mutex_enter(&spa_namespace_lock); if (spa_lookup(pool) != NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EEXIST)); } /* * Create and initialize the spa structure. */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); if (readonly) mode = FREAD; spa = spa_add(pool, config, altroot); spa->spa_import_flags = flags; /* * Verbatim import - Take a pool and insert it into the namespace * as if it had been loaded at boot. */ if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { if (props != NULL) spa_configfile_set(spa, props, B_FALSE); spa_config_sync(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); return (0); } spa_activate(spa, mode); /* * Don't start async tasks until we know everything is healthy. */ spa_async_suspend(spa); zpool_get_rewind_policy(config, &policy); if (policy.zrp_request & ZPOOL_DO_REWIND) state = SPA_LOAD_RECOVER; /* * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig * because the user-supplied config is actually the one to trust when * doing an import. */ if (state != SPA_LOAD_RECOVER) spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, policy.zrp_request); /* * Propagate anything learned while loading the pool and pass it * back to caller (i.e. rewind info, missing devices, etc). */ VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * Toss any existing sparelist, as it doesn't have any validity * anymore, and conflicts with spa_has_spare(). */ if (spa->spa_spares.sav_config) { nvlist_free(spa->spa_spares.sav_config); spa->spa_spares.sav_config = NULL; spa_load_spares(spa); } if (spa->spa_l2cache.sav_config) { nvlist_free(spa->spa_l2cache.sav_config); spa->spa_l2cache.sav_config = NULL; spa_load_l2cache(spa); } VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); if (error == 0) error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_L2CACHE); spa_config_exit(spa, SCL_ALL, FTAG); if (props != NULL) spa_configfile_set(spa, props, B_FALSE); if (error != 0 || (props && spa_writeable(spa) && (error = spa_prop_set(spa, props)))) { spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (error); } spa_async_resume(spa); /* * Override any spares and level 2 cache devices as specified by * the user, as these may have correct device names/devids, etc. */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0) { if (spa->spa_spares.sav_config) VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_spares.sav_sync = B_TRUE; } if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) { if (spa->spa_l2cache.sav_config) VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); else VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); spa->spa_l2cache.sav_sync = B_TRUE; } /* * Check for any removed devices. */ if (spa->spa_autoreplace) { spa_aux_check_removed(&spa->spa_spares); spa_aux_check_removed(&spa->spa_l2cache); } if (spa_writeable(spa)) { /* * Update the config cache to include the newly-imported pool. */ spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); } /* * It's possible that the pool was expanded while it was exported. * We kick off an async task to handle this for us. */ spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); mutex_exit(&spa_namespace_lock); spa_history_log_version(spa, "import"); #ifdef __FreeBSD__ #ifdef _KERNEL zvol_create_minors(pool); #endif #endif return (0); } nvlist_t * spa_tryimport(nvlist_t *tryconfig) { nvlist_t *config = NULL; char *poolname; spa_t *spa; uint64_t state; int error; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) return (NULL); /* * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); spa_activate(spa, FREAD); /* * Pass off the heavy lifting to spa_load(). * Pass TRUE for mosconfig because the user-supplied config * is actually the one to trust when doing an import. */ error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); /* * If 'tryconfig' was at least parsable, return the current config. */ if (spa->spa_root_vdev != NULL) { config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, poolname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, state) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, spa->spa_uberblock.ub_timestamp) == 0); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, spa->spa_load_info) == 0); /* * If the bootfs property exists on this pool then we * copy it out so that external consumers can tell which * pools are bootable. */ if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* * We have to play games with the name since the * pool was opened as TRYIMPORT_NAME. */ if (dsl_dsobj_to_dsname(spa_name(spa), spa->spa_bootfs, tmpname) == 0) { char *cp; char *dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); cp = strchr(tmpname, '/'); if (cp == NULL) { (void) strlcpy(dsname, tmpname, MAXPATHLEN); } else { (void) snprintf(dsname, MAXPATHLEN, "%s/%s", poolname, ++cp); } VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_BOOTFS, dsname) == 0); kmem_free(dsname, MAXPATHLEN); } kmem_free(tmpname, MAXPATHLEN); } /* * Add the list of hot spares and level 2 cache devices. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); return (config); } /* * Pool export/destroy * * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the * configuration from the cache afterwards. If the 'hardforce' flag is set, then * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { spa_t *spa; if (oldconfig) *oldconfig = NULL; if (!(spa_mode_global & FWRITE)) return (SET_ERROR(EROFS)); mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(pool)) == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(ENOENT)); } /* * Put a hold on the pool, drop the namespace lock, stop async tasks, * reacquire the namespace lock, and see if we can export. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); /* * The pool will be in core if it's openable, * in which case we can modify its state. */ if (spa->spa_state != POOL_STATE_UNINITIALIZED && spa->spa_sync_on) { /* * Objsets may be open only because they're dirty, so we * have to force it to sync before checking spa_refcnt. */ txg_wait_synced(spa->spa_dsl_pool, 0); + spa_evicting_os_wait(spa); /* * A pool cannot be exported or destroyed if there are active * references. If we are resetting a pool, allow references by * fault injection handlers. */ if (!spa_refcount_zero(spa) || (spa->spa_inject_ref != 0 && new_state != POOL_STATE_UNINITIALIZED)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EBUSY)); } /* * A pool cannot be exported if it has an active shared spare. * This is to prevent other pools stealing the active spare * from an exported pool. At user's own will, such pool can * be forcedly exported. */ if (!force && new_state == POOL_STATE_EXPORTED && spa_has_active_shared_spare(spa)) { spa_async_resume(spa); mutex_exit(&spa_namespace_lock); return (SET_ERROR(EXDEV)); } /* * We want this to be reflected on every label, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; spa->spa_final_txg = spa_last_synced_txg(spa) + TXG_DEFER_SIZE + 1; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); } } spa_event_notify(spa, NULL, ESC_ZFS_POOL_DESTROY); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } if (oldconfig && spa->spa_config) VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); if (new_state != POOL_STATE_UNINITIALIZED) { if (!hardforce) spa_config_sync(spa, B_TRUE, B_TRUE); spa_remove(spa); } mutex_exit(&spa_namespace_lock); return (0); } /* * Destroy a storage pool. */ int spa_destroy(char *pool) { return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce) { return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force, hardforce)); } /* * Similar to spa_export(), this unloads the spa_t without actually removing it * from the namespace in any way. */ int spa_reset(char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, B_FALSE, B_FALSE)); } /* * ========================================================================== * Device manipulation * ========================================================================== */ /* * Add a device to a storage pool. */ int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { uint64_t txg, id; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, NULL, txg, error)); spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, &nspares) != 0) nspares = 0; if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) != 0) nl2cache = 0; if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) return (spa_vdev_exit(spa, vd, txg, EINVAL)); if (vd->vdev_children != 0 && (error = vdev_create(vd, txg, B_FALSE)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * We must validate the spares and l2cache devices after checking the * children. Otherwise, vdev_inuse() will blindly overwrite the spare. */ if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) return (spa_vdev_exit(spa, vd, txg, error)); /* * Transfer each new top-level vdev from vd to rvd. */ for (int c = 0; c < vd->vdev_children; c++) { /* * Set the vdev id to the first hole, if one exists. */ for (id = 0; id < rvd->vdev_children; id++) { if (rvd->vdev_child[id]->vdev_ishole) { vdev_free(rvd->vdev_child[id]); break; } } tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); tvd->vdev_id = id; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } if (nspares != 0) { spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, ZPOOL_CONFIG_SPARES); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } if (nl2cache != 0) { spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, ZPOOL_CONFIG_L2CACHE); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } /* * We have to be careful when adding new vdevs to an existing pool. * If other threads start allocating from these vdevs before we * sync the config cache, and we lose power, then upon reboot we may * fail to open the pool because there are DVAs that the config cache * can't translate. Therefore, we first add the vdevs without * initializing metaslabs; sync the config cache (via spa_vdev_exit()); * and then let spa_config_update() initialize the new metaslabs. * * spa_load() checks for added-but-not-initialized vdevs, so that * if we lose power at any point in this sequence, the remaining * steps will be completed the next time we load the pool. */ (void) spa_vdev_exit(spa, vd, txg, 0); mutex_enter(&spa_namespace_lock); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); mutex_exit(&spa_namespace_lock); return (0); } /* * Attach a device to a mirror. The arguments are the path to any device * in the mirror, and the nvroot for the new device. If the path specifies * a device that is not mirrored, we automatically insert the mirror vdev. * * If 'replacing' is specified, the new device is intended to replace the * existing device; in this case the two devices are made into their own * mirror using the 'replacing' vdev, which is functionally identical to * the mirror vdev (it actually reuses all the same ops) but has a few * extra rules: you can't attach to it after it's been created, and upon * completion of resilvering, the first disk (the one being replaced) * is automatically detached. */ int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) { uint64_t txg, dtl_max_txg; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; char *oldvdpath, *newvdpath; int newvd_isspare; int error; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); if (oldvd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!oldvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = oldvd->vdev_parent; if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, VDEV_ALLOC_ATTACH)) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); if (newrootvd->vdev_children != 1) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); newvd = newrootvd->vdev_child[0]; if (!newvd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); if ((error = vdev_create(newrootvd, txg, replacing)) != 0) return (spa_vdev_exit(spa, newrootvd, txg, error)); /* * Spares can't replace logs */ if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); if (!replacing) { /* * For attach, the only allowable parent is a mirror or the root * vdev. */ if (pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_root_ops) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); pvops = &vdev_mirror_ops; } else { /* * Active hot spares can only be replaced by inactive hot * spares. */ if (pvd->vdev_ops == &vdev_spare_ops && oldvd->vdev_isspare && !spa_has_spare(spa, newvd->vdev_guid)) return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); /* * If the source is a hot spare, and the parent isn't already a * spare, then we want to create a new hot spare. Otherwise, we * want to create a replacing vdev. The user is not allowed to * attach to a spared vdev child unless the 'isspare' state is * the same (spare replaces spare, non-spare replaces * non-spare). */ if (pvd->vdev_ops == &vdev_replacing_ops && spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } else if (pvd->vdev_ops == &vdev_spare_ops && newvd->vdev_isspare != oldvd->vdev_isspare) { return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); } if (newvd->vdev_isspare) pvops = &vdev_spare_ops; else pvops = &vdev_replacing_ops; } /* * Make sure the new device is big enough. */ if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* * The new device cannot have a higher alignment requirement * than the top-level vdev. */ if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) return (spa_vdev_exit(spa, newrootvd, txg, EDOM)); /* * If this is an in-place replacement, update oldvd's path and devid * to make it distinguishable from newvd, and unopenable from now on. */ if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { spa_strfree(oldvd->vdev_path); oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, KM_SLEEP); (void) sprintf(oldvd->vdev_path, "%s/%s", newvd->vdev_path, "old"); if (oldvd->vdev_devid != NULL) { spa_strfree(oldvd->vdev_devid); oldvd->vdev_devid = NULL; } } /* mark the device being resilvered */ newvd->vdev_resilver_txg = txg; /* * If the parent is not a mirror, or if we're replacing, insert the new * mirror/replacing/spare vdev above oldvd. */ if (pvd->vdev_ops != pvops) pvd = vdev_add_parent(oldvd, pvops); ASSERT(pvd->vdev_top->vdev_parent == rvd); ASSERT(pvd->vdev_ops == pvops); ASSERT(oldvd->vdev_parent == pvd); /* * Extract the new device from its root and add it to pvd. */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); vdev_config_dirty(tvd); /* * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account * for any dmu_sync-ed blocks. It will propagate upward when * spa_vdev_exit() calls vdev_dtl_reassess(). */ dtl_max_txg = txg + TXG_CONCURRENT_STATES; vdev_dtl_dirty(newvd, DTL_MISSING, TXG_INITIAL, dtl_max_txg - TXG_INITIAL); if (newvd->vdev_isspare) { spa_spare_activate(newvd); spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); } oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; /* * Mark newvd's DTL dirty in this txg. */ vdev_dirty(tvd, VDD_DTL, newvd, txg); /* * Schedule the resilver to restart in the future. We do this to * ensure that dmu_sync-ed blocks have been stitched into the * respective datasets. */ dsl_resilver_restart(spa->spa_dsl_pool, dtl_max_txg); /* * Commit the config */ (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); spa_history_log_internal(spa, "vdev attach", NULL, "%s vdev=%s %s vdev=%s", replacing && newvd_isspare ? "spare in" : replacing ? "replace" : "attach", newvdpath, replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); if (spa->spa_bootfs) spa_event_notify(spa, newvd, ESC_ZFS_BOOTFS_VDEV_ATTACH); return (0); } /* * Detach a device from a mirror or replacing vdev. * * If 'replace_done' is specified, only detach if the parent * is a replacing vdev. */ int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid = 0; char *vdpath; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (vd == NULL) return (spa_vdev_exit(spa, NULL, txg, ENODEV)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); pvd = vd->vdev_parent; /* * If the parent/child relationship is not as expected, don't do it. * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing * vdev that's replacing B with C. The user's intent in replacing * is to go from M(A,B) to M(A,C). If the user decides to cancel * the replace by detaching C, the expected behavior is to end up * M(A,B). But suppose that right after deciding to detach C, * the replacement of B completes. We would have M(A,C), and then * ask to detach C, which would leave us with just A -- not what * the user wanted. To prevent this, we make sure that the * parent/child relationship hasn't changed -- in this example, * that C's parent is still the replacing vdev R. */ if (pvd->vdev_guid != pguid && pguid != 0) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); /* * Only 'replacing' or 'spare' vdevs can be replaced. */ if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); ASSERT(pvd->vdev_ops != &vdev_spare_ops || spa_version(spa) >= SPA_VERSION_SPARES); /* * Only mirror, replacing, and spare vdevs support detach. */ if (pvd->vdev_ops != &vdev_replacing_ops && pvd->vdev_ops != &vdev_mirror_ops && pvd->vdev_ops != &vdev_spare_ops) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* * If this device has the only valid copy of some data, * we cannot safely detach it. */ if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then * check to see if we changed the original vdev's path to have "/old" * at the end in spa_vdev_attach(). If so, undo that change now. */ if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && vd->vdev_path != NULL) { size_t len = strlen(vd->vdev_path); for (int c = 0; c < pvd->vdev_children; c++) { cvd = pvd->vdev_child[c]; if (cvd == vd || cvd->vdev_path == NULL) continue; if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && strcmp(cvd->vdev_path + len, "/old") == 0) { spa_strfree(cvd->vdev_path); cvd->vdev_path = spa_strdup(vd->vdev_path); break; } } } /* * If we are detaching the original disk from a spare, then it implies * that the spare should become a real disk, and be removed from the * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && vd->vdev_id == 0 && pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) unspare = B_TRUE; /* * Erase the disk labels so the disk can be used for other things. * This must be done after all other error cases are handled, * but before we disembowel vd (so we can still do I/O to it). * But if we can't do it, don't treat the error as fatal -- * it may be that the unwritability of the disk is the reason * it's being detached! */ error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); /* * Remove vd from its parent and compact the parent's children. */ vdev_remove_child(pvd, vd); vdev_compact_children(pvd); /* * Remember one of the remaining children so we can get tvd below. */ cvd = pvd->vdev_child[pvd->vdev_children - 1]; /* * If we need to remove the remaining child from the list of hot spares, * do it now, marking the vdev as no longer a spare in the process. * We must do this before vdev_remove_parent(), because that can * change the GUID if it creates a new toplevel GUID. For a similar * reason, we must remove the spare now, in the same txg as the detach; * otherwise someone could attach a new sibling, change the GUID, and * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); cvd->vdev_unspare = B_TRUE; } /* * If the parent mirror/replacing vdev only has one child, * the parent is no longer needed. Remove it from the tree. */ if (pvd->vdev_children == 1) { if (pvd->vdev_ops == &vdev_spare_ops) cvd->vdev_unspare = B_FALSE; vdev_remove_parent(cvd); } /* * We don't set tvd until now because the parent we just removed * may have been the previous top-level vdev. */ tvd = cvd->vdev_top; ASSERT(tvd->vdev_parent == rvd); /* * Reevaluate the parent vdev state. */ vdev_propagate_state(cvd); /* * If the 'autoexpand' property is set on the pool then automatically * try to expand the size of the pool. For example if the device we * just detached was smaller than the others, it may be possible to * add metaslabs (i.e. grow the pool). We need to reopen the vdev * first so that we can obtain the updated sizes of the leaf vdevs. */ if (spa->spa_autoexpand) { vdev_reopen(tvd); vdev_expand(tvd, txg); } vdev_config_dirty(tvd); /* * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that * vd->vdev_detached is set and free vd's DTL object in syncing context. * But first make sure we're not on any *other* txg's DTL list, to * prevent vd from being accessed after it's freed. */ vdpath = spa_strdup(vd->vdev_path); for (int t = 0; t < TXG_SIZE; t++) (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); vd->vdev_detached = B_TRUE; vdev_dirty(tvd, VDD_DTL, vd, txg); spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); /* hang on to the spa before we release the lock */ spa_open_ref(spa, FTAG); error = spa_vdev_exit(spa, vd, txg, 0); spa_history_log_internal(spa, "detach", NULL, "vdev=%s", vdpath); spa_strfree(vdpath); /* * If this was the removal of the original device in a hot spare vdev, * then we want to go through and remove the device from the hot spare * list of every other pool. */ if (unspare) { spa_t *altspa = NULL; mutex_enter(&spa_namespace_lock); while ((altspa = spa_next(altspa)) != NULL) { if (altspa->spa_state != POOL_STATE_ACTIVE || altspa == spa) continue; spa_open_ref(altspa, FTAG); mutex_exit(&spa_namespace_lock); (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); mutex_enter(&spa_namespace_lock); spa_close(altspa, FTAG); } mutex_exit(&spa_namespace_lock); /* search the rest of the vdevs for spares to remove */ spa_vdev_resilver_done(spa); } /* all done with the spa; OK to release */ mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (error); } /* * Split a set of devices from their mirrors, and create a new pool from them. */ int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp) { int error = 0; uint64_t txg, *glist; spa_t *newspa; uint_t c, children, lastlog; nvlist_t **child, *nvl, *tmp; dmu_tx_t *tx; char *altroot = NULL; vdev_t *rvd, **vml = NULL; /* vdev modify list */ boolean_t activate_slog; ASSERT(spa_writeable(spa)); txg = spa_vdev_enter(spa); /* clear the log and flush everything up to now */ activate_slog = spa_passivate_log(spa); (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); error = spa_offline_log(spa); txg = spa_vdev_config_enter(spa); if (activate_slog) spa_activate_log(spa); if (error != 0) return (spa_vdev_exit(spa, NULL, txg, error)); /* check new spa name before going any further */ if (spa_lookup(newname) != NULL) return (spa_vdev_exit(spa, NULL, txg, EEXIST)); /* * scan through all the children to ensure they're all mirrors */ if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* first, check to ensure we've got the right child count */ rvd = spa->spa_root_vdev; lastlog = 0; for (c = 0; c < rvd->vdev_children; c++) { vdev_t *vd = rvd->vdev_child[c]; /* don't count the holes & logs as children */ if (vd->vdev_islog || vd->vdev_ishole) { if (lastlog == 0) lastlog = c; continue; } lastlog = 0; } if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); /* next, ensure no spare or cache devices are part of the split */ if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) return (spa_vdev_exit(spa, NULL, txg, EINVAL)); vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); /* then, loop over each vdev and validate it */ for (c = 0; c < children; c++) { uint64_t is_hole = 0; (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, &is_hole); if (is_hole != 0) { if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || spa->spa_root_vdev->vdev_child[c]->vdev_islog) { continue; } else { error = SET_ERROR(EINVAL); break; } } /* which disk is going to be split? */ if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, &glist[c]) != 0) { error = SET_ERROR(EINVAL); break; } /* look it up in the spa */ vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); if (vml[c] == NULL) { error = SET_ERROR(ENODEV); break; } /* make sure there's nothing stopping the split */ if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || vml[c]->vdev_islog || vml[c]->vdev_ishole || vml[c]->vdev_isspare || vml[c]->vdev_isl2cache || !vdev_writeable(vml[c]) || vml[c]->vdev_children != 0 || vml[c]->vdev_state != VDEV_STATE_HEALTHY || c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { error = SET_ERROR(EINVAL); break; } if (vdev_dtl_required(vml[c])) { error = SET_ERROR(EBUSY); break; } /* we need certain info from the top level */ VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, vml[c]->vdev_top->vdev_ms_array) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, vml[c]->vdev_top->vdev_ms_shift) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, vml[c]->vdev_top->vdev_asize) == 0); VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, vml[c]->vdev_top->vdev_ashift) == 0); } if (error != 0) { kmem_free(vml, children * sizeof (vdev_t *)); kmem_free(glist, children * sizeof (uint64_t)); return (spa_vdev_exit(spa, NULL, txg, error)); } /* stop writers from using the disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_TRUE; } vdev_reopen(spa->spa_root_vdev); /* * Temporarily record the splitting vdevs in the spa config. This * will disappear once the config is regenerated. */ VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, glist, children) == 0); kmem_free(glist, children * sizeof (uint64_t)); mutex_enter(&spa->spa_props_lock); VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, nvl) == 0); mutex_exit(&spa->spa_props_lock); spa->spa_config_splitting = nvl; vdev_config_dirty(spa->spa_root_vdev); /* configure and create the new pool */ VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, spa->spa_config_txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_generate_guid(NULL)) == 0); (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); /* add the new pool to the namespace */ newspa = spa_add(newname, config, altroot); newspa->spa_config_txg = spa->spa_config_txg; spa_set_log_state(newspa, SPA_LOG_CLEAR); /* release the spa config lock, retaining the namespace lock */ spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 1); spa_activate(newspa, spa_mode_global); spa_async_suspend(newspa); #ifndef illumos /* mark that we are creating new spa by splitting */ newspa->spa_splitting_newspa = B_TRUE; #endif /* create the new pool from the disks of the original pool */ error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); #ifndef illumos newspa->spa_splitting_newspa = B_FALSE; #endif if (error) goto out; /* if that worked, generate a real config for the new pool */ if (newspa->spa_root_vdev != NULL) { VERIFY(nvlist_alloc(&newspa->spa_config_splitting, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, B_TRUE)); } /* set the props */ if (props != NULL) { spa_configfile_set(newspa, props, B_FALSE); error = spa_prop_set(newspa, props); if (error) goto out; } /* flush everything */ txg = spa_vdev_config_enter(newspa); vdev_config_dirty(newspa->spa_root_vdev); (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 2); spa_async_resume(newspa); /* finally, update the original pool's config */ txg = spa_vdev_config_enter(spa); tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) dmu_tx_abort(tx); for (c = 0; c < children; c++) { if (vml[c] != NULL) { vdev_split(vml[c]); if (error == 0) spa_history_log_internal(spa, "detach", tx, "vdev=%s", vml[c]->vdev_path); vdev_free(vml[c]); } } vdev_config_dirty(spa->spa_root_vdev); spa->spa_config_splitting = NULL; nvlist_free(nvl); if (error == 0) dmu_tx_commit(tx); (void) spa_vdev_exit(spa, NULL, txg, 0); if (zio_injection_enabled) zio_handle_panic_injection(spa, FTAG, 3); /* split is complete; log a history record */ spa_history_log_internal(newspa, "split", NULL, "from pool %s", spa_name(spa)); kmem_free(vml, children * sizeof (vdev_t *)); /* if we're not going to mount the filesystems in userland, export */ if (exp) error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, B_FALSE, B_FALSE); return (error); out: spa_unload(newspa); spa_deactivate(newspa); spa_remove(newspa); txg = spa_vdev_config_enter(spa); /* re-online all offlined disks */ for (c = 0; c < children; c++) { if (vml[c] != NULL) vml[c]->vdev_offline = B_FALSE; } vdev_reopen(spa->spa_root_vdev); nvlist_free(spa->spa_config_splitting); spa->spa_config_splitting = NULL; (void) spa_vdev_exit(spa, NULL, txg, error); kmem_free(vml, children * sizeof (vdev_t *)); return (error); } static nvlist_t * spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) { for (int i = 0; i < count; i++) { uint64_t guid; VERIFY(nvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID, &guid) == 0); if (guid == target_guid) return (nvpp[i]); } return (NULL); } static void spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, nvlist_t *dev_to_remove) { nvlist_t **newdev = NULL; if (count > 1) newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); for (int i = 0, j = 0; i < count; i++) { if (dev[i] == dev_to_remove) continue; VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); } VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); for (int i = 0; i < count - 1; i++) nvlist_free(newdev[i]); if (count > 1) kmem_free(newdev, (count - 1) * sizeof (void *)); } /* * Evacuate the device. */ static int spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) { uint64_t txg; int error = 0; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); ASSERT(vd == vd->vdev_top); /* * Evacuate the device. We don't hold the config lock as writer * since we need to do I/O but we do keep the * spa_namespace_lock held. Once this completes the device * should no longer have any blocks allocated on it. */ if (vd->vdev_islog) { if (vd->vdev_stat.vs_alloc != 0) error = spa_offline_log(spa); } else { error = SET_ERROR(ENOTSUP); } if (error) return (error); /* * The evacuation succeeded. Remove any remaining MOS metadata * associated with this vdev, and wait for these changes to sync. */ ASSERT0(vd->vdev_stat.vs_alloc); txg = spa_vdev_config_enter(spa); vd->vdev_removing = B_TRUE; vdev_dirty_leaves(vd, VDD_DTL, txg); vdev_config_dirty(vd); spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); return (0); } /* * Complete the removal by cleaning up the namespace. */ static void spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) { vdev_t *rvd = spa->spa_root_vdev; uint64_t id = vd->vdev_id; boolean_t last_vdev = (id == (rvd->vdev_children - 1)); ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); ASSERT(vd == vd->vdev_top); /* * Only remove any devices which are empty. */ if (vd->vdev_stat.vs_alloc != 0) return; (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); if (list_link_active(&vd->vdev_state_dirty_node)) vdev_state_clean(vd); if (list_link_active(&vd->vdev_config_dirty_node)) vdev_config_clean(vd); vdev_free(vd); if (last_vdev) { vdev_compact_children(rvd); } else { vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); vdev_add_child(rvd, vd); } vdev_config_dirty(rvd); /* * Reassess the health of our root vdev. */ vdev_reopen(rvd); } /* * Remove a device from the pool - * * Removing a device from the vdev namespace requires several steps * and can take a significant amount of time. As a result we use * the spa_vdev_config_[enter/exit] functions which allow us to * grab and release the spa_config_lock while still holding the namespace * lock. During each step the configuration is synced out. * * Currently, this supports removing only hot spares, slogs, and level 2 ARC * devices. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; metaslab_group_t *mg; nvlist_t **spares, **l2cache, *nv; uint64_t txg = 0; uint_t nspares, nl2cache; int error = 0; boolean_t locked = MUTEX_HELD(&spa_namespace_lock); ASSERT(spa_writeable(spa)); if (!locked) txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); if (spa->spa_spares.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { /* * Only remove the hot spare if it's not currently in use * in this pool. */ if (vd == NULL || unspare) { spa_vdev_remove_aux(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, spares, nspares, nv); spa_load_spares(spa); spa->spa_spares.sav_sync = B_TRUE; } else { error = SET_ERROR(EBUSY); } } else if (spa->spa_l2cache.sav_vdevs != NULL && nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { /* * Cache devices can always be removed. */ spa_vdev_remove_aux(spa->spa_l2cache.sav_config, ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; } else if (vd != NULL && vd->vdev_islog) { ASSERT(!locked); ASSERT(vd == vd->vdev_top); mg = vd->vdev_mg; /* * Stop allocating from this vdev. */ metaslab_group_passivate(mg); /* * Wait for the youngest allocations and frees to sync, * and then wait for the deferral of those frees to finish. */ spa_vdev_config_exit(spa, NULL, txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); /* * Attempt to evacuate the vdev. */ error = spa_vdev_remove_evacuate(spa, vd); txg = spa_vdev_config_enter(spa); /* * If we couldn't evacuate the vdev, unwind. */ if (error) { metaslab_group_activate(mg); return (spa_vdev_exit(spa, NULL, txg, error)); } /* * Clean up the vdev namespace. */ spa_vdev_remove_from_namespace(spa, vd); } else if (vd != NULL) { /* * Normal vdevs cannot be removed (yet). */ error = SET_ERROR(ENOTSUP); } else { /* * There is no vdev of any kind with the specified guid. */ error = SET_ERROR(ENOENT); } if (!locked) return (spa_vdev_exit(spa, NULL, txg, error)); return (error); } /* * Find any device that's done replacing, or a vdev marked 'unspare' that's * currently spared, so we can detach it. */ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); } /* * Check for a completed replacement. We always consider the first * vdev in the list to be the oldest vdev, and the last one to be * the newest (see spa_vdev_attach() for how that works). In * the case where the newest vdev is faulted, we will not automatically * remove it after a resilver completes. This is OK as it will require * user intervention to determine which disk the admin wishes to keep. */ if (vd->vdev_ops == &vdev_replacing_ops) { ASSERT(vd->vdev_children > 1); newvd = vd->vdev_child[vd->vdev_children - 1]; oldvd = vd->vdev_child[0]; if (vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); } /* * Check for a completed resilver with the 'unspare' flag set. */ if (vd->vdev_ops == &vdev_spare_ops) { vdev_t *first = vd->vdev_child[0]; vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; if (last->vdev_unspare) { oldvd = first; newvd = last; } else if (first->vdev_unspare) { oldvd = last; newvd = first; } else { oldvd = NULL; } if (oldvd != NULL && vdev_dtl_empty(newvd, DTL_MISSING) && vdev_dtl_empty(newvd, DTL_OUTAGE) && !vdev_dtl_required(oldvd)) return (oldvd); /* * If there are more than two spares attached to a disk, * and those spares are not required, then we want to * attempt to free them up now so that they can be used * by other pools. Once we're back down to a single * disk+spare, we stop removing them. */ if (vd->vdev_children > 2) { newvd = vd->vdev_child[1]; if (newvd->vdev_isspare && last->vdev_isspare && vdev_dtl_empty(last, DTL_MISSING) && vdev_dtl_empty(last, DTL_OUTAGE) && !vdev_dtl_required(newvd)) return (newvd); } } return (NULL); } static void spa_vdev_resilver_done(spa_t *spa) { vdev_t *vd, *pvd, *ppvd; uint64_t guid, sguid, pguid, ppguid; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { pvd = vd->vdev_parent; ppvd = pvd->vdev_parent; guid = vd->vdev_guid; pguid = pvd->vdev_guid; ppguid = ppvd->vdev_guid; sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && ppvd->vdev_children == 2) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); sguid = ppvd->vdev_child[1]->vdev_guid; } ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); spa_config_exit(spa, SCL_ALL, FTAG); if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } spa_config_exit(spa, SCL_ALL, FTAG); } /* * Update the stored path or FRU for this vdev. */ int spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, boolean_t ispath) { vdev_t *vd; boolean_t sync = B_FALSE; ASSERT(spa_writeable(spa)); spa_vdev_state_enter(spa, SCL_ALL); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); if (ispath) { if (strcmp(value, vd->vdev_path) != 0) { spa_strfree(vd->vdev_path); vd->vdev_path = spa_strdup(value); sync = B_TRUE; } } else { if (vd->vdev_fru == NULL) { vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } else if (strcmp(value, vd->vdev_fru) != 0) { spa_strfree(vd->vdev_fru); vd->vdev_fru = spa_strdup(value); sync = B_TRUE; } } return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); } int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) { return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); } int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) { return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* * ========================================================================== * SPA Scanning * ========================================================================== */ int spa_scan_stop(spa_t *spa) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (dsl_scan_resilvering(spa->spa_dsl_pool)) return (SET_ERROR(EBUSY)); return (dsl_scan_cancel(spa->spa_dsl_pool)); } int spa_scan(spa_t *spa, pool_scan_func_t func) { ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) return (SET_ERROR(ENOTSUP)); /* * If a resilver was requested, but there is no DTL on a * writeable leaf device, we have nothing to do. */ if (func == POOL_SCAN_RESILVER && !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); return (0); } return (dsl_scan(spa->spa_dsl_pool, func)); } /* * ========================================================================== * SPA async task processing * ========================================================================== */ static void spa_async_remove(spa_t *spa, vdev_t *vd) { if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = B_FALSE; vd->vdev_delayed_close = B_FALSE; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); /* * We want to clear the stats, but we don't want to do a full * vdev_clear() as that will cause us to throw away * degraded/faulted state as well as attempt to reopen the * device, all of which is a waste. */ vd->vdev_stat.vs_read_errors = 0; vd->vdev_stat.vs_write_errors = 0; vd->vdev_stat.vs_checksum_errors = 0; vdev_state_dirty(vd->vdev_top); } for (int c = 0; c < vd->vdev_children; c++) spa_async_remove(spa, vd->vdev_child[c]); } static void spa_async_probe(spa_t *spa, vdev_t *vd) { if (vd->vdev_probe_wanted) { vd->vdev_probe_wanted = B_FALSE; vdev_reopen(vd); /* vdev_open() does the actual probe */ } for (int c = 0; c < vd->vdev_children; c++) spa_async_probe(spa, vd->vdev_child[c]); } static void spa_async_autoexpand(spa_t *spa, vdev_t *vd) { sysevent_id_t eid; nvlist_t *attr; char *physpath; if (!spa->spa_autoexpand) return; for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; spa_async_autoexpand(spa, cvd); } if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) return; physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, ESC_ZFS_VDEV_AUTOEXPAND, attr, &eid, DDI_SLEEP); nvlist_free(attr); kmem_free(physpath, MAXPATHLEN); } static void spa_async_thread(void *arg) { spa_t *spa = arg; int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; spa->spa_async_tasks &= SPA_ASYNC_REMOVE; mutex_exit(&spa->spa_async_lock); /* * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { uint64_t old_space, new_space; mutex_enter(&spa_namespace_lock); old_space = metaslab_class_get_space(spa_normal_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); new_space = metaslab_class_get_space(spa_normal_class(spa)); mutex_exit(&spa_namespace_lock); /* * If the pool grew as a result of the config update, * then log an internal history event. */ if (new_space != old_space) { spa_history_log_internal(spa, "vdev online", NULL, "pool '%s' size: %llu(+%llu)", spa_name(spa), new_space, new_space - old_space); } } if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_async_autoexpand(spa, spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } /* * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * If any devices are done replacing, detach them. */ if (tasks & SPA_ASYNC_RESILVER_DONE) spa_vdev_resilver_done(spa); /* * Kick off a resilver. */ if (tasks & SPA_ASYNC_RESILVER) dsl_resilver_restart(spa->spa_dsl_pool, 0); /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); spa->spa_async_thread = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } static void spa_async_thread_vd(void *arg) { spa_t *spa = arg; int tasks; ASSERT(spa->spa_sync_on); mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; retry: spa->spa_async_tasks &= ~SPA_ASYNC_REMOVE; mutex_exit(&spa->spa_async_lock); /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); for (int i = 0; i < spa->spa_spares.sav_count; i++) spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); (void) spa_vdev_state_exit(spa, NULL, 0); } /* * Let the world know that we're done. */ mutex_enter(&spa->spa_async_lock); tasks = spa->spa_async_tasks; if ((tasks & SPA_ASYNC_REMOVE) != 0) goto retry; spa->spa_async_thread_vd = NULL; cv_broadcast(&spa->spa_async_cv); mutex_exit(&spa->spa_async_lock); thread_exit(); } void spa_async_suspend(spa_t *spa) { mutex_enter(&spa->spa_async_lock); spa->spa_async_suspended++; while (spa->spa_async_thread != NULL && spa->spa_async_thread_vd != NULL) cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); mutex_exit(&spa->spa_async_lock); } void spa_async_resume(spa_t *spa) { mutex_enter(&spa->spa_async_lock); ASSERT(spa->spa_async_suspended != 0); spa->spa_async_suspended--; mutex_exit(&spa->spa_async_lock); } static boolean_t spa_async_tasks_pending(spa_t *spa) { uint_t non_config_tasks; uint_t config_task; boolean_t config_task_suspended; non_config_tasks = spa->spa_async_tasks & ~(SPA_ASYNC_CONFIG_UPDATE | SPA_ASYNC_REMOVE); config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; if (spa->spa_ccw_fail_time == 0) { config_task_suspended = B_FALSE; } else { config_task_suspended = (gethrtime() - spa->spa_ccw_fail_time) < (zfs_ccw_retry_interval * NANOSEC); } return (non_config_tasks || (config_task && !config_task_suspended)); } static void spa_async_dispatch(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if (spa_async_tasks_pending(spa) && !spa->spa_async_suspended && spa->spa_async_thread == NULL && rootdir != NULL) spa->spa_async_thread = thread_create(NULL, 0, spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } static void spa_async_dispatch_vd(spa_t *spa) { mutex_enter(&spa->spa_async_lock); if ((spa->spa_async_tasks & SPA_ASYNC_REMOVE) != 0 && !spa->spa_async_suspended && spa->spa_async_thread_vd == NULL && rootdir != NULL) spa->spa_async_thread_vd = thread_create(NULL, 0, spa_async_thread_vd, spa, 0, &p0, TS_RUN, maxclsyspri); mutex_exit(&spa->spa_async_lock); } void spa_async_request(spa_t *spa, int task) { zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); mutex_enter(&spa->spa_async_lock); spa->spa_async_tasks |= task; mutex_exit(&spa->spa_async_lock); spa_async_dispatch_vd(spa); } /* * ========================================================================== * SPA syncing routines * ========================================================================== */ static int bpobj_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { bpobj_t *bpo = arg; bpobj_enqueue(bpo, bp, tx); return (0); } static int spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { zio_t *zio = arg; zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, BP_GET_PSIZE(bp), zio->io_flags)); return (0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing frees. */ static void spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); bplist_iterate(bpl, spa_free_sync_cb, zio, tx); VERIFY(zio_wait(zio) == 0); } /* * Note: this simple function is not inlined to make it easier to dtrace the * amount of time spent syncing deferred frees. */ static void spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) { zio_t *zio = zio_root(spa, NULL, NULL, 0); VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, spa_free_sync_cb, zio, tx), ==, 0); VERIFY0(zio_wait(zio)); } static void spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) { char *packed = NULL; size_t bufsize; size_t nvsize = 0; dmu_buf_t *db; VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); /* * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration * information. This avoids the dmu_buf_will_dirty() path and * saves us a pre-read to get data we don't actually care about. */ bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); packed = kmem_alloc(bufsize, KM_SLEEP); VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, KM_SLEEP) == 0); bzero(packed + nvsize, bufsize - nvsize); dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); kmem_free(packed, bufsize); VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); dmu_buf_will_dirty(db, tx); *(uint64_t *)db->db_data = nvsize; dmu_buf_rele(db, FTAG); } static void spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, const char *config, const char *entry) { nvlist_t *nvroot; nvlist_t **list; int i; if (!sav->sav_sync) return; /* * Update the MOS nvlist describing the list of available devices. * spa_validate_aux() will have already made sure this nvlist is * valid and the vdevs are labeled appropriately. */ if (sav->sav_object == 0) { sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); VERIFY(zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, &sav->sav_object, tx) == 0); } VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (sav->sav_count == 0) { VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); } else { list = kmem_alloc(sav->sav_count * sizeof (void *), KM_SLEEP); for (i = 0; i < sav->sav_count; i++) list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], B_FALSE, VDEV_CONFIG_L2CACHE); VERIFY(nvlist_add_nvlist_array(nvroot, config, list, sav->sav_count) == 0); for (i = 0; i < sav->sav_count; i++) nvlist_free(list[i]); kmem_free(list, sav->sav_count * sizeof (void *)); } spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); nvlist_free(nvroot); sav->sav_sync = B_FALSE; } static void spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) { nvlist_t *config; if (list_is_empty(&spa->spa_config_dirty_list)) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); config = spa_config_generate(spa, spa->spa_root_vdev, dmu_tx_get_txg(tx), B_FALSE); /* * If we're upgrading the spa version then make sure that * the config object gets updated with the correct version. */ if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa->spa_uberblock.ub_version); spa_config_exit(spa, SCL_STATE, FTAG); if (spa->spa_config_syncing) nvlist_free(spa->spa_config_syncing); spa->spa_config_syncing = config; spa_sync_nvlist(spa, spa->spa_config_object, config, tx); } static void spa_sync_version(void *arg, dmu_tx_t *tx) { uint64_t *versionp = arg; uint64_t version = *versionp; spa_t *spa = dmu_tx_pool(tx)->dp_spa; /* * Setting the version is special cased when first creating the pool. */ ASSERT(tx->tx_txg != TXG_INITIAL); ASSERT(SPA_VERSION_IS_SUPPORTED(version)); ASSERT(version >= spa_version(spa)); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "version=%lld", version); } /* * Set zpool properties. */ static void spa_sync_props(void *arg, dmu_tx_t *tx) { nvlist_t *nvp = arg; spa_t *spa = dmu_tx_pool(tx)->dp_spa; objset_t *mos = spa->spa_meta_objset; nvpair_t *elem = NULL; mutex_enter(&spa->spa_props_lock); while ((elem = nvlist_next_nvpair(nvp, elem))) { uint64_t intval; char *strval, *fname; zpool_prop_t prop; const char *propname; zprop_type_t proptype; spa_feature_t fid; switch (prop = zpool_name_to_prop(nvpair_name(elem))) { case ZPROP_INVAL: /* * We checked this earlier in spa_prop_validate(). */ ASSERT(zpool_prop_feature(nvpair_name(elem))); fname = strchr(nvpair_name(elem), '@') + 1; VERIFY0(zfeature_lookup_name(fname, &fid)); spa_feature_enable(spa, fid, tx); spa_history_log_internal(spa, "set", tx, "%s=enabled", nvpair_name(elem)); break; case ZPOOL_PROP_VERSION: intval = fnvpair_value_uint64(elem); /* * The version is synced seperatly before other * properties and should be correct by now. */ ASSERT3U(spa_version(spa), >=, intval); break; case ZPOOL_PROP_ALTROOT: /* * 'altroot' is a non-persistent property. It should * have been set temporarily at creation or import time. */ ASSERT(spa->spa_root != NULL); break; case ZPOOL_PROP_READONLY: case ZPOOL_PROP_CACHEFILE: /* * 'readonly' and 'cachefile' are also non-persisitent * properties. */ break; case ZPOOL_PROP_COMMENT: strval = fnvpair_value_string(elem); if (spa->spa_comment != NULL) spa_strfree(spa->spa_comment); spa->spa_comment = spa_strdup(strval); /* * We need to dirty the configuration on all the vdevs * so that their labels get updated. It's unnecessary * to do this for pool creation since the vdev's * configuratoin has already been dirtied. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); break; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { spa->spa_pool_props_object = zap_create_link(mos, DMU_OT_POOL_PROPS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, tx); } /* normalize the property name */ propname = zpool_prop_to_name(prop); proptype = zpool_prop_get_type(prop); if (nvpair_type(elem) == DATA_TYPE_STRING) { ASSERT(proptype == PROP_TYPE_STRING); strval = fnvpair_value_string(elem); VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 1, strlen(strval) + 1, strval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%s", nvpair_name(elem), strval); } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { intval = fnvpair_value_uint64(elem); if (proptype == PROP_TYPE_INDEX) { const char *unused; VERIFY0(zpool_prop_index_to_string( prop, intval, &unused)); } VERIFY0(zap_update(mos, spa->spa_pool_props_object, propname, 8, 1, &intval, tx)); spa_history_log_internal(spa, "set", tx, "%s=%lld", nvpair_name(elem), intval); } else { ASSERT(0); /* not allowed */ } switch (prop) { case ZPOOL_PROP_DELEGATION: spa->spa_delegation = intval; break; case ZPOOL_PROP_BOOTFS: spa->spa_bootfs = intval; break; case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; case ZPOOL_PROP_AUTOEXPAND: spa->spa_autoexpand = intval; if (tx->tx_txg != TXG_INITIAL) spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); break; case ZPOOL_PROP_DEDUPDITTO: spa->spa_dedup_ditto = intval; break; default: break; } } } mutex_exit(&spa->spa_props_lock); } /* * Perform one-time upgrade on-disk changes. spa_version() does not * reflect the new version this txg, so there must be no changes this * txg to anything that the upgrade code depends on after it executes. * Therefore this must be called after dsl_pool_sync() does the sync * tasks. */ static void spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) { dsl_pool_t *dp = spa->spa_dsl_pool; ASSERT(spa->spa_sync_pass == 1); rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { dsl_pool_create_origin(dp, tx); /* Keeping the origin open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { dsl_pool_upgrade_clones(dp, tx); } if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { dsl_pool_upgrade_dir_clones(dp, tx); /* Keeping the freedir open increases spa_minref */ spa->spa_minref += 3; } if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { spa_feature_create_zap_objects(spa, tx); } /* * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable * when possibility to use lz4 compression for metadata was added * Old pools that have this feature enabled must be upgraded to have * this feature active */ if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { boolean_t lz4_en = spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS); boolean_t lz4_ac = spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS); if (lz4_en && !lz4_ac) spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); } rrw_exit(&dp->dp_config_rwlock, FTAG); } /* * Sync the specified transaction group. New blocks may be dirtied as * part of the process, so we iterate until it converges. */ void spa_sync(spa_t *spa, uint64_t txg) { dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; int error; VERIFY(spa_writeable(spa)); /* * Lock out configuration changes. */ spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa->spa_syncing_txg = txg; spa->spa_sync_pass = 0; /* * If there are any pending vdev state changes, convert them * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); while (list_head(&spa->spa_state_dirty_list) != NULL) { /* * We need the write lock here because, for aux vdevs, * calling vdev_config_dirty() modifies sav_config. * This is ugly and will become unnecessary when we * eliminate the aux vdev wart by integrating all vdevs * into the root vdev tree. */ spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { vdev_state_clean(vd); vdev_config_dirty(vd); } spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); tx = dmu_tx_create_assigned(dp, txg); spa->spa_sync_starttime = gethrtime(); #ifdef illumos VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, spa->spa_sync_starttime + spa->spa_deadman_synctime)); #else /* !illumos */ #ifdef _KERNEL callout_reset(&spa->spa_deadman_cycid, hz * spa->spa_deadman_synctime / NANOSEC, spa_deadman, spa); #endif #endif /* illumos */ /* * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, * set spa_deflate if we have no raid-z vdevs. */ if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { int i; for (i = 0; i < rvd->vdev_children; i++) { vd = rvd->vdev_child[i]; if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) break; } if (i == rvd->vdev_children) { spa->spa_deflate = TRUE; VERIFY(0 == zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, sizeof (uint64_t), 1, &spa->spa_deflate, tx)); } } /* * Iterate to convergence. */ do { int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); if (pass < zfs_sync_pass_deferred_free) { spa_sync_frees(spa, free_bpl, tx); } else { /* * We can not defer frees in pass 1, because * we sync the deferred frees later in pass 1. */ ASSERT3U(pass, >, 1); bplist_iterate(free_bpl, bpobj_enqueue_cb, &spa->spa_deferred_bpobj, tx); } ddt_sync(spa, txg); dsl_scan_sync(dp, tx); while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) vdev_sync(vd, txg); if (pass == 1) { spa_sync_upgrades(spa, tx); ASSERT3U(txg, >=, spa->spa_uberblock.ub_rootbp.blk_birth); /* * Note: We need to check if the MOS is dirty * because we could have marked the MOS dirty * without updating the uberblock (e.g. if we * have sync tasks but no dirty user data). We * need to check the uberblock's rootbp because * it is updated if we have synced out dirty * data (though in this case the MOS will most * likely also be dirty due to second order * effects, we don't want to rely on that here). */ if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && !dmu_objset_is_dirty(mos, txg)) { /* * Nothing changed on the first pass, * therefore this TXG is a no-op. Avoid * syncing deferred frees, so that we * can keep this TXG as a no-op. */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); break; } spa_sync_deferred_frees(spa, tx); } } while (dmu_objset_is_dirty(mos, txg)); /* * Rewrite the vdev configuration (which includes the uberblock) * to commit the transaction group. * * If there are no dirty vdevs, we sync the uberblock to a few * random top-level vdevs that are known to be visible in the * config cache (see spa_vdev_add() for a complete description). * If there *are* dirty vdevs, sync the uberblock to all vdevs. */ for (;;) { /* * We hold SCL_STATE to prevent vdev open/close/etc. * while we're attempting to write the vdev labels. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); if (list_is_empty(&spa->spa_config_dirty_list)) { vdev_t *svd[SPA_DVAS_PER_BP]; int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); for (int c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; if (vd->vdev_ms_array == 0 || vd->vdev_islog) continue; svd[svdcount++] = vd; if (svdcount == SPA_DVAS_PER_BP) break; } error = vdev_config_sync(svd, svdcount, txg, B_FALSE); if (error != 0) error = vdev_config_sync(svd, svdcount, txg, B_TRUE); } else { error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg, B_FALSE); if (error != 0) error = vdev_config_sync(rvd->vdev_child, rvd->vdev_children, txg, B_TRUE); } if (error == 0) spa->spa_last_synced_guid = rvd->vdev_guid; spa_config_exit(spa, SCL_STATE, FTAG); if (error == 0) break; zio_suspend(spa, NULL); zio_resume_wait(spa); } dmu_tx_commit(tx); #ifdef illumos VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); #else /* !illumos */ #ifdef _KERNEL callout_drain(&spa->spa_deadman_cycid); #endif #endif /* illumos */ /* * Clear the dirty config list. */ while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) vdev_config_clean(vd); /* * Now that the new config has synced transactionally, * let it become visible to the config cache. */ if (spa->spa_config_syncing != NULL) { spa_config_set(spa, spa->spa_config_syncing); spa->spa_config_txg = txg; spa->spa_config_syncing = NULL; } spa->spa_ubsync = spa->spa_uberblock; dsl_pool_sync_done(dp, txg); /* * Update usable space statistics. */ while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) vdev_sync_done(vd, txg); spa_update_dspace(spa); /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). */ ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); spa->spa_sync_pass = 0; spa_config_exit(spa, SCL_CONFIG, FTAG); spa_handle_ignored_writes(spa); /* * If any async tasks have been requested, kick them off. */ spa_async_dispatch(spa); spa_async_dispatch_vd(spa); } /* * Sync all pools. We don't want to hold the namespace lock across these * operations, so we take a reference on the spa_t and drop the lock during the * sync. */ void spa_sync_allpools(void) { spa_t *spa = NULL; mutex_enter(&spa_namespace_lock); while ((spa = spa_next(spa)) != NULL) { if (spa_state(spa) != POOL_STATE_ACTIVE || !spa_writeable(spa) || spa_suspended(spa)) continue; spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); txg_wait_synced(spa_get_dsl(spa), 0); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); } mutex_exit(&spa_namespace_lock); } /* * ========================================================================== * Miscellaneous routines * ========================================================================== */ /* * Remove all pools in the system. */ void spa_evict_all(void) { spa_t *spa; /* * Remove all cached state. All pools should be closed now, * so every spa in the AVL tree should be unreferenced. */ mutex_enter(&spa_namespace_lock); while ((spa = spa_next(NULL)) != NULL) { /* * Stop async tasks. The async thread may need to detach * a device that's been replaced, which requires grabbing * spa_namespace_lock, so we must drop it here. */ spa_open_ref(spa, FTAG); mutex_exit(&spa_namespace_lock); spa_async_suspend(spa); mutex_enter(&spa_namespace_lock); spa_close(spa, FTAG); if (spa->spa_state != POOL_STATE_UNINITIALIZED) { spa_unload(spa); spa_deactivate(spa); } spa_remove(spa); } mutex_exit(&spa_namespace_lock); } vdev_t * spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } for (i = 0; i < spa->spa_spares.sav_count; i++) { vd = spa->spa_spares.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } } return (NULL); } void spa_upgrade(spa_t *spa, uint64_t version) { ASSERT(spa_writeable(spa)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* * This should only be called for a non-faulted pool, and since a * future version would result in an unopenable pool, this shouldn't be * possible. */ ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); ASSERT3U(version, >=, spa->spa_uberblock.ub_version); spa->spa_uberblock.ub_version = version; vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa_get_dsl(spa), 0); } boolean_t spa_has_spare(spa_t *spa, uint64_t guid) { int i; uint64_t spareguid; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) if (sav->sav_vdevs[i]->vdev_guid == guid) return (B_TRUE); for (i = 0; i < sav->sav_npending; i++) { if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, &spareguid) == 0 && spareguid == guid) return (B_TRUE); } return (B_FALSE); } /* * Check if a pool has an active shared spare device. * Note: reference count of an active spare is 2, as a spare and as a replace */ static boolean_t spa_has_active_shared_spare(spa_t *spa) { int i, refcnt; uint64_t pool; spa_aux_vdev_t *sav = &spa->spa_spares; for (i = 0; i < sav->sav_count; i++) { if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, &refcnt) && pool != 0ULL && pool == spa_guid(spa) && refcnt > 2) return (B_TRUE); } return (B_FALSE); } /* * Post a sysevent corresponding to the given event. The 'name' must be one of * the event definitions in sys/sysevent/eventdefs.h. The payload will be * filled in from the spa and (optionally) the vdev. This doesn't do anything * in the userland libzpool, as we don't want consumers to misinterpret ztest * or zdb as real changes. */ void spa_event_notify(spa_t *spa, vdev_t *vd, const char *name) { #ifdef _KERNEL sysevent_t *ev; sysevent_attr_list_t *attr = NULL; sysevent_value_t value; sysevent_id_t eid; ev = sysevent_alloc(EC_ZFS, (char *)name, SUNW_KERN_PUB "zfs", SE_SLEEP); value.value_type = SE_DATA_TYPE_STRING; value.value.sv_string = spa_name(spa); if (sysevent_add_attr(&attr, ZFS_EV_POOL_NAME, &value, SE_SLEEP) != 0) goto done; value.value_type = SE_DATA_TYPE_UINT64; value.value.sv_uint64 = spa_guid(spa); if (sysevent_add_attr(&attr, ZFS_EV_POOL_GUID, &value, SE_SLEEP) != 0) goto done; if (vd) { value.value_type = SE_DATA_TYPE_UINT64; value.value.sv_uint64 = vd->vdev_guid; if (sysevent_add_attr(&attr, ZFS_EV_VDEV_GUID, &value, SE_SLEEP) != 0) goto done; if (vd->vdev_path) { value.value_type = SE_DATA_TYPE_STRING; value.value.sv_string = vd->vdev_path; if (sysevent_add_attr(&attr, ZFS_EV_VDEV_PATH, &value, SE_SLEEP) != 0) goto done; } } if (sysevent_attach_attributes(ev, attr) != 0) goto done; attr = NULL; (void) log_sysevent(ev, SE_SLEEP, &eid); done: if (attr) sysevent_free_attr(attr); sysevent_free(ev); #endif } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_misc.c (revision 286575) @@ -1,2065 +1,2113 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfeature_common.h" /* * SPA locking * * There are four basic locks for managing spa_t structures: * * spa_namespace_lock (global mutex) * * This lock must be acquired to do any of the following: * * - Lookup a spa_t by name * - Add or remove a spa_t from the namespace * - Increase spa_refcount from non-zero * - Check if spa_refcount is zero * - Rename a spa_t * - add/remove/attach/detach devices * - Held for the duration of create/destroy/import/export * * It does not need to handle recursion. A create or destroy may * reference objects (files or zvols) in other pools, but by * definition they must have an existing reference, and will never need * to lookup a spa_t by name. * * spa_refcount (per-spa refcount_t protected by mutex) * * This reference count keep track of any active users of the spa_t. The * spa_t cannot be destroyed or freed while this is non-zero. Internally, * the refcount is never really 'zero' - opening a pool implicitly keeps * some references in the DMU. Internally we check against spa_minref, but * present the image of a zero/non-zero value to consumers. * * spa_config_lock[] (per-spa array of rwlocks) * * This protects the spa_t from config changes, and must be held in * the following circumstances: * * - RW_READER to perform I/O to the spa * - RW_WRITER to change the vdev config * * The locking order is fairly straightforward: * * spa_namespace_lock -> spa_refcount * * The namespace lock must be acquired to increase the refcount from 0 * or to check if it is zero. * * spa_refcount -> spa_config_lock[] * * There must be at least one valid reference on the spa_t to acquire * the config lock. * * spa_namespace_lock -> spa_config_lock[] * * The namespace lock must always be taken before the config lock. * * * The spa_namespace_lock can be acquired directly and is globally visible. * * The namespace is manipulated using the following functions, all of which * require the spa_namespace_lock to be held. * * spa_lookup() Lookup a spa_t by name. * * spa_add() Create a new spa_t in the namespace. * * spa_remove() Remove a spa_t from the namespace. This also * frees up any memory associated with the spa_t. * * spa_next() Returns the next spa_t in the system, or the * first if NULL is passed. * * spa_evict_all() Shutdown and remove all spa_t structures in * the system. * * spa_guid_exists() Determine whether a pool/device guid exists. * * The spa_refcount is manipulated using the following functions: * * spa_open_ref() Adds a reference to the given spa_t. Must be * called with spa_namespace_lock held if the * refcount is currently zero. * * spa_close() Remove a reference from the spa_t. This will * not free the spa_t or remove it from the * namespace. No locking is required. * * spa_refcount_zero() Returns true if the refcount is currently * zero. Must be called with spa_namespace_lock * held. * * The spa_config_lock[] is an array of rwlocks, ordered as follows: * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). * * To read the configuration, it suffices to hold one of these locks as reader. * To modify the configuration, you must hold all locks as writer. To modify * vdev state without altering the vdev tree's topology (e.g. online/offline), * you must hold SCL_STATE and SCL_ZIO as writer. * * We use these distinct config locks to avoid recursive lock entry. * For example, spa_sync() (which holds SCL_CONFIG as reader) induces * block allocations (SCL_ALLOC), which may require reading space maps * from disk (dmu_read() -> zio_read() -> SCL_ZIO). * * The spa config locks cannot be normal rwlocks because we need the * ability to hand off ownership. For example, SCL_ZIO is acquired * by the issuing thread and later released by an interrupt thread. * They do, however, obey the usual write-wanted semantics to prevent * writer (i.e. system administrator) starvation. * * The lock acquisition rules are as follows: * * SCL_CONFIG * Protects changes to the vdev tree topology, such as vdev * add/remove/attach/detach. Protects the dirty config list * (spa_config_dirty_list) and the set of spares and l2arc devices. * * SCL_STATE * Protects changes to pool state and vdev state, such as vdev * online/offline/fault/degrade/clear. Protects the dirty state list * (spa_state_dirty_list) and global pool state (spa_state). * * SCL_ALLOC * Protects changes to metaslab groups and classes. * Held as reader by metaslab_alloc() and metaslab_claim(). * * SCL_ZIO * Held by bp-level zios (those which have no io_vd upon entry) * to prevent changes to the vdev tree. The bp-level zio implicitly * protects all of its vdev child zios, which do not hold SCL_ZIO. * * SCL_FREE * Protects changes to metaslab groups and classes. * Held as reader by metaslab_free(). SCL_FREE is distinct from * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free * blocks in zio_done() while another i/o that holds either * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. * * SCL_VDEV * Held as reader to prevent changes to the vdev tree during trivial * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the * other locks, and lower than all of them, to ensure that it's safe * to acquire regardless of caller context. * * In addition, the following rules apply: * * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. * The lock ordering is SCL_CONFIG > spa_props_lock. * * (b) I/O operations on leaf vdevs. For any zio operation that takes * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), * or zio_write_phys() -- the caller must ensure that the config cannot * cannot change in the interim, and that the vdev cannot be reopened. * SCL_STATE as reader suffices for both. * * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). * * spa_vdev_enter() Acquire the namespace lock and the config lock * for writing. * * spa_vdev_exit() Release the config lock, wait for all I/O * to complete, sync the updated configs to the * cache, and release the namespace lock. * * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual * locking is, always, based on spa_namespace_lock and spa_config_lock[]. * * spa_rename() is also implemented within this file since it requires * manipulation of the namespace. */ static avl_tree_t spa_namespace_avl; kmutex_t spa_namespace_lock; static kcondvar_t spa_namespace_cv; static int spa_active_count; int spa_max_replication_override = SPA_DVAS_PER_BP; static kmutex_t spa_spare_lock; static avl_tree_t spa_spare_avl; static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; int spa_mode_global; #ifdef ZFS_DEBUG /* Everything except dprintf and spa is on by default in debug builds */ int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SPA); #else int zfs_flags = 0; #endif /* * zfs_recover can be set to nonzero to attempt to recover from * otherwise-fatal errors, typically caused by on-disk corruption. When * set, calls to zfs_panic_recover() will turn into warning messages. * This should only be used as a last resort, as it typically results * in leaked space, or worse. */ boolean_t zfs_recover = B_FALSE; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, recover, CTLFLAG_RDTUN, &zfs_recover, 0, "Try to recover from otherwise-fatal errors."); static int sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) { int err, val; val = zfs_flags; err = sysctl_handle_int(oidp, &val, 0, req); if (err != 0 || req->newptr == NULL) return (err); /* * ZFS_DEBUG_MODIFY must be enabled prior to boot so all * arc buffers in the system have the necessary additional * checksum data. However, it is safe to disable at any * time. */ if (!(zfs_flags & ZFS_DEBUG_MODIFY)) val &= ~ZFS_DEBUG_MODIFY; zfs_flags = val; return (0); } SYSCTL_PROC(_vfs_zfs, OID_AUTO, debug_flags, CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, 0, sizeof(int), sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); /* * If destroy encounters an EIO while reading metadata (e.g. indirect * blocks), space referenced by the missing metadata can not be freed. * Normally this causes the background destroy to become "stalled", as * it is unable to make forward progress. While in this stalled state, * all remaining space to free from the error-encountering filesystem is * "temporarily leaked". Set this flag to cause it to ignore the EIO, * permanently leak the space from indirect blocks that can not be read, * and continue to free everything else that it can. * * The default, "stalling" behavior is useful if the storage partially * fails (i.e. some but not all i/os fail), and then later recovers. In * this case, we will be able to continue pool operations while it is * partially failed, and when it recovers, we can continue to free the * space, with no leaks. However, note that this case is actually * fairly rare. * * Typically pools either (a) fail completely (but perhaps temporarily, * e.g. a top-level vdev going offline), or (b) have localized, * permanent errors (e.g. disk returns the wrong data due to bit flip or * firmware bug). In case (a), this setting does not matter because the * pool will be suspended and the sync thread will not be able to make * forward progress regardless. In case (b), because the error is * permanent, the best we can do is leak the minimum amount of space, * which is what setting this flag will do. Therefore, it is reasonable * for this flag to normally be set, but we chose the more conservative * approach of not setting it, so that there is no possibility of * leaking space in the "partial temporary" failure case. */ boolean_t zfs_free_leak_on_eio = B_FALSE; /* * Expiration time in milliseconds. This value has two meanings. First it is * used to determine when the spa_deadman() logic should fire. By default the * spa_deadman() will fire if spa_sync() has not completed in 1000 seconds. * Secondly, the value determines if an I/O is considered "hung". Any I/O that * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting * in a system panic. */ uint64_t zfs_deadman_synctime_ms = 1000000ULL; SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_synctime_ms, CTLFLAG_RDTUN, &zfs_deadman_synctime_ms, 0, "Stalled ZFS I/O expiration time in milliseconds"); /* * Check time in milliseconds. This defines the frequency at which we check * for hung I/O. */ uint64_t zfs_deadman_checktime_ms = 5000ULL; SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, deadman_checktime_ms, CTLFLAG_RDTUN, &zfs_deadman_checktime_ms, 0, "Period of checks for stalled ZFS I/O in milliseconds"); /* * Default value of -1 for zfs_deadman_enabled is resolved in * zfs_deadman_init() */ int zfs_deadman_enabled = -1; SYSCTL_INT(_vfs_zfs, OID_AUTO, deadman_enabled, CTLFLAG_RDTUN, &zfs_deadman_enabled, 0, "Kernel panic on stalled ZFS I/O"); /* * The worst case is single-sector max-parity RAID-Z blocks, in which * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) * times the size; so just assume that. Add to this the fact that * we can have up to 3 DVAs per bp, and one more factor of 2 because * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, * the worst case is: * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 */ int spa_asize_inflation = 24; SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_asize_inflation, CTLFLAG_RWTUN, &spa_asize_inflation, 0, "Worst case inflation factor for single sector writes"); #ifndef illumos #ifdef _KERNEL static void zfs_deadman_init() { /* * If we are not i386 or amd64 or in a virtual machine, * disable ZFS deadman thread by default */ if (zfs_deadman_enabled == -1) { #if defined(__amd64__) || defined(__i386__) zfs_deadman_enabled = (vm_guest == VM_GUEST_NO) ? 1 : 0; #else zfs_deadman_enabled = 0; #endif } } #endif /* _KERNEL */ #endif /* !illumos */ /* * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in * the pool to be consumed. This ensures that we don't run the pool * completely out of space, due to unaccounted changes (e.g. to the MOS). * It also limits the worst-case time to allocate space. If we have * less than this amount of free space, most ZPL operations (e.g. write, * create) will return ENOSPC. * * Certain operations (e.g. file removal, most administrative actions) can * use half the slop space. They will only return ENOSPC if less than half * the slop space is free. Typically, once the pool has less than the slop * space free, the user will use these operations to free up space in the pool. * These are the operations that call dsl_pool_adjustedsize() with the netfree * argument set to TRUE. * * A very restricted set of operations are always permitted, regardless of * the amount of free space. These are the operations that call * dsl_sync_task(ZFS_SPACE_CHECK_NONE), e.g. "zfs destroy". If these * operations result in a net increase in the amount of space used, * it is possible to run the pool completely out of space, causing it to * be permanently read-only. * * See also the comments in zfs_space_check_t. */ int spa_slop_shift = 5; SYSCTL_INT(_vfs_zfs, OID_AUTO, spa_slop_shift, CTLFLAG_RWTUN, &spa_slop_shift, 0, "Shift value of reserved space (1/(2^spa_slop_shift))."); /* * ========================================================================== * SPA config locking * ========================================================================== */ static void spa_config_lock_init(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); refcount_create_untracked(&scl->scl_count); scl->scl_writer = NULL; scl->scl_write_wanted = 0; } } static void spa_config_lock_destroy(spa_t *spa) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; mutex_destroy(&scl->scl_lock); cv_destroy(&scl->scl_cv); refcount_destroy(&scl->scl_count); ASSERT(scl->scl_writer == NULL); ASSERT(scl->scl_write_wanted == 0); } } int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) { for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { if (scl->scl_writer || scl->scl_write_wanted) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks ^ (1 << i), tag); return (0); } } else { ASSERT(scl->scl_writer != curthread); if (!refcount_is_zero(&scl->scl_count)) { mutex_exit(&scl->scl_lock); spa_config_exit(spa, locks ^ (1 << i), tag); return (0); } scl->scl_writer = curthread; } (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } return (1); } void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) { int wlocks_held = 0; ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (scl->scl_writer == curthread) wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); if (rw == RW_READER) { while (scl->scl_writer || scl->scl_write_wanted) { cv_wait(&scl->scl_cv, &scl->scl_lock); } } else { ASSERT(scl->scl_writer != curthread); while (!refcount_is_zero(&scl->scl_count)) { scl->scl_write_wanted++; cv_wait(&scl->scl_cv, &scl->scl_lock); scl->scl_write_wanted--; } scl->scl_writer = curthread; } (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } ASSERT(wlocks_held <= locks); } void spa_config_exit(spa_t *spa, int locks, void *tag) { for (int i = SCL_LOCKS - 1; i >= 0; i--) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); ASSERT(!refcount_is_zero(&scl->scl_count)); if (refcount_remove(&scl->scl_count, tag) == 0) { ASSERT(scl->scl_writer == NULL || scl->scl_writer == curthread); scl->scl_writer = NULL; /* OK in either case */ cv_broadcast(&scl->scl_cv); } mutex_exit(&scl->scl_lock); } } int spa_config_held(spa_t *spa, int locks, krw_t rw) { int locks_held = 0; for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; if (!(locks & (1 << i))) continue; if ((rw == RW_READER && !refcount_is_zero(&scl->scl_count)) || (rw == RW_WRITER && scl->scl_writer == curthread)) locks_held |= 1 << i; } return (locks_held); } /* * ========================================================================== * SPA namespace functions * ========================================================================== */ /* * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. * Returns NULL if no matching spa_t is found. */ spa_t * spa_lookup(const char *name) { static spa_t search; /* spa_t is large; don't allocate on stack */ spa_t *spa; avl_index_t where; char *cp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); /* * If it's a full dataset name, figure out the pool name and * just use that. */ cp = strpbrk(search.spa_name, "/@#"); if (cp != NULL) *cp = '\0'; spa = avl_find(&spa_namespace_avl, &search, &where); return (spa); } /* * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. * If the zfs_deadman_enabled flag is set then it inspects all vdev queues * looking for potentially hung I/Os. */ void spa_deadman(void *arg) { spa_t *spa = arg; /* * Disable the deadman timer if the pool is suspended. */ if (spa_suspended(spa)) { #ifdef illumos VERIFY(cyclic_reprogram(spa->spa_deadman_cycid, CY_INFINITY)); #else /* Nothing. just don't schedule any future callouts. */ #endif return; } zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", (gethrtime() - spa->spa_sync_starttime) / NANOSEC, ++spa->spa_deadman_calls); if (zfs_deadman_enabled) vdev_deadman(spa->spa_root_vdev); #ifdef __FreeBSD__ #ifdef _KERNEL callout_schedule(&spa->spa_deadman_cycid, hz * zfs_deadman_checktime_ms / MILLISEC); #endif #endif } /* * Create an uninitialized spa_t with the given name. Requires * spa_namespace_lock. The caller must ensure that the spa_t doesn't already * exist by calling spa_lookup() first. */ spa_t * spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; #ifdef illumos cyc_handler_t hdlr; cyc_time_t when; #endif ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); for (int t = 0; t < TXG_SIZE; t++) bplist_create(&spa->spa_free_bplist[t]); (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; spa->spa_load_max_txg = UINT64_MAX; spa->spa_proc = &p0; spa->spa_proc_state = SPA_PROC_NONE; #ifdef illumos hdlr.cyh_func = spa_deadman; hdlr.cyh_arg = spa; hdlr.cyh_level = CY_LOW_LEVEL; #endif spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); #ifdef illumos /* * This determines how often we need to check for hung I/Os after * the cyclic has already fired. Since checking for hung I/Os is * an expensive operation we don't want to check too frequently. * Instead wait for 5 seconds before checking again. */ when.cyt_interval = MSEC2NSEC(zfs_deadman_checktime_ms); when.cyt_when = CY_INFINITY; mutex_enter(&cpu_lock); spa->spa_deadman_cycid = cyclic_add(&hdlr, &when); mutex_exit(&cpu_lock); #else /* !illumos */ #ifdef _KERNEL callout_init(&spa->spa_deadman_cycid, 1); #endif #endif refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); avl_add(&spa_namespace_avl, spa); /* * Set the alternate root, if there is one. */ if (altroot) { spa->spa_root = spa_strdup(altroot); spa_active_count++; } /* * Every pool starts with the default cachefile */ list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), offsetof(spa_config_dirent_t, scd_link)); dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (config != NULL) { nvlist_t *features; if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, &features) == 0) { VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); } VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); } if (spa->spa_label_features == NULL) { VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, KM_SLEEP) == 0); } spa->spa_debug = ((zfs_flags & ZFS_DEBUG_SPA) != 0); spa->spa_min_ashift = INT_MAX; spa->spa_max_ashift = 0; /* * As a pool is being created, treat all features as disabled by * setting SPA_FEATURE_DISABLED for all entries in the feature * refcount cache. */ for (int i = 0; i < SPA_FEATURES; i++) { spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; } return (spa); } /* * Removes a spa_t from the namespace, freeing up any memory used. Requires * spa_namespace_lock. This is called only after the spa_t has been closed and * deactivated. */ void spa_remove(spa_t *spa) { spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + ASSERT3U(refcount_count(&spa->spa_refcount), ==, 0); nvlist_free(spa->spa_config_splitting); avl_remove(&spa_namespace_avl, spa); cv_broadcast(&spa_namespace_cv); if (spa->spa_root) { spa_strfree(spa->spa_root); spa_active_count--; } while ((dp = list_head(&spa->spa_config_list)) != NULL) { list_remove(&spa->spa_config_list, dp); if (dp->scd_path != NULL) spa_strfree(dp->scd_path); kmem_free(dp, sizeof (spa_config_dirent_t)); } list_destroy(&spa->spa_config_list); nvlist_free(spa->spa_label_features); nvlist_free(spa->spa_load_info); spa_config_set(spa, NULL); #ifdef illumos mutex_enter(&cpu_lock); if (spa->spa_deadman_cycid != CYCLIC_NONE) cyclic_remove(spa->spa_deadman_cycid); mutex_exit(&cpu_lock); spa->spa_deadman_cycid = CYCLIC_NONE; #else /* !illumos */ #ifdef _KERNEL callout_drain(&spa->spa_deadman_cycid); #endif #endif refcount_destroy(&spa->spa_refcount); spa_config_lock_destroy(spa); for (int t = 0; t < TXG_SIZE; t++) bplist_destroy(&spa->spa_free_bplist[t]); cv_destroy(&spa->spa_async_cv); + cv_destroy(&spa->spa_evicting_os_cv); cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); mutex_destroy(&spa->spa_async_lock); mutex_destroy(&spa->spa_errlist_lock); mutex_destroy(&spa->spa_errlog_lock); + mutex_destroy(&spa->spa_evicting_os_lock); mutex_destroy(&spa->spa_history_lock); mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); mutex_destroy(&spa->spa_vdev_top_lock); kmem_free(spa, sizeof (spa_t)); } /* * Given a pool, return the next pool in the namespace, or NULL if there is * none. If 'prev' is NULL, return the first pool. */ spa_t * spa_next(spa_t *prev) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); if (prev) return (AVL_NEXT(&spa_namespace_avl, prev)); else return (avl_first(&spa_namespace_avl)); } /* * ========================================================================== * SPA refcount functions * ========================================================================== */ /* * Add a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_open_ref(spa_t *spa, void *tag) { ASSERT(refcount_count(&spa->spa_refcount) >= spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) refcount_add(&spa->spa_refcount, tag); } /* * Remove a reference to the given spa_t. Must have at least one reference, or * have the namespace lock held. */ void spa_close(spa_t *spa, void *tag) { ASSERT(refcount_count(&spa->spa_refcount) > spa->spa_minref || MUTEX_HELD(&spa_namespace_lock)); (void) refcount_remove(&spa->spa_refcount, tag); } /* + * Remove a reference to the given spa_t held by a dsl dir that is + * being asynchronously released. Async releases occur from a taskq + * performing eviction of dsl datasets and dirs. The namespace lock + * isn't held and the hold by the object being evicted may contribute to + * spa_minref (e.g. dataset or directory released during pool export), + * so the asserts in spa_close() do not apply. + */ +void +spa_async_close(spa_t *spa, void *tag) +{ + (void) refcount_remove(&spa->spa_refcount, tag); +} + +/* * Check to see if the spa refcount is zero. Must be called with * spa_namespace_lock held. We really compare against spa_minref, which is the * number of references acquired when opening a pool */ boolean_t spa_refcount_zero(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); return (refcount_count(&spa->spa_refcount) == spa->spa_minref); } /* * ========================================================================== * SPA spare and l2cache tracking * ========================================================================== */ /* * Hot spares and cache devices are tracked using the same code below, * for 'auxiliary' devices. */ typedef struct spa_aux { uint64_t aux_guid; uint64_t aux_pool; avl_node_t aux_avl; int aux_count; } spa_aux_t; static int spa_aux_compare(const void *a, const void *b) { const spa_aux_t *sa = a; const spa_aux_t *sb = b; if (sa->aux_guid < sb->aux_guid) return (-1); else if (sa->aux_guid > sb->aux_guid) return (1); else return (0); } void spa_aux_add(vdev_t *vd, avl_tree_t *avl) { avl_index_t where; spa_aux_t search; spa_aux_t *aux; search.aux_guid = vd->vdev_guid; if ((aux = avl_find(avl, &search, &where)) != NULL) { aux->aux_count++; } else { aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); aux->aux_guid = vd->vdev_guid; aux->aux_count = 1; avl_insert(avl, aux, where); } } void spa_aux_remove(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search; spa_aux_t *aux; avl_index_t where; search.aux_guid = vd->vdev_guid; aux = avl_find(avl, &search, &where); ASSERT(aux != NULL); if (--aux->aux_count == 0) { avl_remove(avl, aux); kmem_free(aux, sizeof (spa_aux_t)); } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { aux->aux_pool = 0ULL; } } boolean_t spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) { spa_aux_t search, *found; search.aux_guid = guid; found = avl_find(avl, &search, NULL); if (pool) { if (found) *pool = found->aux_pool; else *pool = 0ULL; } if (refcnt) { if (found) *refcnt = found->aux_count; else *refcnt = 0; } return (found != NULL); } void spa_aux_activate(vdev_t *vd, avl_tree_t *avl) { spa_aux_t search, *found; avl_index_t where; search.aux_guid = vd->vdev_guid; found = avl_find(avl, &search, &where); ASSERT(found != NULL); ASSERT(found->aux_pool == 0ULL); found->aux_pool = spa_guid(vd->vdev_spa); } /* * Spares are tracked globally due to the following constraints: * * - A spare may be part of multiple pools. * - A spare may be added to a pool even if it's actively in use within * another pool. * - A spare in use in any pool can only be the source of a replacement if * the target is a spare in the same pool. * * We keep track of all spares on the system through the use of a reference * counted AVL tree. When a vdev is added as a spare, or used as a replacement * spare, then we bump the reference count in the AVL tree. In addition, we set * the 'vdev_isspare' member to indicate that the device is a spare (active or * inactive). When a spare is made active (used to replace a device in the * pool), we also keep track of which pool its been made a part of. * * The 'spa_spare_lock' protects the AVL tree. These functions are normally * called under the spa_namespace lock as part of vdev reconfiguration. The * separate spare lock exists for the status query path, which does not need to * be completely consistent with respect to other vdev configuration changes. */ static int spa_spare_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_spare_add(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(!vd->vdev_isspare); spa_aux_add(vd, &spa_spare_avl); vd->vdev_isspare = B_TRUE; mutex_exit(&spa_spare_lock); } void spa_spare_remove(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_remove(vd, &spa_spare_avl); vd->vdev_isspare = B_FALSE; mutex_exit(&spa_spare_lock); } boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) { boolean_t found; mutex_enter(&spa_spare_lock); found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); mutex_exit(&spa_spare_lock); return (found); } void spa_spare_activate(vdev_t *vd) { mutex_enter(&spa_spare_lock); ASSERT(vd->vdev_isspare); spa_aux_activate(vd, &spa_spare_avl); mutex_exit(&spa_spare_lock); } /* * Level 2 ARC devices are tracked globally for the same reasons as spares. * Cache devices currently only support one pool per cache device, and so * for these devices the aux reference count is currently unused beyond 1. */ static int spa_l2cache_compare(const void *a, const void *b) { return (spa_aux_compare(a, b)); } void spa_l2cache_add(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(!vd->vdev_isl2cache); spa_aux_add(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_TRUE; mutex_exit(&spa_l2cache_lock); } void spa_l2cache_remove(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_remove(vd, &spa_l2cache_avl); vd->vdev_isl2cache = B_FALSE; mutex_exit(&spa_l2cache_lock); } boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool) { boolean_t found; mutex_enter(&spa_l2cache_lock); found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); return (found); } void spa_l2cache_activate(vdev_t *vd) { mutex_enter(&spa_l2cache_lock); ASSERT(vd->vdev_isl2cache); spa_aux_activate(vd, &spa_l2cache_avl); mutex_exit(&spa_l2cache_lock); } /* * ========================================================================== * SPA vdev locking * ========================================================================== */ /* * Lock the given spa_t for the purpose of adding or removing a vdev. * Grabs the global spa_namespace_lock plus the spa config lock for writing. * It returns the next transaction group for the spa_t. */ uint64_t spa_vdev_enter(spa_t *spa) { mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); return (spa_vdev_config_enter(spa)); } /* * Internal implementation for spa_vdev_enter(). Used when a vdev * operation requires multiple syncs (i.e. removing a device) while * keeping the spa_namespace_lock held. */ uint64_t spa_vdev_config_enter(spa_t *spa) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); return (spa_last_synced_txg(spa) + 1); } /* * Used in combination with spa_vdev_config_enter() to allow the syncing * of multiple transactions without releasing the spa_namespace_lock. */ void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); int config_changed = B_FALSE; ASSERT(txg > spa_last_synced_txg(spa)); spa->spa_pending_vdev = NULL; /* * Reassess the DTLs. */ vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE); if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { config_changed = B_TRUE; spa->spa_config_generation++; } /* * Verify the metaslab classes. */ ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); spa_config_exit(spa, SCL_ALL, spa); /* * Panic the system if the specified tag requires it. This * is useful for ensuring that configurations are updated * transactionally. */ if (zio_injection_enabled) zio_handle_panic_injection(spa, tag, 0); /* * Note: this txg_wait_synced() is important because it ensures * that there won't be more than one config change per txg. * This allows us to use the txg as the generation number. */ if (error == 0) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); vdev_free(vd); spa_config_exit(spa, SCL_ALL, spa); } /* * If the config changed, update the config cache. */ if (config_changed) spa_config_sync(spa, B_FALSE, B_TRUE); } /* * Unlock the spa_t after adding or removing a vdev. Besides undoing the * locking of spa_vdev_enter(), we also want make sure the transactions have * synced to disk, and then update the global configuration cache with the new * information. */ int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) { spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); mutex_exit(&spa->spa_vdev_top_lock); return (error); } /* * Lock the given spa_t for the purpose of changing vdev state. */ void spa_vdev_state_enter(spa_t *spa, int oplocks) { int locks = SCL_STATE_ALL | oplocks; /* * Root pools may need to read of the underlying devfs filesystem * when opening up a vdev. Unfortunately if we're holding the * SCL_ZIO lock it will result in a deadlock when we try to issue * the read from the root filesystem. Instead we "prefetch" * the associated vnodes that we need prior to opening the * underlying devices and cache them so that we can prevent * any I/O when we are doing the actual open. */ if (spa_is_root(spa)) { int low = locks & ~(SCL_ZIO - 1); int high = locks & ~low; spa_config_enter(spa, high, spa, RW_WRITER); vdev_hold(spa->spa_root_vdev); spa_config_enter(spa, low, spa, RW_WRITER); } else { spa_config_enter(spa, locks, spa, RW_WRITER); } spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { boolean_t config_changed = B_FALSE; if (vd != NULL || error == 0) vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, 0, 0, B_FALSE); if (vd != NULL) { vdev_state_dirty(vd->vdev_top); config_changed = B_TRUE; spa->spa_config_generation++; } if (spa_is_root(spa)) vdev_rele(spa->spa_root_vdev); ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); spa_config_exit(spa, spa->spa_vdev_locks, spa); /* * If anything changed, wait for it to sync. This ensures that, * from the system administrator's perspective, zpool(1M) commands * are synchronous. This is important for things like zpool offline: * when the command completes, you expect no further I/O from ZFS. */ if (vd != NULL) txg_wait_synced(spa->spa_dsl_pool, 0); /* * If the config changed, update the config cache. */ if (config_changed) { mutex_enter(&spa_namespace_lock); spa_config_sync(spa, B_FALSE, B_TRUE); mutex_exit(&spa_namespace_lock); } return (error); } /* * ========================================================================== * Miscellaneous functions * ========================================================================== */ void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) { if (!nvlist_exists(spa->spa_label_features, feature)) { fnvlist_add_boolean(spa->spa_label_features, feature); /* * When we are creating the pool (tx_txg==TXG_INITIAL), we can't * dirty the vdev config because lock SCL_CONFIG is not held. * Thankfully, in this case we don't need to dirty the config * because it will be written out anyway when we finish * creating the pool. */ if (tx->tx_txg != TXG_INITIAL) vdev_config_dirty(spa->spa_root_vdev); } } void spa_deactivate_mos_feature(spa_t *spa, const char *feature) { if (nvlist_remove_all(spa->spa_label_features, feature) == 0) vdev_config_dirty(spa->spa_root_vdev); } /* * Rename a spa_t. */ int spa_rename(const char *name, const char *newname) { spa_t *spa; int err; /* * Lookup the spa_t and grab the config lock for writing. We need to * actually open the pool so that we can sync out the necessary labels. * It's OK to call spa_open() with the namespace lock held because we * allow recursive calls for other reasons. */ mutex_enter(&spa_namespace_lock); if ((err = spa_open(name, &spa, FTAG)) != 0) { mutex_exit(&spa_namespace_lock); return (err); } spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); avl_remove(&spa_namespace_avl, spa); (void) strlcpy(spa->spa_name, newname, sizeof (spa->spa_name)); avl_add(&spa_namespace_avl, spa); /* * Sync all labels to disk with the new names by marking the root vdev * dirty and waiting for it to sync. It will pick up the new pool name * during the sync. */ vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_ALL, FTAG); txg_wait_synced(spa->spa_dsl_pool, 0); /* * Sync the updated config cache. */ spa_config_sync(spa, B_FALSE, B_TRUE); spa_close(spa, FTAG); mutex_exit(&spa_namespace_lock); return (0); } /* * Return the spa_t associated with given pool_guid, if it exists. If * device_guid is non-zero, determine whether the pool exists *and* contains * a device with the specified device_guid. */ spa_t * spa_by_guid(uint64_t pool_guid, uint64_t device_guid) { spa_t *spa; avl_tree_t *t = &spa_namespace_avl; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { if (spa->spa_state == POOL_STATE_UNINITIALIZED) continue; if (spa->spa_root_vdev == NULL) continue; if (spa_guid(spa) == pool_guid) { if (device_guid == 0) break; if (vdev_lookup_by_guid(spa->spa_root_vdev, device_guid) != NULL) break; /* * Check any devices we may be in the process of adding. */ if (spa->spa_pending_vdev) { if (vdev_lookup_by_guid(spa->spa_pending_vdev, device_guid) != NULL) break; } } } return (spa); } /* * Determine whether a pool with the given pool_guid exists. */ boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) { return (spa_by_guid(pool_guid, device_guid) != NULL); } char * spa_strdup(const char *s) { size_t len; char *new; len = strlen(s); new = kmem_alloc(len + 1, KM_SLEEP); bcopy(s, new, len); new[len] = '\0'; return (new); } void spa_strfree(char *s) { kmem_free(s, strlen(s) + 1); } uint64_t spa_get_random(uint64_t range) { uint64_t r; ASSERT(range != 0); (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); return (r % range); } uint64_t spa_generate_guid(spa_t *spa) { uint64_t guid = spa_get_random(-1ULL); if (spa != NULL) { while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) guid = spa_get_random(-1ULL); } else { while (guid == 0 || spa_guid_exists(guid, 0)) guid = spa_get_random(-1ULL); } return (guid); } void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) { char type[256]; char *checksum = NULL; char *compress = NULL; if (bp != NULL) { if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { dmu_object_byteswap_t bswap = DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); (void) snprintf(type, sizeof (type), "bswap %s %s", DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? "metadata" : "data", dmu_ot_byteswap[bswap].ob_name); } else { (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, sizeof (type)); } if (!BP_IS_EMBEDDED(bp)) { checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; } compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; } SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, compress); } void spa_freeze(spa_t *spa) { uint64_t freeze_txg = 0; spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); if (spa->spa_freeze_txg == UINT64_MAX) { freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; spa->spa_freeze_txg = freeze_txg; } spa_config_exit(spa, SCL_ALL, FTAG); if (freeze_txg != 0) txg_wait_synced(spa_get_dsl(spa), freeze_txg); } void zfs_panic_recover(const char *fmt, ...) { va_list adx; va_start(adx, fmt); vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); va_end(adx); } /* * This is a stripped-down version of strtoull, suitable only for converting * lowercase hexadecimal numbers that don't overflow. */ uint64_t zfs_strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; int digit; while ((c = *str) != '\0') { if (c >= '0' && c <= '9') digit = c - '0'; else if (c >= 'a' && c <= 'f') digit = 10 + c - 'a'; else break; val *= 16; val += digit; str++; } if (nptr) *nptr = (char *)str; return (val); } /* * ========================================================================== * Accessor functions * ========================================================================== */ boolean_t spa_shutting_down(spa_t *spa) { return (spa->spa_async_suspended); } dsl_pool_t * spa_get_dsl(spa_t *spa) { return (spa->spa_dsl_pool); } boolean_t spa_is_initializing(spa_t *spa) { return (spa->spa_is_initializing); } blkptr_t * spa_get_rootblkptr(spa_t *spa) { return (&spa->spa_ubsync.ub_rootbp); } void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) { spa->spa_uberblock.ub_rootbp = *bp; } void spa_altroot(spa_t *spa, char *buf, size_t buflen) { if (spa->spa_root == NULL) buf[0] = '\0'; else (void) strncpy(buf, spa->spa_root, buflen); } int spa_sync_pass(spa_t *spa) { return (spa->spa_sync_pass); } char * spa_name(spa_t *spa) { return (spa->spa_name); } uint64_t spa_guid(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); uint64_t guid; /* * If we fail to parse the config during spa_load(), we can go through * the error path (which posts an ereport) and end up here with no root * vdev. We stash the original pool guid in 'spa_config_guid' to handle * this case. */ if (spa->spa_root_vdev == NULL) return (spa->spa_config_guid); guid = spa->spa_last_synced_guid != 0 ? spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; /* * Return the most recently synced out guid unless we're * in syncing context. */ if (dp && dsl_pool_sync_context(dp)) return (spa->spa_root_vdev->vdev_guid); else return (guid); } uint64_t spa_load_guid(spa_t *spa) { /* * This is a GUID that exists solely as a reference for the * purposes of the arc. It is generated at load time, and * is never written to persistent storage. */ return (spa->spa_load_guid); } uint64_t spa_last_synced_txg(spa_t *spa) { return (spa->spa_ubsync.ub_txg); } uint64_t spa_first_txg(spa_t *spa) { return (spa->spa_first_txg); } uint64_t spa_syncing_txg(spa_t *spa) { return (spa->spa_syncing_txg); } pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); } spa_load_state_t spa_load_state(spa_t *spa) { return (spa->spa_load_state); } uint64_t spa_freeze_txg(spa_t *spa) { return (spa->spa_freeze_txg); } /* ARGSUSED */ uint64_t spa_get_asize(spa_t *spa, uint64_t lsize) { return (lsize * spa_asize_inflation); } /* * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), * or at least 32MB. * * See the comment above spa_slop_shift for details. */ uint64_t spa_get_slop_space(spa_t *spa) { uint64_t space = spa_get_dspace(spa); return (MAX(space >> spa_slop_shift, SPA_MINDEVSIZE >> 1)); } uint64_t spa_get_dspace(spa_t *spa) { return (spa->spa_dspace); } void spa_update_dspace(spa_t *spa) { spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + ddt_get_dedup_dspace(spa); } /* * Return the failure mode that has been set to this pool. The default * behavior will be to block all I/Os when a complete failure occurs. */ uint8_t spa_get_failmode(spa_t *spa) { return (spa->spa_failmode); } boolean_t spa_suspended(spa_t *spa) { return (spa->spa_suspended); } uint64_t spa_version(spa_t *spa) { return (spa->spa_ubsync.ub_version); } boolean_t spa_deflate(spa_t *spa) { return (spa->spa_deflate); } metaslab_class_t * spa_normal_class(spa_t *spa) { return (spa->spa_normal_class); } metaslab_class_t * spa_log_class(spa_t *spa) { return (spa->spa_log_class); +} + +void +spa_evicting_os_register(spa_t *spa, objset_t *os) +{ + mutex_enter(&spa->spa_evicting_os_lock); + list_insert_head(&spa->spa_evicting_os_list, os); + mutex_exit(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_deregister(spa_t *spa, objset_t *os) +{ + mutex_enter(&spa->spa_evicting_os_lock); + list_remove(&spa->spa_evicting_os_list, os); + cv_broadcast(&spa->spa_evicting_os_cv); + mutex_exit(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_wait(spa_t *spa) +{ + mutex_enter(&spa->spa_evicting_os_lock); + while (!list_is_empty(&spa->spa_evicting_os_list)) + cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); + mutex_exit(&spa->spa_evicting_os_lock); + + dmu_buf_user_evict_wait(); } int spa_max_replication(spa_t *spa) { /* * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to * handle BPs with more than one DVA allocated. Set our max * replication level accordingly. */ if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) return (1); return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); } int spa_prev_software_version(spa_t *spa) { return (spa->spa_prev_software_version); } uint64_t spa_deadman_synctime(spa_t *spa) { return (spa->spa_deadman_synctime); } uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { uint64_t asize = DVA_GET_ASIZE(dva); uint64_t dsize = asize; ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); if (asize != 0 && spa->spa_deflate) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } return (dsize); } uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); return (dsize); } uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp) { uint64_t dsize = 0; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); for (int d = 0; d < BP_GET_NDVAS(bp); d++) dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); spa_config_exit(spa, SCL_VDEV, FTAG); return (dsize); } /* * ========================================================================== * Initialization and Termination * ========================================================================== */ static int spa_name_compare(const void *a1, const void *a2) { const spa_t *s1 = a1; const spa_t *s2 = a2; int s; s = strcmp(s1->spa_name, s2->spa_name); if (s > 0) return (1); if (s < 0) return (-1); return (0); } int spa_busy(void) { return (spa_active_count); } void spa_boot_init() { spa_config_load(); } #ifdef _KERNEL EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); #endif void spa_init(int mode) { mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), offsetof(spa_t, spa_avl)); avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); spa_mode_global = mode; #ifdef illumos #ifdef _KERNEL spa_arch_init(); #else if (spa_mode_global != FREAD && dprintf_find_string("watch")) { arc_procfd = open("/proc/self/ctl", O_WRONLY); if (arc_procfd == -1) { perror("could not enable watchpoints: " "opening /proc/self/ctl failed: "); } else { arc_watch = B_TRUE; } } #endif #endif /* illumos */ refcount_sysinit(); unique_init(); range_tree_init(); zio_init(); lz4_init(); dmu_init(); zil_init(); vdev_cache_stat_init(); zfs_prop_init(); zpool_prop_init(); zpool_feature_init(); spa_config_load(); l2arc_start(); #ifndef illumos #ifdef _KERNEL zfs_deadman_init(); #endif #endif /* !illumos */ } void spa_fini(void) { l2arc_stop(); spa_evict_all(); vdev_cache_stat_fini(); zil_fini(); dmu_fini(); lz4_fini(); zio_fini(); range_tree_fini(); unique_fini(); refcount_fini(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); avl_destroy(&spa_l2cache_avl); cv_destroy(&spa_namespace_cv); mutex_destroy(&spa_namespace_lock); mutex_destroy(&spa_spare_lock); mutex_destroy(&spa_l2cache_lock); } /* * Return whether this pool has slogs. No locking needed. * It's not a problem if the wrong answer is returned as it's only for * performance and not correctness */ boolean_t spa_has_slogs(spa_t *spa) { return (spa->spa_log_class->mc_rotor != NULL); } spa_log_state_t spa_get_log_state(spa_t *spa) { return (spa->spa_log_state); } void spa_set_log_state(spa_t *spa, spa_log_state_t state) { spa->spa_log_state = state; } boolean_t spa_is_root(spa_t *spa) { return (spa->spa_is_root); } boolean_t spa_writeable(spa_t *spa) { return (!!(spa->spa_mode & FWRITE)); } /* * Returns true if there is a pending sync task in any of the current * syncing txg, the current quiescing txg, or the current open txg. */ boolean_t spa_has_pending_synctask(spa_t *spa) { return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks)); } int spa_mode(spa_t *spa) { return (spa->spa_mode); } uint64_t spa_bootfs(spa_t *spa) { return (spa->spa_bootfs); } uint64_t spa_delegation(spa_t *spa) { return (spa->spa_delegation); } objset_t * spa_meta_objset(spa_t *spa) { return (spa->spa_meta_objset); } enum zio_checksum spa_dedup_checksum(spa_t *spa) { return (spa->spa_dedup_checksum); } /* * Reset pool scan stat per scan pass (or reboot). */ void spa_scan_stat_init(spa_t *spa) { /* data not stored on disk */ spa->spa_scan_pass_start = gethrestime_sec(); spa->spa_scan_pass_exam = 0; vdev_scan_stat_init(spa->spa_root_vdev); } /* * Get scan stats for zpool status reports */ int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) { dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) return (SET_ERROR(ENOENT)); bzero(ps, sizeof (pool_scan_stat_t)); /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_to_process = scn->scn_phys.scn_to_process; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; ps->pss_state = scn->scn_phys.scn_state; /* data not stored on disk */ ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_exam = spa->spa_scan_pass_exam; return (0); } boolean_t spa_debug_enabled(spa_t *spa) { return (spa->spa_debug); } int spa_maxblocksize(spa_t *spa) { if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) return (SPA_MAXBLOCKSIZE); else return (SPA_OLD_MAXBLOCKSIZE); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h (revision 286575) @@ -1,376 +1,376 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DBUF_H #define _SYS_DBUF_H #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif #define IN_DMU_SYNC 2 /* * define flags for dbuf_read */ #define DB_RF_MUST_SUCCEED (1 << 0) #define DB_RF_CANFAIL (1 << 1) #define DB_RF_HAVESTRUCT (1 << 2) #define DB_RF_NOPREFETCH (1 << 3) #define DB_RF_NEVERWAIT (1 << 4) #define DB_RF_CACHED (1 << 5) /* * The simplified state transition diagram for dbufs looks like: * * +----> READ ----+ * | | * | V * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) * | ^ ^ * | | | * +----> FILL ----+ | * | | * | | * +--------> NOFILL -------+ * * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range * to find all dbufs in a range of a dnode and must be less than any other * dbuf_states_t (see comment on dn_dbufs in dnode.h). */ typedef enum dbuf_states { DB_SEARCH = -1, DB_UNCACHED, DB_FILL, DB_NOFILL, DB_READ, DB_CACHED, DB_EVICTING } dbuf_states_t; struct dnode; struct dmu_tx; /* * level = 0 means the user data * level = 1 means the single indirect block * etc. */ struct dmu_buf_impl; typedef enum override_states { DR_NOT_OVERRIDDEN, DR_IN_DMU_SYNC, DR_OVERRIDDEN } override_states_t; typedef struct dbuf_dirty_record { /* link on our parents dirty list */ list_node_t dr_dirty_node; /* transaction group this data will sync in */ uint64_t dr_txg; /* zio of outstanding write IO */ zio_t *dr_zio; /* pointer back to our dbuf */ struct dmu_buf_impl *dr_dbuf; /* pointer to next dirty record */ struct dbuf_dirty_record *dr_next; /* pointer to parent dirty record */ struct dbuf_dirty_record *dr_parent; /* How much space was changed to dsl_pool_dirty_space() for this? */ unsigned int dr_accounted; union dirty_types { struct dirty_indirect { /* protect access to list */ kmutex_t dr_mtx; /* Our list of dirty children */ list_t dr_children; } di; struct dirty_leaf { /* * dr_data is set when we dirty the buffer * so that we can retain the pointer even if it * gets COW'd in a subsequent transaction group. */ arc_buf_t *dr_data; blkptr_t dr_overridden_by; override_states_t dr_override_state; uint8_t dr_copies; boolean_t dr_nopwrite; } dl; } dt; } dbuf_dirty_record_t; typedef struct dmu_buf_impl { /* * The following members are immutable, with the exception of * db.db_data, which is protected by db_mtx. */ /* the publicly visible structure */ dmu_buf_t db; /* the objset we belong to */ struct objset *db_objset; /* * handle to safely access the dnode we belong to (NULL when evicted) */ struct dnode_handle *db_dnode_handle; /* * our parent buffer; if the dnode points to us directly, * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf * only accessed by sync thread ??? * (NULL when evicted) * May change from NULL to non-NULL under the protection of db_mtx * (see dbuf_check_blkptr()) */ struct dmu_buf_impl *db_parent; /* * link for hash table of all dmu_buf_impl_t's */ struct dmu_buf_impl *db_hash_next; /* our block number */ uint64_t db_blkid; /* * Pointer to the blkptr_t which points to us. May be NULL if we * don't have one yet. (NULL when evicted) */ blkptr_t *db_blkptr; /* * Our indirection level. Data buffers have db_level==0. * Indirect buffers which point to data buffers have * db_level==1. etc. Buffers which contain dnodes have * db_level==0, since the dnodes are stored in a file. */ uint8_t db_level; /* db_mtx protects the members below */ kmutex_t db_mtx; /* * Current state of the buffer */ dbuf_states_t db_state; /* * Refcount accessed by dmu_buf_{hold,rele}. * If nonzero, the buffer can't be destroyed. * Protected by db_mtx. */ refcount_t db_holds; /* buffer holding our data */ arc_buf_t *db_buf; kcondvar_t db_changed; dbuf_dirty_record_t *db_data_pending; /* pointer to most recent dirty record for this buffer */ dbuf_dirty_record_t *db_last_dirty; /* * Our link on the owner dnodes's dn_dbufs list. * Protected by its dn_dbufs_mtx. */ avl_node_t db_link; /* Data which is unique to data (leaf) blocks: */ - /* stuff we store for the user (see dmu_buf_set_user) */ - void *db_user_ptr; - dmu_buf_evict_func_t *db_evict_func; + /* User callback information. */ + dmu_buf_user_t *db_user; uint8_t db_immediate_evict; uint8_t db_freed_in_flight; uint8_t db_dirtycnt; } dmu_buf_impl_t; /* Note: the dbuf hash table is exposed only for the mdb module */ #define DBUF_MUTEXES 256 #define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) typedef struct dbuf_hash_table { uint64_t hash_table_mask; dmu_buf_impl_t **hash_table; kmutex_t hash_mutexes[DBUF_MUTEXES]; } dbuf_hash_table_t; uint64_t dbuf_whichblock(struct dnode *di, uint64_t offset); dmu_buf_impl_t *dbuf_create_tlib(struct dnode *dn, char *data); void dbuf_create_bonus(struct dnode *dn); int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx); void dbuf_spill_hold(struct dnode *dn, dmu_buf_impl_t **dbp, void *tag); void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, void *tag); int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, int create, void *tag, dmu_buf_impl_t **dbp); void dbuf_prefetch(struct dnode *dn, uint64_t blkid, zio_priority_t prio); void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, uint64_t blkid, void *tag); uint64_t dbuf_refcount(dmu_buf_impl_t *db); void dbuf_rele(dmu_buf_impl_t *db, void *tag); void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag); dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, uint64_t blkid); int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, bp_embedded_type_t etype, enum zio_compress comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); void dbuf_clear(dmu_buf_impl_t *db); void dbuf_evict(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); void dbuf_release_bp(dmu_buf_impl_t *db); void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, struct dmu_tx *); void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); #define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) #define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) #define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) #define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) #define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) void dbuf_init(void); void dbuf_fini(void); boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); #define DBUF_GET_BUFC_TYPE(_db) \ (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) #define DBUF_IS_CACHEABLE(_db) \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) #define DBUF_IS_L2CACHEABLE(_db) \ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ (dbuf_is_metadata(_db) && \ ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) #define DBUF_IS_L2COMPRESSIBLE(_db) \ ((_db)->db_objset->os_compress != ZIO_COMPRESS_OFF || \ (dbuf_is_metadata(_db) && zfs_mdcomp_disable == B_FALSE)) #ifdef ZFS_DEBUG /* * There should be a ## between the string literal and fmt, to make it * clear that we're joining two strings together, but gcc does not * support that preprocessor token. */ #define dprintf_dbuf(dbuf, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char __db_buf[32]; \ uint64_t __db_obj = (dbuf)->db.db_object; \ if (__db_obj == DMU_META_DNODE_OBJECT) \ (void) strcpy(__db_buf, "mdn"); \ else \ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ (u_longlong_t)__db_obj); \ dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ "obj=%s lvl=%u blkid=%lld " fmt, \ __db_buf, (dbuf)->db_level, \ (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ } \ _NOTE(CONSTCOND) } while (0) #define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ _NOTE(CONSTCOND) } while (0) #define DBUF_VERIFY(db) dbuf_verify(db) #else #define dprintf_dbuf(db, fmt, ...) #define dprintf_dbuf_bp(db, bp, fmt, ...) #define DBUF_VERIFY(db) #endif #ifdef __cplusplus } #endif #endif /* _SYS_DBUF_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h (revision 286575) @@ -1,851 +1,938 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, Joyent, Inc. All rights reserved. * Copyright 2013 DEY Storage Systems, Inc. * Copyright 2014 HybridCluster. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_H #define _SYS_DMU_H /* * This file describes the interface that the DMU provides for its * consumers. * * The DMU also interacts with the SPA. That interface is described in * dmu_spa.h. */ -#include -#include +#include #include -#include #include #ifdef __cplusplus extern "C" { #endif struct uio; struct xuio; struct page; struct vnode; struct spa; struct zilog; struct zio; struct blkptr; struct zap_cursor; struct dsl_dataset; struct dsl_pool; struct dnode; struct drr_begin; struct drr_end; struct zbookmark_phys; struct spa; struct nvlist; struct arc_buf; struct zio_prop; struct sa_handle; struct file; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; typedef struct dsl_dir dsl_dir_t; typedef enum dmu_object_byteswap { DMU_BSWAP_UINT8, DMU_BSWAP_UINT16, DMU_BSWAP_UINT32, DMU_BSWAP_UINT64, DMU_BSWAP_ZAP, DMU_BSWAP_DNODE, DMU_BSWAP_OBJSET, DMU_BSWAP_ZNODE, DMU_BSWAP_OLDACL, DMU_BSWAP_ACL, /* * Allocating a new byteswap type number makes the on-disk format * incompatible with any other format that uses the same number. * * Data can usually be structured to work with one of the * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. */ DMU_BSWAP_NUMFUNCS } dmu_object_byteswap_t; #define DMU_OT_NEWTYPE 0x80 #define DMU_OT_METADATA 0x40 #define DMU_OT_BYTESWAP_MASK 0x3f /* * Defines a uint8_t object type. Object types specify if the data * in the object is metadata (boolean) and how to byteswap the data * (dmu_object_byteswap_t). */ #define DMU_OT(byteswap, metadata) \ (DMU_OT_NEWTYPE | \ ((metadata) ? DMU_OT_METADATA : 0) | \ ((byteswap) & DMU_OT_BYTESWAP_MASK)) #define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ (ot) < DMU_OT_NUMTYPES) #define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_METADATA) : \ dmu_ot[(ot)].ot_metadata) /* * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill * is repurposed for embedded BPs. */ #define DMU_OT_HAS_FILL(ot) \ ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) #define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ ((ot) & DMU_OT_BYTESWAP_MASK) : \ dmu_ot[(ot)].ot_byteswap) typedef enum dmu_object_type { DMU_OT_NONE, /* general: */ DMU_OT_OBJECT_DIRECTORY, /* ZAP */ DMU_OT_OBJECT_ARRAY, /* UINT64 */ DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ DMU_OT_BPOBJ, /* UINT64 */ DMU_OT_BPOBJ_HDR, /* UINT64 */ /* spa: */ DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ DMU_OT_SPACE_MAP, /* UINT64 */ /* zil: */ DMU_OT_INTENT_LOG, /* UINT64 */ /* dmu: */ DMU_OT_DNODE, /* DNODE */ DMU_OT_OBJSET, /* OBJSET */ /* dsl: */ DMU_OT_DSL_DIR, /* UINT64 */ DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ DMU_OT_DSL_PROPS, /* ZAP */ DMU_OT_DSL_DATASET, /* UINT64 */ /* zpl: */ DMU_OT_ZNODE, /* ZNODE */ DMU_OT_OLDACL, /* Old ACL */ DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ DMU_OT_MASTER_NODE, /* ZAP */ DMU_OT_UNLINKED_SET, /* ZAP */ /* zvol: */ DMU_OT_ZVOL, /* UINT8 */ DMU_OT_ZVOL_PROP, /* ZAP */ /* other; for testing only! */ DMU_OT_PLAIN_OTHER, /* UINT8 */ DMU_OT_UINT64_OTHER, /* UINT64 */ DMU_OT_ZAP_OTHER, /* ZAP */ /* new object types: */ DMU_OT_ERROR_LOG, /* ZAP */ DMU_OT_SPA_HISTORY, /* UINT8 */ DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ DMU_OT_POOL_PROPS, /* ZAP */ DMU_OT_DSL_PERMS, /* ZAP */ DMU_OT_ACL, /* ACL */ DMU_OT_SYSACL, /* SYSACL */ DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCAN_QUEUE, /* ZAP */ DMU_OT_USERGROUP_USED, /* ZAP */ DMU_OT_USERGROUP_QUOTA, /* ZAP */ DMU_OT_USERREFS, /* ZAP */ DMU_OT_DDT_ZAP, /* ZAP */ DMU_OT_DDT_STATS, /* ZAP */ DMU_OT_SA, /* System attr */ DMU_OT_SA_MASTER_NODE, /* ZAP */ DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ DMU_OT_SCAN_XLATE, /* ZAP */ DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ DMU_OT_DEADLIST, /* ZAP */ DMU_OT_DEADLIST_HDR, /* UINT64 */ DMU_OT_DSL_CLONES, /* ZAP */ DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ /* * Do not allocate new object types here. Doing so makes the on-disk * format incompatible with any other format that uses the same object * type number. * * When creating an object which does not have one of the above types * use the DMU_OTN_* type with the correct byteswap and metadata * values. * * The DMU_OTN_* types do not have entries in the dmu_ot table, * use the DMU_OT_IS_METDATA() and DMU_OT_BYTESWAP() macros instead * of indexing into dmu_ot directly (this works for both DMU_OT_* types * and DMU_OTN_* types). */ DMU_OT_NUMTYPES, /* * Names for valid types declared with DMU_OT(). */ DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE), DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE), DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE), DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE), DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE), DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE), DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE), DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE), DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE), DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE), } dmu_object_type_t; typedef enum txg_how { TXG_WAIT = 1, TXG_NOWAIT, TXG_WAITED, } txg_how_t; void byteswap_uint64_array(void *buf, size_t size); void byteswap_uint32_array(void *buf, size_t size); void byteswap_uint16_array(void *buf, size_t size); void byteswap_uint8_array(void *buf, size_t size); void zap_byteswap(void *buf, size_t size); void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) /* * The maximum number of bytes that can be accessed as part of one * operation, including metadata. */ #define DMU_MAX_ACCESS (32 * 1024 * 1024) /* 32MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ #define DMU_USERUSED_OBJECT (-1ULL) #define DMU_GROUPUSED_OBJECT (-2ULL) /* * artificial blkids for bonus buffer and spill blocks */ #define DMU_BONUS_BLKID (-1ULL) #define DMU_SPILL_BLKID (-2ULL) /* * Public routines to create, destroy, open, and close objsets. */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_evict_dbufs(objset_t *os); int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); int dmu_get_recursive_snaps_nvl(char *fsname, const char *snapname, struct nvlist *snaps); int dmu_objset_clone(const char *name, const char *origin); int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, struct nvlist *errlist); int dmu_objset_snapshot_one(const char *fsname, const char *snapname); int dmu_objset_snapshot_tmp(const char *, const char *, int); int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); typedef struct dmu_buf { uint64_t db_object; /* object that this buffer is part of */ uint64_t db_offset; /* byte offset in this object */ uint64_t db_size; /* size of buffer in bytes */ void *db_data; /* data in buffer */ } dmu_buf_t; -typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); - /* * The names of zap entries in the DIRECTORY_OBJECT of the MOS. */ #define DMU_POOL_DIRECTORY_OBJECT 1 #define DMU_POOL_CONFIG "config" #define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" #define DMU_POOL_FEATURES_FOR_READ "features_for_read" #define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" #define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" #define DMU_POOL_ROOT_DATASET "root_dataset" #define DMU_POOL_SYNC_BPOBJ "sync_bplist" #define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" #define DMU_POOL_ERRLOG_LAST "errlog_last" #define DMU_POOL_SPARES "spares" #define DMU_POOL_DEFLATE "deflate" #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" #define DMU_POOL_TMP_USERREFS "tmp_userrefs" #define DMU_POOL_DDT "DDT-%s-%s-%s" #define DMU_POOL_DDT_STATS "DDT-statistics" #define DMU_POOL_CREATION_VERSION "creation_version" #define DMU_POOL_SCAN "scan" #define DMU_POOL_FREE_BPOBJ "free_bpobj" #define DMU_POOL_BPTREE_OBJ "bptree_obj" #define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" /* * Allocate an object from this objset. The range of object numbers * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. * * The transaction must be assigned to a txg. The newly allocated * object will be "held" in the transaction (ie. you can modify the * newly allocated object in this transaction). * * dmu_object_alloc() chooses an object and returns it in *objectp. * * dmu_object_claim() allocates a specific object number. If that * number is already allocated, it fails and returns EEXIST. * * Return 0 on success, or ENOSPC or EEXIST as specified above. */ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); /* * Free an object from this objset. * * The object's data will be freed as well (ie. you don't need to call * dmu_free(object, 0, -1, tx)). * * The object need not be held in the transaction. * * If there are any holds on this object's buffers (via dmu_buf_hold()), * or tx holds on the object (via dmu_tx_hold_object()), you can not * free it; it fails and returns EBUSY. * * If the object is not allocated, it fails and returns ENOENT. * * Return 0 on success, or EBUSY or ENOENT as specified above. */ int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); /* * Find the next allocated or free object. * * The objectp parameter is in-out. It will be updated to be the next * object which is allocated. Ignore objects which have not been * modified since txg. * * XXX Can only be called on a objset with no dirty data. * * Returns 0 on success, or ENOENT if there are no more objects. */ int dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg); /* * Set the data blocksize for an object. * * The object cannot have any blocks allcated beyond the first. If * the first block is allocated already, the new size must be greater * than the current block size. If these conditions are not met, * ENOTSUP will be returned. * * Returns 0 on success, or EBUSY if there are any holds on the object * contents, or ENOTSUP as described above. */ int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dmu_tx_t *tx); /* * Set the checksum property on a dnode. The new checksum algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dmu_tx_t *tx); /* * Set the compress property on a dnode. The new compression algorithm will * apply to all newly written blocks; existing blocks will not be affected. */ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, void *data, uint8_t etype, uint8_t comp, int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); /* * Decide how to write a block: checksum, compression, number of copies, etc. */ #define WP_NOFILL 0x1 #define WP_DMU_SYNC 0x2 #define WP_SPILL 0x4 void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus * data. As with any normal buffer, you must call dmu_buf_read() to * read db_data, dmu_buf_will_dirty() before modifying it, and the * object must be held in an assigned transaction before calling * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus * buffer as well. You must release your hold with dmu_buf_rele(). * * Returns ENOENT, EIO, or 0. */ int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **); int dmu_bonus_max(void); int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); /* * Special spill buffer support used by "SA" framework */ int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_by_dnode(struct dnode *dn, uint32_t flags, void *tag, dmu_buf_t **dbp); int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); /* * Obtain the DMU buffer from the specified object which contains the * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so * that it will remain in memory. You must release the hold with * dmu_buf_rele(). You musn't access the dmu_buf_t after releasing your * hold. You must have a hold on any dmu_buf_t* you pass to the DMU. * * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill * on the returned buffer before reading or writing the buffer's * db_data. The comments for those routines describe what particular * operations are valid after calling them. * * The object number must be a valid, allocated object number. */ int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, void *tag, dmu_buf_t **, int flags); /* * Add a reference to a dmu buffer that has already been held via * dmu_buf_hold() in the current context. */ void dmu_buf_add_ref(dmu_buf_t *db, void* tag); /* * Attempt to add a reference to a dmu buffer that is in an unknown state, * using a pointer that may have been invalidated by eviction processing. * The request will succeed if the passed in dbuf still represents the * same os/object/blkid, is ineligible for eviction, and has at least * one hold by a user other than the syncer. */ boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, uint64_t blkid, void *tag); void dmu_buf_rele(dmu_buf_t *db, void *tag); uint64_t dmu_buf_refcount(dmu_buf_t *db); /* * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a * range of an object. A pointer to an array of dmu_buf_t*'s is * returned (in *dbpp). * * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and * frees the array. The hold on the array of buffers MUST be released * with dmu_buf_rele_array. You can NOT release the hold on each buffer * individually with dmu_buf_rele. */ int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp); void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); +typedef void dmu_buf_evict_func_t(void *user_ptr); + /* - * Returns NULL on success, or the existing user ptr if it's already - * been set. + * A DMU buffer user object may be associated with a dbuf for the + * duration of its lifetime. This allows the user of a dbuf (client) + * to attach private data to a dbuf (e.g. in-core only data such as a + * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified + * when that dbuf has been evicted. Clients typically respond to the + * eviction notification by freeing their private data, thus ensuring + * the same lifetime for both dbuf and private data. * - * user_ptr is for use by the user and can be obtained via dmu_buf_get_user(). + * The mapping from a dmu_buf_user_t to any client private data is the + * client's responsibility. All current consumers of the API with private + * data embed a dmu_buf_user_t as the first member of the structure for + * their private data. This allows conversions between the two types + * with a simple cast. Since the DMU buf user API never needs access + * to the private data, other strategies can be employed if necessary + * or convenient for the client (e.g. using container_of() to do the + * conversion for private data that cannot have the dmu_buf_user_t as + * its first member). * - * If non-NULL, pageout func will be called when this buffer is being - * excised from the cache, so that you can clean up the data structure - * pointed to by user_ptr. + * Eviction callbacks are executed without the dbuf mutex held or any + * other type of mechanism to guarantee that the dbuf is still available. + * For this reason, users must assume the dbuf has already been freed + * and not reference the dbuf from the callback context. * - * dmu_evict_user() will call the pageout func for all buffers in a - * objset with a given pageout func. + * Users requesting "immediate eviction" are notified as soon as the dbuf + * is only referenced by dirty records (dirties == holds). Otherwise the + * notification occurs after eviction processing for the dbuf begins. */ -void *dmu_buf_set_user(dmu_buf_t *db, void *user_ptr, - dmu_buf_evict_func_t *pageout_func); +typedef struct dmu_buf_user { + /* + * Asynchronous user eviction callback state. + */ + taskq_ent_t dbu_tqent; + + /* This instance's eviction function pointer. */ + dmu_buf_evict_func_t *dbu_evict_func; +#ifdef ZFS_DEBUG + /* + * Pointer to user's dbuf pointer. NULL for clients that do + * not associate a dbuf with their user data. + * + * The dbuf pointer is cleared upon eviction so as to catch + * use-after-evict bugs in clients. + */ + dmu_buf_t **dbu_clear_on_evict_dbufp; +#endif +} dmu_buf_user_t; + /* - * set_user_ie is the same as set_user, but request immediate eviction - * when hold count goes to zero. + * Initialize the given dmu_buf_user_t instance with the eviction function + * evict_func, to be called when the user is evicted. + * + * NOTE: This function should only be called once on a given dmu_buf_user_t. + * To allow enforcement of this, dbu must already be zeroed on entry. */ -void *dmu_buf_set_user_ie(dmu_buf_t *db, void *user_ptr, - dmu_buf_evict_func_t *pageout_func); -void *dmu_buf_update_user(dmu_buf_t *db_fake, void *old_user_ptr, - void *user_ptr, dmu_buf_evict_func_t *pageout_func); -void dmu_evict_user(objset_t *os, dmu_buf_evict_func_t *func); +#ifdef __lint +/* Very ugly, but it beats issuing suppression directives in many Makefiles. */ +extern void +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, + dmu_buf_t **clear_on_evict_dbufp); +#else /* __lint */ +inline void +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func, + dmu_buf_t **clear_on_evict_dbufp) +{ + ASSERT(dbu->dbu_evict_func == NULL); + ASSERT(evict_func != NULL); + dbu->dbu_evict_func = evict_func; +#ifdef ZFS_DEBUG + dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; +#endif +} +#endif /* __lint */ /* - * Returns the user_ptr set with dmu_buf_set_user(), or NULL if not set. + * Attach user data to a dbuf and mark it for normal (when the dbuf's + * data is cleared or its reference count goes to zero) eviction processing. + * + * Returns NULL on success, or the existing user if another user currently + * owns the buffer. */ +void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Attach user data to a dbuf and mark it for immediate (its dirty and + * reference counts are equal) eviction processing. + * + * Returns NULL on success, or the existing user if another user currently + * owns the buffer. + */ +void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Replace the current user of a dbuf. + * + * If given the current user of a dbuf, replaces the dbuf's user with + * "new_user" and returns the user data pointer that was replaced. + * Otherwise returns the current, and unmodified, dbuf user pointer. + */ +void *dmu_buf_replace_user(dmu_buf_t *db, + dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); + +/* + * Remove the specified user data for a DMU buffer. + * + * Returns the user that was removed on success, or the current user if + * another user currently owns the buffer. + */ +void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Returns the user data (dmu_buf_user_t *) associated with this dbuf. + */ void *dmu_buf_get_user(dmu_buf_t *db); + +/* Block until any in-progress dmu buf user evictions complete. */ +void dmu_buf_user_evict_wait(void); /* * Returns the blkptr associated with this dbuf, or NULL if not set. */ struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); /* * Indicate that you are going to modify the buffer's data (db_data). * * The transaction (tx) must be assigned to a txg (ie. you've called * dmu_tx_assign()). The buffer's object must be held in the tx * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); /* * Tells if the given dbuf is freeable. */ boolean_t dmu_buf_freeable(dmu_buf_t *); /* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign * the transaction to a transaction group. Once the transaction has * been assigned, you can modify buffers which belong to held objects as * part of this transaction. You can't modify buffers before the * transaction has been assigned; you can't modify buffers which don't * belong to objects which this transaction holds; you can't hold * objects once the transaction has been assigned. You may hold an * object which you are going to free (with dmu_object_free()), but you * don't have to. * * You can abort the transaction before it has been assigned. * * Note that you may hold buffers (with dmu_buf_hold) at any time, * regardless of transaction state. */ #define DMU_NEW_OBJECT (-1ULL) #define DMU_OBJECT_END (-1ULL) dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, enum txg_how txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); void dmu_tx_mark_netfree(dmu_tx_t *tx); /* * To register a commit callback, dmu_tx_callback_register() must be called. * * dcb_data is a pointer to caller private data that is passed on as a * callback parameter. The caller is responsible for properly allocating and * freeing it. * * When registering a callback, the transaction must be already created, but * it cannot be committed or aborted. It can be assigned to a txg or not. * * The callback will be called after the transaction has been safely written * to stable storage and will also be called if the dmu_tx is aborted. * If there is any error which prevents the transaction from being committed to * disk, the callback will be called with a value of error != 0. */ typedef void dmu_tx_callback_func_t(void *dcb_data, int error); void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, void *dcb_data); /* * Free up the data blocks for a defined range of a file. If size is * -1, the range from offset to end-of-file is freed. */ int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size); int dmu_free_long_object(objset_t *os, uint64_t object); /* * Convenience functions. * * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ #define DMU_READ_PREFETCH 0 /* prefetch */ #define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx); int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, dmu_tx_t *tx); #ifdef _KERNEL #ifdef illumos int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); #else int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct vm_page **ppa, dmu_tx_t *tx); #endif #endif struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); void dmu_return_arcbuf(struct arc_buf *buf); void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, dmu_tx_t *tx); int dmu_xuio_init(struct xuio *uio, int niov); void dmu_xuio_fini(struct xuio *uio); int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, size_t n); int dmu_xuio_cnt(struct xuio *uio); struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); void dmu_xuio_clear(struct xuio *uio, int i); void xuio_stat_wbuf_copied(); void xuio_stat_wbuf_nocopy(); extern int zfs_prefetch_disable; extern int zfs_max_recordsize; /* * Asynchronously try to read in the data. */ void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len); typedef struct dmu_object_info { /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_nblkptr; uint8_t doi_pad[4]; uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ uint64_t doi_max_offset; uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void arc_byteswap_func_t(void *buf, size_t size); typedef struct dmu_object_type_info { dmu_object_byteswap_t ot_byteswap; boolean_t ot_metadata; char *ot_name; } dmu_object_type_info_t; typedef struct dmu_object_byteswap_info { arc_byteswap_func_t *ob_func; char *ob_name; } dmu_object_byteswap_info_t; extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; /* * Get information on a DMU object. * * Return 0 on success or ENOENT if object is not allocated. * * If doi is NULL, just indicates whether the object exists. */ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dnode in hand. */ void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); /* Like dmu_object_info, but faster if you have a held dbuf in hand. */ void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); /* * Like dmu_object_info_from_db, but faster still when you only care about * the size. This is specifically optimized for zfs_getattr(). */ void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, u_longlong_t *nblk512); typedef struct dmu_objset_stats { uint64_t dds_num_clones; /* number of clones of this */ uint64_t dds_creation_txg; uint64_t dds_guid; dmu_objset_type_t dds_type; uint8_t dds_is_snapshot; uint8_t dds_inconsistent; char dds_origin[MAXNAMELEN]; } dmu_objset_stats_t; /* * Get stats on a dataset. */ void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); /* * Add entries to the nvlist for all the objset's properties. See * zfs_prop_table[] and zfs(1m) for details on the properties. */ void dmu_objset_stats(objset_t *os, struct nvlist *nv); /* * Get the space usage statistics for statvfs(). * * refdbytes is the amount of space "referenced" by this objset. * availbytes is the amount of space available to this objset, taking * into account quotas & reservations, assuming that no other objsets * use the space first. These values correspond to the 'referenced' and * 'available' properties, described in the zfs(1m) manpage. * * usedobjs and availobjs are the number of objects currently allocated, * and available. */ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); /* * The fsid_guid is a 56-bit ID that can change to avoid collisions. * (Contrast with the ds_guid which is a 64-bit ID that will never * change, so there is a small probability that it will collide.) */ uint64_t dmu_objset_fsid_guid(objset_t *os); /* * Get the [cm]time for an objset's snapshot dir */ timestruc_t dmu_objset_snap_cmtime(objset_t *os); int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); extern struct zilog *dmu_objset_zil(objset_t *os); extern struct dsl_pool *dmu_objset_pool(objset_t *os); extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); typedef int objset_used_cb_t(dmu_object_type_t bonustype, void *bonus, uint64_t *userp, uint64_t *groupp); extern void dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); /* * Return the txg number for the given assigned transaction. */ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); /* * Synchronous write. * If a parent zio is provided this function initiates a write on the * provided buffer as a child of the parent zio. * In the absence of a parent zio, the write is completed synchronously. * At write completion, blk is filled with the bp of the written block. * Note that while the data covered by this function will be on stable * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ /* * {zfs,zvol,ztest}_get_done() args */ typedef struct zgd { struct zilog *zgd_zilog; struct blkptr *zgd_bp; dmu_buf_t *zgd_db; struct rl *zgd_rl; void *zgd_private; } zgd_t; typedef void dmu_sync_cb_t(zgd_t *arg, int error); int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off * Return found offset in *off. Return ESRCH for end of file. */ int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off); /* * Initial setup and final teardown. */ extern void dmu_init(void); extern void dmu_fini(void); typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, uint64_t object, uint64_t offset, int len); void dmu_traverse_objset(objset_t *os, uint64_t txg_start, dmu_traverse_cb_t cb, void *arg); int dmu_diff(const char *tosnap_name, const char *fromsnap_name, struct file *fp, offset_t *offp); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ extern uint64_t zfs_crc64_table[256]; extern int zfs_mdcomp_disable; #ifdef __cplusplus } #endif #endif /* _SYS_DMU_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu_objset.h (revision 286575) @@ -1,180 +1,186 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #ifndef _SYS_DMU_OBJSET_H #define _SYS_DMU_OBJSET_H #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif extern krwlock_t os_lock; struct dsl_pool; struct dsl_dataset; struct dmu_tx; #define OBJSET_PHYS_SIZE 2048 #define OBJSET_OLD_PHYS_SIZE 1024 #define OBJSET_BUF_HAS_USERUSED(buf) \ (arc_buf_size(buf) > OBJSET_OLD_PHYS_SIZE) #define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) typedef struct objset_phys { dnode_phys_t os_meta_dnode; zil_header_t os_zil_header; uint64_t os_type; uint64_t os_flags; char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 - sizeof (zil_header_t) - sizeof (uint64_t)*2]; dnode_phys_t os_userused_dnode; dnode_phys_t os_groupused_dnode; } objset_phys_t; struct objset { /* Immutable: */ struct dsl_dataset *os_dsl_dataset; spa_t *os_spa; arc_buf_t *os_phys_buf; objset_phys_t *os_phys; /* - * The following "special" dnodes have no parent and are exempt from - * dnode_move(), but they root their descendents in this objset using - * handles anyway, so that all access to dnodes from dbufs consistently - * uses handles. + * The following "special" dnodes have no parent, are exempt + * from dnode_move(), and are not recorded in os_dnodes, but they + * root their descendents in this objset using handles anyway, so + * that all access to dnodes from dbufs consistently uses handles. */ dnode_handle_t os_meta_dnode; dnode_handle_t os_userused_dnode; dnode_handle_t os_groupused_dnode; zilog_t *os_zil; + list_node_t os_evicting_node; + /* can change, under dsl_dir's locks: */ enum zio_checksum os_checksum; enum zio_compress os_compress; uint8_t os_copies; enum zio_checksum os_dedup_checksum; boolean_t os_dedup_verify; + boolean_t os_evicting; zfs_logbias_op_t os_logbias; zfs_cache_type_t os_primary_cache; zfs_cache_type_t os_secondary_cache; zfs_sync_type_t os_sync; zfs_redundant_metadata_type_t os_redundant_metadata; int os_recordsize; /* no lock needed: */ struct dmu_tx *os_synctx; /* XXX sketchy */ blkptr_t *os_rootbp; zil_header_t os_zil_header; list_t os_synced_dnodes; uint64_t os_flags; /* Protected by os_obj_lock */ kmutex_t os_obj_lock; uint64_t os_obj_next; /* Protected by os_lock */ kmutex_t os_lock; list_t os_dirty_dnodes[TXG_SIZE]; list_t os_free_dnodes[TXG_SIZE]; list_t os_dnodes; list_t os_downgraded_dbufs; /* stuff we store for the user */ kmutex_t os_user_ptr_lock; void *os_user_ptr; sa_os_t *os_sa; }; #define DMU_META_OBJSET 0 #define DMU_META_DNODE_OBJECT 0 #define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) #define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode) #define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode) #define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) #define DMU_OS_IS_L2CACHEABLE(os) \ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ (os)->os_secondary_cache == ZFS_CACHE_METADATA) #define DMU_OS_IS_L2COMPRESSIBLE(os) (zfs_mdcomp_disable == B_FALSE) /* called from zpl */ int dmu_objset_hold(const char *name, void *tag, objset_t **osp); int dmu_objset_own(const char *name, dmu_objset_type_t type, boolean_t readonly, void *tag, objset_t **osp); void dmu_objset_refresh_ownership(objset_t *os, void *tag); void dmu_objset_rele(objset_t *os, void *tag); void dmu_objset_disown(objset_t *os, void *tag); int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); void dmu_objset_stats(objset_t *os, nvlist_t *nv); void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); uint64_t dmu_objset_fsid_guid(objset_t *os); int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj, int func(struct dsl_pool *, struct dsl_dataset *, void *), void *arg, int flags); int dmu_objset_prefetch(const char *name, void *arg); void dmu_objset_evict_dbufs(objset_t *os); timestruc_t dmu_objset_snap_cmtime(objset_t *os); /* called from dsl */ void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, objset_t **osp); void dmu_objset_evict(objset_t *os); void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); boolean_t dmu_objset_userused_enabled(objset_t *os); int dmu_objset_userspace_upgrade(objset_t *os); boolean_t dmu_objset_userspace_present(objset_t *os); int dmu_fsname(const char *snapname, char *buf); + +void dmu_objset_evict_done(objset_t *os); void dmu_objset_init(void); void dmu_objset_fini(void); #ifdef __cplusplus } #endif #endif /* _SYS_DMU_OBJSET_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dnode.h (revision 286575) @@ -1,342 +1,344 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DNODE_H #define _SYS_DNODE_H #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * dnode_hold() flags. */ #define DNODE_MUST_BE_ALLOCATED 1 #define DNODE_MUST_BE_FREE 2 /* * dnode_next_offset() flags. */ #define DNODE_FIND_HOLE 1 #define DNODE_FIND_BACKWARDS 2 #define DNODE_FIND_HAVELOCK 4 /* * Fixed constants. */ #define DNODE_SHIFT 9 /* 512 bytes */ #define DN_MIN_INDBLKSHIFT 12 /* 4k */ #define DN_MAX_INDBLKSHIFT 14 /* 16k */ #define DNODE_BLOCK_SHIFT 14 /* 16k */ #define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ #define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ #define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ /* * dnode id flags * * Note: a file will never ever have its * ids moved from bonus->spill * and only in a crypto environment would it be on spill */ #define DN_ID_CHKED_BONUS 0x1 #define DN_ID_CHKED_SPILL 0x2 #define DN_ID_OLD_EXIST 0x4 #define DN_ID_NEW_EXIST 0x8 /* * Derived constants. */ #define DNODE_SIZE (1 << DNODE_SHIFT) #define DN_MAX_NBLKPTR ((DNODE_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) #define DN_MAX_BONUSLEN (DNODE_SIZE - DNODE_CORE_SIZE - (1 << SPA_BLKPTRSHIFT)) #define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) #define DN_ZERO_BONUSLEN (DN_MAX_BONUSLEN + 1) #define DN_KILL_SPILLBLK (1) #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) #define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) #define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT) /* The +2 here is a cheesy way to round up */ #define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ (DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT))) #define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) #define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) #define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) struct dmu_buf_impl; struct objset; struct zio; enum dnode_dirtycontext { DN_UNDIRTIED, DN_DIRTY_OPEN, DN_DIRTY_SYNC }; /* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ #define DNODE_FLAG_USED_BYTES (1<<0) #define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1) /* Does dnode have a SA spill blkptr in bonus? */ #define DNODE_FLAG_SPILL_BLKPTR (1<<2) typedef struct dnode_phys { uint8_t dn_type; /* dmu_object_type_t */ uint8_t dn_indblkshift; /* ln2(indirect block size) */ uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ uint8_t dn_nblkptr; /* length of dn_blkptr */ uint8_t dn_bonustype; /* type of data in bonus buffer */ uint8_t dn_checksum; /* ZIO_CHECKSUM type */ uint8_t dn_compress; /* ZIO_COMPRESS type */ uint8_t dn_flags; /* DNODE_FLAG_* */ uint16_t dn_datablkszsec; /* data block size in 512b sectors */ uint16_t dn_bonuslen; /* length of dn_bonus */ uint8_t dn_pad2[4]; /* accounting is protected by dn_dirty_mtx */ uint64_t dn_maxblkid; /* largest allocated block ID */ uint64_t dn_used; /* bytes (or sectors) of disk space */ uint64_t dn_pad3[4]; blkptr_t dn_blkptr[1]; uint8_t dn_bonus[DN_MAX_BONUSLEN - sizeof (blkptr_t)]; blkptr_t dn_spill; } dnode_phys_t; typedef struct dnode { /* * Protects the structure of the dnode, including the number of levels * of indirection (dn_nlevels), dn_maxblkid, and dn_next_* */ krwlock_t dn_struct_rwlock; /* Our link on dn_objset->os_dnodes list; protected by os_lock. */ list_node_t dn_link; /* immutable: */ struct objset *dn_objset; uint64_t dn_object; struct dmu_buf_impl *dn_dbuf; struct dnode_handle *dn_handle; dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ /* * Copies of stuff in dn_phys. They're valid in the open * context (eg. even before the dnode is first synced). * Where necessary, these are protected by dn_struct_rwlock. */ dmu_object_type_t dn_type; /* object type */ uint16_t dn_bonuslen; /* bonus length */ uint8_t dn_bonustype; /* bonus type */ uint8_t dn_nblkptr; /* number of blkptrs (immutable) */ uint8_t dn_checksum; /* ZIO_CHECKSUM type */ uint8_t dn_compress; /* ZIO_COMPRESS type */ uint8_t dn_nlevels; uint8_t dn_indblkshift; uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ uint8_t dn_moved; /* Has this dnode been moved? */ uint16_t dn_datablkszsec; /* in 512b sectors */ uint32_t dn_datablksz; /* in bytes */ uint64_t dn_maxblkid; uint8_t dn_next_type[TXG_SIZE]; uint8_t dn_next_nblkptr[TXG_SIZE]; uint8_t dn_next_nlevels[TXG_SIZE]; uint8_t dn_next_indblkshift[TXG_SIZE]; uint8_t dn_next_bonustype[TXG_SIZE]; uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */ uint16_t dn_next_bonuslen[TXG_SIZE]; uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */ uint32_t dn_dbufs_count; /* count of dn_dbufs */ /* There are no level-0 blocks of this blkid or higher in dn_dbufs */ uint64_t dn_unlisted_l0_blkid; /* protected by os_lock: */ list_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ /* protected by dn_mtx: */ kmutex_t dn_mtx; list_t dn_dirty_records[TXG_SIZE]; struct range_tree *dn_free_ranges[TXG_SIZE]; uint64_t dn_allocated_txg; uint64_t dn_free_txg; uint64_t dn_assigned_txg; kcondvar_t dn_notxholds; enum dnode_dirtycontext dn_dirtyctx; uint8_t *dn_dirtyctx_firstset; /* dbg: contents meaningless */ /* protected by own devices */ refcount_t dn_tx_holds; refcount_t dn_holds; kmutex_t dn_dbufs_mtx; /* * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs * can contain multiple dbufs of the same (level, blkid) when a * dbuf is marked DB_EVICTING without being removed from * dn_dbufs. To maintain the avl invariant that there cannot be * duplicate entries, we order the dbufs by an arbitrary value - * their address in memory. This means that dn_dbufs cannot be used to * directly look up a dbuf. Instead, callers must use avl_walk, have * a reference to the dbuf, or look up a non-existant node with * db_state = DB_SEARCH (see dbuf_free_range for an example). */ avl_tree_t dn_dbufs; /* protected by dn_struct_rwlock */ struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ boolean_t dn_have_spill; /* have spill or are spilling */ /* parent IO for current sync write */ zio_t *dn_zio; /* used in syncing context */ uint64_t dn_oldused; /* old phys used bytes */ uint64_t dn_oldflags; /* old phys dn_flags */ uint64_t dn_olduid, dn_oldgid; uint64_t dn_newuid, dn_newgid; int dn_id_flags; /* holds prefetch structure */ struct zfetch dn_zfetch; } dnode_t; /* * Adds a level of indirection between the dbuf and the dnode to avoid * iterating descendent dbufs in dnode_move(). Handles are not allocated * individually, but as an array of child dnodes in dnode_hold_impl(). */ typedef struct dnode_handle { /* Protects dnh_dnode from modification by dnode_move(). */ zrlock_t dnh_zrlock; dnode_t *dnh_dnode; } dnode_handle_t; typedef struct dnode_children { + dmu_buf_user_t dnc_dbu; /* User evict data */ size_t dnc_count; /* number of children */ dnode_handle_t dnc_children[]; /* sized dynamically */ } dnode_children_t; typedef struct free_range { avl_node_t fr_node; uint64_t fr_blkid; uint64_t fr_nblks; } free_range_t; -dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp, +void dnode_special_open(struct objset *dd, dnode_phys_t *dnp, uint64_t object, dnode_handle_t *dnh); void dnode_special_close(dnode_handle_t *dnh); void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx); void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx); int dnode_hold(struct objset *dd, uint64_t object, void *ref, dnode_t **dnp); int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, void *ref, dnode_t **dnp); boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); void dnode_rele_and_unlock(dnode_t *dn, void *tag); void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); void dnode_sync(dnode_t *dn, dmu_tx_t *tx); void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); void dnode_free(dnode_t *dn, dmu_tx_t *tx); void dnode_byteswap(dnode_phys_t *dnp); void dnode_buf_byteswap(void *buf, size_t size); void dnode_verify(dnode_t *dn); int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); void dnode_diduse_space(dnode_t *dn, int64_t space); void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx); void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t); uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); void dnode_init(void); void dnode_fini(void); int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, int minlvl, uint64_t blkfill, uint64_t txg); void dnode_evict_dbufs(dnode_t *dn); void dnode_evict_bonus(dnode_t *dn); #ifdef ZFS_DEBUG /* * There should be a ## between the string literal and fmt, to make it * clear that we're joining two strings together, but that piece of shit * gcc doesn't support that preprocessor token. */ #define dprintf_dnode(dn, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char __db_buf[32]; \ uint64_t __db_obj = (dn)->dn_object; \ if (__db_obj == DMU_META_DNODE_OBJECT) \ (void) strcpy(__db_buf, "mdn"); \ else \ (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ (u_longlong_t)__db_obj);\ dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \ __db_buf, __VA_ARGS__); \ } \ _NOTE(CONSTCOND) } while (0) #define DNODE_VERIFY(dn) dnode_verify(dn) #define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx) #else #define dprintf_dnode(db, fmt, ...) #define DNODE_VERIFY(dn) #define FREE_VERIFY(db, start, end, tx) #endif #ifdef __cplusplus } #endif #endif /* _SYS_DNODE_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dataset.h (revision 286575) @@ -1,324 +1,322 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DSL_DATASET_H #define _SYS_DSL_DATASET_H #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct dsl_dataset; struct dsl_dir; struct dsl_pool; #define DS_FLAG_INCONSISTENT (1ULL<<0) #define DS_IS_INCONSISTENT(ds) \ (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) /* * Do not allow this dataset to be promoted. */ #define DS_FLAG_NOPROMOTE (1ULL<<1) /* * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE, * refquota/refreservations). */ #define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) /* * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called * on a dataset. This allows the dataset to be destroyed using 'zfs release'. */ #define DS_FLAG_DEFER_DESTROY (1ULL<<3) #define DS_IS_DEFER_DESTROY(ds) \ (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY) /* * DS_FIELD_* are strings that are used in the "extensified" dataset zap object. * They should be of the format :. */ /* * This field's value is the object ID of a zap object which contains the * bookmarks of this dataset. If it is present, then this dataset is counted * in the refcount of the SPA_FEATURES_BOOKMARKS feature. */ #define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks" /* * This field is present (with value=0) if this dataset may contain large * blocks (>128KB). If it is present, then this dataset * is counted in the refcount of the SPA_FEATURE_LARGE_BLOCKS feature. */ #define DS_FIELD_LARGE_BLOCKS "org.open-zfs:large_blocks" /* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose * name lookups should be performed case-insensitively. */ #define DS_FLAG_CI_DATASET (1ULL<<16) #define DS_CREATE_FLAG_NODIRTY (1ULL<<24) typedef struct dsl_dataset_phys { uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */ uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */ uint64_t ds_prev_snap_txg; uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */ uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */ uint64_t ds_num_children; /* clone/snap children; ==0 for head */ uint64_t ds_creation_time; /* seconds since 1970 */ uint64_t ds_creation_txg; uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */ /* * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes * include all blocks referenced by this dataset, including those * shared with any other datasets. */ uint64_t ds_referenced_bytes; uint64_t ds_compressed_bytes; uint64_t ds_uncompressed_bytes; uint64_t ds_unique_bytes; /* only relevant to snapshots */ /* * The ds_fsid_guid is a 56-bit ID that can change to avoid * collisions. The ds_guid is a 64-bit ID that will never * change, so there is a small probability that it will collide. */ uint64_t ds_fsid_guid; uint64_t ds_guid; uint64_t ds_flags; /* DS_FLAG_* */ blkptr_t ds_bp; uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */ uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */ uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */ uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */ } dsl_dataset_phys_t; typedef struct dsl_dataset { + dmu_buf_user_t ds_dbu; + /* Immutable: */ struct dsl_dir *ds_dir; dmu_buf_t *ds_dbuf; uint64_t ds_object; uint64_t ds_fsid_guid; + boolean_t ds_is_snapshot; /* only used in syncing context, only valid for non-snapshots: */ struct dsl_dataset *ds_prev; uint64_t ds_bookmarks; /* DMU_OTN_ZAP_METADATA */ boolean_t ds_large_blocks; boolean_t ds_need_large_blocks; /* has internal locking: */ dsl_deadlist_t ds_deadlist; bplist_t ds_pending_deadlist; /* protected by lock on pool's dp_dirty_datasets list */ txg_node_t ds_dirty_link; list_node_t ds_synced_link; /* * ds_phys->ds_ is also protected by ds_lock. * Protected by ds_lock: */ kmutex_t ds_lock; objset_t *ds_objset; uint64_t ds_userrefs; void *ds_owner; /* * Long holds prevent the ds from being destroyed; they allow the * ds to remain held even after dropping the dp_config_rwlock. * Owning counts as a long hold. See the comments above * dsl_pool_hold() for details. */ refcount_t ds_longholds; /* no locking; only for making guesses */ uint64_t ds_trysnap_txg; /* for objset_open() */ kmutex_t ds_opening_lock; uint64_t ds_reserved; /* cached refreservation */ uint64_t ds_quota; /* cached refquota */ kmutex_t ds_sendstream_lock; list_t ds_sendstreams; /* Protected by ds_lock; keep at end of struct for better locality */ char ds_snapname[MAXNAMELEN]; } dsl_dataset_t; inline dsl_dataset_phys_t * dsl_dataset_phys(dsl_dataset_t *ds) { return (ds->ds_dbuf->db_data); } /* * The max length of a temporary tag prefix is the number of hex digits * required to express UINT64_MAX plus one for the hyphen. */ #define MAX_TAG_PREFIX_LEN 17 - -inline boolean_t -dsl_dataset_is_snapshot(dsl_dataset_t *ds) -{ - return (dsl_dataset_phys(ds)->ds_num_children != 0); -} #define DS_UNIQUE_IS_ACCURATE(ds) \ ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag, dsl_dataset_t **dsp); boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds, void *tag); int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, dsl_dataset_t **); void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); int dsl_dataset_own(struct dsl_pool *dp, const char *name, void *tag, dsl_dataset_t **dsp); int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, dsl_dataset_t **dsp); void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); void dsl_dataset_name(dsl_dataset_t *ds, char *name); boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, uint64_t flags, dmu_tx_t *tx); int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors); int dsl_dataset_promote(const char *name, char *conflsnap); int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force); int dsl_dataset_rename_snapshot(const char *fsname, const char *oldsnapname, const char *newsnapname, boolean_t recursive); int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, minor_t cleanup_minor, const char *htag); blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap); void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx); int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, boolean_t async); boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, const blkptr_t *bp, uint64_t blk_birth); uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv); void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat); void dsl_dataset_space(dsl_dataset_t *ds, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); boolean_t dsl_dataset_is_dirty(dsl_dataset_t *ds); int dsl_dataset_activate_large_blocks(const char *dsname); void dsl_dataset_activate_large_blocks_sync_impl(uint64_t dsobj, dmu_tx_t *tx); int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv); int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, uint64_t quota); int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, uint64_t reservation); boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, uint64_t earlier_txg); void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag); void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag); boolean_t dsl_dataset_long_held(dsl_dataset_t *ds); int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx); void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, dsl_dataset_t *origin_head, dmu_tx_t *tx); int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr); void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, dmu_tx_t *tx); void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx); void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds); int dsl_dataset_get_snapname(dsl_dataset_t *ds); int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value); int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, boolean_t adj_cnt); void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, zprop_source_t source, uint64_t value, dmu_tx_t *tx); void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx); int dsl_dataset_rollback(const char *fsname, void *owner, nvlist_t *result); #ifdef ZFS_DEBUG #define dprintf_ds(ds, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__ds_name = kmem_alloc(MAXNAMELEN, KM_SLEEP); \ dsl_dataset_name(ds, __ds_name); \ dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \ kmem_free(__ds_name, MAXNAMELEN); \ } \ _NOTE(CONSTCOND) } while (0) #else #define dprintf_ds(dd, fmt, ...) #endif #ifdef __cplusplus } #endif #endif /* _SYS_DSL_DATASET_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dsl_dir.h (revision 286575) @@ -1,190 +1,194 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_DSL_DIR_H #define _SYS_DSL_DIR_H #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif struct dsl_dataset; /* * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object. * They should be of the format :. */ #define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count" #define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" typedef enum dd_used { DD_USED_HEAD, DD_USED_SNAP, DD_USED_CHILD, DD_USED_CHILD_RSRV, DD_USED_REFRSRV, DD_USED_NUM } dd_used_t; #define DD_FLAG_USED_BREAKDOWN (1<<0) typedef struct dsl_dir_phys { uint64_t dd_creation_time; /* not actually used */ uint64_t dd_head_dataset_obj; uint64_t dd_parent_obj; uint64_t dd_origin_obj; uint64_t dd_child_dir_zapobj; /* * how much space our children are accounting for; for leaf * datasets, == physical space used by fs + snaps */ uint64_t dd_used_bytes; uint64_t dd_compressed_bytes; uint64_t dd_uncompressed_bytes; /* Administrative quota setting */ uint64_t dd_quota; /* Administrative reservation setting */ uint64_t dd_reserved; uint64_t dd_props_zapobj; uint64_t dd_deleg_zapobj; /* dataset delegation permissions */ uint64_t dd_flags; uint64_t dd_used_breakdown[DD_USED_NUM]; uint64_t dd_clones; /* dsl_dir objects */ uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */ } dsl_dir_phys_t; struct dsl_dir { + dmu_buf_user_t dd_dbu; + /* These are immutable; no lock needed: */ uint64_t dd_object; dsl_pool_t *dd_pool; /* Stable until user eviction; no lock needed: */ dmu_buf_t *dd_dbuf; /* protected by lock on pool's dp_dirty_dirs list */ txg_node_t dd_dirty_link; /* protected by dp_config_rwlock */ dsl_dir_t *dd_parent; /* Protected by dd_lock */ kmutex_t dd_lock; list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */ uint64_t dd_origin_txg; /* gross estimate of space used by in-flight tx's */ uint64_t dd_tempreserved[TXG_SIZE]; /* amount of space we expect to write; == amount of dirty data */ int64_t dd_space_towrite[TXG_SIZE]; /* protected by dd_lock; keep at end of struct for better locality */ char dd_myname[MAXNAMELEN]; }; inline dsl_dir_phys_t * dsl_dir_phys(dsl_dir_t *dd) { return (dd->dd_dbuf->db_data); } void dsl_dir_rele(dsl_dir_t *dd, void *tag); +void dsl_dir_async_rele(dsl_dir_t *dd, void *tag); int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, dsl_dir_t **, const char **tail); int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **); void dsl_dir_name(dsl_dir_t *dd, char *buf); int dsl_dir_namelen(dsl_dir_t *dd); uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx); void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); uint64_t dsl_dir_space_available(dsl_dir_t *dd, dsl_dir_t *ancestor, int64_t delta, int ondiskonly); void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, uint64_t asize, uint64_t fsize, uint64_t usize, void **tr_cookiep, dmu_tx_t *tx); void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); int dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota); int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, uint64_t reservation); int dsl_dir_activate_fs_ss_limit(const char *); int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *, cred_t *); void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *); int dsl_dir_rename(const char *oldname, const char *newname); int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *); boolean_t dsl_dir_is_clone(dsl_dir_t *dd); void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, uint64_t reservation, cred_t *cr, dmu_tx_t *tx); void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx); void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx); boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" #define ORIGIN_DIR_NAME "$ORIGIN" #define XLATION_DIR_NAME "$XLATION" #define FREE_DIR_NAME "$FREE" #define LEAK_DIR_NAME "$LEAK" #ifdef ZFS_DEBUG #define dprintf_dd(dd, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__ds_name = kmem_alloc(MAXNAMELEN + strlen(MOS_DIR_NAME) + 1, \ KM_SLEEP); \ dsl_dir_name(dd, __ds_name); \ dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \ kmem_free(__ds_name, MAXNAMELEN + strlen(MOS_DIR_NAME) + 1); \ } \ _NOTE(CONSTCOND) } while (0) #else #define dprintf_dd(dd, fmt, ...) #endif #ifdef __cplusplus } #endif #endif /* _SYS_DSL_DIR_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa.h (revision 286575) @@ -1,171 +1,170 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_SA_H #define _SYS_SA_H #include #include /* * Currently available byteswap functions. * If it all possible new attributes should used * one of the already defined byteswap functions. * If a new byteswap function is added then the * ZPL/Pool version will need to be bumped. */ typedef enum sa_bswap_type { SA_UINT64_ARRAY, SA_UINT32_ARRAY, SA_UINT16_ARRAY, SA_UINT8_ARRAY, SA_ACL, } sa_bswap_type_t; typedef uint16_t sa_attr_type_t; /* * Attribute to register support for. */ typedef struct sa_attr_reg { char *sa_name; /* attribute name */ uint16_t sa_length; sa_bswap_type_t sa_byteswap; /* bswap functon enum */ sa_attr_type_t sa_attr; /* filled in during registration */ } sa_attr_reg_t; typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t, boolean_t, void *userptr); /* * array of attributes to store. * * This array should be treated as opaque/private data. * The SA_BULK_ADD_ATTR() macro should be used for manipulating * the array. * * When sa_replace_all_by_template() is used the attributes * will be stored in the order defined in the array, except that * the attributes may be split between the bonus and the spill buffer * */ typedef struct sa_bulk_attr { void *sa_data; sa_data_locator_t *sa_data_func; uint16_t sa_length; sa_attr_type_t sa_attr; /* the following are private to the sa framework */ void *sa_addr; uint16_t sa_buftype; uint16_t sa_size; } sa_bulk_attr_t; /* * special macro for adding entries for bulk attr support * bulk - sa_bulk_attr_t * count - integer that will be incremented during each add * attr - attribute to manipulate * func - function for accessing data. * data - pointer to data. * len - length of data */ #define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \ { \ b[idx].sa_attr = attr;\ b[idx].sa_data_func = func; \ b[idx].sa_data = data; \ b[idx++].sa_length = len; \ } typedef struct sa_os sa_os_t; typedef enum sa_handle_type { SA_HDL_SHARED, SA_HDL_PRIVATE } sa_handle_type_t; struct sa_handle; typedef void *sa_lookup_tab_t; typedef struct sa_handle sa_handle_t; typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx); int sa_handle_get(objset_t *, uint64_t, void *userp, sa_handle_type_t, sa_handle_t **); int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp, sa_handle_type_t, sa_handle_t **); void sa_handle_destroy(sa_handle_t *); int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **); void sa_buf_rele(dmu_buf_t *, void *); int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen); int sa_update(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen, dmu_tx_t *); int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *); int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count); int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count); int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *); int sa_size(sa_handle_t *, sa_attr_type_t, int *); int sa_update_from_cb(sa_handle_t *, sa_attr_type_t, uint32_t buflen, sa_data_locator_t *, void *userdata, dmu_tx_t *); void sa_object_info(sa_handle_t *, dmu_object_info_t *); void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); -void sa_update_user(sa_handle_t *, sa_handle_t *); void *sa_get_userdata(sa_handle_t *); void sa_set_userp(sa_handle_t *, void *); dmu_buf_t *sa_get_db(sa_handle_t *); uint64_t sa_handle_object(sa_handle_t *); boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size); void sa_register_update_callback(objset_t *, sa_update_cb_t *); int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **); void sa_tear_down(objset_t *); int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *, int, dmu_tx_t *); int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *, int, dmu_tx_t *); boolean_t sa_enabled(objset_t *); void sa_cache_init(); void sa_cache_fini(); int sa_set_sa_object(objset_t *, uint64_t); int sa_hdrsize(void *); void sa_handle_lock(sa_handle_t *); void sa_handle_unlock(sa_handle_t *); #ifdef _KERNEL int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *); #endif #ifdef __cplusplus extern "C" { #endif #ifdef __cplusplus } #endif #endif /* _SYS_SA_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/sa_impl.h (revision 286575) @@ -1,289 +1,291 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_SA_IMPL_H #define _SYS_SA_IMPL_H #include #include #include /* * Array of known attributes and their * various characteristics. */ typedef struct sa_attr_table { sa_attr_type_t sa_attr; uint8_t sa_registered; uint16_t sa_length; sa_bswap_type_t sa_byteswap; char *sa_name; } sa_attr_table_t; /* * Zap attribute format for attribute registration * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * | unused | len | bswap | attr num | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Zap attribute format for layout information. * * layout information is stored as an array of attribute numbers * The name of the attribute is the layout number (0, 1, 2, ...) * * 16 0 * +---- ---+ * | attr # | * +--------+ * | attr # | * +--- ----+ * ...... * */ #define ATTR_BSWAP(x) BF32_GET(x, 16, 8) #define ATTR_LENGTH(x) BF32_GET(x, 24, 16) #define ATTR_NUM(x) BF32_GET(x, 0, 16) #define ATTR_ENCODE(x, attr, length, bswap) \ { \ BF64_SET(x, 24, 16, length); \ BF64_SET(x, 16, 8, bswap); \ BF64_SET(x, 0, 16, attr); \ } #define TOC_OFF(x) BF32_GET(x, 0, 23) #define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1) #define TOC_LEN_IDX(x) BF32_GET(x, 24, 4) #define TOC_ATTR_ENCODE(x, len_idx, offset) \ { \ BF32_SET(x, 31, 1, 1); \ BF32_SET(x, 24, 7, len_idx); \ BF32_SET(x, 0, 24, offset); \ } #define SA_LAYOUTS "LAYOUTS" #define SA_REGISTRY "REGISTRY" /* * Each unique layout will have their own table * sa_lot (layout_table) */ typedef struct sa_lot { avl_node_t lot_num_node; avl_node_t lot_hash_node; uint64_t lot_num; uint64_t lot_hash; sa_attr_type_t *lot_attrs; /* array of attr #'s */ uint32_t lot_var_sizes; /* how many aren't fixed size */ uint32_t lot_attr_count; /* total attr count */ list_t lot_idx_tab; /* should be only a couple of entries */ int lot_instance; /* used with lot_hash to identify entry */ } sa_lot_t; /* index table of offsets */ typedef struct sa_idx_tab { list_node_t sa_next; sa_lot_t *sa_layout; uint16_t *sa_variable_lengths; refcount_t sa_refcount; uint32_t *sa_idx_tab; /* array of offsets */ } sa_idx_tab_t; /* * Since the offset/index information into the actual data * will usually be identical we can share that information with * all handles that have the exact same offsets. * * You would typically only have a large number of different table of * contents if you had a several variable sized attributes. * * Two AVL trees are used to track the attribute layout numbers. * one is keyed by number and will be consulted when a DMU_OT_SA * object is first read. The second tree is keyed by the hash signature * of the attributes and will be consulted when an attribute is added * to determine if we already have an instance of that layout. Both * of these tree's are interconnected. The only difference is that * when an entry is found in the "hash" tree the list of attributes will * need to be compared against the list of attributes you have in hand. * The assumption is that typically attributes will just be updated and * adding a completely new attribute is a very rare operation. */ struct sa_os { kmutex_t sa_lock; boolean_t sa_need_attr_registration; boolean_t sa_force_spill; uint64_t sa_master_obj; uint64_t sa_reg_attr_obj; uint64_t sa_layout_attr_obj; int sa_num_attrs; sa_attr_table_t *sa_attr_table; /* private attr table */ sa_update_cb_t *sa_update_cb; avl_tree_t sa_layout_num_tree; /* keyed by layout number */ avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */ int sa_user_table_sz; sa_attr_type_t *sa_user_table; /* user name->attr mapping table */ }; /* * header for all bonus and spill buffers. * * The header has a fixed portion with a variable number * of "lengths" depending on the number of variable sized * attributes which are determined by the "layout number" */ #define SA_MAGIC 0x2F505A /* ZFS SA */ typedef struct sa_hdr_phys { uint32_t sa_magic; /* BEGIN CSTYLED */ /* * Encoded with hdrsize and layout number as follows: * 16 10 0 * +--------+-------+ * | hdrsz |layout | * +--------+-------+ * * Bits 0-10 are the layout number * Bits 11-16 are the size of the header. * The hdrsize is the number * 8 * * For example. * hdrsz of 1 ==> 8 byte header * 2 ==> 16 byte header * */ /* END CSTYLED */ uint16_t sa_layout_info; uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ /* ... Data follows the lengths. */ } sa_hdr_phys_t; #define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) #define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0) #define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ { \ BF32_SET_SB(x, 10, 6, 3, 0, size); \ BF32_SET(x, 0, 10, num); \ } typedef enum sa_buf_type { SA_BONUS = 1, SA_SPILL = 2 } sa_buf_type_t; typedef enum sa_data_op { SA_LOOKUP, SA_UPDATE, SA_ADD, SA_REPLACE, SA_REMOVE } sa_data_op_t; /* * Opaque handle used for most sa functions * * This needs to be kept as small as possible. */ struct sa_handle { + dmu_buf_user_t sa_dbu; kmutex_t sa_lock; dmu_buf_t *sa_bonus; dmu_buf_t *sa_spill; objset_t *sa_os; - void *sa_userp; + void *sa_userp; sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ }; #define SA_GET_DB(hdl, type) \ (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill) #define SA_GET_HDR(hdl, type) \ ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \ type))->db.db_data)) #define SA_IDX_TAB_GET(hdl, type) \ (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab) #define IS_SA_BONUSTYPE(a) \ ((a == DMU_OT_SA) ? B_TRUE : B_FALSE) #define SA_BONUSTYPE_FROM_DB(db) \ (dmu_get_bonustype((dmu_buf_t *)db)) #define SA_BLKPTR_SPACE (DN_MAX_BONUSLEN - sizeof (blkptr_t)) #define SA_LAYOUT_NUM(x, type) \ ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \ ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x)))) #define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length #define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\ hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \ SA_REGISTERED_LEN(sa, attr)) #define SA_SET_HDR(hdr, num, size) \ { \ hdr->sa_magic = SA_MAGIC; \ SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \ } #define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \ { \ bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \ bulk.sa_buftype = type; \ bulk.sa_addr = \ (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \ (uintptr_t)hdr); \ } #define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \ (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \ (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \ sizeof (uint16_t), 8) : 0))) int sa_add_impl(sa_handle_t *, sa_attr_type_t, uint32_t, sa_data_locator_t, void *, dmu_tx_t *); void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *); int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *); void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *); int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t, uint16_t *, sa_hdr_phys_t *); #ifdef __cplusplus extern "C" { #endif #ifdef __cplusplus } #endif #endif /* _SYS_SA_IMPL_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h (revision 286575) @@ -1,894 +1,899 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_SPA_H #define _SYS_SPA_H #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Forward references that lots of things need. */ typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; typedef struct metaslab_group metaslab_group_t; typedef struct metaslab_class metaslab_class_t; typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; typedef struct ddt ddt_t; typedef struct ddt_entry ddt_entry_t; struct dsl_pool; struct dsl_dataset; /* * General-purpose 32-bit and 64-bit bitfield encodings. */ #define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) #define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) #define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) #define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) #define BF32_GET(x, low, len) BF32_DECODE(x, low, len) #define BF64_GET(x, low, len) BF64_DECODE(x, low, len) #define BF32_SET(x, low, len, val) do { \ ASSERT3U(val, <, 1U << (len)); \ ASSERT3U(low + len, <=, 32); \ (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ _NOTE(CONSTCOND) } while (0) #define BF64_SET(x, low, len, val) do { \ ASSERT3U(val, <, 1ULL << (len)); \ ASSERT3U(low + len, <=, 64); \ ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ _NOTE(CONSTCOND) } while (0) #define BF32_GET_SB(x, low, len, shift, bias) \ ((BF32_GET(x, low, len) + (bias)) << (shift)) #define BF64_GET_SB(x, low, len, shift, bias) \ ((BF64_GET(x, low, len) + (bias)) << (shift)) #define BF32_SET_SB(x, low, len, shift, bias, val) do { \ ASSERT(IS_P2ALIGNED(val, 1U << shift)); \ ASSERT3S((val) >> (shift), >=, bias); \ BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ _NOTE(CONSTCOND) } while (0) #define BF64_SET_SB(x, low, len, shift, bias, val) do { \ ASSERT(IS_P2ALIGNED(val, 1ULL << shift)); \ ASSERT3S((val) >> (shift), >=, bias); \ BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ _NOTE(CONSTCOND) } while (0) /* * We currently support block sizes from 512 bytes to 16MB. * The benefits of larger blocks, and thus larger IO, need to be weighed * against the cost of COWing a giant block to modify one byte, and the * large latency of reading or writing a large block. * * Note that although blocks up to 16MB are supported, the recordsize * property can not be set larger than zfs_max_recordsize (default 1MB). * See the comment near zfs_max_recordsize in dsl_dataset.c for details. * * Note that although the LSIZE field of the blkptr_t can store sizes up * to 32MB, the dnode's dn_datablkszsec can only store sizes up to * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. */ #define SPA_MINBLOCKSHIFT 9 #define SPA_OLD_MAXBLOCKSHIFT 17 #define SPA_MAXBLOCKSHIFT 24 #define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) #define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) #define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) /* * Default maximum supported logical ashift. * * The current 8k allocation block size limit is due to the 8k * aligned/sized operations performed by vdev_probe() on * vdev_label->vl_pad2. Using another "safe region" for these tests * would allow the limit to be raised to 16k, at the expense of * only having 8 available uberblocks in the label area. */ #define SPA_MAXASHIFT 13 /* * Default minimum supported logical ashift. */ #define SPA_MINASHIFT SPA_MINBLOCKSHIFT /* * Size of block to hold the configuration data (a packed nvlist) */ #define SPA_CONFIG_BLOCKSIZE (1ULL << 14) /* * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. * The ASIZE encoding should be at least 64 times larger (6 more bits) * to support up to 4-way RAID-Z mirror mode with worst-case gang block * overhead, three DVAs per bp, plus one more bit in case we do anything * else that expands the ASIZE. */ #define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ #define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ #define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ /* * All SPA data is represented by 128-bit data virtual addresses (DVAs). * The members of the dva_t should be considered opaque outside the SPA. */ typedef struct dva { uint64_t dva_word[2]; } dva_t; /* * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. */ typedef struct zio_cksum { uint64_t zc_word[4]; } zio_cksum_t; /* * Each block is described by its DVAs, time of birth, checksum, etc. * The word-by-word, bit-by-bit layout of the blkptr is as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | vdev1 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 1 |G| offset1 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 2 | vdev2 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 3 |G| offset2 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 4 | vdev3 | GRID | ASIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ * c | checksum[0] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * d | checksum[1] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * e | checksum[2] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * f | checksum[3] | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * vdev virtual device ID * offset offset into virtual device * LSIZE logical size * PSIZE physical size (after compression) * ASIZE allocated size (including RAID-Z parity and gang block headers) * GRID RAID-Z layout information (reserved for future use) * cksum checksum function * comp compression function * G gang block indicator * B byteorder (endianness) * D dedup * X encryption (on version 30, which is not supported) * E blkptr_t contains embedded data (see below) * lvl level of indirection * type DMU object type * phys birth txg of block allocation; zero if same as logical birth txg * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ /* * "Embedded" blkptr_t's don't actually point to a block, instead they * have a data payload embedded in the blkptr_t itself. See the comment * in blkptr.c for more details. * * The blkptr_t is laid out as follows: * * 64 56 48 40 32 24 16 8 0 * +-------+-------+-------+-------+-------+-------+-------+-------+ * 0 | payload | * 1 | payload | * 2 | payload | * 3 | payload | * 4 | payload | * 5 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | payload | * 8 | payload | * 9 | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | payload | * c | payload | * d | payload | * e | payload | * f | payload | * +-------+-------+-------+-------+-------+-------+-------+-------+ * * Legend: * * payload contains the embedded data * B (byteorder) byteorder (endianness) * D (dedup) padding (set to zero) * X encryption (set to zero; see above) * E (embedded) set to one * lvl indirection level * type DMU object type * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) * comp compression function of payload * PSIZE size of payload after compression, in bytes * LSIZE logical size of payload, in bytes * note that 25 bits is enough to store the largest * "normal" BP's LSIZE (2^16 * 2^9) in bytes * log. birth transaction group in which the block was logically born * * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded * bp's they are stored in units of SPA_MINBLOCKSHIFT. * Generally, the generic BP_GET_*() macros can be used on embedded BP's. * The B, D, X, lvl, type, and comp fields are stored the same as with normal * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before * other macros, as they assert that they are only used on BP's of the correct * "embedded-ness". */ #define BPE_GET_ETYPE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET((bp)->blk_prop, 40, 8)) #define BPE_SET_ETYPE(bp, t) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, t); \ _NOTE(CONSTCOND) } while (0) #define BPE_GET_LSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) #define BPE_SET_LSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BPE_GET_PSIZE(bp) \ (ASSERT(BP_IS_EMBEDDED(bp)), \ BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) #define BPE_SET_PSIZE(bp, x) do { \ ASSERT(BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ _NOTE(CONSTCOND) } while (0) typedef enum bp_embedded_type { BP_EMBEDDED_TYPE_DATA, BP_EMBEDDED_TYPE_RESERVED, /* Reserved for an unintegrated feature. */ NUM_BP_EMBEDDED_TYPES = BP_EMBEDDED_TYPE_RESERVED } bp_embedded_type_t; #define BPE_NUM_WORDS 14 #define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) #define BPE_IS_PAYLOADWORD(bp, wp) \ ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ /* * A block is a hole when it has either 1) never been written to, or * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads * without physically allocating disk space. Holes are represented in the * blkptr_t structure by zeroed blk_dva. Correct checking for holes is * done through the BP_IS_HOLE macro. For holes, the logical size, level, * DMU object type, and birth times are all also stored for holes that * were written to at some point (i.e. were punched after having been filled). */ typedef struct blkptr { dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ uint64_t blk_prop; /* size, compression, type, etc */ uint64_t blk_pad[2]; /* Extra space for the future */ uint64_t blk_phys_birth; /* txg when block was allocated */ uint64_t blk_birth; /* transaction group at birth */ uint64_t blk_fill; /* fill count */ zio_cksum_t blk_cksum; /* 256-bit checksum */ } blkptr_t; /* * Macros to get and set fields in a bp or DVA. */ #define DVA_GET_ASIZE(dva) \ BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_ASIZE(dva, x) \ BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) #define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) #define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, 32) #define DVA_SET_VDEV(dva, x) BF64_SET((dva)->dva_word[0], 32, 32, x) #define DVA_GET_OFFSET(dva) \ BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) #define DVA_SET_OFFSET(dva, x) \ BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) #define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? \ (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_LSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_PSIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) #define BP_SET_PSIZE(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET_SB((bp)->blk_prop, \ 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 7) #define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 7, x) #define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) #define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) #define BP_GET_CHECKSUM(bp) \ (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ BF64_GET((bp)->blk_prop, 40, 8)) #define BP_SET_CHECKSUM(bp, x) do { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ BF64_SET((bp)->blk_prop, 40, 8, x); \ _NOTE(CONSTCOND) } while (0) #define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) #define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) #define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) #define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) #define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) #define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) #define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) #define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) #define BP_PHYSICAL_BIRTH(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) #define BP_SET_BIRTH(bp, logical, physical) \ { \ ASSERT(!BP_IS_EMBEDDED(bp)); \ (bp)->blk_birth = (logical); \ (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ } #define BP_GET_FILL(bp) (BP_IS_EMBEDDED(bp) ? 1 : (bp)->blk_fill) #define BP_GET_ASIZE(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ DVA_GET_ASIZE(&(bp)->blk_dva[2])) #define BP_GET_UCSIZE(bp) \ ((BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) ? \ BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) #define BP_GET_NDVAS(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ !!DVA_GET_ASIZE(&(bp)->blk_dva[2])) #define BP_COUNT_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? 0 : \ (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ DVA_GET_GANG(&(bp)->blk_dva[1]) + \ DVA_GET_GANG(&(bp)->blk_dva[2]))) #define DVA_EQUAL(dva1, dva2) \ ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ (dva1)->dva_word[0] == (dva2)->dva_word[0]) #define BP_EQUAL(bp1, bp2) \ (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ (bp1)->blk_birth == (bp2)->blk_birth && \ DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) #define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ ((zc1).zc_word[3] - (zc2).zc_word[3]))) #define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) #define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ { \ (zcp)->zc_word[0] = w0; \ (zcp)->zc_word[1] = w1; \ (zcp)->zc_word[2] = w2; \ (zcp)->zc_word[3] = w3; \ } #define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) #define BP_IS_GANG(bp) \ (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) #define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ (dva)->dva_word[1] == 0ULL) #define BP_IS_HOLE(bp) \ (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) /* BP_IS_RAIDZ(bp) assumes no block compression */ #define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ BP_GET_PSIZE(bp)) #define BP_ZERO(bp) \ { \ (bp)->blk_dva[0].dva_word[0] = 0; \ (bp)->blk_dva[0].dva_word[1] = 0; \ (bp)->blk_dva[1].dva_word[0] = 0; \ (bp)->blk_dva[1].dva_word[1] = 0; \ (bp)->blk_dva[2].dva_word[0] = 0; \ (bp)->blk_dva[2].dva_word[1] = 0; \ (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ (bp)->blk_phys_birth = 0; \ (bp)->blk_birth = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } #if BYTE_ORDER == _BIG_ENDIAN #define ZFS_HOST_BYTEORDER (0ULL) #else #define ZFS_HOST_BYTEORDER (1ULL) #endif #define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) #define BP_SPRINTF_LEN 320 /* * This macro allows code sharing between zfs, libzpool, and mdb. * 'func' is either snprintf() or mdb_snprintf(). * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. */ #define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ { \ static const char *copyname[] = \ { "zero", "single", "double", "triple" }; \ int len = 0; \ int copies = 0; \ \ if (bp == NULL) { \ len += func(buf + len, size - len, ""); \ } else if (BP_IS_HOLE(bp)) { \ len += func(buf + len, size - len, ""); \ if (bp->blk_birth > 0) { \ len += func(buf + len, size - len, \ " birth=%lluL", \ (u_longlong_t)bp->blk_birth); \ } \ } else if (BP_IS_EMBEDDED(bp)) { \ len = func(buf + len, size - len, \ "EMBEDDED [L%llu %s] et=%u %s " \ "size=%llxL/%llxP birth=%lluL", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ (int)BPE_GET_ETYPE(bp), \ compress, \ (u_longlong_t)BPE_GET_LSIZE(bp), \ (u_longlong_t)BPE_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth); \ } else { \ for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ const dva_t *dva = &bp->blk_dva[d]; \ if (DVA_IS_VALID(dva)) \ copies++; \ len += func(buf + len, size - len, \ "DVA[%d]=<%llu:%llx:%llx>%c", d, \ (u_longlong_t)DVA_GET_VDEV(dva), \ (u_longlong_t)DVA_GET_OFFSET(dva), \ (u_longlong_t)DVA_GET_ASIZE(dva), \ ws); \ } \ if (BP_IS_GANG(bp) && \ DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ copies--; \ len += func(buf + len, size - len, \ "[L%llu %s] %s %s %s %s %s %s%c" \ "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ "cksum=%llx:%llx:%llx:%llx", \ (u_longlong_t)BP_GET_LEVEL(bp), \ type, \ checksum, \ compress, \ BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ BP_IS_GANG(bp) ? "gang" : "contiguous", \ BP_GET_DEDUP(bp) ? "dedup" : "unique", \ copyname[copies], \ ws, \ (u_longlong_t)BP_GET_LSIZE(bp), \ (u_longlong_t)BP_GET_PSIZE(bp), \ (u_longlong_t)bp->blk_birth, \ (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ (u_longlong_t)BP_GET_FILL(bp), \ ws, \ (u_longlong_t)bp->blk_cksum.zc_word[0], \ (u_longlong_t)bp->blk_cksum.zc_word[1], \ (u_longlong_t)bp->blk_cksum.zc_word[2], \ (u_longlong_t)bp->blk_cksum.zc_word[3]); \ } \ ASSERT(len < size); \ } #include #define BP_GET_BUFC_TYPE(bp) \ (((BP_GET_LEVEL(bp) > 0) || (DMU_OT_IS_METADATA(BP_GET_TYPE(bp)))) ? \ ARC_BUFC_METADATA : ARC_BUFC_DATA) typedef enum spa_import_type { SPA_IMPORT_EXISTING, SPA_IMPORT_ASSEMBLE } spa_import_type_t; /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, void *tag); extern int spa_open_rewind(const char *pool, spa_t **, void *tag, nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, nvlist_t *zplprops); #ifdef illumos extern int spa_import_rootpool(char *devpath, char *devid); #else extern int spa_import_rootpool(const char *name); #endif extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, boolean_t hardforce); extern int spa_reset(char *pool); extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); extern void spa_async_suspend(spa_t *spa); extern void spa_async_resume(spa_t *spa); extern spa_t *spa_inject_addref(char *pool); extern void spa_inject_delref(spa_t *spa); extern void spa_scan_stat_init(spa_t *spa); extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); #define SPA_ASYNC_CONFIG_UPDATE 0x01 #define SPA_ASYNC_REMOVE 0x02 #define SPA_ASYNC_PROBE 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 #define SPA_ASYNC_AUTOEXPAND 0x20 #define SPA_ASYNC_REMOVE_DONE 0x40 #define SPA_ASYNC_REMOVE_STOP 0x80 /* * Controls the behavior of spa_vdev_remove(). */ #define SPA_REMOVE_UNSPARE 0x01 #define SPA_REMOVE_DONE 0x02 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing); extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern boolean_t spa_vdev_remove_active(spa_t *spa); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, nvlist_t *props, boolean_t exp); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); extern void spa_spare_remove(vdev_t *vd); extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); extern void spa_spare_activate(vdev_t *vd); /* L2ARC state (which is global across all pools) */ extern void spa_l2cache_add(vdev_t *vd); extern void spa_l2cache_remove(vdev_t *vd); extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); /* scanning */ extern int spa_scan(spa_t *spa, pool_scan_func_t func); extern int spa_scan_stop(spa_t *spa); /* spa syncing */ extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; /* * SPA configuration functions in spa_config.c */ #define SPA_CONFIG_UPDATE_POOL 0 #define SPA_CONFIG_UPDATE_VDEVS 1 extern void spa_config_sync(spa_t *, boolean_t, boolean_t); extern void spa_config_load(void); extern nvlist_t *spa_all_configs(uint64_t *); extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); /* * Miscellaneous SPA routines in spa_misc.c */ /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); /* Refcount functions */ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); +extern void spa_async_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); #define SCL_NONE 0x00 #define SCL_CONFIG 0x01 #define SCL_STATE 0x02 #define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ #define SCL_ALLOC 0x08 #define SCL_ZIO 0x10 #define SCL_FREE 0x20 #define SCL_VDEV 0x40 #define SCL_LOCKS 7 #define SCL_ALL ((1 << SCL_LOCKS) - 1) #define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) /* Pool configuration locks */ extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw); extern void spa_config_exit(spa_t *spa, int locks, void *tag); extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); extern uint64_t spa_vdev_config_enter(spa_t *spa); extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag); extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); /* Pool vdev state change lock */ extern void spa_vdev_state_enter(spa_t *spa, int oplock); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); /* Log state */ typedef enum spa_log_state { SPA_LOG_UNKNOWN = 0, /* unknown log state */ SPA_LOG_MISSING, /* missing log(s) */ SPA_LOG_CLEAR, /* clear the log(s) */ SPA_LOG_GOOD, /* log(s) are good */ } spa_log_state_t; extern spa_log_state_t spa_get_log_state(spa_t *spa); extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); extern int spa_offline_log(spa_t *spa); /* Log claim callback */ extern void spa_claim_notify(zio_t *zio); /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); extern boolean_t spa_is_initializing(spa_t *spa); extern blkptr_t *spa_get_rootblkptr(spa_t *spa); extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); extern void spa_altroot(spa_t *, char *, size_t); extern int spa_sync_pass(spa_t *spa); extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_load_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_slop_space(spa_t *spa); extern void spa_update_dspace(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern boolean_t spa_deflate(spa_t *spa); extern metaslab_class_t *spa_normal_class(spa_t *spa); extern metaslab_class_t *spa_log_class(spa_t *spa); +extern void spa_evicting_os_register(spa_t *, objset_t *os); +extern void spa_evicting_os_deregister(spa_t *, objset_t *os); +extern void spa_evicting_os_wait(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_prev_software_version(spa_t *spa); extern int spa_busy(void); extern uint8_t spa_get_failmode(spa_t *spa); extern boolean_t spa_suspended(spa_t *spa); extern uint64_t spa_bootfs(spa_t *spa); extern uint64_t spa_delegation(spa_t *spa); extern objset_t *spa_meta_objset(spa_t *spa); extern uint64_t spa_deadman_synctime(spa_t *spa); /* Miscellaneous support routines */ extern void spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx); extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); extern int spa_rename(const char *oldname, const char *newname); extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); extern uint64_t spa_generate_guid(spa_t *spa); extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern int spa_change_guid(spa_t *spa); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); extern boolean_t spa_writeable(spa_t *spa); extern boolean_t spa_has_pending_synctask(spa_t *spa); extern int spa_maxblocksize(spa_t *spa); extern void zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp); extern int spa_mode(spa_t *spa); extern uint64_t zfs_strtonum(const char *str, char **nptr); #define strtonum(str, nptr) zfs_strtonum((str), (nptr)) extern char *spa_his_ievent_table[]; extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf); extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); extern void spa_history_log_version(spa_t *spa, const char *operation); extern void spa_history_log_internal(spa_t *spa, const char *operation, dmu_tx_t *tx, const char *fmt, ...); extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, dmu_tx_t *tx, const char *fmt, ...); extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, dmu_tx_t *tx, const char *fmt, ...); /* error handling */ struct zbookmark_phys; extern void spa_log_error(spa_t *spa, zio_t *zio); extern void zfs_ereport_post(const char *cls, spa_t *spa, vdev_t *vd, zio_t *zio, uint64_t stateoroffset, uint64_t length); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); extern void zfs_post_state_change(spa_t *spa, vdev_t *vd); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); extern void spa_errlog_rotate(spa_t *spa); extern void spa_errlog_drain(spa_t *spa); extern void spa_errlog_sync(spa_t *spa, uint64_t txg); extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); /* vdev cache */ extern void vdev_cache_stat_init(void); extern void vdev_cache_stat_fini(void); /* Initialization and termination */ extern void spa_init(int flags); extern void spa_fini(void); extern void spa_boot_init(); /* properties */ extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); #ifdef ZFS_DEBUG #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ _NOTE(CONSTCOND) } while (0) #else #define dprintf_bp(bp, fmt, ...) #endif extern boolean_t spa_debug_enabled(spa_t *spa); #define spa_dbgmsg(spa, ...) \ { \ if (spa_debug_enabled(spa)) \ zfs_dbgmsg(__VA_ARGS__); \ } extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ #ifdef __cplusplus } #endif #endif /* _SYS_SPA_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa_impl.h (revision 286575) @@ -1,294 +1,298 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2015 by Delphix. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_SPA_IMPL_H #define _SYS_SPA_IMPL_H #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif typedef struct spa_error_entry { zbookmark_phys_t se_bookmark; char *se_name; avl_node_t se_avl; } spa_error_entry_t; typedef struct spa_history_phys { uint64_t sh_pool_create_len; /* ending offset of zpool create */ uint64_t sh_phys_max_off; /* physical EOF */ uint64_t sh_bof; /* logical BOF */ uint64_t sh_eof; /* logical EOF */ uint64_t sh_records_lost; /* num of records overwritten */ } spa_history_phys_t; struct spa_aux_vdev { uint64_t sav_object; /* MOS object for device list */ nvlist_t *sav_config; /* cached device config */ vdev_t **sav_vdevs; /* devices */ int sav_count; /* number devices */ boolean_t sav_sync; /* sync the device list */ nvlist_t **sav_pending; /* pending device additions */ uint_t sav_npending; /* # pending devices */ }; typedef struct spa_config_lock { kmutex_t scl_lock; kthread_t *scl_writer; int scl_write_wanted; kcondvar_t scl_cv; refcount_t scl_count; } spa_config_lock_t; typedef struct spa_config_dirent { list_node_t scd_link; char *scd_path; } spa_config_dirent_t; typedef enum zio_taskq_type { ZIO_TASKQ_ISSUE = 0, ZIO_TASKQ_ISSUE_HIGH, ZIO_TASKQ_INTERRUPT, ZIO_TASKQ_INTERRUPT_HIGH, ZIO_TASKQ_TYPES } zio_taskq_type_t; /* * State machine for the zpool-poolname process. The states transitions * are done as follows: * * From To Routine * PROC_NONE -> PROC_CREATED spa_activate() * PROC_CREATED -> PROC_ACTIVE spa_thread() * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() * PROC_DEACTIVATE -> PROC_GONE spa_thread() * PROC_GONE -> PROC_NONE spa_deactivate() */ typedef enum spa_proc_state { SPA_PROC_NONE, /* spa_proc = &p0, no process created */ SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ } spa_proc_state_t; typedef struct spa_taskqs { uint_t stqs_count; taskq_t **stqs_taskq; } spa_taskqs_t; struct spa { /* * Fields protected by spa_namespace_lock. */ char spa_name[MAXNAMELEN]; /* pool name */ char *spa_comment; /* comment */ avl_node_t spa_avl; /* node in spa_namespace_avl */ nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ nvlist_t *spa_config_splitting; /* config for splitting */ nvlist_t *spa_load_info; /* info and errors from load */ uint64_t spa_config_txg; /* txg of last config change */ int spa_sync_pass; /* iterate-to-convergence */ pool_state_t spa_state; /* pool state */ int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ uint64_t spa_import_flags; /* import specific flags */ spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; boolean_t spa_is_initializing; /* true while opening pool */ metaslab_class_t *spa_normal_class; /* normal data class */ metaslab_class_t *spa_log_class; /* intent log data class */ uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ uint64_t spa_freeze_txg; /* freeze pool at this txg */ uint64_t spa_load_max_txg; /* best initial ub_txg */ uint64_t spa_claim_max_txg; /* highest claimed birth txg */ timespec_t spa_loaded_ts; /* 1st successful open time */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ + kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ + list_t spa_evicting_os_list; /* Objsets being evicted. */ + kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ int spa_min_ashift; /* of vdevs in normal class */ int spa_max_ashift; /* of vdevs in normal class */ uint64_t spa_config_guid; /* config pool guid */ uint64_t spa_load_guid; /* spa_load initialized guid */ uint64_t spa_last_synced_guid; /* last synced guid */ list_t spa_config_dirty_list; /* vdevs with dirty config */ list_t spa_state_dirty_list; /* vdevs with dirty state */ spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ nvlist_t *spa_label_features; /* Features for reading MOS */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ boolean_t spa_extreme_rewind; /* rewind past deferred frees */ uint64_t spa_last_io; /* lbolt of last non-scan I/O */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ uint8_t spa_scrub_finished; /* indicator to rotate logs */ uint8_t spa_scrub_started; /* started since last boot */ uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ uint64_t spa_scan_pass_start; /* start time per pass/reboot */ uint64_t spa_scan_pass_exam; /* examined bytes per pass */ kmutex_t spa_async_lock; /* protect async state */ kthread_t *spa_async_thread; /* thread doing async task */ kthread_t *spa_async_thread_vd; /* thread doing vd async task */ int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ uint16_t spa_async_tasks; /* async task mask */ char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ int spa_last_open_failed; /* error if last open failed */ uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */ uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */ uint64_t spa_load_txg; /* ub txg that loaded */ uint64_t spa_load_txg_ts; /* timestamp from that ub */ uint64_t spa_load_meta_errors; /* verify metadata err count */ uint64_t spa_load_data_errors; /* verify data err count */ uint64_t spa_verify_min_txg; /* start txg of verify scrub */ kmutex_t spa_errlog_lock; /* error log lock */ uint64_t spa_errlog_last; /* last error log object */ uint64_t spa_errlog_scrub; /* scrub error log object */ kmutex_t spa_errlist_lock; /* error list/ereport lock */ avl_tree_t spa_errlist_last; /* last error list */ avl_tree_t spa_errlist_scrub; /* scrub error list */ uint64_t spa_deflate; /* should we deflate? */ uint64_t spa_history; /* history object */ kmutex_t spa_history_lock; /* history lock */ vdev_t *spa_pending_vdev; /* pending vdev additions */ kmutex_t spa_props_lock; /* property lock */ uint64_t spa_pool_props_object; /* object for properties */ uint64_t spa_bootfs; /* default boot filesystem */ uint64_t spa_failmode; /* failure mode for the pool */ uint64_t spa_delegation; /* delegation on/off */ list_t spa_config_list; /* previous cache file(s) */ /* per-CPU array of root of async I/O: */ zio_t **spa_async_zio_root; zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kcondvar_t spa_suspend_cv; /* notification of resume */ uint8_t spa_suspended; /* pool is suspended */ uint8_t spa_claiming; /* pool is doing zil_claim() */ boolean_t spa_debug; /* debug enabled? */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ int spa_mode; /* FREAD | FWRITE */ spa_log_state_t spa_log_state; /* log state */ uint64_t spa_autoexpand; /* lun expansion on/off */ ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ uint64_t spa_ddt_stat_object; /* DDT statistics */ uint64_t spa_dedup_ditto; /* dedup ditto threshold */ uint64_t spa_dedup_checksum; /* default dedup checksum */ uint64_t spa_dspace; /* dspace in normal class */ kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ kmutex_t spa_proc_lock; /* protects spa_proc* */ kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ spa_proc_state_t spa_proc_state; /* see definition */ struct proc *spa_proc; /* "zpool-poolname" process */ uint64_t spa_did; /* if procp != p0, did of t1 */ kthread_t *spa_trim_thread; /* thread sending TRIM I/Os */ kmutex_t spa_trim_lock; /* protects spa_trim_cv */ kcondvar_t spa_trim_cv; /* used to notify TRIM thread */ boolean_t spa_autoreplace; /* autoreplace set in open */ int spa_vdev_locks; /* locks grabbed */ uint64_t spa_creation_version; /* version at pool creation */ uint64_t spa_prev_software_version; /* See ub_software_version */ uint64_t spa_feat_for_write_obj; /* required to write to pool */ uint64_t spa_feat_for_read_obj; /* required to read from pool */ uint64_t spa_feat_desc_obj; /* Feature descriptions */ uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */ /* cache feature refcounts */ uint64_t spa_feat_refcount_cache[SPA_FEATURES]; #ifdef illumos cyclic_id_t spa_deadman_cycid; /* cyclic id */ #else /* !illumos */ #ifdef _KERNEL struct callout spa_deadman_cycid; /* callout id */ #endif #endif /* illumos */ uint64_t spa_deadman_calls; /* number of deadman calls */ hrtime_t spa_sync_starttime; /* starting time fo spa_sync */ uint64_t spa_deadman_synctime; /* deadman expiration timer */ #ifdef illumos /* * spa_iokstat_lock protects spa_iokstat and * spa_queue_stats[]. */ kmutex_t spa_iokstat_lock; struct kstat *spa_iokstat; /* kstat of io to this pool */ struct { int spa_active; int spa_queued; } spa_queue_stats[ZIO_PRIORITY_NUM_QUEUEABLE]; #endif hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ /* * spa_refcount & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. * In order for the MDB module to function correctly, the other * fields must remain in the same location. */ spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ refcount_t spa_refcount; /* number of opens */ #ifndef illumos boolean_t spa_splitting_newspa; /* creating new spa in split */ #endif }; extern const char *spa_config_path; extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); #ifdef __cplusplus } #endif #endif /* _SYS_SPA_IMPL_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_impl.h (revision 286575) @@ -1,236 +1,238 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_ZAP_IMPL_H #define _SYS_ZAP_IMPL_H #include #include #include #ifdef __cplusplus extern "C" { #endif extern int fzap_default_block_shift; #define ZAP_MAGIC 0x2F52AB2ABULL #define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) #define MZAP_ENT_LEN 64 #define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) #define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE #define ZAP_NEED_CD (-1U) typedef struct mzap_ent_phys { uint64_t mze_value; uint32_t mze_cd; uint16_t mze_pad; /* in case we want to chain them someday */ char mze_name[MZAP_NAME_LEN]; } mzap_ent_phys_t; typedef struct mzap_phys { uint64_t mz_block_type; /* ZBT_MICRO */ uint64_t mz_salt; uint64_t mz_normflags; uint64_t mz_pad[5]; mzap_ent_phys_t mz_chunk[1]; /* actually variable size depending on block size */ } mzap_phys_t; typedef struct mzap_ent { avl_node_t mze_node; int mze_chunkid; uint64_t mze_hash; uint32_t mze_cd; /* copy from mze_phys->mze_cd */ } mzap_ent_t; #define MZE_PHYS(zap, mze) \ (&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid]) /* * The (fat) zap is stored in one object. It is an array of * 1<= 6] [zap_leaf_t] [ptrtbl] ... * */ struct dmu_buf; struct zap_leaf; #define ZBT_LEAF ((1ULL << 63) + 0) #define ZBT_HEADER ((1ULL << 63) + 1) #define ZBT_MICRO ((1ULL << 63) + 3) /* any other values are ptrtbl blocks */ /* * the embedded pointer table takes up half a block: * block size / entry size (2^3) / 2 */ #define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) /* * The embedded pointer table starts half-way through the block. Since * the pointer table itself is half the block, it starts at (64-bit) * word number (1<zap_dbuf->db_data); } inline mzap_phys_t * zap_m_phys(zap_t *zap) { return (zap->zap_dbuf->db_data); } typedef struct zap_name { zap_t *zn_zap; int zn_key_intlen; const void *zn_key_orig; int zn_key_orig_numints; const void *zn_key_norm; int zn_key_norm_numints; uint64_t zn_hash; matchtype_t zn_matchtype; char zn_normbuf[ZAP_MAXNAMELEN]; } zap_name_t; #define zap_f zap_u.zap_fat #define zap_m zap_u.zap_micro boolean_t zap_match(zap_name_t *zn, const char *matchname); int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp); void zap_unlockdir(zap_t *zap); -void zap_evict(dmu_buf_t *db, void *vmzap); +void zap_evict(void *dbu); zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); void zap_name_free(zap_name_t *zn); int zap_hashbits(zap_t *zap); uint32_t zap_maxcd(zap_t *zap); uint64_t zap_getflags(zap_t *zap); #define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) void fzap_byteswap(void *buf, size_t size); int fzap_count(zap_t *zap, uint64_t *count); int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *normalization_conflictp); void fzap_prefetch(zap_name_t *zn); int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, uint64_t *tooverwrite); int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int fzap_length(zap_name_t *zn, uint64_t *integer_size, uint64_t *num_integers); int fzap_remove(zap_name_t *zn, dmu_tx_t *tx); int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); void fzap_get_stats(zap_t *zap, zap_stats_t *zs); void zap_put_leaf(struct zap_leaf *l); int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, dmu_tx_t *tx); void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags); int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn); #ifdef __cplusplus } #endif #endif /* _SYS_ZAP_IMPL_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zap_leaf.h (revision 286575) @@ -1,246 +1,248 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #ifndef _SYS_ZAP_LEAF_H #define _SYS_ZAP_LEAF_H #include #ifdef __cplusplus extern "C" { #endif struct zap; struct zap_name; struct zap_stats; #define ZAP_LEAF_MAGIC 0x2AB1EAF /* chunk size = 24 bytes */ #define ZAP_LEAF_CHUNKSIZE 24 /* * The amount of space available for chunks is: * block size (1<l_bs) - hash entry size (2) * number of hash * entries - header space (2*chunksize) */ #define ZAP_LEAF_NUMCHUNKS(l) \ (((1<<(l)->l_bs) - 2*ZAP_LEAF_HASH_NUMENTRIES(l)) / \ ZAP_LEAF_CHUNKSIZE - 2) /* * The amount of space within the chunk available for the array is: * chunk size - space for type (1) - space for next pointer (2) */ #define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3) #define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \ (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES) /* * Low water mark: when there are only this many chunks free, start * growing the ptrtbl. Ideally, this should be larger than a * "reasonably-sized" entry. 20 chunks is more than enough for the * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value), * while still being only around 3% for 16k blocks. */ #define ZAP_LEAF_LOW_WATER (20) /* * The leaf hash table has block size / 2^5 (32) number of entries, * which should be more than enough for the maximum number of entries, * which is less than block size / CHUNKSIZE (24) / minimum number of * chunks per entry (3). */ #define ZAP_LEAF_HASH_SHIFT(l) ((l)->l_bs - 5) #define ZAP_LEAF_HASH_NUMENTRIES(l) (1 << ZAP_LEAF_HASH_SHIFT(l)) /* * The chunks start immediately after the hash table. The end of the * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a * chunk_t. */ #define ZAP_LEAF_CHUNK(l, idx) \ ((zap_leaf_chunk_t *) \ (zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] #define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry) typedef enum zap_chunk_type { ZAP_CHUNK_FREE = 253, ZAP_CHUNK_ENTRY = 252, ZAP_CHUNK_ARRAY = 251, ZAP_CHUNK_TYPE_MAX = 250 } zap_chunk_type_t; #define ZLF_ENTRIES_CDSORTED (1<<0) /* * TAKE NOTE: * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. */ typedef struct zap_leaf_phys { struct zap_leaf_header { /* Public to ZAP */ uint64_t lh_block_type; /* ZBT_LEAF */ uint64_t lh_pad1; uint64_t lh_prefix; /* hash prefix of this leaf */ uint32_t lh_magic; /* ZAP_LEAF_MAGIC */ uint16_t lh_nfree; /* number free chunks */ uint16_t lh_nentries; /* number of entries */ uint16_t lh_prefix_len; /* num bits used to id this */ /* Private to zap_leaf */ uint16_t lh_freelist; /* chunk head of free list */ uint8_t lh_flags; /* ZLF_* flags */ uint8_t lh_pad2[11]; } l_hdr; /* 2 24-byte chunks */ /* * The header is followed by a hash table with * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is * followed by an array of ZAP_LEAF_NUMCHUNKS(zap) * zap_leaf_chunk structures. These structures are accessed * with the ZAP_LEAF_CHUNK() macro. */ uint16_t l_hash[1]; } zap_leaf_phys_t; typedef union zap_leaf_chunk { struct zap_leaf_entry { uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ uint8_t le_value_intlen; /* size of value's ints */ uint16_t le_next; /* next entry in hash chain */ uint16_t le_name_chunk; /* first chunk of the name */ uint16_t le_name_numints; /* ints in name (incl null) */ uint16_t le_value_chunk; /* first chunk of the value */ uint16_t le_value_numints; /* value length in ints */ uint32_t le_cd; /* collision differentiator */ uint64_t le_hash; /* hash value of the name */ } l_entry; struct zap_leaf_array { uint8_t la_type; /* always ZAP_CHUNK_ARRAY */ uint8_t la_array[ZAP_LEAF_ARRAY_BYTES]; uint16_t la_next; /* next blk or CHAIN_END */ } l_array; struct zap_leaf_free { uint8_t lf_type; /* always ZAP_CHUNK_FREE */ uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES]; uint16_t lf_next; /* next in free list, or CHAIN_END */ } l_free; } zap_leaf_chunk_t; typedef struct zap_leaf { + dmu_buf_user_t l_dbu; krwlock_t l_rwlock; uint64_t l_blkid; /* 1<l_dbuf->db_data); } typedef struct zap_entry_handle { /* Set by zap_leaf and public to ZAP */ uint64_t zeh_num_integers; uint64_t zeh_hash; uint32_t zeh_cd; uint8_t zeh_integer_size; /* Private to zap_leaf */ uint16_t zeh_fakechunk; uint16_t *zeh_chunkp; zap_leaf_t *zeh_leaf; } zap_entry_handle_t; /* * Return a handle to the named entry, or ENOENT if not found. The hash * value must equal zap_hash(name). */ extern int zap_leaf_lookup(zap_leaf_t *l, struct zap_name *zn, zap_entry_handle_t *zeh); /* * Return a handle to the entry with this hash+cd, or the entry with the * next closest hash+cd. */ extern int zap_leaf_lookup_closest(zap_leaf_t *l, uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh); /* * Read the first num_integers in the attribute. Integer size * conversion will be done without sign extension. Return EINVAL if * integer_size is too small. Return EOVERFLOW if there are more than * num_integers in the attribute. */ extern int zap_entry_read(const zap_entry_handle_t *zeh, uint8_t integer_size, uint64_t num_integers, void *buf); extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh, uint16_t buflen, char *buf); /* * Replace the value of an existing entry. * * May fail if it runs out of space (ENOSPC). */ extern int zap_entry_update(zap_entry_handle_t *zeh, uint8_t integer_size, uint64_t num_integers, const void *buf); /* * Remove an entry. */ extern void zap_entry_remove(zap_entry_handle_t *zeh); /* * Create an entry. An equal entry must not exist, and this entry must * belong in this leaf (according to its hash value). Fills in the * entry handle on success. Returns 0 on success or ENOSPC on failure. */ extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd, uint8_t integer_size, uint64_t num_integers, const void *buf, zap_entry_handle_t *zeh); /* Determine whether there is another entry with the same normalized form. */ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, struct zap_name *zn, const char *name, struct zap *zap); /* * Other stuff. */ extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort); extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort); extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l, struct zap_stats *zs); #ifdef __cplusplus } #endif #endif /* _SYS_ZAP_LEAF_H */ Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap.c (revision 286575) @@ -1,1386 +1,1387 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ /* * This file contains the top half of the zfs directory structure * implementation. The bottom half is in zap_leaf.c. * * The zdir is an extendable hash data structure. There is a table of * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are * each a constant size and hold a variable number of directory entries. * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. * * The pointer table holds a power of 2 number of pointers. * (1<zd_data->zd_phys->zd_prefix_len). The bucket pointed to * by the pointer at index i in the table holds entries whose hash value * has a zd_prefix_len - bit prefix */ #include #include #include #include #include #include #include #include #include int fzap_default_block_shift = 14; /* 16k blocksize */ extern inline zap_phys_t *zap_f_phys(zap_t *zap); -static void zap_leaf_pageout(dmu_buf_t *db, void *vl); static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); void fzap_byteswap(void *vbuf, size_t size) { uint64_t block_type; block_type = *(uint64_t *)vbuf; if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) zap_leaf_byteswap(vbuf, size); else { /* it's a ptrtbl block */ byteswap_uint64_array(vbuf, size); } } void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) { dmu_buf_t *db; zap_leaf_t *l; int i; zap_phys_t *zp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); zap->zap_ismicro = FALSE; - (void) dmu_buf_update_user(zap->zap_dbuf, zap, zap, zap_evict); + zap->zap_dbu.dbu_evict_func = zap_evict; mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; zp = zap_f_phys(zap); /* * explicitly zero it since it might be coming from an * initialized microzap */ bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); zp->zap_block_type = ZBT_HEADER; zp->zap_magic = ZAP_MAGIC; zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); zp->zap_freeblk = 2; /* block 1 will be the first leaf */ zp->zap_num_leafs = 1; zp->zap_num_entries = 0; zp->zap_salt = zap->zap_salt; zp->zap_normflags = zap->zap_normflags; zp->zap_flags = flags; /* block 1 will be the first leaf */ for (i = 0; i < (1<zap_ptrtbl.zt_shift); i++) ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; /* * set up block 1 - the first leaf */ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, 1<l_dbuf = db; zap_leaf_init(l, zp->zap_normflags != 0); kmem_free(l, sizeof (zap_leaf_t)); dmu_buf_rele(db, FTAG); } static int zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) { if (RW_WRITE_HELD(&zap->zap_rwlock)) return (1); if (rw_tryupgrade(&zap->zap_rwlock)) { dmu_buf_will_dirty(zap->zap_dbuf, tx); return (1); } return (0); } /* * Generic routines for dealing with the pointer & cookie tables. */ static int zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), dmu_tx_t *tx) { uint64_t b, newblk; dmu_buf_t *db_old, *db_new; int err; int bs = FZAP_BLOCK_SHIFT(zap); int hepb = 1<<(bs-4); /* hepb = half the number of entries in a block */ ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ASSERT(tbl->zt_blk != 0); ASSERT(tbl->zt_numblks > 0); if (tbl->zt_nextblk != 0) { newblk = tbl->zt_nextblk; } else { newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); tbl->zt_nextblk = newblk; ASSERT0(tbl->zt_blks_copied); dmu_prefetch(zap->zap_objset, zap->zap_object, tbl->zt_blk << bs, tbl->zt_numblks << bs); } /* * Copy the ptrtbl from the old to new location. */ b = tbl->zt_blks_copied; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); if (err) return (err); /* first half of entries in old[b] go to new[2*b+0] */ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func(db_old->db_data, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); /* second half of entries in old[b] go to new[2*b+1] */ VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(db_new, tx); transfer_func((uint64_t *)db_old->db_data + hepb, db_new->db_data, hepb); dmu_buf_rele(db_new, FTAG); dmu_buf_rele(db_old, FTAG); tbl->zt_blks_copied++; dprintf("copied block %llu of %llu\n", tbl->zt_blks_copied, tbl->zt_numblks); if (tbl->zt_blks_copied == tbl->zt_numblks) { (void) dmu_free_range(zap->zap_objset, zap->zap_object, tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); tbl->zt_blk = newblk; tbl->zt_numblks *= 2; tbl->zt_shift++; tbl->zt_nextblk = 0; tbl->zt_blks_copied = 0; dprintf("finished; numblocks now %llu (%lluk entries)\n", tbl->zt_numblks, 1<<(tbl->zt_shift-10)); } return (0); } static int zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, dmu_tx_t *tx) { int err; uint64_t blk, off; int bs = FZAP_BLOCK_SHIFT(zap); dmu_buf_t *db; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(tbl->zt_blk != 0); dprintf("storing %llx at index %llx\n", val, idx); blk = idx >> (bs-3); off = idx & ((1<<(bs-3))-1); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err) return (err); dmu_buf_will_dirty(db, tx); if (tbl->zt_nextblk != 0) { uint64_t idx2 = idx * 2; uint64_t blk2 = idx2 >> (bs-3); uint64_t off2 = idx2 & ((1<<(bs-3))-1); dmu_buf_t *db2; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, DMU_READ_NO_PREFETCH); if (err) { dmu_buf_rele(db, FTAG); return (err); } dmu_buf_will_dirty(db2, tx); ((uint64_t *)db2->db_data)[off2] = val; ((uint64_t *)db2->db_data)[off2+1] = val; dmu_buf_rele(db2, FTAG); } ((uint64_t *)db->db_data)[off] = val; dmu_buf_rele(db, FTAG); return (0); } static int zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) { uint64_t blk, off; int err; dmu_buf_t *db; int bs = FZAP_BLOCK_SHIFT(zap); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); blk = idx >> (bs-3); off = idx & ((1<<(bs-3))-1); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err) return (err); *valp = ((uint64_t *)db->db_data)[off]; dmu_buf_rele(db, FTAG); if (tbl->zt_nextblk != 0) { /* * read the nextblk for the sake of i/o error checking, * so that zap_table_load() will catch errors for * zap_table_store. */ blk = (idx*2) >> (bs-3); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (tbl->zt_nextblk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) dmu_buf_rele(db, FTAG); } return (err); } /* * Routines for growing the ptrtbl. */ static void zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) { int i; for (i = 0; i < n; i++) { uint64_t lb = src[i]; dst[2*i+0] = lb; dst[2*i+1] = lb; } } static int zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) { /* * The pointer table should never use more hash bits than we * have (otherwise we'd be using useless zero bits to index it). * If we are within 2 bits of running out, stop growing, since * this is already an aberrant condition. */ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) return (SET_ERROR(ENOSPC)); if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { /* * We are outgrowing the "embedded" ptrtbl (the one * stored in the header block). Give it its own entire * block, which will double the size of the ptrtbl. */ uint64_t newblk; dmu_buf_t *db_new; int err; ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); newblk = zap_allocate_blocks(zap, 1); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, DMU_READ_NO_PREFETCH); if (err) return (err); dmu_buf_will_dirty(db_new, tx); zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); dmu_buf_rele(db_new, FTAG); zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; zap_f_phys(zap)->zap_ptrtbl.zt_shift++; ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << (FZAP_BLOCK_SHIFT(zap)-3)); return (0); } else { return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, zap_ptrtbl_transfer, tx)); } } static void zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) { dmu_buf_will_dirty(zap->zap_dbuf, tx); mutex_enter(&zap->zap_f.zap_num_entries_mtx); ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); zap_f_phys(zap)->zap_num_entries += delta; mutex_exit(&zap->zap_f.zap_num_entries_mtx); } static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks) { uint64_t newblk; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); newblk = zap_f_phys(zap)->zap_freeblk; zap_f_phys(zap)->zap_freeblk += nblocks; return (newblk); } +static void +zap_leaf_pageout(void *dbu) +{ + zap_leaf_t *l = dbu; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + static zap_leaf_t * zap_create_leaf(zap_t *zap, dmu_tx_t *tx) { void *winner; - zap_leaf_t *l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); rw_init(&l->l_rwlock, 0, 0, 0); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = zap_allocate_blocks(zap, 1); l->l_dbuf = NULL; VERIFY(0 == dmu_buf_hold(zap->zap_objset, zap->zap_object, l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, DMU_READ_NO_PREFETCH)); - winner = dmu_buf_set_user(l->l_dbuf, l, zap_leaf_pageout); + dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf); + winner = dmu_buf_set_user(l->l_dbuf, &l->l_dbu); ASSERT(winner == NULL); dmu_buf_will_dirty(l->l_dbuf, tx); zap_leaf_init(l, zap->zap_normflags != 0); zap_f_phys(zap)->zap_num_leafs++; return (l); } int fzap_count(zap_t *zap, uint64_t *count) { ASSERT(!zap->zap_ismicro); mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ *count = zap_f_phys(zap)->zap_num_entries; mutex_exit(&zap->zap_f.zap_num_entries_mtx); return (0); } /* * Routines for obtaining zap_leaf_t's */ void zap_put_leaf(zap_leaf_t *l) { rw_exit(&l->l_rwlock); dmu_buf_rele(l->l_dbuf, NULL); } -_NOTE(ARGSUSED(0)) -static void -zap_leaf_pageout(dmu_buf_t *db, void *vl) -{ - zap_leaf_t *l = vl; - - rw_destroy(&l->l_rwlock); - kmem_free(l, sizeof (zap_leaf_t)); -} - static zap_leaf_t * zap_open_leaf(uint64_t blkid, dmu_buf_t *db) { zap_leaf_t *l, *winner; ASSERT(blkid != 0); - l = kmem_alloc(sizeof (zap_leaf_t), KM_SLEEP); + l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); rw_init(&l->l_rwlock, 0, 0, 0); rw_enter(&l->l_rwlock, RW_WRITER); l->l_blkid = blkid; l->l_bs = highbit64(db->db_size) - 1; l->l_dbuf = db; - winner = dmu_buf_set_user(db, l, zap_leaf_pageout); + dmu_buf_init_user(&l->l_dbu, zap_leaf_pageout, &l->l_dbuf); + winner = dmu_buf_set_user(db, &l->l_dbu); rw_exit(&l->l_rwlock); if (winner != NULL) { /* someone else set it first */ - zap_leaf_pageout(NULL, l); + zap_leaf_pageout(&l->l_dbu); l = winner; } /* * lhr_pad was previously used for the next leaf in the leaf * chain. There should be no chained leafs (as we have removed * support for them). */ ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); /* * There should be more hash entries than there can be * chunks to put in the hash table */ ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); /* The chunks should begin at the end of the hash table */ ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); /* The chunks should end at the end of the block */ ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); return (l); } static int zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { dmu_buf_t *db; zap_leaf_t *l; int bs = FZAP_BLOCK_SHIFT(zap); int err; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); err = dmu_buf_hold(zap->zap_objset, zap->zap_object, blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); if (err) return (err); ASSERT3U(db->db_object, ==, zap->zap_object); ASSERT3U(db->db_offset, ==, blkid << bs); ASSERT3U(db->db_size, ==, 1 << bs); ASSERT(blkid != 0); l = dmu_buf_get_user(db); if (l == NULL) l = zap_open_leaf(blkid, db); rw_enter(&l->l_rwlock, lt); /* * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, * causing ASSERT below to fail. */ if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3U(l->l_blkid, ==, blkid); ASSERT3P(l->l_dbuf, ==, db); ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); *lp = l; return (0); } static int zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) { ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { ASSERT3U(idx, <, (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); return (0); } else { return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, idx, valp)); } } static int zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) { ASSERT(tx != NULL); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; return (0); } else { return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, idx, blk, tx)); } } static int zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) { uint64_t idx, blk; int err; ASSERT(zap->zap_dbuf == NULL || zap_f_phys(zap) == zap->zap_dbuf->db_data); ASSERT3U(zap_f_phys(zap)->zap_magic, ==, ZAP_MAGIC); idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); err = zap_idx_to_blk(zap, idx, &blk); if (err != 0) return (err); err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); ASSERT(err || ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == zap_leaf_phys(*lp)->l_hdr.lh_prefix); return (err); } static int zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx, zap_leaf_t **lp) { zap_t *zap = zn->zn_zap; uint64_t hash = zn->zn_hash; zap_leaf_t *nl; int prefix_diff, i, err; uint64_t sibling; int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, zap_leaf_phys(l)->l_hdr.lh_prefix); if (zap_tryupgradedir(zap, tx) == 0 || old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { /* We failed to upgrade, or need to grow the pointer table */ objset_t *os = zap->zap_objset; uint64_t object = zap->zap_object; zap_put_leaf(l); zap_unlockdir(zap); err = zap_lockdir(os, object, tx, RW_WRITER, FALSE, FALSE, &zn->zn_zap); zap = zn->zn_zap; if (err) return (err); ASSERT(!zap->zap_ismicro); while (old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { err = zap_grow_ptrtbl(zap, tx); if (err) return (err); } err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); if (err) return (err); if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { /* it split while our locks were down */ *lp = l; return (0); } } ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, zap_leaf_phys(l)->l_hdr.lh_prefix); prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - (old_prefix_len + 1); sibling = (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; /* check for i/o errors before doing zap_leaf_split */ for (i = 0; i < (1ULL<l_blkid); } nl = zap_create_leaf(zap, tx); zap_leaf_split(l, nl, zap->zap_normflags != 0); /* set sibling pointers */ for (i = 0; i < (1ULL << prefix_diff); i++) { err = zap_set_idx_to_blk(zap, sibling+i, nl->l_blkid, tx); ASSERT0(err); /* we checked for i/o errors above */ } if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { /* we want the sibling */ zap_put_leaf(l); *lp = nl; } else { zap_put_leaf(nl); *lp = l; } return (0); } static void zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) { zap_t *zap = zn->zn_zap; int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); zap_put_leaf(l); if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { int err; /* * We are in the middle of growing the pointer table, or * this leaf will soon make us grow it. */ if (zap_tryupgradedir(zap, tx) == 0) { objset_t *os = zap->zap_objset; uint64_t zapobj = zap->zap_object; zap_unlockdir(zap); err = zap_lockdir(os, zapobj, tx, RW_WRITER, FALSE, FALSE, &zn->zn_zap); zap = zn->zn_zap; if (err) return; } /* could have finished growing while our locks were down */ if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) (void) zap_grow_ptrtbl(zap, tx); } } static int fzap_checkname(zap_name_t *zn) { if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); return (0); } static int fzap_checksize(uint64_t integer_size, uint64_t num_integers) { /* Only integer sizes supported by C */ switch (integer_size) { case 1: case 2: case 4: case 8: break; default: return (SET_ERROR(EINVAL)); } if (integer_size * num_integers > ZAP_MAXVALUELEN) return (E2BIG); return (0); } static int fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) { int err; if ((err = fzap_checkname(zn)) != 0) return (err); return (fzap_checksize(integer_size, num_integers)); } /* * Routines for manipulating attributes. */ int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *ncp) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; if ((err = fzap_checkname(zn)) != 0) return (err); err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { if ((err = fzap_checksize(integer_size, num_integers)) != 0) { zap_put_leaf(l); return (err); } err = zap_entry_read(&zeh, integer_size, num_integers, buf); (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); if (ncp) { *ncp = zap_entry_normalization_conflict(&zeh, zn, NULL, zn->zn_zap); } } zap_put_leaf(l); return (err); } int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, dmu_tx_t *tx) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; zap_t *zap = zn->zn_zap; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(!zap->zap_ismicro); ASSERT(fzap_check(zn, integer_size, num_integers) == 0); err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); retry: err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { err = SET_ERROR(EEXIST); goto out; } if (err != ENOENT) goto out; err = zap_entry_create(l, zn, cd, integer_size, num_integers, val, &zeh); if (err == 0) { zap_increment_num_entries(zap, 1, tx); } else if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } out: if (zap != NULL) zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); return (err); } int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { int err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); return (fzap_add_cd(zn, integer_size, num_integers, val, ZAP_NEED_CD, tx)); } int fzap_update(zap_name_t *zn, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_leaf_t *l; int err, create; zap_entry_handle_t zeh; zap_t *zap = zn->zn_zap; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); retry: err = zap_leaf_lookup(l, zn, &zeh); create = (err == ENOENT); ASSERT(err == 0 || err == ENOENT); if (create) { err = zap_entry_create(l, zn, ZAP_NEED_CD, integer_size, num_integers, val, &zeh); if (err == 0) zap_increment_num_entries(zap, 1, tx); } else { err = zap_entry_update(&zeh, integer_size, num_integers, val); } if (err == EAGAIN) { err = zap_expand_leaf(zn, l, tx, &l); zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ if (err == 0) goto retry; } if (zap != NULL) zap_put_leaf_maybe_grow_ptrtbl(zn, l, tx); return (err); } int fzap_length(zap_name_t *zn, uint64_t *integer_size, uint64_t *num_integers) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err != 0) goto out; if (integer_size) *integer_size = zeh.zeh_integer_size; if (num_integers) *num_integers = zeh.zeh_num_integers; out: zap_put_leaf(l); return (err); } int fzap_remove(zap_name_t *zn, dmu_tx_t *tx) { zap_leaf_t *l; int err; zap_entry_handle_t zeh; err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { zap_entry_remove(&zeh); zap_increment_num_entries(zn->zn_zap, -1, tx); } zap_put_leaf(l); return (err); } void fzap_prefetch(zap_name_t *zn) { uint64_t idx, blk; zap_t *zap = zn->zn_zap; int bs; idx = ZAP_HASH_IDX(zn->zn_hash, zap_f_phys(zap)->zap_ptrtbl.zt_shift); if (zap_idx_to_blk(zap, idx, &blk) != 0) return; bs = FZAP_BLOCK_SHIFT(zap); dmu_prefetch(zap->zap_objset, zap->zap_object, blk << bs, 1 << bs); } /* * Helper functions for consumers. */ uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, const char *name, dmu_tx_t *tx) { uint64_t new_obj; VERIFY((new_obj = zap_create(os, ot, DMU_OT_NONE, 0, tx)) > 0); VERIFY(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, tx) == 0); return (new_obj); } int zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, char *name) { zap_cursor_t zc; zap_attribute_t *za; int err; if (mask == 0) mask = -1ULL; za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); for (zap_cursor_init(&zc, os, zapobj); (err = zap_cursor_retrieve(&zc, za)) == 0; zap_cursor_advance(&zc)) { if ((za->za_first_integer & mask) == (value & mask)) { (void) strcpy(name, za->za_name); break; } } zap_cursor_fini(&zc); kmem_free(za, sizeof (zap_attribute_t)); return (err); } int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) { zap_cursor_t zc; zap_attribute_t za; int err; err = 0; for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, &za) == 0; (void) zap_cursor_advance(&zc)) { if (za.za_integer_length != 8 || za.za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } err = zap_add(os, intoobj, za.za_name, 8, 1, &za.za_first_integer, tx); if (err) break; } zap_cursor_fini(&zc); return (err); } int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, uint64_t value, dmu_tx_t *tx) { zap_cursor_t zc; zap_attribute_t za; int err; err = 0; for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, &za) == 0; (void) zap_cursor_advance(&zc)) { if (za.za_integer_length != 8 || za.za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } err = zap_add(os, intoobj, za.za_name, 8, 1, &value, tx); if (err) break; } zap_cursor_fini(&zc); return (err); } int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) { zap_cursor_t zc; zap_attribute_t za; int err; err = 0; for (zap_cursor_init(&zc, os, fromobj); zap_cursor_retrieve(&zc, &za) == 0; (void) zap_cursor_advance(&zc)) { uint64_t delta = 0; if (za.za_integer_length != 8 || za.za_num_integers != 1) { err = SET_ERROR(EINVAL); break; } err = zap_lookup(os, intoobj, za.za_name, 8, 1, &delta); if (err != 0 && err != ENOENT) break; delta += za.za_first_integer; err = zap_update(os, intoobj, za.za_name, 8, 1, &delta, tx); if (err) break; } zap_cursor_fini(&zc); return (err); } int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); return (zap_add(os, obj, name, 8, 1, &value, tx)); } int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); return (zap_remove(os, obj, name, tx)); } int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); return (zap_lookup(os, obj, name, 8, 1, &value)); } int zap_add_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_add(os, obj, name, 8, 1, &value, tx)); } int zap_update_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t value, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_update(os, obj, name, 8, 1, &value, tx)); } int zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_lookup(os, obj, name, 8, 1, valuep)); } int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, dmu_tx_t *tx) { uint64_t value = 0; int err; if (delta == 0) return (0); err = zap_lookup(os, obj, name, 8, 1, &value); if (err != 0 && err != ENOENT) return (err); value += delta; if (value == 0) err = zap_remove(os, obj, name, tx); else err = zap_update(os, obj, name, 8, 1, &value, tx); return (err); } int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, dmu_tx_t *tx) { char name[20]; (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); return (zap_increment(os, obj, name, delta, tx)); } /* * Routines for iterating over the attributes. */ int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) { int err = ENOENT; zap_entry_handle_t zeh; zap_leaf_t *l; /* retrieve the next entry at or after zc_hash/zc_cd */ /* if no entry, return ENOENT */ if (zc->zc_leaf && (ZAP_HASH_IDX(zc->zc_hash, zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; } again: if (zc->zc_leaf == NULL) { err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, &zc->zc_leaf); if (err != 0) return (err); } else { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); } l = zc->zc_leaf; err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); if (err == ENOENT) { uint64_t nocare = (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; zc->zc_cd = 0; if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0 || zc->zc_hash == 0) { zc->zc_hash = -1ULL; } else { zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; goto again; } } if (err == 0) { zc->zc_hash = zeh.zeh_hash; zc->zc_cd = zeh.zeh_cd; za->za_integer_length = zeh.zeh_integer_size; za->za_num_integers = zeh.zeh_num_integers; if (zeh.zeh_num_integers == 0) { za->za_first_integer = 0; } else { err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); ASSERT(err == 0 || err == EOVERFLOW); } err = zap_entry_read_name(zap, &zeh, sizeof (za->za_name), za->za_name); ASSERT(err == 0); za->za_normalization_conflict = zap_entry_normalization_conflict(&zeh, NULL, za->za_name, zap); } rw_exit(&zc->zc_leaf->l_rwlock); return (err); } static void zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) { int i, err; uint64_t lastblk = 0; /* * NB: if a leaf has more pointers than an entire ptrtbl block * can hold, then it'll be accounted for more than once, since * we won't have lastblk. */ for (i = 0; i < len; i++) { zap_leaf_t *l; if (tbl[i] == lastblk) continue; lastblk = tbl[i]; err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); if (err == 0) { zap_leaf_stats(zap, l, zs); zap_put_leaf(l); } } } int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn) { int err; zap_leaf_t *l; zap_entry_handle_t zeh; if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err != 0) return (err); zc->zc_leaf = l; zc->zc_hash = zeh.zeh_hash; zc->zc_cd = zeh.zeh_cd; return (err); } void fzap_get_stats(zap_t *zap, zap_stats_t *zs) { int bs = FZAP_BLOCK_SHIFT(zap); zs->zs_blocksize = 1ULL << bs; /* * Set zap_phys_t fields */ zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; zs->zs_block_type = zap_f_phys(zap)->zap_block_type; zs->zs_magic = zap_f_phys(zap)->zap_magic; zs->zs_salt = zap_f_phys(zap)->zap_salt; /* * Set zap_ptrtbl fields */ zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; zs->zs_ptrtbl_blks_copied = zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { /* the ptrtbl is entirely in the header block. */ zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); } else { int b; dmu_prefetch(zap->zap_objset, zap->zap_object, zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs); for (b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; b++) { dmu_buf_t *db; int err; err = dmu_buf_hold(zap->zap_objset, zap->zap_object, (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); if (err == 0) { zap_stats_ptrtbl(zap, db->db_data, 1<<(bs-3), zs); dmu_buf_rele(db, FTAG); } } } } int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, uint64_t *tooverwrite) { zap_t *zap = zn->zn_zap; zap_leaf_t *l; int err; /* * Account for the header block of the fatzap. */ if (!add && dmu_buf_freeable(zap->zap_dbuf)) { *tooverwrite += zap->zap_dbuf->db_size; } else { *towrite += zap->zap_dbuf->db_size; } /* * Account for the pointer table blocks. * If we are adding we need to account for the following cases : * - If the pointer table is embedded, this operation could force an * external pointer table. * - If this already has an external pointer table this operation * could extend the table. */ if (add) { if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) *towrite += zap->zap_dbuf->db_size; else *towrite += (zap->zap_dbuf->db_size * 3); } /* * Now, check if the block containing leaf is freeable * and account accordingly. */ err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l); if (err != 0) { return (err); } if (!add && dmu_buf_freeable(l->l_dbuf)) { *tooverwrite += l->l_dbuf->db_size; } else { /* * If this an add operation, the leaf block could split. * Hence, we need to account for an additional leaf block. */ *towrite += (add ? 2 : 1) * l->l_dbuf->db_size; } zap_put_leaf(l); return (0); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zap_micro.c (revision 286575) @@ -1,1468 +1,1469 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef _KERNEL #include #endif extern inline mzap_phys_t *zap_m_phys(zap_t *zap); static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags); uint64_t zap_getflags(zap_t *zap) { if (zap->zap_ismicro) return (0); return (zap_f_phys(zap)->zap_flags); } int zap_hashbits(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return (48); else return (28); } uint32_t zap_maxcd(zap_t *zap) { if (zap_getflags(zap) & ZAP_FLAG_HASH64) return ((1<<16)-1); else return (-1U); } static uint64_t zap_hash(zap_name_t *zn) { zap_t *zap = zn->zn_zap; uint64_t h = 0; if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); h = *(uint64_t *)zn->zn_key_orig; } else { h = zap->zap_salt; ASSERT(h != 0); ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { int i; const uint64_t *wp = zn->zn_key_norm; ASSERT(zn->zn_key_intlen == 8); for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) { int j; uint64_t word = *wp; for (j = 0; j < zn->zn_key_intlen; j++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ word) & 0xFF]; word >>= NBBY; } } } else { int i, len; const uint8_t *cp = zn->zn_key_norm; /* * We previously stored the terminating null on * disk, but didn't hash it, so we need to * continue to not hash it. (The * zn_key_*_numints includes the terminating * null for non-binary keys.) */ len = zn->zn_key_norm_numints - 1; ASSERT(zn->zn_key_intlen == 1); for (i = 0; i < len; cp++, i++) { h = (h >> 8) ^ zfs_crc64_table[(h ^ *cp) & 0xFF]; } } } /* * Don't use all 64 bits, since we need some in the cookie for * the collision differentiator. We MUST use the high bits, * since those are the ones that we first pay attention to when * chosing the bucket. */ h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); return (h); } static int zap_normalize(zap_t *zap, const char *name, char *namenorm) { size_t inlen, outlen; int err; ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); inlen = strlen(name) + 1; outlen = ZAP_MAXNAMELEN; err = 0; (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); return (err); } boolean_t zap_match(zap_name_t *zn, const char *matchname) { ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); if (zn->zn_matchtype == MT_FIRST) { char norm[ZAP_MAXNAMELEN]; if (zap_normalize(zn->zn_zap, matchname, norm) != 0) return (B_FALSE); return (strcmp(zn->zn_key_norm, norm) == 0); } else { /* MT_BEST or MT_EXACT */ return (strcmp(zn->zn_key_orig, matchname) == 0); } } void zap_name_free(zap_name_t *zn) { kmem_free(zn, sizeof (zap_name_t)); } zap_name_t * zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = key; zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; zn->zn_matchtype = mt; if (zap->zap_normflags) { if (zap_normalize(zap, key, zn->zn_normbuf) != 0) { zap_name_free(zn); return (NULL); } zn->zn_key_norm = zn->zn_normbuf; zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { if (mt != MT_EXACT) { zap_name_free(zn); return (NULL); } zn->zn_key_norm = zn->zn_key_orig; zn->zn_key_norm_numints = zn->zn_key_orig_numints; } zn->zn_hash = zap_hash(zn); return (zn); } zap_name_t * zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); ASSERT(zap->zap_normflags == 0); zn->zn_zap = zap; zn->zn_key_intlen = sizeof (*key); zn->zn_key_orig = zn->zn_key_norm = key; zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; zn->zn_matchtype = MT_EXACT; zn->zn_hash = zap_hash(zn); return (zn); } static void mzap_byteswap(mzap_phys_t *buf, size_t size) { int i, max; buf->mz_block_type = BSWAP_64(buf->mz_block_type); buf->mz_salt = BSWAP_64(buf->mz_salt); buf->mz_normflags = BSWAP_64(buf->mz_normflags); max = (size / MZAP_ENT_LEN) - 1; for (i = 0; i < max; i++) { buf->mz_chunk[i].mze_value = BSWAP_64(buf->mz_chunk[i].mze_value); buf->mz_chunk[i].mze_cd = BSWAP_32(buf->mz_chunk[i].mze_cd); } } void zap_byteswap(void *buf, size_t size) { uint64_t block_type; block_type = *(uint64_t *)buf; if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { /* ASSERT(magic == ZAP_LEAF_MAGIC); */ mzap_byteswap(buf, size); } else { fzap_byteswap(buf, size); } } static int mze_compare(const void *arg1, const void *arg2) { const mzap_ent_t *mze1 = arg1; const mzap_ent_t *mze2 = arg2; if (mze1->mze_hash > mze2->mze_hash) return (+1); if (mze1->mze_hash < mze2->mze_hash) return (-1); if (mze1->mze_cd > mze2->mze_cd) return (+1); if (mze1->mze_cd < mze2->mze_cd) return (-1); return (0); } static int mze_insert(zap_t *zap, int chunkid, uint64_t hash) { mzap_ent_t *mze; avl_index_t idx; ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze->mze_chunkid = chunkid; mze->mze_hash = hash; mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); if (avl_find(&zap->zap_m.zap_avl, mze, &idx) != NULL) { kmem_free(mze, sizeof (mzap_ent_t)); return (EEXIST); } avl_insert(&zap->zap_m.zap_avl, mze, idx); return (0); } static mzap_ent_t * mze_find(zap_name_t *zn) { mzap_ent_t mze_tofind; mzap_ent_t *mze; avl_index_t idx; avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); mze_tofind.mze_hash = zn->zn_hash; mze_tofind.mze_cd = 0; again: mze = avl_find(avl, &mze_tofind, &idx); if (mze == NULL) mze = avl_nearest(avl, idx, AVL_AFTER); for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) return (mze); } if (zn->zn_matchtype == MT_BEST) { zn->zn_matchtype = MT_FIRST; goto again; } return (NULL); } static uint32_t mze_find_unused_cd(zap_t *zap, uint64_t hash) { mzap_ent_t mze_tofind; mzap_ent_t *mze; avl_index_t idx; avl_tree_t *avl = &zap->zap_m.zap_avl; uint32_t cd; ASSERT(zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); mze_tofind.mze_hash = hash; mze_tofind.mze_cd = 0; cd = 0; for (mze = avl_find(avl, &mze_tofind, &idx); mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { if (mze->mze_cd != cd) break; cd++; } return (cd); } static void mze_remove(zap_t *zap, mzap_ent_t *mze) { ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); avl_remove(&zap->zap_m.zap_avl, mze); kmem_free(mze, sizeof (mzap_ent_t)); } static void mze_destroy(zap_t *zap) { mzap_ent_t *mze; void *avlcookie = NULL; while (mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie)) kmem_free(mze, sizeof (mzap_ent_t)); avl_destroy(&zap->zap_m.zap_avl); } static zap_t * mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) { zap_t *winner; zap_t *zap; int i; ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); rw_init(&zap->zap_rwlock, 0, 0, 0); rw_enter(&zap->zap_rwlock, RW_WRITER); zap->zap_objset = os; zap->zap_object = obj; zap->zap_dbuf = db; if (*(uint64_t *)db->db_data != ZBT_MICRO) { mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, 0, 0); zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; } else { zap->zap_ismicro = TRUE; } /* * Make sure that zap_ismicro is set before we let others see * it, because zap_lockdir() checks zap_ismicro without the lock * held. */ - winner = dmu_buf_set_user(db, zap, zap_evict); + dmu_buf_init_user(&zap->zap_dbu, zap_evict, &zap->zap_dbuf); + winner = dmu_buf_set_user(db, &zap->zap_dbu); if (winner != NULL) { rw_exit(&zap->zap_rwlock); rw_destroy(&zap->zap_rwlock); if (!zap->zap_ismicro) mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); return (winner); } if (zap->zap_ismicro) { zap->zap_salt = zap_m_phys(zap)->mz_salt; zap->zap_normflags = zap_m_phys(zap)->mz_normflags; zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; avl_create(&zap->zap_m.zap_avl, mze_compare, sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0]) { zap_name_t *zn; zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); if (mze_insert(zap, i, zn->zn_hash) == 0) zap->zap_m.zap_num_entries++; else { printf("ZFS WARNING: Duplicated ZAP " "entry detected (%s).\n", mze->mze_name); } zap_name_free(zn); } } } else { zap->zap_salt = zap_f_phys(zap)->zap_salt; zap->zap_normflags = zap_f_phys(zap)->zap_normflags; ASSERT3U(sizeof (struct zap_leaf_header), ==, 2*ZAP_LEAF_CHUNKSIZE); /* * The embedded pointer table should not overlap the * other members. */ ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, &zap_f_phys(zap)->zap_salt); /* * The embedded pointer table should end at the end of * the block */ ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, 1<zap_dbuf->db_size); } rw_exit(&zap->zap_rwlock); return (zap); } int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) { zap_t *zap; dmu_buf_t *db; krw_t lt; int err; *zapp = NULL; err = dmu_buf_hold(os, obj, 0, NULL, &db, DMU_READ_NO_PREFETCH); if (err) return (err); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif zap = dmu_buf_get_user(db); if (zap == NULL) zap = mzap_open(os, obj, db); /* * We're checking zap_ismicro without the lock held, in order to * tell what type of lock we want. Once we have some sort of * lock, see if it really is the right type. In practice this * can only be different if it was upgraded from micro to fat, * and micro wanted WRITER but fat only needs READER. */ lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; rw_enter(&zap->zap_rwlock, lt); if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { /* it was upgraded, now we only need reader */ ASSERT(lt == RW_WRITER); ASSERT(RW_READER == (!zap->zap_ismicro && fatreader) ? RW_READER : lti); rw_downgrade(&zap->zap_rwlock); lt = RW_READER; } zap->zap_objset = os; if (lt == RW_WRITER) dmu_buf_will_dirty(db, tx); ASSERT3P(zap->zap_dbuf, ==, db); ASSERT(!zap->zap_ismicro || zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); if (zap->zap_ismicro && tx && adding && zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; if (newsz > MZAP_MAX_BLKSZ) { dprintf("upgrading obj %llu: num_entries=%u\n", obj, zap->zap_m.zap_num_entries); *zapp = zap; return (mzap_upgrade(zapp, tx, 0)); } err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); ASSERT0(err); zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; } *zapp = zap; return (0); } void zap_unlockdir(zap_t *zap) { rw_exit(&zap->zap_rwlock); dmu_buf_rele(zap->zap_dbuf, NULL); } static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) { mzap_phys_t *mzp; int i, sz, nchunks; int err = 0; zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); sz = zap->zap_dbuf->db_size; mzp = zio_buf_alloc(sz); bcopy(zap->zap_dbuf->db_data, mzp, sz); nchunks = zap->zap_m.zap_num_chunks; if (!flags) { err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, 1ULL << fzap_default_block_shift, 0, tx); if (err) { zio_buf_free(mzp, sz); return (err); } } dprintf("upgrading obj=%llu with %u chunks\n", zap->zap_object, nchunks); /* XXX destroy the avl later, so we can use the stored hash value */ mze_destroy(zap); fzap_upgrade(zap, tx, flags); for (i = 0; i < nchunks; i++) { mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; zap_name_t *zn; if (mze->mze_name[0] == 0) continue; dprintf("adding %s=%llu\n", mze->mze_name, mze->mze_value); zn = zap_name_alloc(zap, mze->mze_name, MT_EXACT); err = fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, tx); zap = zn->zn_zap; /* fzap_add_cd() may change zap */ zap_name_free(zn); if (err) break; } zio_buf_free(mzp, sz); *zapp = zap; return (err); } void mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, dmu_tx_t *tx) { dmu_buf_t *db; mzap_phys_t *zp; VERIFY(0 == dmu_buf_hold(os, obj, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); #ifdef ZFS_DEBUG { dmu_object_info_t doi; dmu_object_info_from_db(db, &doi); ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); } #endif dmu_buf_will_dirty(db, tx); zp = db->db_data; zp->mz_block_type = ZBT_MICRO; zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; zp->mz_normflags = normflags; dmu_buf_rele(db, FTAG); if (flags != 0) { zap_t *zap; /* Only fat zap supports flags; upgrade immediately. */ VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER, B_FALSE, B_FALSE, &zap)); VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags)); zap_unlockdir(zap); } } int zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_claim_norm(os, obj, 0, ot, bonustype, bonuslen, tx)); } int zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { int err; err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); if (err != 0) return (err); mzap_create_impl(os, obj, normflags, 0, tx); return (0); } uint64_t zap_create(objset_t *os, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); } uint64_t zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); mzap_create_impl(os, obj, normflags, 0, tx); return (obj); } uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && leaf_blockshift <= SPA_OLD_MAXBLOCKSHIFT && indirect_blockshift >= SPA_MINBLOCKSHIFT && indirect_blockshift <= SPA_OLD_MAXBLOCKSHIFT); VERIFY(dmu_object_set_blocksize(os, obj, 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); mzap_create_impl(os, obj, normflags, flags, tx); return (obj); } int zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) { /* * dmu_object_free will free the object number and free the * data. Freeing the data will cause our pageout function to be * called, which will destroy our data (zap_leaf_t's and zap_t). */ return (dmu_object_free(os, zapobj, tx)); } -_NOTE(ARGSUSED(0)) void -zap_evict(dmu_buf_t *db, void *vzap) +zap_evict(void *dbu) { - zap_t *zap = vzap; + zap_t *zap = dbu; rw_destroy(&zap->zap_rwlock); if (zap->zap_ismicro) mze_destroy(zap); else mutex_destroy(&zap->zap_f.zap_num_entries_mtx); kmem_free(zap, sizeof (zap_t)); } int zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) { zap_t *zap; int err; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); if (!zap->zap_ismicro) { err = fzap_count(zap, count); } else { *count = zap->zap_m.zap_num_entries; } zap_unlockdir(zap); return (err); } /* * zn may be NULL; if not specified, it will be computed if needed. * See also the comment above zap_entry_normalization_conflict(). */ static boolean_t mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) { mzap_ent_t *other; int direction = AVL_BEFORE; boolean_t allocdzn = B_FALSE; if (zap->zap_normflags == 0) return (B_FALSE); again: for (other = avl_walk(&zap->zap_m.zap_avl, mze, direction); other && other->mze_hash == mze->mze_hash; other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { if (zn == NULL) { zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, MT_FIRST); allocdzn = B_TRUE; } if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); } } if (direction == AVL_BEFORE) { direction = AVL_AFTER; goto again; } if (allocdzn) zap_name_free(zn); return (B_FALSE); } /* * Routines for manipulating attributes. */ int zap_lookup(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf) { return (zap_lookup_norm(os, zapobj, name, integer_size, num_integers, buf, MT_EXACT, NULL, 0, NULL)); } int zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *ncp) { zap_t *zap; int err; mzap_ent_t *mze; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, mt); if (zn == NULL) { zap_unlockdir(zap); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_lookup(zn, integer_size, num_integers, buf, realname, rn_len, ncp); } else { mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (num_integers < 1) { err = SET_ERROR(EOVERFLOW); } else if (integer_size != 8) { err = SET_ERROR(EINVAL); } else { *(uint64_t *)buf = MZE_PHYS(zap, mze)->mze_value; (void) strlcpy(realname, MZE_PHYS(zap, mze)->mze_name, rn_len); if (ncp) { *ncp = mzap_normalization_conflict(zap, zn, mze); } } } } zap_name_free(zn); zap_unlockdir(zap); return (err); } int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap); return (SET_ERROR(ENOTSUP)); } fzap_prefetch(zn); zap_name_free(zn); zap_unlockdir(zap); return (err); } int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap); return (SET_ERROR(ENOTSUP)); } err = fzap_lookup(zn, integer_size, num_integers, buf, NULL, 0, NULL); zap_name_free(zn); zap_unlockdir(zap); return (err); } int zap_contains(objset_t *os, uint64_t zapobj, const char *name) { int err = zap_lookup_norm(os, zapobj, name, 0, 0, NULL, MT_EXACT, NULL, 0, NULL); if (err == EOVERFLOW || err == EINVAL) err = 0; /* found, but skipped reading the value */ return (err); } int zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err; mzap_ent_t *mze; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, MT_EXACT); if (zn == NULL) { zap_unlockdir(zap); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_length(zn, integer_size, num_integers); } else { mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { if (integer_size) *integer_size = 8; if (num_integers) *num_integers = 1; } } zap_name_free(zn); zap_unlockdir(zap); return (err); } int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, uint64_t *integer_size, uint64_t *num_integers) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap); return (SET_ERROR(ENOTSUP)); } err = fzap_length(zn, integer_size, num_integers); zap_name_free(zn); zap_unlockdir(zap); return (err); } static void mzap_addent(zap_name_t *zn, uint64_t value) { int i; zap_t *zap = zn->zn_zap; int start = zap->zap_m.zap_alloc_next; uint32_t cd; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ ASSERT(cd < zap_maxcd(zap)); again: for (i = start; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; (void) strcpy(mze->mze_name, zn->zn_key_orig); zap->zap_m.zap_num_entries++; zap->zap_m.zap_alloc_next = i+1; if (zap->zap_m.zap_alloc_next == zap->zap_m.zap_num_chunks) zap->zap_m.zap_alloc_next = 0; VERIFY(0 == mze_insert(zap, i, zn->zn_hash)); return; } } if (start != 0) { start = 0; goto again; } ASSERT(!"out of entries!"); } int zap_add(objset_t *os, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; mzap_ent_t *mze; const uint64_t *intval = val; zap_name_t *zn; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); zn = zap_name_alloc(zap, key, MT_EXACT); if (zn == NULL) { zap_unlockdir(zap); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_add(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(key) >= MZAP_NAME_LEN) { err = mzap_upgrade(&zn->zn_zap, tx, 0); if (err == 0) err = fzap_add(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else { mze = mze_find(zn); if (mze != NULL) { err = SET_ERROR(EEXIST); } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap); return (err); } int zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap); return (SET_ERROR(ENOTSUP)); } err = fzap_add(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_add() failed */ zap_unlockdir(zap); return (err); } int zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; mzap_ent_t *mze; uint64_t oldval; const uint64_t *intval = val; zap_name_t *zn; int err; #ifdef ZFS_DEBUG /* * If there is an old value, it shouldn't change across the * lockdir (eg, due to bprewrite's xlation). */ if (integer_size == 8 && num_integers == 1) (void) zap_lookup(os, zapobj, name, 8, 1, &oldval); #endif err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, MT_EXACT); if (zn == NULL) { zap_unlockdir(zap); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_update(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ } else if (integer_size != 8 || num_integers != 1 || strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", zapobj, integer_size, num_integers, name); err = mzap_upgrade(&zn->zn_zap, tx, 0); if (err == 0) err = fzap_update(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ } else { mze = mze_find(zn); if (mze != NULL) { ASSERT3U(MZE_PHYS(zap, mze)->mze_value, ==, oldval); MZE_PHYS(zap, mze)->mze_value = *intval; } else { mzap_addent(zn, *intval); } } ASSERT(zap == zn->zn_zap); zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap); return (err); } int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { zap_t *zap; zap_name_t *zn; int err; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap); return (SET_ERROR(ENOTSUP)); } err = fzap_update(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_update() may change zap */ zap_name_free(zn); if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ zap_unlockdir(zap); return (err); } int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { return (zap_remove_norm(os, zapobj, name, MT_EXACT, tx)); } int zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx) { zap_t *zap; int err; mzap_ent_t *mze; zap_name_t *zn; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc(zap, name, mt); if (zn == NULL) { zap_unlockdir(zap); return (SET_ERROR(ENOTSUP)); } if (!zap->zap_ismicro) { err = fzap_remove(zn, tx); } else { mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); } else { zap->zap_m.zap_num_entries--; bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], sizeof (mzap_ent_phys_t)); mze_remove(zap, mze); } } zap_name_free(zn); zap_unlockdir(zap); return (err); } int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, int key_numints, dmu_tx_t *tx) { zap_t *zap; int err; zap_name_t *zn; err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); if (err) return (err); zn = zap_name_alloc_uint64(zap, key, key_numints); if (zn == NULL) { zap_unlockdir(zap); return (SET_ERROR(ENOTSUP)); } err = fzap_remove(zn, tx); zap_name_free(zn); zap_unlockdir(zap); return (err); } /* * Routines for iterating over the attributes. */ void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized) { zc->zc_objset = os; zc->zc_zap = NULL; zc->zc_leaf = NULL; zc->zc_zapobj = zapobj; zc->zc_serialized = serialized; zc->zc_hash = 0; zc->zc_cd = 0; } void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) { zap_cursor_init_serialized(zc, os, zapobj, 0); } void zap_cursor_fini(zap_cursor_t *zc) { if (zc->zc_zap) { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); zap_unlockdir(zc->zc_zap); zc->zc_zap = NULL; } if (zc->zc_leaf) { rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); zap_put_leaf(zc->zc_leaf); zc->zc_leaf = NULL; } zc->zc_objset = NULL; } uint64_t zap_cursor_serialize(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return (-1ULL); if (zc->zc_zap == NULL) return (zc->zc_serialized); ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); /* * We want to keep the high 32 bits of the cursor zero if we can, so * that 32-bit programs can access this. So usually use a small * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits * of the cursor. * * [ collision differentiator | zap_hashbits()-bit hash value ] */ return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); } int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) { int err; avl_index_t idx; mzap_ent_t mze_tofind; mzap_ent_t *mze; if (zc->zc_hash == -1ULL) return (SET_ERROR(ENOENT)); if (zc->zc_zap == NULL) { int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, &zc->zc_zap); if (err) return (err); /* * To support zap_cursor_init_serialized, advance, retrieve, * we must add to the existing zc_cd, which may already * be 1 due to the zap_cursor_advance. */ ASSERT(zc->zc_hash == 0); hb = zap_hashbits(zc->zc_zap); zc->zc_hash = zc->zc_serialized << (64 - hb); zc->zc_cd += zc->zc_serialized >> hb; if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ zc->zc_cd = 0; } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_retrieve(zc->zc_zap, zc, za); } else { mze_tofind.mze_hash = zc->zc_hash; mze_tofind.mze_cd = zc->zc_cd; mze = avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); if (mze == NULL) { mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, idx, AVL_AFTER); } if (mze) { mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); za->za_normalization_conflict = mzap_normalization_conflict(zc->zc_zap, NULL, mze); za->za_integer_length = 8; za->za_num_integers = 1; za->za_first_integer = mzep->mze_value; (void) strcpy(za->za_name, mzep->mze_name); zc->zc_hash = mze->mze_hash; zc->zc_cd = mze->mze_cd; err = 0; } else { zc->zc_hash = -1ULL; err = SET_ERROR(ENOENT); } } rw_exit(&zc->zc_zap->zap_rwlock); return (err); } void zap_cursor_advance(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return; zc->zc_cd++; } int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) { int err = 0; mzap_ent_t *mze; zap_name_t *zn; if (zc->zc_zap == NULL) { err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, &zc->zc_zap); if (err) return (err); } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } zn = zap_name_alloc(zc->zc_zap, name, mt); if (zn == NULL) { rw_exit(&zc->zc_zap->zap_rwlock); return (SET_ERROR(ENOTSUP)); } if (!zc->zc_zap->zap_ismicro) { err = fzap_cursor_move_to_key(zc, zn); } else { mze = mze_find(zn); if (mze == NULL) { err = SET_ERROR(ENOENT); goto out; } zc->zc_hash = mze->mze_hash; zc->zc_cd = mze->mze_cd; } out: zap_name_free(zn); rw_exit(&zc->zc_zap->zap_rwlock); return (err); } int zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) { int err; zap_t *zap; err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); bzero(zs, sizeof (zap_stats_t)); if (zap->zap_ismicro) { zs->zs_blocksize = zap->zap_dbuf->db_size; zs->zs_num_entries = zap->zap_m.zap_num_entries; zs->zs_num_blocks = 1; } else { fzap_get_stats(zap, zs); } zap_unlockdir(zap); return (0); } int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, uint64_t *towrite, uint64_t *tooverwrite) { zap_t *zap; int err = 0; /* * Since, we don't have a name, we cannot figure out which blocks will * be affected in this operation. So, account for the worst case : * - 3 blocks overwritten: target leaf, ptrtbl block, header block * - 4 new blocks written if adding: * - 2 blocks for possibly split leaves, * - 2 grown ptrtbl blocks * * This also accomodates the case where an add operation to a fairly * large microzap results in a promotion to fatzap. */ if (name == NULL) { *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE; return (err); } /* * We lock the zap with adding == FALSE. Because, if we pass * the actual value of add, it could trigger a mzap_upgrade(). * At present we are just evaluating the possibility of this operation * and hence we donot want to trigger an upgrade. */ err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); if (err) return (err); if (!zap->zap_ismicro) { zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT); if (zn) { err = fzap_count_write(zn, add, towrite, tooverwrite); zap_name_free(zn); } else { /* * We treat this case as similar to (name == NULL) */ *towrite += (3 + (add ? 4 : 0)) * SPA_OLD_MAXBLOCKSIZE; } } else { /* * We are here if (name != NULL) and this is a micro-zap. * We account for the header block depending on whether it * is freeable. * * Incase of an add-operation it is hard to find out * if this add will promote this microzap to fatzap. * Hence, we consider the worst case and account for the * blocks assuming this microzap would be promoted to a * fatzap. * * 1 block overwritten : header block * 4 new blocks written : 2 new split leaf, 2 grown * ptrtbl blocks */ if (dmu_buf_freeable(zap->zap_dbuf)) *tooverwrite += MZAP_MAX_BLKSZ; else *towrite += MZAP_MAX_BLKSZ; if (add) { *towrite += 4 * MZAP_MAX_BLKSZ; } } zap_unlockdir(zap); return (err); } Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ioctl.c (revision 286575) @@ -1,6604 +1,6604 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011-2012 Pawel Jakub Dawidek . * All rights reserved. * Copyright 2013 Martin Matuska . All rights reserved. * Copyright 2014 Xin Li . All rights reserved. * Copyright 2015 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. */ /* * ZFS ioctls. * * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. * * There are two ways that we handle ioctls: the legacy way where almost * all of the logic is in the ioctl callback, and the new way where most * of the marshalling is handled in the common entry point, zfsdev_ioctl(). * * Non-legacy ioctls should be registered by calling * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked * from userland by lzc_ioctl(). * * The registration arguments are as follows: * * const char *name * The name of the ioctl. This is used for history logging. If the * ioctl returns successfully (the callback returns 0), and allow_log * is true, then a history log entry will be recorded with the input & * output nvlists. The log entry can be printed with "zpool history -i". * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). * The ioctl numbers can change from release to release, because * the caller (libzfs) must be matched to the kernel. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to * determine if this operation is permitted. It should return EPERM * on failure, and 0 on success. Checks include determining if the * dataset is visible in this zone, and if the user has either all * zfs privileges in the zone (SYS_MOUNT), or has been granted permission * to do this operation on this dataset with "zfs allow". * * zfs_ioc_namecheck_t namecheck * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool * name, a dataset name, or nothing. If the name is not well-formed, * the ioctl will fail and the callback will not be called. * Therefore, the callback can assume that the name is well-formed * (e.g. is null-terminated, doesn't have more than one '@' character, * doesn't have invalid characters). * * zfs_ioc_poolcheck_t pool_check * This specifies requirements on the pool state. If the pool does * not meet them (is suspended or is readonly), the ioctl will fail * and the callback will not be called. If any checks are specified * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the * caller's buffer, by removing some entries and replacing them with a * single "N_MORE_ERRORS" entry indicating how many were removed. See * nvlist_smush() for details. If smush_outnvlist is false, and the * outnvlist does not fit into the userland-provided buffer, then the * ioctl will fail with ENOMEM. * * zfs_ioc_func_t *func * The callback function that will perform the operation. * * The callback should return 0 on success, or an error number on * failure. If the function fails, the userland ioctl will return -1, * and errno will be set to the callback's return value. The callback * will be called with the following arguments: * * const char *name * The name of the pool or dataset to operate on, from * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the * expected type (pool, dataset, or none). * * nvlist_t *innvl * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or * NULL if no input nvlist was provided. Changes to this nvlist are * ignored. If the input nvlist could not be deserialized, the * ioctl will fail and the callback will not be called. * * nvlist_t *outnvl * The output nvlist, initially empty. The callback can fill it in, * and it will be returned to userland by serializing it into * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization * fails (e.g. because the caller didn't supply a large enough * buffer), then the overall ioctl will fail. See the * 'smush_nvlist' argument above for additional behaviors. * * There are two typical uses of the output nvlist: * - To return state, e.g. property values. In this case, * smush_outnvlist should be false. If the buffer was not large * enough, the caller will reallocate a larger buffer and try * the ioctl again. * * - To return multiple errors from an ioctl which makes on-disk * changes. In this case, smush_outnvlist should be true. * Ioctls which make on-disk modifications should generally not * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. */ #ifdef __FreeBSD__ #include "opt_kstack_pages.h" #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" #include "zfs_ioctl_compat.h" CTASSERT(sizeof(zfs_cmd_t) < IOCPARM_MAX); static struct cdev *zfsdev; extern void zfs_init(void); extern void zfs_fini(void); uint_t zfs_fsyncer_key; extern uint_t rrw_tsd_key; static uint_t zfs_allow_log_key; typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); typedef enum { NO_NAME, POOL_NAME, DATASET_NAME } zfs_ioc_namecheck_t; typedef enum { POOL_CHECK_NONE = 1 << 0, POOL_CHECK_SUSPENDED = 1 << 1, POOL_CHECK_READONLY = 1 << 2, } zfs_ioc_poolcheck_t; typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_allow_log; zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); static void zfsdev_close(void *data); static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature); /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ void __dprintf(const char *file, const char *func, int line, const char *fmt, ...) { const char *newfile; char buf[512]; va_list adx; /* * Get rid of annoying "../common/" prefix to filename. */ newfile = strrchr(file, '/'); if (newfile != NULL) { newfile = newfile + 1; /* Get rid of leading / */ } else { newfile = file; } va_start(adx, fmt); (void) vsnprintf(buf, sizeof (buf), fmt, adx); va_end(adx); /* * To get this data, use the zfs-dprintf probe as so: * dtrace -q -n 'zfs-dprintf \ * /stringof(arg0) == "dbuf.c"/ \ * {printf("%s: %s", stringof(arg1), stringof(arg3))}' * arg0 = file name * arg1 = function name * arg2 = line number * arg3 = message */ DTRACE_PROBE4(zfs__dprintf, char *, newfile, char *, func, int, line, char *, buf); } static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == 0) return (NULL); buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Check to see if the named dataset is currently defined as bootable */ static boolean_t zfs_is_bootfs(const char *name) { objset_t *os; if (dmu_objset_hold(name, FTAG, &os) == 0) { boolean_t ret; ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os))); dmu_objset_rele(os, FTAG); return (ret); } return (B_FALSE); } /* * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ /* ARGSUSED */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ /* ARGSUSED */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (INGLOBALZONE(curthread) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (SET_ERROR(ENOENT)); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curthread) && !zone_dataset_visible(dataset, &writable)) return (SET_ERROR(ENOENT)); if (INGLOBALZONE(curthread)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) return (SET_ERROR(EPERM)); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, "jailed", &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_int_ds(ds, "jailed", &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } static int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; dsl_dataset_t *ds; dsl_pool_t *dp; error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } #ifdef SECLABEL /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) { char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error != 0) return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (SET_ERROR(EPERM)); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (SET_ERROR(EPERM)); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); dmu_objset_disown(os, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); } #endif /* SECLABEL */ static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { char *strval; /* * Check permissions for special properties. */ switch (prop) { case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curthread)) return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curthread)) { uint64_t zoned; char setpoint[MAXNAMELEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, "jailed", &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: #ifdef SECLABEL if (!is_system_labeled()) return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } #else return (EOPNOTSUPP); #endif break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } /* ARGSUSED */ static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; error = zfs_dozonecheck(zc->zc_name, cr); if (error != 0) return (error); /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ return (0); } /* ARGSUSED */ static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } /* ARGSUSED */ static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *ds; char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } /* ARGSUSED */ static int zfs_secpolicy_deleg_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { vnode_t *vp; int error; if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp)) != 0) return (error); /* Now make sure mntpnt and dataset are ZFS */ if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); return (SET_ERROR(EPERM)); } VN_RELE(vp); return (dsl_deleg_access(zc->zc_name, ZFS_DELEG_PERM_SHARE, cr)); } int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (!INGLOBALZONE(curthread)) return (SET_ERROR(EPERM)); if (secpolicy_nfs(cr) == 0) { return (0); } else { return (zfs_secpolicy_deleg_share(zc, innvl, cr)); } } int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (!INGLOBALZONE(curthread)) return (SET_ERROR(EPERM)); if (secpolicy_smb(cr) == 0) { return (0); } else { return (zfs_secpolicy_deleg_share(zc, innvl, cr)); } } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ (void) strncpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (SET_ERROR(ENOENT)); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } /* ARGSUSED */ static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ /* ARGSUSED */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider * them "already destroyed"). Remove the name from the * nvl here in case the snapshot is created between * now and when we try to destroy it (in which case * we don't want to destroy it since we haven't * checked for permission). */ fnvlist_remove_nvpair(snaps, pair); error = 0; } if (error != 0) break; } return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[MAXNAMELEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } /* ARGSUSED */ static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char *at = NULL; int error; if ((zc->zc_cookie & 1) != 0) { /* * This is recursive rename, so the starting snapshot might * not exist. Check file system or volume permission instead. */ at = strchr(zc->zc_name, '@'); if (at == NULL) return (EINVAL); *at = '\0'; } error = zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr); if (at != NULL) *at = '@'; return (error); } /* ARGSUSED */ static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error != 0) return (error); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { char parentname[MAXNAMELEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(origin, parentname); if (error == 0) { error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); } dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(origin, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } /* * Check for permission to create each snapshot in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; int error; nvpair_t *pair; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } *atp = '\0'; error = zfs_secpolicy_snapshot_perms(name, cr); *atp = '@'; if (error != 0) break; } return (error); } /* * Check for permission to create each snapshot in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_BOOKMARK, cr); *hashp = '#'; if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr); *hashp = '#'; if (error == ENOENT) { /* * Ignore any filesystems that don't exist (we consider * their bookmarks "already destroyed"). Remove * the name from the nvl here in case the filesystem * is created between now and when we try to destroy * the bookmark (in which case we don't want to * destroy it since we haven't checked for permission). */ fnvlist_remove_nvpair(innvl, pair); error = 0; } if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * Even root must have a proper TSD so that we know what pool * to log to. */ if (tsd_get(zfs_allow_log_key) == NULL) return (SET_ERROR(EPERM)); return (0); } static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char parentname[MAXNAMELEN]; int error; char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && (error = zfs_secpolicy_write_perms(origin, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)); } /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ /* ARGSUSED */ static int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); return (0); } /* * Policy for object to name lookups. */ /* ARGSUSED */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ /* ARGSUSED */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (secpolicy_zinject(cr)); } /* ARGSUSED */ static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_INVAL) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else { if (groupmember(zc->zc_guid, cr)) return (0); } } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } /* ARGSUSED */ static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } /* ARGSUSED */ static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; nvlist_t *holds; int error; error = nvlist_lookup_nvlist(innvl, "holds", &holds); if (error != 0) return (SET_ERROR(EINVAL)); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char fsname[MAXNAMELEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_HOLD, cr); if (error != 0) return (error); } return (0); } /* ARGSUSED */ static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; int error; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char fsname[MAXNAMELEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_RELEASE, cr); if (error != 0) return (error); } return (0); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr)) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); return (error); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (SET_ERROR(EINVAL)); packed = kmem_alloc(size, KM_SLEEP); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { kmem_free(packed, size); return (error); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { kmem_free(packed, size); return (error); } kmem_free(packed, size); *nvp = list; return (0); } /* * Reduce the size of this nvlist until it can be serialized in 'max' bytes. * Entries will be removed from the end of the nvlist, and one int32 entry * named "N_MORE_ERRORS" will be added indicating how many entries were * removed. */ static int nvlist_smush(nvlist_t *errors, size_t max) { size_t size; size = fnvlist_size(errors); if (size > max) { nvpair_t *more_errors; int n = 0; if (max < 1024) return (SET_ERROR(ENOMEM)); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); more_errors = nvlist_prev_nvpair(errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); fnvlist_remove_nvpair(errors, pair); n++; size = fnvlist_size(errors); } while (size > max); fnvlist_remove_nvpair(errors, more_errors); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); ASSERT3U(fnvlist_size(errors), <=, max); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { /* * Solaris returns ENOMEM here, because even if an error is * returned from an ioctl(2), new zc_nvlist_dst_size will be * passed to the userland. This is not the case for FreeBSD. * We need to return 0, so the kernel will copy the * zc_nvlist_dst_size back and the userland can discover that a * bigger buffer is needed. */ error = 0; } else { packed = fnvlist_pack(nvl, &size); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = SET_ERROR(EFAULT); fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; zc->zc_nvlist_dst_filled = B_TRUE; return (error); } static int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); if (*zfvp) { VFS_HOLD((*zfvp)->z_vfs); } else { error = SET_ERROR(ESRCH); } mutex_exit(&os->os_user_ptr_lock); dmu_objset_rele(os, FTAG); return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_vfs will be NULL, and it will be opened as the owner. * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, * which prevents all vnode ops from running. */ static int zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, zfvp); if (error == 0) { rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : RW_READER, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ rrm_exit(&(*zfvp)->z_teardown_lock, tag); return (SET_ERROR(EBUSY)); } } return (error); } static void zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) { rrm_exit(&zfsvfs->z_teardown_lock, tag); if (zfsvfs->z_vfs) { VFS_RELE(zfsvfs->z_vfs); } else { dmu_objset_disown(zfsvfs->z_os, zfsvfs); zfsvfs_free(zfsvfs); } } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; uint64_t version = SPA_VERSION; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (!SPA_VERSION_IS_SUPPORTED(version)) { error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) { nvlist_free(config); nvlist_free(props); return (error); } (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) goto pool_props_bad; } error = spa_create(zc->zc_name, config, props, zplprops); /* * Set the remaining root properties */ if (!error && (error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) (void) spa_destroy(zc->zc_name); pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); if (error == 0) zvol_remove_minors(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = SET_ERROR(EINVAL); else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } nvlist_free(config); if (props) nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); if (error == 0) zvol_remove_minors(zc->zc_name); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) return (SET_ERROR(EEXIST)); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } /* * inputs: * zc_name name of the pool * * outputs: * zc_cookie real errno * zc_nvlist_dst config nvlist * zc_nvlist_dst_size size of config nvlist */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } hist_buf = kmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); kmem_free(hist_buf, size); return (error); } static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { error = spa_change_guid(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele(os, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config, **l2cache, **spares; uint_t nl2cache = 0, nspares = 0; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_SPARES, &spares, &nspares); #ifdef illumos /* * A root pool with concatenated devices is not supported. * Thus, can not add a device to a root pool. * * Intent log device can not be added to a rootpool because * during mountroot, zil is replayed, a seperated log device * can not be accessed during the mountroot time. * * l2cache and spare devices are ok to be added to a rootpool. */ if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { nvlist_free(config); spa_close(spa, FTAG); return (SET_ERROR(EDOM)); } #endif /* illumos */ if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_nvlist_conf nvlist of devices to remove * zc_cookie to stop the remove? */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; default: error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; int replacing = zc->zc_cookie; nvlist_t *config; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); if (error == EIO) return (error); VERIFY0(error); } error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); } if (error == ENOMEM) error = 0; return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { int error = 0; nvlist_t *nv; /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if (err = dmu_objset_hold(zc->zc_name, FTAG, &os)) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != 0 && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } boolean_t dataset_name_hidden(const char *name) { /* * Skip over datasets that are not visible in this zone, * internal datasets (which have a $ in their name), and * temporary datasets (which have a % in their name). */ if (strchr(name, '$') != NULL) return (B_TRUE); if (strchr(name, '%') != NULL) return (B_TRUE); if (!INGLOBALZONE(curthread) && !zone_dataset_visible(name, NULL)) return (B_TRUE); return (B_FALSE); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) { if (error == ENOENT) error = SET_ERROR(ESRCH); return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = SET_ERROR(ESRCH); } while (error == 0 && dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * zc_simple when set, only name is requested * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? ESRCH : error); } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == 0 && !zc->zc_simple) { dsl_dataset_t *ds; dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; error = dsl_dataset_hold_obj(dp, zc->zc_obj, FTAG, &ds); if (error == 0) { objset_t *ossnap; error = dmu_objset_from_ds(ds, &ossnap); if (error == 0) error = zfs_ioc_objset_stats_impl(zc, ossnap); dsl_dataset_rele(ds, FTAG); } } else if (error == ENOENT) { error = SET_ERROR(ESRCH); } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *domain; char *dash; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (SET_ERROR(EINVAL)); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval; int err = -1; if (prop == ZPROP_INVAL) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) return (-1); VERIFY(0 == nvpair_value_uint64(pair, &intval)); switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (intval == UINT64_MAX) { /* clearing the limit, just do it */ err = 0; } else { err = dsl_dir_activate_fs_ss_limit(dsname); } /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strcpy(zc->zc_name, dsname); (void) zfs_ioc_userspace_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error * encountered. If the caller provides a non-NULL errlist, it will be filled in * with the list of names of all the properties that failed along with the * corresponding error numbers. * * If every property is set successfully, zero is returned and errlist is not * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; uint64_t intval; char *strval; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); int err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = SET_ERROR(EINVAL); } /* Validate value type */ if (err == 0 && prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = SET_ERROR(EINVAL); } else { err = SET_ERROR(EINVAL); } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = SET_ERROR(EINVAL); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = SET_ERROR(EINVAL); } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) { if (errlist != NULL) fnvlist_add_int32(errlist, propname, err); rv = err; } } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (!nvlist_empty(genericnvl) && dsl_props_set(dsname, source, genericnvl) != 0) { /* * If this fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); int err = 0; propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); err = dsl_prop_set_string(dsname, propname, source, strval); } else { intval = fnvpair_value_uint64(propval); err = dsl_prop_set_int(dsname, propname, source, intval); } if (err != 0) { if (errlist != NULL) { fnvlist_add_int32(errlist, propname, err); } rv = err; } } } nvlist_free(genericnvl); nvlist_free(retrynvl); return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(const char *fsname, nvlist_t *nvl) { nvpair_t *pair = NULL; int error = 0; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); if (error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_USERPROP, CRED())) return (error); if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (E2BIG); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { (void) clear_received_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ if (received) { nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ if (prop == ZPROP_INVAL) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: nvlist_free(dummy); return (SET_ERROR(EINVAL)); } pair = nvlist_next_nvpair(dummy, NULL); err = zfs_prop_set_special(zc->zc_name, source, pair); nvlist_free(dummy); if (err != -1) return (err); /* special property already handled */ } else { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } /* property name has been validated by zfs_secpolicy_inherit_prop() */ return (dsl_prop_inherit(zc->zc_name, zc->zc_value, source)); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props)) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_config_sync(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_props(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *nvp = NULL; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) error = spa_prop_get(spa, &nvp); mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); spa_close(spa, FTAG); } if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else error = SET_ERROR(EFAULT); nvlist_free(nvp); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } /* * Search the vfs list for a specified resource. Returns a pointer to it * or NULL if no suitable entry is found. The caller of this routine * is responsible for releasing the returned vfs pointer. */ static vfs_t * zfs_get_vfs(const char *resource) { vfs_t *vfsp; mtx_lock(&mountlist_mtx); TAILQ_FOREACH(vfsp, &mountlist, mnt_list) { if (strcmp(refstr_value(vfsp->vfs_resource), resource) == 0) { VFS_HOLD(vfsp); break; } } mtx_unlock(&mountlist_mtx); return (vfsp); } /* ARGSUSED */ static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * os parent objset pointer (NULL if root fs) * fuids_ok fuids allowed in this version of the spa? * sa_ok SAs allowed in this version of the spa? * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; ASSERT(zplprops != NULL); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED) VERIFY(zfs_get_zplprop(os, ZFS_PROP_CASE, &sense) == 0); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[MAXNAMELEN]; char *cp; spa_t *spa; uint64_t spa_vers; int error; (void) strlcpy(parentname, dataset, sizeof (parentname)); cp = strrchr(parentname, '/'); ASSERT(cp != NULL); cp[0] = '\0'; if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } * } * * outnvl: propname -> error code (int32) */ static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); int32_t type32; dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; if (nvlist_lookup_int32(innvl, "type", &type32) != 0) return (SET_ERROR(EINVAL)); type = type32; (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); zct.zct_props = nvprops; if (cbfunc == NULL) return (SET_ERROR(EINVAL)); if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) return (SET_ERROR(EINVAL)); if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) return (SET_ERROR(EINVAL)); if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize( volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) return (error); } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } } error = dmu_objset_create(fsname, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); nvlist_free(zct.zct_zplprops); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } #ifdef __FreeBSD__ if (error == 0 && type == DMU_OST_ZVOL) zvol_create_minors(fsname); #endif return (error); } /* * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } * } * * outnvl: propname -> error code (int32) */ static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; char *origin_name; if (nvlist_lookup_string(innvl, "origin", &origin_name) != 0) return (SET_ERROR(EINVAL)); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); error = dmu_objset_clone(fsname, origin_name); if (error != 0) return (error); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } #ifdef __FreeBSD__ if (error == 0) zvol_create_minors(fsname); #endif return (error); } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional) "props" -> { prop -> value (string) } * } * * outnvl: snapshot -> error code (int32) */ static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvlist_t *props = NULL; int error, poollen; nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); if ((error = zfs_check_userprops(poolname, props)) != 0) return (error); if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The snap must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { if (strncmp(name, nvpair_name(pair2), cp - name + 1) == 0) { return (SET_ERROR(EXDEV)); } } } error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } /* * innvl: "message" -> string */ /* ARGSUSED */ static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { char *message; spa_t *spa; int error; char *poolname; /* * The poolname in the ioctl is not set, we get it from the TSD, * which was set at the end of the last successful ioctl that allows * logging. The secpolicy func already checked that it is set. * Only one log ioctl is allowed after each successful ioctl, so * we clear the TSD here. */ poolname = tsd_get(zfs_allow_log_key); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); strfree(poolname); if (error != 0) return (error); if (nvlist_lookup_string(innvl, "message", &message) != 0) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } error = spa_history_log(spa, message); spa_close(spa, FTAG); return (error); } /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). * * Returns 0 if the argument is not a snapshot, or it is not currently a * filesystem, or we were able to unmount it. Returns error code otherwise. */ int zfs_unmount_snap(const char *snapname) { vfs_t *vfsp; zfsvfs_t *zfsvfs; int err; if (strchr(snapname, '@') == NULL) return (0); vfsp = zfs_get_vfs(snapname); if (vfsp == NULL) return (0); zfsvfs = vfsp->vfs_data; ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); err = vn_vfswlock(vfsp->vfs_vnodecovered); VFS_RELE(vfsp); if (err != 0) return (SET_ERROR(err)); /* * Always force the unmount for snapshots. */ #ifdef illumos (void) dounmount(vfsp, MS_FORCE, kcred); #else vfs_ref(vfsp); (void) dounmount(vfsp, MS_FORCE, curthread); #endif return (0); } /* ARGSUSED */ static int zfs_unmount_snap_cb(const char *snapname, void *arg) { return (zfs_unmount_snap(snapname)); } /* * When a clone is destroyed, its origin may also need to be destroyed, * in which case it must be unmounted. This routine will do that unmount * if necessary. */ void zfs_destroy_unmount_origin(const char *fsname) { int error; objset_t *os; dsl_dataset_t *ds; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { char originname[MAXNAMELEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); (void) zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional boolean) "defer" * } * * outnvl: snapshot -> error code (int32) * */ /* ARGSUSED */ static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; if (nvlist_lookup_nvlist(innvl, "snaps", &snaps) != 0) return (SET_ERROR(EINVAL)); defer = nvlist_exists(innvl, "defer"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); /* * The snap must be in the specified pool to prevent the * invalid removal of zvol minors below. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); error = zfs_unmount_snap(name); if (error != 0) return (error); #if defined(__FreeBSD__) zvol_remove_minors(name); #endif } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* * Create bookmarks. Bookmark names are of the form #. * All bookmarks must be in the same pool. * * innvl: { * bookmark1 -> snapshot1, bookmark2 -> snapshot2 * } * * outnvl: bookmark -> error code (int32) * */ /* ARGSUSED */ static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *snap_name; /* * Verify the snapshot argument. */ if (nvpair_value_string(pair, &snap_name) != 0) return (SET_ERROR(EINVAL)); /* Verify that the keys (bookmarks) are unique */ for (nvpair_t *pair2 = nvlist_next_nvpair(innvl, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(innvl, pair2)) { if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0) return (SET_ERROR(EINVAL)); } } return (dsl_bookmark_create(innvl, outnvl)); } /* * innvl: { * property 1, property 2, ... * } * * outnvl: { * bookmark name 1 -> { property 1, property 2, ... }, * bookmark name 2 -> { property 1, property 2, ... } * } * */ static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_get_bookmarks(fsname, innvl, outnvl)); } /* * innvl: { * bookmark name 1, bookmark name 2 * } * * outnvl: bookmark -> error code (int32) * */ static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; poollen = strlen(poolname); for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '#'); /* * The bookmark name must contain an #, and the part after it * must contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The bookmark must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '#')) return (SET_ERROR(EXDEV)); } error = dsl_bookmark_destroy(innvl, outnvl); return (error); } /* * inputs: * zc_name name of dataset to destroy * zc_objset_type type of objset * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { int err; if (zc->zc_objset_type == DMU_OST_ZFS) { err = zfs_unmount_snap(zc->zc_name); if (err != 0) return (err); } if (strchr(zc->zc_name, '@')) err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); else err = dsl_destroy_head(zc->zc_name); if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) #ifdef __FreeBSD__ zvol_remove_minors(zc->zc_name); #else (void) zvol_remove_minor(zc->zc_name); #endif return (err); } /* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl is not used. * * outnvl: "target" -> name of most recent snapshot * } */ /* ARGSUSED */ static int zfs_ioc_rollback(const char *fsname, nvlist_t *args, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; int error; if (getzfsvfs(fsname, &zfsvfs) == 0) { error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; error = dsl_dataset_rollback(fsname, zfsvfs, outnvl); resume_err = zfs_resume_fs(zfsvfs, fsname); error = error ? error : resume_err; } VFS_RELE(zfsvfs->z_vfs); } else { error = dsl_dataset_rollback(fsname, NULL, outnvl); } return (error); } static int recursive_unmount(const char *fsname, void *arg) { const char *snapname = arg; char fullname[MAXNAMELEN]; (void) snprintf(fullname, sizeof (fullname), "%s@%s", fsname, snapname); return (zfs_unmount_snap(fullname)); } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { boolean_t recursive = zc->zc_cookie & 1; char *at; boolean_t allow_mounted = B_TRUE; #ifdef __FreeBSD__ allow_mounted = (zc->zc_cookie & 2) != 0; #endif zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); at = strchr(zc->zc_name, '@'); if (at != NULL) { /* snaps must be in same fs */ int error; if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; if (zc->zc_objset_type == DMU_OST_ZFS && allow_mounted) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); if (error != 0) { *at = '@'; return (error); } } error = dsl_dataset_rename_snapshot(zc->zc_name, at + 1, strchr(zc->zc_value, '@') + 1, recursive); *at = '@'; return (error); } else { #ifdef illumos if (zc->zc_objset_type == DMU_OST_ZVOL) (void) zvol_remove_minor(zc->zc_name); #endif return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval; int err; if (prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr)) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else { /* USERUSED and GROUPUSED are read-only */ return (SET_ERROR(EINVAL)); } if (err = zfs_secpolicy_write_perms(dsname, perm, cr)) return (err); return (0); } return (SET_ERROR(EINVAL)); } if (issnap) return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { if (intval >= ZIO_COMPRESS_GZIP_1 && intval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } if (intval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); if (intval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } /* * If this is a bootable dataset then * verify that the compression algorithm * is supported for booting. We must return * something other than ENOTSUP since it * implies a downrev pool version. */ if (zfs_is_bootfs(dsname) && !BOOTFS_COMPRESS_VALID(intval)) { return (SET_ERROR(ERANGE)); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_DEDUP: if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; /* * If this is a bootable dataset then * the we don't allow large (>128K) blocks, * because GRUB doesn't support them. */ if (zfs_is_bootfs(dsname) && intval > SPA_OLD_MAXBLOCKSIZE) { return (SET_ERROR(EDOM)); } /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) return (SET_ERROR(EDOM)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (SET_ERROR(ENOTSUP)); } break; } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Checks for a race condition to make sure we don't increment a feature flag * multiple times. */ static int zfs_prop_activate_feature_check(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_feature_t *featurep = arg; if (!spa_feature_is_active(spa, *featurep)) return (0); else return (SET_ERROR(EBUSY)); } /* * The callback invoked on feature activation in the sync task caused by * zfs_prop_activate_feature. */ static void zfs_prop_activate_feature_sync(void *arg, dmu_tx_t *tx) { spa_t *spa = dmu_tx_pool(tx)->dp_spa; spa_feature_t *featurep = arg; spa_feature_incr(spa, *featurep, tx); } /* * Activates a feature on a pool in response to a property setting. This * creates a new sync task which modifies the pool to reflect the feature * as being active. */ static int zfs_prop_activate_feature(spa_t *spa, spa_feature_t feature) { int err; /* EBUSY here indicates that the feature is already active */ err = dsl_sync_task(spa_name(spa), zfs_prop_activate_feature_check, zfs_prop_activate_feature_sync, &feature, 2, ZFS_SPACE_CHECK_RESERVED); if (err != 0 && err != EBUSY) return (err); else return (0); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strcpy(zc->zc_name, dataset); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strcpy(zc->zc_value, nvpair_name(pair)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } #ifdef DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * inputs: * zc_name name of containing filesystem * zc_nvlist_src{_size} nvlist of properties to apply * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * zc_cleanup_fd cleanup-on-exit file descriptor * zc_action_handle handle for this guid/ds mapping (or zero on first call) * * outputs: * zc_cookie number of bytes read * zc_nvlist_dst{_size} error for each unapplied received property * zc_obj zprop_errflags_t * zc_action_handle handle for this guid/ds mapping */ static int zfs_ioc_recv(zfs_cmd_t *zc) { file_t *fp; dmu_recv_cookie_t drc; boolean_t force = (boolean_t)zc->zc_guid; int fd; int error = 0; int props_error = 0; nvlist_t *errors; offset_t off; nvlist_t *props = NULL; /* sent properties */ nvlist_t *origprops = NULL; /* existing properties */ char *origin = NULL; char *tosnap; char tofs[ZFS_MAXNAMELEN]; cap_rights_t rights; boolean_t first_recvd_props = B_FALSE; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); (void) strcpy(tofs, zc->zc_value); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props)) != 0) return (error); fd = zc->zc_cookie; #ifdef illumos fp = getf(fd); #else fget_read(curthread, fd, cap_rights_init(&rights, CAP_PREAD), &fp); #endif if (fp == NULL) { nvlist_free(props); return (SET_ERROR(EBADF)); } VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); if (zc->zc_string[0]) origin = zc->zc_string; error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record, force, origin, &drc); if (error != 0) goto out; /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (props != NULL && !drc.drc_newfs) { if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= SPA_VERSION_RECVD_PROPS && !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; /* * If new received properties are supplied, they are to * completely replace the existing received properties, so stash * away the existing ones. */ if (dsl_prop_get_received(tofs, &origprops) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(props, origprops); if (zfs_check_clearable(tofs, origprops, &errlist) != 0) (void) nvlist_merge(errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origprops, first_recvd_props ? NULL : props) != 0) zc->zc_obj |= ZPROP_ERR_NOCLEAR; } else { zc->zc_obj |= ZPROP_ERR_NOCLEAR; } } if (props != NULL) { props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, props, errors); } } if (zc->zc_nvlist_dst_size != 0 && (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || put_nvlist(zc, errors) != 0)) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ props_error = SET_ERROR(EINVAL); } off = fp->f_offset; error = dmu_recv_stream(&drc, fp, &off, zc->zc_cleanup_fd, &zc->zc_action_handle); if (error == 0) { zfsvfs_t *zfsvfs = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ int end_err; error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); if (error == 0) error = zfs_resume_fs(zfsvfs, tofs); error = error ? error : end_err; VFS_RELE(zfsvfs->z_vfs); } else { error = dmu_recv_end(&drc, NULL); } } zc->zc_cookie = off - fp->f_offset; if (off >= 0 && off <= MAXOFFSET_T) fp->f_offset = off; #ifdef DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif #ifdef __FreeBSD__ if (error == 0) zvol_create_minors(tofs); #endif /* * On error, restore the original props. */ if (error != 0 && props != NULL && !drc.drc_newfs) { if (clear_received_props(tofs, props, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origprops == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explictly if we're restoring local properties cleared in the * first new-style receive. */ if (origprops != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origprops, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ zc->zc_obj |= ZPROP_ERR_NORESTORE; } } out: nvlist_free(props); nvlist_free(origprops); nvlist_free(errors); releasef(fd); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set */ static int zfs_ioc_send(zfs_cmd_t *zc) { int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); if (zc->zc_obj != 0) { dsl_pool_t *dp; dsl_dataset_t *tosnap; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (dsl_dir_is_clone(tosnap->ds_dir)) zc->zc_fromobj = dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } if (estimate) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (zc->zc_fromobj != 0) { error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate(tosnap, fromsnap, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { file_t *fp; cap_rights_t rights; #ifdef illumos fp = getf(zc->zc_cookie); #else fget_write(curthread, zc->zc_cookie, cap_rights_init(&rights, CAP_WRITE), &fp); #endif if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, #ifdef illumos zc->zc_cookie, fp->f_vnode, &off); #else zc->zc_cookie, fp, &off); #endif if (off >= 0 && off <= MAXOFFSET_T) fp->f_offset = off; releasef(zc->zc_cookie); } return (error); } /* * inputs: * zc_name name of snapshot on which to report progress * zc_cookie file descriptor of send stream * * outputs: * zc_cookie number of bytes written in send stream thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendarg_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } mutex_enter(&ds->ds_sendstream_lock); /* * Iterate over all the send streams currently active on this dataset. * If there's one which matches the specified file descriptor _and_ the * stream was started by the current process, return the progress of * that stream. */ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { if (dsp->dsa_outfd == zc->zc_cookie && dsp->dsa_proc == curproc) break; } if (dsp != NULL) zc->zc_cookie = *(dsp->dsa_off); else error = SET_ERROR(ENOENT); mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; size_t count = (size_t)zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &count); if (error == 0) zc->zc_nvlist_dst_size = count; else zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { (void) spa_vdev_state_exit(spa, NULL, ENODEV); spa_close(spa, FTAG); return (SET_ERROR(ENODEV)); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, NULL, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_reopen(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); /* * If a resilver is already in progress then set the * spa_scrub_reopen flag to B_TRUE so that we don't restart * the scan as a side effect of the reopen. Otherwise, let * vdev_open() decided if a resilver is required. */ spa->spa_scrub_reopen = dsl_scan_resilvering(spa->spa_dsl_pool); vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (0); } /* * inputs: * zc_name name of filesystem * zc_value name of origin snapshot * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { char *cp; /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(zc->zc_value, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(zc->zc_value, zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (SET_ERROR(ENOMEM)); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = kmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = ddi_copyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size, zc->zc_iflags); } kmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { objset_t *os; int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(zfsvfs->z_os, zfsvfs); error = zfs_resume_fs(zfsvfs, zc->zc_name); } } if (error == 0) error = dmu_objset_userspace_upgrade(zfsvfs->z_os); VFS_RELE(zfsvfs->z_vfs); } else { /* XXX kind of reading contents without owning */ error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_objset_userspace_upgrade(os); dmu_objset_rele(os, FTAG); } return (error); } #ifdef illumos /* * We don't want to have a hard dependency * against some special symbols in sharefs * nfs, and smbsrv. Determine them if needed when * the first file system is shared. * Neither sharefs, nfs or smbsrv are unloadable modules. */ int (*znfsexport_fs)(void *arg); int (*zshare_fs)(enum sharefs_sys_op, share_t *, uint32_t); int (*zsmbexport_fs)(void *arg, boolean_t add_share); int zfs_nfsshare_inited; int zfs_smbshare_inited; ddi_modhandle_t nfs_mod; ddi_modhandle_t sharefs_mod; ddi_modhandle_t smbsrv_mod; #endif /* illumos */ kmutex_t zfs_share_lock; #ifdef illumos static int zfs_init_sharefs() { int error; ASSERT(MUTEX_HELD(&zfs_share_lock)); /* Both NFS and SMB shares also require sharetab support. */ if (sharefs_mod == NULL && ((sharefs_mod = ddi_modopen("fs/sharefs", KRTLD_MODE_FIRST, &error)) == NULL)) { return (SET_ERROR(ENOSYS)); } if (zshare_fs == NULL && ((zshare_fs = (int (*)(enum sharefs_sys_op, share_t *, uint32_t)) ddi_modsym(sharefs_mod, "sharefs_impl", &error)) == NULL)) { return (SET_ERROR(ENOSYS)); } return (0); } #endif /* illumos */ static int zfs_ioc_share(zfs_cmd_t *zc) { #ifdef illumos int error; int opcode; switch (zc->zc_share.z_sharetype) { case ZFS_SHARE_NFS: case ZFS_UNSHARE_NFS: if (zfs_nfsshare_inited == 0) { mutex_enter(&zfs_share_lock); if (nfs_mod == NULL && ((nfs_mod = ddi_modopen("fs/nfs", KRTLD_MODE_FIRST, &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } if (znfsexport_fs == NULL && ((znfsexport_fs = (int (*)(void *)) ddi_modsym(nfs_mod, "nfs_export", &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } error = zfs_init_sharefs(); if (error != 0) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } zfs_nfsshare_inited = 1; mutex_exit(&zfs_share_lock); } break; case ZFS_SHARE_SMB: case ZFS_UNSHARE_SMB: if (zfs_smbshare_inited == 0) { mutex_enter(&zfs_share_lock); if (smbsrv_mod == NULL && ((smbsrv_mod = ddi_modopen("drv/smbsrv", KRTLD_MODE_FIRST, &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } if (zsmbexport_fs == NULL && ((zsmbexport_fs = (int (*)(void *, boolean_t))ddi_modsym(smbsrv_mod, "smb_server_share", &error)) == NULL)) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } error = zfs_init_sharefs(); if (error != 0) { mutex_exit(&zfs_share_lock); return (SET_ERROR(ENOSYS)); } zfs_smbshare_inited = 1; mutex_exit(&zfs_share_lock); } break; default: return (SET_ERROR(EINVAL)); } switch (zc->zc_share.z_sharetype) { case ZFS_SHARE_NFS: case ZFS_UNSHARE_NFS: if (error = znfsexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata)) return (error); break; case ZFS_SHARE_SMB: case ZFS_UNSHARE_SMB: if (error = zsmbexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata, zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? B_TRUE: B_FALSE)) { return (error); } break; } opcode = (zc->zc_share.z_sharetype == ZFS_SHARE_NFS || zc->zc_share.z_sharetype == ZFS_SHARE_SMB) ? SHAREFS_ADD : SHAREFS_REMOVE; /* * Add or remove share from sharetab */ error = zshare_fs(opcode, (void *)(uintptr_t)zc->zc_share.z_sharedata, zc->zc_share.z_sharemax); return (error); #else /* !illumos */ return (ENOSYS); #endif /* illumos */ } ace_t full_access[] = { {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} }; /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, dsl_dataset_phys(os->os_dsl_dataset)->ds_prev_snap_txg); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: * zc_value short name of new snapshot */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; int error; minor_t minor; error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); if (error != 0) return (error); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strcpy(zc->zc_value, snap_name); strfree(snap_name); strfree(hold_name); zfs_onexit_fd_rele(zc->zc_cleanup_fd); return (error); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { file_t *fp; cap_rights_t rights; offset_t off; int error; #ifdef illumos fp = getf(zc->zc_cookie); #else fget_write(curthread, zc->zc_cookie, cap_rights_init(&rights, CAP_WRITE), &fp); #endif if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; #ifdef illumos error = dmu_diff(zc->zc_name, zc->zc_value, fp->f_vnode, &off); #else error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); #endif if (off >= 0 && off <= MAXOFFSET_T) fp->f_offset = off; releasef(zc->zc_cookie); return (error); } #ifdef illumos /* * Remove all ACL files in shares dir */ static int zfs_smb_acl_purge(znode_t *dzp) { zap_cursor_t zc; zap_attribute_t zap; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; int error; for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); (error = zap_cursor_retrieve(&zc, &zap)) == 0; zap_cursor_advance(&zc)) { if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred, NULL, 0)) != 0) break; } zap_cursor_fini(&zc); return (error); } #endif /* illumos */ static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { #ifdef illumos vnode_t *vp; znode_t *dzp; vnode_t *resourcevp = NULL; znode_t *sharedir; zfsvfs_t *zfsvfs; nvlist_t *nvlist; char *src, *target; vattr_t vattr; vsecattr_t vsec; int error = 0; if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp)) != 0) return (error); /* Now make sure mntpnt and dataset are ZFS */ if (strcmp(vp->v_vfsp->mnt_stat.f_fstypename, "zfs") != 0 || (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), zc->zc_name) != 0)) { VN_RELE(vp); return (SET_ERROR(EINVAL)); } dzp = VTOZ(vp); zfsvfs = dzp->z_zfsvfs; ZFS_ENTER(zfsvfs); /* * Create share dir if its missing. */ mutex_enter(&zfsvfs->z_lock); if (zfsvfs->z_shares_dir == 0) { dmu_tx_t *tx; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, ZFS_SHARES_DIR); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { error = zfs_create_share_dir(zfsvfs, tx); dmu_tx_commit(tx); } if (error != 0) { mutex_exit(&zfsvfs->z_lock); VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } } mutex_exit(&zfsvfs->z_lock); ASSERT(zfsvfs->z_shares_dir); if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) { VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } switch (zc->zc_cookie) { case ZFS_SMB_ACL_ADD: vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; vattr.va_type = VREG; vattr.va_mode = S_IFREG|0777; vattr.va_uid = 0; vattr.va_gid = 0; vsec.vsa_mask = VSA_ACE; vsec.vsa_aclentp = &full_access; vsec.vsa_aclentsz = sizeof (full_access); vsec.vsa_aclcnt = 1; error = VOP_CREATE(ZTOV(sharedir), zc->zc_string, &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec); if (resourcevp) VN_RELE(resourcevp); break; case ZFS_SMB_ACL_REMOVE: error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred, NULL, 0); break; case ZFS_SMB_ACL_RENAME: if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { VN_RELE(vp); ZFS_EXIT(zfsvfs); return (error); } if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) || nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET, &target)) { VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); nvlist_free(nvlist); return (error); } error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, kcred, NULL, 0); nvlist_free(nvlist); break; case ZFS_SMB_ACL_PURGE: error = zfs_smb_acl_purge(sharedir); break; default: error = SET_ERROR(EINVAL); break; } VN_RELE(vp); VN_RELE(ZTOV(sharedir)); ZFS_EXIT(zfsvfs); return (error); #else /* !illumos */ return (EOPNOTSUPP); #endif /* illumos */ } /* * innvl: { * "holds" -> { snapname -> holdname (string), ... } * (optional) "cleanup_fd" -> fd (int32) * } * * outnvl: { * snapname -> error value (int32) * ... * } */ /* ARGSUSED */ static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; int error; minor_t minor = 0; error = nvlist_lookup_nvlist(args, "holds", &holds); if (error != 0) return (SET_ERROR(EINVAL)); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) return (SET_ERROR(error)); if (strlen(htag) == 0) return (SET_ERROR(EINVAL)); } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { error = zfs_onexit_fd_hold(cleanup_fd, &minor); if (error != 0) return (error); } error = dsl_dataset_user_hold(holds, minor, errlist); if (minor != 0) zfs_onexit_fd_rele(cleanup_fd); return (error); } /* * innvl is not used. * * outnvl: { * holdname -> time added (uint64 seconds since epoch) * ... * } */ /* ARGSUSED */ static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { return (dsl_dataset_get_holds(snapname, outnvl)); } /* * innvl: { * snapname -> { holdname, ... } * ... * } * * outnvl: { * snapname -> error value (int32) * ... * } */ /* ARGSUSED */ static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: * zc_name name of new filesystem or snapshot * zc_value full name of old snapshot * * outputs: * zc_cookie space in bytes * zc_objset_type compressed space in bytes * zc_perm_action uncompressed space in bytes */ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_written(old, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * innvl: { * "firstsnap" -> snapshot name * } * * outnvl: { * "used" -> space in bytes * "compressed" -> compressed space in bytes * "uncompressed" -> uncompressed space in bytes * } */ static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; char *firstsnap; uint64_t used, comp, uncomp; if (nvlist_lookup_string(innvl, "firstsnap", &firstsnap) != 0) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); - if (error == 0 && !dsl_dataset_is_snapshot(new)) { + if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); - if (error == 0 && !dsl_dataset_is_snapshot(old)) { + if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } static int zfs_ioc_jail(zfs_cmd_t *zc) { return (zone_dataset_attach(curthread->td_ucred, zc->zc_name, (int)zc->zc_jailid)); } static int zfs_ioc_unjail(zfs_cmd_t *zc) { return (zone_dataset_detach(curthread->td_ucred, zc->zc_name, (int)zc->zc_jailid)); } /* * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * } * * outnvl is unused */ /* ARGSUSED */ static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { cap_rights_t rights; file_t *fp; int error; offset_t off; char *fromname = NULL; int fd; boolean_t largeblockok; boolean_t embedok; error = nvlist_lookup_int32(innvl, "fd", &fd); if (error != 0) return (SET_ERROR(EINVAL)); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); #ifdef illumos file_t *fp = getf(fd); #else fget_write(curthread, fd, cap_rights_init(&rights, CAP_WRITE), &fp); #endif if (fp == NULL) return (SET_ERROR(EBADF)); off = fp->f_offset; error = dmu_send(snapname, fromname, embedok, largeblockok, #ifdef illumos fd, fp->f_vnode, &off); #else fd, fp, &off); #endif #ifdef illumos if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; #else fp->f_offset = off; #endif releasef(fd); return (error); } /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). * * innvl: { * (optional) "fromsnap" -> full snap name to send an incremental from * } * * outnvl: { * "space" -> bytes of space (uint64) * } */ static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *fromsnap = NULL; dsl_dataset_t *tosnap; int error; char *fromname; uint64_t space; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = nvlist_lookup_string(innvl, "fromsnap", &fromname); if (error == 0) { error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate(tosnap, fromsnap, &space); fnvlist_add_uint64(outnvl, "space", space); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_allow_log = log_history; vec->zvec_pool_check = pool_check; } /* * See the block comment at the beginning of this file for details on * each argument to this function. */ static void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, boolean_t allow_log) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); vec->zvec_name = name; vec->zvec_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; } static void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } static void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } static void zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_dataset_read_secpolicy(ioc, func, zfs_secpolicy_read); } static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE); /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, zfs_ioc_pool_configs, zfs_secpolicy_none); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, zfs_ioc_pool_tryimport, zfs_secpolicy_config); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, zfs_ioc_inject_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, zfs_ioc_clear_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, zfs_ioc_inject_list_next, zfs_secpolicy_inject); /* * pool destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_export * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, zfs_secpolicy_config, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, B_TRUE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, zfs_ioc_next_obj); zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, zfs_ioc_get_fsacl); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, zfs_ioc_objset_stats); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, zfs_ioc_objset_zplprops); zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, zfs_ioc_dataset_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, zfs_ioc_snapshot_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, zfs_ioc_send_progress); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, zfs_ioc_diff, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, zfs_ioc_obj_to_stats, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, zfs_ioc_obj_to_path, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, zfs_secpolicy_none); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, zfs_secpolicy_rename); zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); #ifdef __FreeBSD__ zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail, zfs_secpolicy_config, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail, zfs_secpolicy_config, POOL_CHECK_NONE); #endif } int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = SET_ERROR(EAGAIN); else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = SET_ERROR(EROFS); spa_close(spa, FTAG); } return (error); } /* * Find a free minor number. */ minor_t zfsdev_minor_alloc(void) { static minor_t last_minor; minor_t m; ASSERT(MUTEX_HELD(&spa_namespace_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (ddi_get_soft_state(zfsdev_state, m) == NULL) { last_minor = m; return (m); } } return (0); } static int zfs_ctldev_init(struct cdev *devp) { minor_t minor; zfs_soft_state_t *zs; ASSERT(MUTEX_HELD(&spa_namespace_lock)); minor = zfsdev_minor_alloc(); if (minor == 0) return (SET_ERROR(ENXIO)); if (ddi_soft_state_zalloc(zfsdev_state, minor) != DDI_SUCCESS) return (SET_ERROR(EAGAIN)); devfs_set_cdevpriv((void *)(uintptr_t)minor, zfsdev_close); zs = ddi_get_soft_state(zfsdev_state, minor); zs->zss_type = ZSST_CTLDEV; zfs_onexit_init((zfs_onexit_t **)&zs->zss_data); return (0); } static void zfs_ctldev_destroy(zfs_onexit_t *zo, minor_t minor) { ASSERT(MUTEX_HELD(&spa_namespace_lock)); zfs_onexit_destroy(zo); ddi_soft_state_free(zfsdev_state, minor); } void * zfsdev_get_soft_state(minor_t minor, enum zfs_soft_state_type which) { zfs_soft_state_t *zp; zp = ddi_get_soft_state(zfsdev_state, minor); if (zp == NULL || zp->zss_type != which) return (NULL); return (zp->zss_data); } static int zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td) { int error = 0; #ifdef illumos if (getminor(*devp) != 0) return (zvol_open(devp, flag, otyp, cr)); #endif /* This is the control device. Allocate a new minor if requested. */ if (flag & FEXCL) { mutex_enter(&spa_namespace_lock); error = zfs_ctldev_init(devp); mutex_exit(&spa_namespace_lock); } return (error); } static void zfsdev_close(void *data) { zfs_onexit_t *zo; minor_t minor = (minor_t)(uintptr_t)data; if (minor == 0) return; mutex_enter(&spa_namespace_lock); zo = zfsdev_get_soft_state(minor, ZSST_CTLDEV); if (zo == NULL) { mutex_exit(&spa_namespace_lock); return; } zfs_ctldev_destroy(zo, minor); mutex_exit(&spa_namespace_lock); } static int zfsdev_ioctl(struct cdev *dev, u_long zcmd, caddr_t arg, int flag, struct thread *td) { zfs_cmd_t *zc; uint_t vecnum; int error, rc, len; #ifdef illumos minor_t minor = getminor(dev); #else zfs_iocparm_t *zc_iocparm; int cflag, cmd, oldvecnum; boolean_t newioc, compat; void *compat_zc = NULL; cred_t *cr = td->td_ucred; #endif const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; nvlist_t *innvl = NULL; cflag = ZFS_CMD_COMPAT_NONE; compat = B_FALSE; newioc = B_TRUE; /* "new" style (zfs_iocparm_t) ioctl */ len = IOCPARM_LEN(zcmd); vecnum = cmd = zcmd & 0xff; /* * Check if we are talking to supported older binaries * and translate zfs_cmd if necessary */ if (len != sizeof(zfs_iocparm_t)) { newioc = B_FALSE; compat = B_TRUE; vecnum = cmd; switch (len) { case sizeof(zfs_cmd_zcmd_t): cflag = ZFS_CMD_COMPAT_LZC; break; case sizeof(zfs_cmd_deadman_t): cflag = ZFS_CMD_COMPAT_DEADMAN; break; case sizeof(zfs_cmd_v28_t): cflag = ZFS_CMD_COMPAT_V28; break; case sizeof(zfs_cmd_v15_t): cflag = ZFS_CMD_COMPAT_V15; vecnum = zfs_ioctl_v15_to_v28[cmd]; /* * Return without further handling * if the command is blacklisted. */ if (vecnum == ZFS_IOC_COMPAT_PASS) return (0); else if (vecnum == ZFS_IOC_COMPAT_FAIL) return (ENOTSUP); break; default: return (EINVAL); } } #ifdef illumos vecnum = cmd - ZFS_IOC_FIRST; ASSERT3U(getmajor(dev), ==, ddi_driver_major(zfs_dip)); #endif if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (SET_ERROR(EINVAL)); vec = &zfs_ioc_vec[vecnum]; zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); #ifdef illumos error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); if (error != 0) { error = SET_ERROR(EFAULT); goto out; } #else /* !illumos */ bzero(zc, sizeof(zfs_cmd_t)); if (newioc) { zc_iocparm = (void *)arg; switch (zc_iocparm->zfs_ioctl_version) { case ZFS_IOCVER_CURRENT: if (zc_iocparm->zfs_cmd_size != sizeof(zfs_cmd_t)) { error = SET_ERROR(EINVAL); goto out; } break; case ZFS_IOCVER_ZCMD: if (zc_iocparm->zfs_cmd_size > sizeof(zfs_cmd_t) || zc_iocparm->zfs_cmd_size < sizeof(zfs_cmd_zcmd_t)) { error = SET_ERROR(EFAULT); goto out; } compat = B_TRUE; cflag = ZFS_CMD_COMPAT_ZCMD; break; default: error = SET_ERROR(EINVAL); goto out; /* NOTREACHED */ } if (compat) { ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size); compat_zc = kmem_zalloc(sizeof(zfs_cmd_t), KM_SLEEP); bzero(compat_zc, sizeof(zfs_cmd_t)); error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, compat_zc, zc_iocparm->zfs_cmd_size, flag); if (error != 0) { error = SET_ERROR(EFAULT); goto out; } } else { error = ddi_copyin((void *)(uintptr_t)zc_iocparm->zfs_cmd, zc, zc_iocparm->zfs_cmd_size, flag); if (error != 0) { error = SET_ERROR(EFAULT); goto out; } } } if (compat) { if (newioc) { ASSERT(compat_zc != NULL); zfs_cmd_compat_get(zc, compat_zc, cflag); } else { ASSERT(compat_zc == NULL); zfs_cmd_compat_get(zc, arg, cflag); } oldvecnum = vecnum; error = zfs_ioctl_compat_pre(zc, &vecnum, cflag); if (error != 0) goto out; if (oldvecnum != vecnum) vec = &zfs_ioc_vec[vecnum]; } #endif /* !illumos */ zc->zc_iflags = flag & FKIOCTL; if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &innvl); if (error != 0) goto out; } /* rewrite innvl for backwards compatibility */ if (compat) innvl = zfs_ioctl_compat_innvl(zc, innvl, vecnum, cflag); /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (vec->zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case NO_NAME: break; } if (error == 0 && !(flag & FKIOCTL)) error = vec->zvec_secpolicy(zc, innvl, cr); if (error != 0) goto out; /* legacy ioctls can modify zc_name */ len = strcspn(zc->zc_name, "/@#") + 1; saved_poolname = kmem_alloc(len, KM_SLEEP); (void) strlcpy(saved_poolname, zc->zc_name, len); if (vec->zvec_func != NULL) { nvlist_t *outnvl; int puterror = 0; spa_t *spa; nvlist_t *lognv = NULL; ASSERT(vec->zvec_legacy_func == NULL); /* * Add the innvl to the lognv before calling the func, * in case the func changes the innvl. */ if (vec->zvec_allow_log) { lognv = fnvlist_alloc(); fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, vec->zvec_name); if (!nvlist_empty(innvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, innvl); } } outnvl = fnvlist_alloc(); error = vec->zvec_func(zc->zc_name, innvl, outnvl); if (error == 0 && vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, outnvl); } (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } fnvlist_free(lognv); /* rewrite outnvl for backwards compatibility */ if (compat) outnvl = zfs_ioctl_compat_outnvl(zc, outnvl, vecnum, cflag); if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { smusherror = nvlist_smush(outnvl, zc->zc_nvlist_dst_size); } if (smusherror == 0) puterror = put_nvlist(zc, outnvl); } if (puterror != 0) error = puterror; nvlist_free(outnvl); } else { error = vec->zvec_legacy_func(zc); } out: nvlist_free(innvl); #ifdef illumos rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); #else if (compat) { zfs_ioctl_compat_post(zc, cmd, cflag); if (newioc) { ASSERT(compat_zc != NULL); ASSERT(sizeof(zfs_cmd_t) >= zc_iocparm->zfs_cmd_size); zfs_cmd_compat_put(zc, compat_zc, vecnum, cflag); rc = ddi_copyout(compat_zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd, zc_iocparm->zfs_cmd_size, flag); if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); kmem_free(compat_zc, sizeof (zfs_cmd_t)); } else { zfs_cmd_compat_put(zc, arg, vecnum, cflag); } } else { ASSERT(newioc); rc = ddi_copyout(zc, (void *)(uintptr_t)zc_iocparm->zfs_cmd, sizeof (zfs_cmd_t), flag); if (error == 0 && rc != 0) error = SET_ERROR(EFAULT); } #endif if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) strfree(s); (void) tsd_set(zfs_allow_log_key, saved_poolname); } else { if (saved_poolname != NULL) strfree(saved_poolname); } kmem_free(zc, sizeof (zfs_cmd_t)); return (error); } #ifdef illumos static int zfs_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { if (cmd != DDI_ATTACH) return (DDI_FAILURE); if (ddi_create_minor_node(dip, "zfs", S_IFCHR, 0, DDI_PSEUDO, 0) == DDI_FAILURE) return (DDI_FAILURE); zfs_dip = dip; ddi_report_dev(dip); return (DDI_SUCCESS); } static int zfs_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { if (spa_busy() || zfs_busy() || zvol_busy()) return (DDI_FAILURE); if (cmd != DDI_DETACH) return (DDI_FAILURE); zfs_dip = NULL; ddi_prop_remove_all(dip); ddi_remove_minor_node(dip, NULL); return (DDI_SUCCESS); } /*ARGSUSED*/ static int zfs_info(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { switch (infocmd) { case DDI_INFO_DEVT2DEVINFO: *result = zfs_dip; return (DDI_SUCCESS); case DDI_INFO_DEVT2INSTANCE: *result = (void *)0; return (DDI_SUCCESS); } return (DDI_FAILURE); } #endif /* illumos */ /* * OK, so this is a little weird. * * /dev/zfs is the control node, i.e. minor 0. * /dev/zvol/[r]dsk/pool/dataset are the zvols, minor > 0. * * /dev/zfs has basically nothing to do except serve up ioctls, * so most of the standard driver entry points are in zvol.c. */ #ifdef illumos static struct cb_ops zfs_cb_ops = { zfsdev_open, /* open */ zfsdev_close, /* close */ zvol_strategy, /* strategy */ nodev, /* print */ zvol_dump, /* dump */ zvol_read, /* read */ zvol_write, /* write */ zfsdev_ioctl, /* ioctl */ nodev, /* devmap */ nodev, /* mmap */ nodev, /* segmap */ nochpoll, /* poll */ ddi_prop_op, /* prop_op */ NULL, /* streamtab */ D_NEW | D_MP | D_64BIT, /* Driver compatibility flag */ CB_REV, /* version */ nodev, /* async read */ nodev, /* async write */ }; static struct dev_ops zfs_dev_ops = { DEVO_REV, /* version */ 0, /* refcnt */ zfs_info, /* info */ nulldev, /* identify */ nulldev, /* probe */ zfs_attach, /* attach */ zfs_detach, /* detach */ nodev, /* reset */ &zfs_cb_ops, /* driver operations */ NULL, /* no bus operations */ NULL, /* power */ ddi_quiesce_not_needed, /* quiesce */ }; static struct modldrv zfs_modldrv = { &mod_driverops, "ZFS storage pool", &zfs_dev_ops }; static struct modlinkage modlinkage = { MODREV_1, (void *)&zfs_modlfs, (void *)&zfs_modldrv, NULL }; #endif /* illumos */ static struct cdevsw zfs_cdevsw = { .d_version = D_VERSION, .d_open = zfsdev_open, .d_ioctl = zfsdev_ioctl, .d_name = ZFS_DEV_NAME }; static void zfs_allow_log_destroy(void *arg) { char *poolname = arg; strfree(poolname); } static void zfsdev_init(void) { zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, ZFS_DEV_NAME); } static void zfsdev_fini(void) { if (zfsdev != NULL) destroy_dev(zfsdev); } static struct root_hold_token *zfs_root_token; struct proc *zfsproc; #ifdef illumos int _init(void) { int error; spa_init(FREAD | FWRITE); zfs_init(); zvol_init(); zfs_ioctl_init(); if ((error = mod_install(&modlinkage)) != 0) { zvol_fini(); zfs_fini(); spa_fini(); return (error); } tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); error = ldi_ident_from_mod(&modlinkage, &zfs_li); ASSERT(error == 0); mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); return (0); } int _fini(void) { int error; if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) return (SET_ERROR(EBUSY)); if ((error = mod_remove(&modlinkage)) != 0) return (error); zvol_fini(); zfs_fini(); spa_fini(); if (zfs_nfsshare_inited) (void) ddi_modclose(nfs_mod); if (zfs_smbshare_inited) (void) ddi_modclose(smbsrv_mod); if (zfs_nfsshare_inited || zfs_smbshare_inited) (void) ddi_modclose(sharefs_mod); tsd_destroy(&zfs_fsyncer_key); ldi_ident_release(zfs_li); zfs_li = NULL; mutex_destroy(&zfs_share_lock); return (error); } int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } #endif /* illumos */ static int zfs__init(void); static int zfs__fini(void); static void zfs_shutdown(void *, int); static eventhandler_tag zfs_shutdown_event_tag; #ifdef __FreeBSD__ #define ZFS_MIN_KSTACK_PAGES 4 #endif int zfs__init(void) { #ifdef __FreeBSD__ #if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack " "overflow panic!\nPlease consider adding " "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES, ZFS_MIN_KSTACK_PAGES); #endif #endif zfs_root_token = root_mount_hold("ZFS"); mutex_init(&zfs_share_lock, NULL, MUTEX_DEFAULT, NULL); spa_init(FREAD | FWRITE); zfs_init(); zvol_init(); zfs_ioctl_init(); tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); printf("ZFS storage pool version: features support (" SPA_VERSION_STRING ")\n"); root_mount_rel(zfs_root_token); zfsdev_init(); return (0); } int zfs__fini(void) { if (spa_busy() || zfs_busy() || zvol_busy() || zio_injection_enabled) { return (EBUSY); } zfsdev_fini(); zvol_fini(); zfs_fini(); spa_fini(); tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); mutex_destroy(&zfs_share_lock); return (0); } static void zfs_shutdown(void *arg __unused, int howto __unused) { /* * ZFS fini routines can not properly work in a panic-ed system. */ if (panicstr == NULL) (void)zfs__fini(); } static int zfs_modevent(module_t mod, int type, void *unused __unused) { int err; switch (type) { case MOD_LOAD: err = zfs__init(); if (err == 0) zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( shutdown_post_sync, zfs_shutdown, NULL, SHUTDOWN_PRI_FIRST); return (err); case MOD_UNLOAD: err = zfs__fini(); if (err == 0 && zfs_shutdown_event_tag != NULL) EVENTHANDLER_DEREGISTER(shutdown_post_sync, zfs_shutdown_event_tag); return (err); case MOD_SHUTDOWN: return (0); default: break; } return (EOPNOTSUPP); } static moduledata_t zfs_mod = { "zfsctrl", zfs_modevent, 0 }; DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_VFS, SI_ORDER_ANY); MODULE_VERSION(zfsctrl, 1); MODULE_DEPEND(zfsctrl, opensolaris, 1, 1, 1); MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1); MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1); Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_sa.c (revision 286575) @@ -1,334 +1,333 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ -#include -#include +#include #include #include #include #include /* * ZPL attribute registration table. * Order of attributes doesn't matter * a unique value will be assigned for each * attribute that is file system specific * * This is just the set of ZPL attributes that this * version of ZFS deals with natively. The file system * could have other attributes stored in files, but they will be * ignored. The SA framework will preserve them, just that * this version of ZFS won't change or delete them. */ sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0}, {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0}, {"ZPL_DACL_ACES", 0, SA_ACL, 0}, {NULL, 0, 0, 0} }; #ifdef _KERNEL int zfs_sa_readlink(znode_t *zp, uio_t *uio) { dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); size_t bufsz; int error; bufsz = zp->z_size; if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { error = uiomove((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); } else { dmu_buf_t *dbp; if ((error = dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { error = uiomove(dbp->db_data, MIN((size_t)bufsz, uio->uio_resid), UIO_READ, uio); dmu_buf_rele(dbp, FTAG); } } return (error); } void zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) { dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) { VERIFY(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx) == 0); if (len) { bcopy(link, (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, len); } } else { dmu_buf_t *dbp; zfs_grow_blocksize(zp, len, tx); VERIFY(0 == dmu_buf_hold(zp->z_zfsvfs->z_os, zp->z_id, 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)); dmu_buf_will_dirty(dbp, tx); ASSERT3U(len, <=, dbp->db_size); bcopy(link, dbp->db_data, len); dmu_buf_rele(dbp, FTAG); } } void zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; xoptattr_t *xoap; ASSERT(MUTEX_HELD(&zp->z_lock)); VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); if (zp->z_is_sa) { if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), &xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)) != 0) return; } else { dmu_object_info_t doi; dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); int len; if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP)) return; sa_object_info(zp->z_sa_hdl, &doi); len = sizeof (xoap->xoa_av_scanstamp) + ZFS_OLD_ZNODE_PHYS_SIZE; if (len <= doi.doi_bonus_size) { (void) memcpy(xoap->xoa_av_scanstamp, (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, sizeof (xoap->xoa_av_scanstamp)); } } XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } void zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; xoptattr_t *xoap; ASSERT(MUTEX_HELD(&zp->z_lock)); VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); if (zp->z_is_sa) VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), &xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp), tx)); else { dmu_object_info_t doi; dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); int len; sa_object_info(zp->z_sa_hdl, &doi); len = sizeof (xoap->xoa_av_scanstamp) + ZFS_OLD_ZNODE_PHYS_SIZE; if (len > doi.doi_bonus_size) VERIFY(dmu_set_bonus(db, len, tx) == 0); (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)); zp->z_pflags |= ZFS_BONUS_SCANSTAMP; VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), &zp->z_pflags, sizeof (uint64_t), tx)); } } /* * I'm not convinced we should do any of this upgrade. * since the SA code can read both old/new znode formats * with probably little to no performance difference. * * All new files will be created with the new format. */ void zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) { dmu_buf_t *db = sa_get_db(hdl); znode_t *zp = sa_get_userdata(hdl); zfsvfs_t *zfsvfs = zp->z_zfsvfs; sa_bulk_attr_t bulk[20]; int count = 0; sa_bulk_attr_t sa_attrs[20] = { 0 }; zfs_acl_locator_cb_t locate = { 0 }; uint64_t uid, gid, mode, rdev, xattr, parent; uint64_t crtime[2], mtime[2], ctime[2]; zfs_acl_phys_t znode_acl; char scanstamp[AV_SCANSTAMP_SZ]; boolean_t drop_lock = B_FALSE; /* * No upgrade if ACL isn't cached * since we won't know which locks are held * and ready the ACL would require special "locked" * interfaces that would be messy */ if (zp->z_acl_cached == NULL || ZTOV(zp)->v_type == VLNK) return; /* * If the z_lock is held and we aren't the owner * the just return since we don't want to deadlock * trying to update the status of z_is_sa. This * file can then be upgraded at a later time. * * Otherwise, we know we are doing the * sa_update() that caused us to enter this function. */ if (mutex_owner(&zp->z_lock) != curthread) { if (mutex_tryenter(&zp->z_lock) == 0) return; else drop_lock = B_TRUE; } /* First do a bulk query of the attributes that aren't cached */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &znode_acl, 88); if (sa_bulk_lookup_locked(hdl, bulk, count) != 0) goto done; /* * While the order here doesn't matter its best to try and organize * it is such a way to pick up an already existing layout number */ count = 0; SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, zp->z_atime, 16); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, 8); if (zp->z_vnode->v_type == VBLK || zp->z_vnode->v_type == VCHR) SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &zp->z_acl_cached->z_acl_count, 8); if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID) zfs_acl_xform(zp, zp->z_acl_cached, CRED()); locate.cb_aclp = zp->z_acl_cached; SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes); if (xattr) SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8); /* if scanstamp then add scanstamp */ if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, scanstamp, AV_SCANSTAMP_SZ); SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL, scanstamp, AV_SCANSTAMP_SZ); zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; } VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs, count, tx) == 0); if (znode_acl.z_acl_extern_obj) VERIFY(0 == dmu_object_free(zfsvfs->z_os, znode_acl.z_acl_extern_obj, tx)); zp->z_is_sa = B_TRUE; done: if (drop_lock) mutex_exit(&zp->z_lock); } void zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp) { if (!zp->z_zfsvfs->z_use_sa || zp->z_is_sa) return; dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); if (zfs_external_acl(zp)) { dmu_tx_hold_free(tx, zfs_external_acl(zp), 0, DMU_OBJECT_END); } } #endif Index: head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c =================================================================== --- head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (revision 286574) +++ head/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zil.c (revision 286575) @@ -1,2142 +1,2142 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include /* * The zfs intent log (ZIL) saves transaction records of system calls * that change the file system in memory with enough information * to be able to replay them. These are stored in memory until * either the DMU transaction group (txg) commits them to the stable pool * and they can be discarded, or they are flushed to the stable log * (also in the pool) due to a fsync, O_DSYNC or other synchronous * requirement. In the event of a panic or power fail then those log * records (transactions) are replayed. * * There is one ZIL per file system. Its on-disk (pool) format consists * of 3 parts: * * - ZIL header * - ZIL blocks * - ZIL records * * A log record holds a system call transaction. Log blocks can * hold many log records and the blocks are chained together. * Each ZIL block contains a block pointer (blkptr_t) to the next * ZIL block in the chain. The ZIL header points to the first * block in the chain. Note there is not a fixed place in the pool * to hold blocks. They are dynamically allocated and freed as * needed from the blocks available. Figure X shows the ZIL structure: */ /* * Disable intent logging replay. This global ZIL switch affects all pools. */ int zil_replay_disable = 0; SYSCTL_DECL(_vfs_zfs); SYSCTL_INT(_vfs_zfs, OID_AUTO, zil_replay_disable, CTLFLAG_RWTUN, &zil_replay_disable, 0, "Disable intent logging replay"); /* * Tunable parameter for debugging or performance analysis. Setting * zfs_nocacheflush will cause corruption on power loss if a volatile * out-of-order write cache is enabled. */ boolean_t zfs_nocacheflush = B_FALSE; SYSCTL_INT(_vfs_zfs, OID_AUTO, cache_flush_disable, CTLFLAG_RDTUN, &zfs_nocacheflush, 0, "Disable cache flush"); boolean_t zfs_trim_enabled = B_TRUE; SYSCTL_DECL(_vfs_zfs_trim); SYSCTL_INT(_vfs_zfs_trim, OID_AUTO, enabled, CTLFLAG_RDTUN, &zfs_trim_enabled, 0, "Enable ZFS TRIM"); static kmem_cache_t *zil_lwb_cache; static void zil_async_to_sync(zilog_t *zilog, uint64_t foid); #define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) /* * ziltest is by and large an ugly hack, but very useful in * checking replay without tedious work. * When running ziltest we want to keep all itx's and so maintain * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG * We subtract TXG_CONCURRENT_STATES to allow for common code. */ #define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES) static int zil_bp_compare(const void *x1, const void *x2) { const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) return (-1); if (DVA_GET_VDEV(dva1) > DVA_GET_VDEV(dva2)) return (1); if (DVA_GET_OFFSET(dva1) < DVA_GET_OFFSET(dva2)) return (-1); if (DVA_GET_OFFSET(dva1) > DVA_GET_OFFSET(dva2)) return (1); return (0); } static void zil_bp_tree_init(zilog_t *zilog) { avl_create(&zilog->zl_bp_tree, zil_bp_compare, sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); } static void zil_bp_tree_fini(zilog_t *zilog) { avl_tree_t *t = &zilog->zl_bp_tree; zil_bp_node_t *zn; void *cookie = NULL; while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) kmem_free(zn, sizeof (zil_bp_node_t)); avl_destroy(t); } int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_bp_tree; const dva_t *dva; zil_bp_node_t *zn; avl_index_t where; if (BP_IS_EMBEDDED(bp)) return (0); dva = BP_IDENTITY(bp); if (avl_find(t, dva, &where) != NULL) return (SET_ERROR(EEXIST)); zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); zn->zn_dva = *dva; avl_insert(t, zn, where); return (0); } static zil_header_t * zil_header_in_syncing_context(zilog_t *zilog) { return ((zil_header_t *)zilog->zl_header); } static void zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) { zio_cksum_t *zc = &bp->blk_cksum; zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); zc->zc_word[ZIL_ZC_SEQ] = 1ULL; } /* * Read a log block and make sure it's valid. */ static int zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, char **end) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) zio_flags |= ZIO_FLAG_SPECULATIVE; SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { zio_cksum_t cksum = bp->blk_cksum; /* * Validate the checksummed log block. * * Sequence numbers should be... sequential. The checksum * verifier for the next block should be bp's checksum plus 1. * * Also check the log chain linkage and size used. */ cksum.zc_word[ZIL_ZC_SEQ]++; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { zil_chain_t *zilc = abuf->b_data; char *lr = (char *)(zilc + 1); uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { error = SET_ERROR(ECKSUM); } else { ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); bcopy(lr, dst, len); *end = (char *)dst + len; *nbp = zilc->zc_next_blk; } } else { char *lr = abuf->b_data; uint64_t size = BP_GET_LSIZE(bp); zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || (zilc->zc_nused > (size - sizeof (*zilc)))) { error = SET_ERROR(ECKSUM); } else { ASSERT3U(zilc->zc_nused, <=, SPA_OLD_MAXBLOCKSIZE); bcopy(lr, dst, zilc->zc_nused); *end = (char *)dst + zilc->zc_nused; *nbp = zilc->zc_next_blk; } } VERIFY(arc_buf_remove_ref(abuf, &abuf)); } return (error); } /* * Read a TX_WRITE log data block. */ static int zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) { enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; const blkptr_t *bp = &lr->lr_blkptr; arc_flags_t aflags = ARC_FLAG_WAIT; arc_buf_t *abuf = NULL; zbookmark_phys_t zb; int error; if (BP_IS_HOLE(bp)) { if (wbuf != NULL) bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); return (0); } if (zilog->zl_header->zh_claim_txg == 0) zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { if (wbuf != NULL) bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); (void) arc_buf_remove_ref(abuf, &abuf); } return (error); } /* * Parse the intent log, and call parse_func for each valid record within. */ int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) { const zil_header_t *zh = zilog->zl_header; boolean_t claimed = !!zh->zh_claim_txg; uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; uint64_t max_blk_seq = 0; uint64_t max_lr_seq = 0; uint64_t blk_count = 0; uint64_t lr_count = 0; blkptr_t blk, next_blk; char *lrbuf, *lrp; int error = 0; /* * Old logs didn't record the maximum zh_claim_lr_seq. */ if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) claim_lr_seq = UINT64_MAX; /* * Starting at the block pointed to by zh_log we read the log chain. * For each block in the chain we strongly check that block to * ensure its validity. We stop when an invalid block is found. * For each block pointer in the chain we call parse_blk_func(). * For each record in each valid block we call parse_lr_func(). * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); zil_bp_tree_init(zilog); for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; int reclen; char *end; if (blk_seq > claim_blk_seq) break; if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) break; ASSERT3U(max_blk_seq, <, blk_seq); max_blk_seq = blk_seq; blk_count++; if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) break; error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); if (error != 0) break; for (lrp = lrbuf; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); if (lr->lrc_seq > claim_lr_seq) goto done; if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) goto done; ASSERT3U(max_lr_seq, <, lr->lrc_seq); max_lr_seq = lr->lrc_seq; lr_count++; } } done: zilog->zl_parse_error = error; zilog->zl_parse_blk_seq = max_blk_seq; zilog->zl_parse_lr_seq = max_lr_seq; zilog->zl_parse_blk_count = blk_count; zilog->zl_parse_lr_count = lr_count; ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); zil_bp_tree_fini(zilog); zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); return (error); } static int zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) { /* * Claim log block if not already committed and not already claimed. * If tx == NULL, just verify that the block is claimable. */ if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) return (0); return (zio_wait(zio_claim(NULL, zilog->zl_spa, tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); } static int zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) { lr_write_t *lr = (lr_write_t *)lrc; int error; if (lrc->lrc_txtype != TX_WRITE) return (0); /* * If the block is not readable, don't claim it. This can happen * in normal operation when a log block is written to disk before * some of the dmu_sync() blocks it points to. In this case, the * transaction cannot have been committed to anyone (we would have * waited for all writes to be stable first), so it is semantically * correct to declare this the end of the log. */ if (lr->lr_blkptr.blk_birth >= first_txg && (error = zil_read_log_data(zilog, lr, NULL)) != 0) return (error); return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); } /* ARGSUSED */ static int zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) { zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static int zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) { lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; /* * If we previously claimed it, we need to free it. */ if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && !BP_IS_HOLE(bp)) zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); return (0); } static lwb_t * zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg) { lwb_t *lwb; lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); lwb->lwb_zilog = zilog; lwb->lwb_blk = *bp; lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); lwb->lwb_max_txg = txg; lwb->lwb_zio = NULL; lwb->lwb_tx = NULL; if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { lwb->lwb_nused = sizeof (zil_chain_t); lwb->lwb_sz = BP_GET_LSIZE(bp); } else { lwb->lwb_nused = 0; lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); } mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_lwb_list, lwb); mutex_exit(&zilog->zl_lock); return (lwb); } /* * Called when we create in-memory log transactions so that we know * to cleanup the itxs at the end of spa_sync(). */ void zilog_dirty(zilog_t *zilog, uint64_t txg) { dsl_pool_t *dp = zilog->zl_dmu_pool; dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); - if (dsl_dataset_is_snapshot(ds)) + if (ds->ds_is_snapshot) panic("dirtying snapshot!"); if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { /* up the hold count until we can be written out */ dmu_buf_add_ref(ds->ds_dbuf, zilog); } } boolean_t zilog_is_dirty(zilog_t *zilog) { dsl_pool_t *dp = zilog->zl_dmu_pool; for (int t = 0; t < TXG_SIZE; t++) { if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) return (B_TRUE); } return (B_FALSE); } /* * Create an on-disk intent log. */ static lwb_t * zil_create(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb = NULL; uint64_t txg = 0; dmu_tx_t *tx = NULL; blkptr_t blk; int error = 0; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); ASSERT(zh->zh_claim_txg == 0); ASSERT(zh->zh_replay_seq == 0); blk = zh->zh_log; /* * Allocate an initial log block if: * - there isn't one already * - the existing block is the wrong endianess */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); if (!BP_IS_HOLE(&blk)) { zio_free_zil(zilog->zl_spa, txg, &blk); BP_ZERO(&blk); } error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); if (error == 0) zil_init_log_chain(zilog, &blk); } /* * Allocate a log write buffer (lwb) for the first log block. */ if (error == 0) lwb = zil_alloc_lwb(zilog, &blk, txg); /* * If we just allocated the first log block, commit our transaction * and wait for zil_sync() to stuff the block poiner into zh_log. * (zh is part of the MOS, so we cannot modify it in open context.) */ if (tx != NULL) { dmu_tx_commit(tx); txg_wait_synced(zilog->zl_dmu_pool, txg); } ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); return (lwb); } /* * In one tx, free all log blocks and clear the log header. * If keep_first is set, then we're replaying a log with no content. * We want to keep the first block, however, so that the first * synchronous transaction doesn't require a txg_wait_synced() * in zil_create(). We don't need to txg_wait_synced() here either * when keep_first is set, because both zil_create() and zil_destroy() * will wait for any in-progress destroys to complete. */ void zil_destroy(zilog_t *zilog, boolean_t keep_first) { const zil_header_t *zh = zilog->zl_header; lwb_t *lwb; dmu_tx_t *tx; uint64_t txg; /* * Wait for any previous destroy to complete. */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_old_header = *zh; /* debugging aid */ if (BP_IS_HOLE(&zh->zh_log)) return; tx = dmu_tx_create(zilog->zl_os); VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); mutex_enter(&zilog->zl_lock); ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; zilog->zl_keep_first = keep_first; if (!list_is_empty(&zilog->zl_lwb_list)) { ASSERT(zh->zh_claim_txg == 0); VERIFY(!keep_first); while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { list_remove(&zilog->zl_lwb_list, lwb); if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk); kmem_cache_free(zil_lwb_cache, lwb); } } else if (!keep_first) { zil_destroy_sync(zilog, tx); } mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); } void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) { ASSERT(list_is_empty(&zilog->zl_lwb_list)); (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, tx, zilog->zl_header->zh_claim_txg); } int zil_claim(const char *osname, void *txarg) { dmu_tx_t *tx = txarg; uint64_t first_txg = dmu_tx_get_txg(tx); zilog_t *zilog; zil_header_t *zh; objset_t *os; int error; error = dmu_objset_own(osname, DMU_OST_ANY, B_FALSE, FTAG, &os); if (error != 0) { /* * EBUSY indicates that the objset is inconsistent, in which * case it can not have a ZIL. */ if (error != EBUSY) { cmn_err(CE_WARN, "can't open objset for %s, error %u", osname, error); } return (0); } zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { if (!BP_IS_HOLE(&zh->zh_log)) zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); BP_ZERO(&zh->zh_log); dsl_dataset_dirty(dmu_objset_ds(os), tx); dmu_objset_disown(os, FTAG); return (0); } /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can * read only part of the log now (e.g. due to a missing device), * but we can read the entire log later, we will not try to replay * or destroy beyond the last block we successfully claimed. */ ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { (void) zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, first_txg); zh->zh_claim_txg = first_txg; zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) zh->zh_flags |= ZIL_REPLAY_NEEDED; zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; dsl_dataset_dirty(dmu_objset_ds(os), tx); } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); dmu_objset_disown(os, FTAG); return (0); } /* * Check the log by walking the log chain. * Checksum errors are ok as they indicate the end of the chain. * Any other error (no device or read failure) returns an error. */ int zil_check_log_chain(const char *osname, void *tx) { zilog_t *zilog; objset_t *os; blkptr_t *bp; int error; ASSERT(tx == NULL); error = dmu_objset_hold(osname, FTAG, &os); if (error != 0) { cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); } zilog = dmu_objset_zil(os); bp = (blkptr_t *)&zilog->zl_header->zh_log; /* * Check the first block and determine if it's on a log device * which may have been removed or faulted prior to loading this * pool. If so, there's no point in checking the rest of the log * as its content should have already been synced to the pool. */ if (!BP_IS_HOLE(bp)) { vdev_t *vd; boolean_t valid = B_TRUE; spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); if (vd->vdev_islog && vdev_is_dead(vd)) valid = vdev_log_state_valid(vd); spa_config_exit(os->os_spa, SCL_STATE, FTAG); if (!valid) { dmu_objset_rele(os, FTAG); return (0); } } /* * Because tx == NULL, zil_claim_log_block() will not actually claim * any blocks, but just determine whether it is possible to do so. * In addition to checking the log chain, zil_claim_log_block() * will invoke zio_claim() with a done func of spa_claim_notify(), * which will update spa_max_claim_txg. See spa_load() for details. */ error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); dmu_objset_rele(os, FTAG); return ((error == ECKSUM || error == ENOENT) ? 0 : error); } static int zil_vdev_compare(const void *x1, const void *x2) { const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; if (v1 < v2) return (-1); if (v1 > v2) return (1); return (0); } void zil_add_block(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_vdev_tree; avl_index_t where; zil_vdev_node_t *zv, zvsearch; int ndvas = BP_GET_NDVAS(bp); int i; if (zfs_nocacheflush) return; ASSERT(zilog->zl_writer); /* * Even though we're zl_writer, we still need a lock because the * zl_get_data() callbacks may have dmu_sync() done callbacks * that will run concurrently. */ mutex_enter(&zilog->zl_vdev_lock); for (i = 0; i < ndvas; i++) { zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); if (avl_find(t, &zvsearch, &where) == NULL) { zv = kmem_alloc(sizeof (*zv), KM_SLEEP); zv->zv_vdev = zvsearch.zv_vdev; avl_insert(t, zv, where); } } mutex_exit(&zilog->zl_vdev_lock); } static void zil_flush_vdevs(zilog_t *zilog) { spa_t *spa = zilog->zl_spa; avl_tree_t *t = &zilog->zl_vdev_tree; void *cookie = NULL; zil_vdev_node_t *zv; zio_t *zio; ASSERT(zilog->zl_writer); /* * We don't need zl_vdev_lock here because we're the zl_writer, * and all zl_get_data() callbacks are done. */ if (avl_numnodes(t) == 0) return; spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); if (vd != NULL) zio_flush(zio, vd); kmem_free(zv, sizeof (*zv)); } /* * Wait for all the flushes to complete. Not all devices actually * support the DKIOCFLUSHWRITECACHE ioctl, so it's OK if it fails. */ (void) zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); } /* * Function called when a log block write completes */ static void zil_lwb_write_done(zio_t *zio) { lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; dmu_tx_t *tx = lwb->lwb_tx; ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); ASSERT(!BP_IS_GANG(zio->io_bp)); ASSERT(!BP_IS_HOLE(zio->io_bp)); ASSERT(BP_GET_FILL(zio->io_bp) == 0); /* * Ensure the lwb buffer pointer is cleared before releasing * the txg. If we have had an allocation failure and * the txg is waiting to sync then we want want zil_sync() * to remove the lwb so that it's not picked up as the next new * one in zil_commit_writer(). zil_sync() will only remove * the lwb if lwb_buf is null. */ zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_buf = NULL; lwb->lwb_tx = NULL; mutex_exit(&zilog->zl_lock); /* * Now that we've written this log block, we have a stable pointer * to the next block in the chain, so it's OK to let the txg in * which we allocated the next block sync. */ dmu_tx_commit(tx); } /* * Initialize the io for a log block. */ static void zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) { zbookmark_phys_t zb; SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); if (zilog->zl_root_zio == NULL) { zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, ZIO_FLAG_CANFAIL); } if (lwb->lwb_zio == NULL) { lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); } } /* * Define a limited set of intent log block sizes. * * These must be a multiple of 4KB. Note only the amount used (again * aligned to 4KB) actually gets written. However, we can't always just * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. */ uint64_t zil_block_buckets[] = { 4096, /* non TX_WRITE */ 8192+4096, /* data base */ 32*1024 + 4096, /* NFS writes */ UINT64_MAX }; /* * Use the slog as long as the logbias is 'latency' and the current commit size * is less than the limit or the total list size is less than 2X the limit. * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX. */ uint64_t zil_slog_limit = 1024 * 1024; #define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \ (((zilog)->zl_cur_used < zil_slog_limit) || \ ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))) /* * Start a log block write and advance to the next log block. * Calls are serialized. */ static lwb_t * zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) { lwb_t *nlwb = NULL; zil_chain_t *zilc; spa_t *spa = zilog->zl_spa; blkptr_t *bp; dmu_tx_t *tx; uint64_t txg; uint64_t zil_blksz, wsz; int i, error; if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { zilc = (zil_chain_t *)lwb->lwb_buf; bp = &zilc->zc_next_blk; } else { zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); bp = &zilc->zc_next_blk; } ASSERT(lwb->lwb_nused <= lwb->lwb_sz); /* * Allocate the next block and save its address in this block * before writing it in order to establish the log chain. * Note that if the allocation of nlwb synced before we wrote * the block that points at it (lwb), we'd leak it if we crashed. * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). * We dirty the dataset to ensure that zil_sync() will be called * to clean up in the event of allocation failure or I/O failure. */ tx = dmu_tx_create(zilog->zl_os); VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); lwb->lwb_tx = tx; /* * Log blocks are pre-allocated. Here we select the size of the next * block, based on size used in the last block. * - first find the smallest bucket that will fit the block from a * limited set of block sizes. This is because it's faster to write * blocks allocated from the same metaslab as they are adjacent or * close. * - next find the maximum from the new suggested size and an array of * previous sizes. This lessens a picket fence effect of wrongly * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k * requests. * * Note we only write what is used, but we can't just allocate * the maximum block size because we can exhaust the available * pool log space. */ zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); for (i = 0; zil_blksz > zil_block_buckets[i]; i++) continue; zil_blksz = zil_block_buckets[i]; if (zil_blksz == UINT64_MAX) zil_blksz = SPA_OLD_MAXBLOCKSIZE; zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; for (i = 0; i < ZIL_PREV_BLKS; i++) zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); BP_ZERO(bp); /* pass the old blkptr in order to spread log blocks across devs */ error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, USE_SLOG(zilog)); if (error == 0) { ASSERT3U(bp->blk_birth, ==, txg); bp->blk_cksum = lwb->lwb_blk.blk_cksum; bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; /* * Allocate a new log write buffer (lwb). */ nlwb = zil_alloc_lwb(zilog, bp, txg); /* Record the block for later vdev flushing */ zil_add_block(zilog, &lwb->lwb_blk); } if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { /* For Slim ZIL only write what is used. */ wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); ASSERT3U(wsz, <=, lwb->lwb_sz); zio_shrink(lwb->lwb_zio, wsz); } else { wsz = lwb->lwb_sz; } zilc->zc_pad = 0; zilc->zc_nused = lwb->lwb_nused; zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; /* * clear unused data for security */ bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */ /* * If there was an allocation failure then nlwb will be null which * forces a txg_wait_synced(). */ return (nlwb); } static lwb_t * zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) { lr_t *lrc = &itx->itx_lr; /* common log record */ lr_write_t *lrw = (lr_write_t *)lrc; char *lr_buf; uint64_t txg = lrc->lrc_txg; uint64_t reclen = lrc->lrc_reclen; uint64_t dlen = 0; if (lwb == NULL) return (NULL); ASSERT(lwb->lwb_buf != NULL); ASSERT(zilog_is_dirty(zilog) || spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); zilog->zl_cur_used += (reclen + dlen); zil_lwb_write_init(zilog, lwb); /* * If this record won't fit in the current log block, start a new one. */ if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { lwb = zil_lwb_write_start(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_init(zilog, lwb); ASSERT(LWB_EMPTY(lwb)); if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { txg_wait_synced(zilog->zl_dmu_pool, txg); return (lwb); } } lr_buf = lwb->lwb_buf + lwb->lwb_nused; bcopy(lrc, lr_buf, reclen); lrc = (lr_t *)lr_buf; lrw = (lr_write_t *)lrc; /* * If it's a write, fetch the data or get its blkptr as appropriate. */ if (lrc->lrc_txtype == TX_WRITE) { if (txg > spa_freeze_txg(zilog->zl_spa)) txg_wait_synced(zilog->zl_dmu_pool, txg); if (itx->itx_wr_state != WR_COPIED) { char *dbuf; int error; if (dlen) { ASSERT(itx->itx_wr_state == WR_NEED_COPY); dbuf = lr_buf + reclen; lrw->lr_common.lrc_reclen += dlen; } else { ASSERT(itx->itx_wr_state == WR_INDIRECT); dbuf = NULL; } error = zilog->zl_get_data( itx->itx_private, lrw, dbuf, lwb->lwb_zio); if (error == EIO) { txg_wait_synced(zilog->zl_dmu_pool, txg); return (lwb); } if (error != 0) { ASSERT(error == ENOENT || error == EEXIST || error == EALREADY); return (lwb); } } } /* * We're actually making an entry, so update lrc_seq to be the * log record sequence number. Note that this is generally not * equal to the itx sequence number because not all transactions * are synchronous, and sometimes spa_sync() gets there first. */ lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ lwb->lwb_nused += reclen + dlen; lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); return (lwb); } itx_t * zil_itx_create(uint64_t txtype, size_t lrsize) { itx_t *itx; lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); itx = kmem_alloc(offsetof(itx_t, itx_lr) + lrsize, KM_SLEEP); itx->itx_lr.lrc_txtype = txtype; itx->itx_lr.lrc_reclen = lrsize; itx->itx_sod = lrsize; /* if write & WR_NEED_COPY will be increased */ itx->itx_lr.lrc_seq = 0; /* defensive */ itx->itx_sync = B_TRUE; /* default is synchronous */ return (itx); } void zil_itx_destroy(itx_t *itx) { kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } /* * Free up the sync and async itxs. The itxs_t has already been detached * so no locks are needed. */ static void zil_itxg_clean(itxs_t *itxs) { itx_t *itx; list_t *list; avl_tree_t *t; void *cookie; itx_async_node_t *ian; list = &itxs->i_sync_list; while ((itx = list_head(list)) != NULL) { list_remove(list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } cookie = NULL; t = &itxs->i_async_tree; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list = &ian->ia_list; while ((itx = list_head(list)) != NULL) { list_remove(list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } list_destroy(list); kmem_free(ian, sizeof (itx_async_node_t)); } avl_destroy(t); kmem_free(itxs, sizeof (itxs_t)); } static int zil_aitx_compare(const void *x1, const void *x2) { const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; if (o1 < o2) return (-1); if (o1 > o2) return (1); return (0); } /* * Remove all async itx with the given oid. */ static void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; list_t clean_list; itx_t *itx; ASSERT(oid != 0); list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * Locate the object node and append its list. */ t = &itxg->itxg_itxs->i_async_tree; ian = avl_find(t, &oid, &where); if (ian != NULL) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); } while ((itx = list_head(&clean_list)) != NULL) { list_remove(&clean_list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } list_destroy(&clean_list); } void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) { uint64_t txg; itxg_t *itxg; itxs_t *itxs, *clean = NULL; /* * Object ids can be re-instantiated in the next txg so * remove any async transactions to avoid future leaks. * This can happen if a fsync occurs on the re-instantiated * object for a WR_INDIRECT or WR_NEED_COPY write, which gets * the new file data and flushes a write record for the old object. */ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_REMOVE) zil_remove_async(zilog, itx->itx_oid); /* * Ensure the data of a renamed file is committed before the rename. */ if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) zil_async_to_sync(zilog, itx->itx_oid); if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) txg = ZILTEST_TXG; else txg = dmu_tx_get_txg(tx); itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); itxs = itxg->itxg_itxs; if (itxg->itxg_txg != txg) { if (itxs != NULL) { /* * The zil_clean callback hasn't got around to cleaning * this itxg. Save the itxs for release below. * This should be rare. */ atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); itxg->itxg_sod = 0; clean = itxg->itxg_itxs; } ASSERT(itxg->itxg_sod == 0); itxg->itxg_txg = txg; itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), KM_SLEEP); list_create(&itxs->i_sync_list, sizeof (itx_t), offsetof(itx_t, itx_node)); avl_create(&itxs->i_async_tree, zil_aitx_compare, sizeof (itx_async_node_t), offsetof(itx_async_node_t, ia_node)); } if (itx->itx_sync) { list_insert_tail(&itxs->i_sync_list, itx); atomic_add_64(&zilog->zl_itx_list_sz, itx->itx_sod); itxg->itxg_sod += itx->itx_sod; } else { avl_tree_t *t = &itxs->i_async_tree; uint64_t foid = ((lr_ooo_t *)&itx->itx_lr)->lr_foid; itx_async_node_t *ian; avl_index_t where; ian = avl_find(t, &foid, &where); if (ian == NULL) { ian = kmem_alloc(sizeof (itx_async_node_t), KM_SLEEP); list_create(&ian->ia_list, sizeof (itx_t), offsetof(itx_t, itx_node)); ian->ia_foid = foid; avl_insert(t, ian, where); } list_insert_tail(&ian->ia_list, itx); } itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); zilog_dirty(zilog, txg); mutex_exit(&itxg->itxg_lock); /* Release the old itxs now we've dropped the lock */ if (clean != NULL) zil_itxg_clean(clean); } /* * If there are any in-memory intent log transactions which have now been * synced then start up a taskq to free them. We should only do this after we * have written out the uberblocks (i.e. txg has been comitted) so that * don't inadvertently clean out in-memory log records that would be required * by zil_commit(). */ void zil_clean(zilog_t *zilog, uint64_t synced_txg) { itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; itxs_t *clean_me; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { mutex_exit(&itxg->itxg_lock); return; } ASSERT3U(itxg->itxg_txg, <=, synced_txg); ASSERT(itxg->itxg_txg != 0); ASSERT(zilog->zl_clean_taskq != NULL); atomic_add_64(&zilog->zl_itx_list_sz, -itxg->itxg_sod); itxg->itxg_sod = 0; clean_me = itxg->itxg_itxs; itxg->itxg_itxs = NULL; itxg->itxg_txg = 0; mutex_exit(&itxg->itxg_lock); /* * Preferably start a task queue to free up the old itxs but * if taskq_dispatch can't allocate resources to do that then * free it in-line. This should be rare. Note, using TQ_SLEEP * created a bad performance problem. */ if (taskq_dispatch(zilog->zl_clean_taskq, (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP) == 0) zil_itxg_clean(clean_me); } /* * Get the list of itxs to commit into zl_itx_commit_list. */ static void zil_get_commit_list(zilog_t *zilog) { uint64_t otxg, txg; list_t *commit_list = &zilog->zl_itx_commit_list; uint64_t push_sod = 0; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); push_sod += itxg->itxg_sod; itxg->itxg_sod = 0; mutex_exit(&itxg->itxg_lock); } atomic_add_64(&zilog->zl_itx_list_sz, -push_sod); } /* * Move the async itxs for a specified object to commit into sync lists. */ static void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; itx_async_node_t *ian; avl_tree_t *t; avl_index_t where; if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ otxg = ZILTEST_TXG; else otxg = spa_last_synced_txg(zilog->zl_spa) + 1; for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; mutex_enter(&itxg->itxg_lock); if (itxg->itxg_txg != txg) { mutex_exit(&itxg->itxg_lock); continue; } /* * If a foid is specified then find that node and append its * list. Otherwise walk the tree appending all the lists * to the sync list. We add to the end rather than the * beginning to ensure the create has happened. */ t = &itxg->itxg_itxs->i_async_tree; if (foid != 0) { ian = avl_find(t, &foid, &where); if (ian != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); } } else { void *cookie = NULL; while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); list_destroy(&ian->ia_list); kmem_free(ian, sizeof (itx_async_node_t)); } } mutex_exit(&itxg->itxg_lock); } } static void zil_commit_writer(zilog_t *zilog) { uint64_t txg; itx_t *itx; lwb_t *lwb; spa_t *spa = zilog->zl_spa; int error = 0; ASSERT(zilog->zl_root_zio == NULL); mutex_exit(&zilog->zl_lock); zil_get_commit_list(zilog); /* * Return if there's nothing to commit before we dirty the fs by * calling zil_create(). */ if (list_head(&zilog->zl_itx_commit_list) == NULL) { mutex_enter(&zilog->zl_lock); return; } if (zilog->zl_suspend) { lwb = NULL; } else { lwb = list_tail(&zilog->zl_lwb_list); if (lwb == NULL) lwb = zil_create(zilog); } DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); while (itx = list_head(&zilog->zl_itx_commit_list)) { txg = itx->itx_lr.lrc_txg; ASSERT(txg); if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) lwb = zil_lwb_commit(zilog, itx, lwb); list_remove(&zilog->zl_itx_commit_list, itx); kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); } DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); /* write the last block out */ if (lwb != NULL && lwb->lwb_zio != NULL) lwb = zil_lwb_write_start(zilog, lwb); zilog->zl_cur_used = 0; /* * Wait if necessary for the log blocks to be on stable storage. */ if (zilog->zl_root_zio) { error = zio_wait(zilog->zl_root_zio); zilog->zl_root_zio = NULL; zil_flush_vdevs(zilog); } if (error || lwb == NULL) txg_wait_synced(zilog->zl_dmu_pool, 0); mutex_enter(&zilog->zl_lock); /* * Remember the highest committed log sequence number for ztest. * We only update this value when all the log writes succeeded, * because ztest wants to ASSERT that it got the whole log chain. */ if (error == 0 && lwb != NULL) zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } /* * Commit zfs transactions to stable storage. * If foid is 0 push out all transactions, otherwise push only those * for that object or might reference that object. * * itxs are committed in batches. In a heavily stressed zil there will be * a commit writer thread who is writing out a bunch of itxs to the log * for a set of committing threads (cthreads) in the same batch as the writer. * Those cthreads are all waiting on the same cv for that batch. * * There will also be a different and growing batch of threads that are * waiting to commit (qthreads). When the committing batch completes * a transition occurs such that the cthreads exit and the qthreads become * cthreads. One of the new cthreads becomes the writer thread for the * batch. Any new threads arriving become new qthreads. * * Only 2 condition variables are needed and there's no transition * between the two cvs needed. They just flip-flop between qthreads * and cthreads. * * Using this scheme we can efficiently wakeup up only those threads * that have been committed. */ void zil_commit(zilog_t *zilog, uint64_t foid) { uint64_t mybatch; if (zilog->zl_sync == ZFS_SYNC_DISABLED) return; /* move the async itxs for the foid to the sync queues */ zil_async_to_sync(zilog, foid); mutex_enter(&zilog->zl_lock); mybatch = zilog->zl_next_batch; while (zilog->zl_writer) { cv_wait(&zilog->zl_cv_batch[mybatch & 1], &zilog->zl_lock); if (mybatch <= zilog->zl_com_batch) { mutex_exit(&zilog->zl_lock); return; } } zilog->zl_next_batch++; zilog->zl_writer = B_TRUE; zil_commit_writer(zilog); zilog->zl_com_batch = mybatch; zilog->zl_writer = B_FALSE; mutex_exit(&zilog->zl_lock); /* wake up one thread to become the next writer */ cv_signal(&zilog->zl_cv_batch[(mybatch+1) & 1]); /* wake up all threads waiting for this batch to be committed */ cv_broadcast(&zilog->zl_cv_batch[mybatch & 1]); } /* * Called in syncing context to free committed log blocks and update log header. */ void zil_sync(zilog_t *zilog, dmu_tx_t *tx) { zil_header_t *zh = zil_header_in_syncing_context(zilog); uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = zilog->zl_spa; uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; lwb_t *lwb; /* * We don't zero out zl_destroy_txg, so make sure we don't try * to destroy it twice. */ if (spa_sync_pass(spa) != 1) return; mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); if (*replayed_seq != 0) { ASSERT(zh->zh_replay_seq < *replayed_seq); zh->zh_replay_seq = *replayed_seq; *replayed_seq = 0; } if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; ASSERT(list_head(&zilog->zl_lwb_list) == NULL); bzero(zh, sizeof (zil_header_t)); bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* * If this block was part of log chain that couldn't * be claimed because a device was missing during * zil_claim(), but that device later returns, * then this block could erroneously appear valid. * To guard against this, assign a new GUID to the new * log chain so it doesn't matter what blk points to. */ zil_init_log_chain(zilog, &blk); zh->zh_log = blk; } } while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); zio_free_zil(spa, txg, &lwb->lwb_blk); kmem_cache_free(zil_lwb_cache, lwb); /* * If we don't have anything left in the lwb list then * we've had an allocation failure and we need to zero * out the zil_header blkptr so that we don't end * up freeing the same block twice. */ if (list_head(&zilog->zl_lwb_list) == NULL) BP_ZERO(&zh->zh_log); } mutex_exit(&zilog->zl_lock); } void zil_init(void) { zil_lwb_cache = kmem_cache_create("zil_lwb_cache", sizeof (struct lwb), 0, NULL, NULL, NULL, NULL, NULL, 0); } void zil_fini(void) { kmem_cache_destroy(zil_lwb_cache); } void zil_set_sync(zilog_t *zilog, uint64_t sync) { zilog->zl_sync = sync; } void zil_set_logbias(zilog_t *zilog, uint64_t logbias) { zilog->zl_logbias = logbias; } zilog_t * zil_alloc(objset_t *os, zil_header_t *zh_phys) { zilog_t *zilog; zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); zilog->zl_header = zh_phys; zilog->zl_os = os; zilog->zl_spa = dmu_objset_spa(os); zilog->zl_dmu_pool = dmu_objset_pool(os); zilog->zl_destroy_txg = TXG_INITIAL - 1; zilog->zl_logbias = dmu_objset_logbias(os); zilog->zl_sync = dmu_objset_syncprop(os); zilog->zl_next_batch = 1; mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); for (int i = 0; i < TXG_SIZE; i++) { mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, MUTEX_DEFAULT, NULL); } list_create(&zilog->zl_lwb_list, sizeof (lwb_t), offsetof(lwb_t, lwb_node)); list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), offsetof(itx_t, itx_node)); mutex_init(&zilog->zl_vdev_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zilog->zl_vdev_tree, zil_vdev_compare, sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); cv_init(&zilog->zl_cv_writer, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_cv_batch[0], NULL, CV_DEFAULT, NULL); cv_init(&zilog->zl_cv_batch[1], NULL, CV_DEFAULT, NULL); return (zilog); } void zil_free(zilog_t *zilog) { zilog->zl_stop_sync = 1; ASSERT0(zilog->zl_suspend); ASSERT0(zilog->zl_suspending); ASSERT(list_is_empty(&zilog->zl_lwb_list)); list_destroy(&zilog->zl_lwb_list); avl_destroy(&zilog->zl_vdev_tree); mutex_destroy(&zilog->zl_vdev_lock); ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); list_destroy(&zilog->zl_itx_commit_list); for (int i = 0; i < TXG_SIZE; i++) { /* * It's possible for an itx to be generated that doesn't dirty * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() * callback to remove the entry. We remove those here. * * Also free up the ziltest itxs. */ if (zilog->zl_itxg[i].itxg_itxs) zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); mutex_destroy(&zilog->zl_itxg[i].itxg_lock); } mutex_destroy(&zilog->zl_lock); cv_destroy(&zilog->zl_cv_writer); cv_destroy(&zilog->zl_cv_suspend); cv_destroy(&zilog->zl_cv_batch[0]); cv_destroy(&zilog->zl_cv_batch[1]); kmem_free(zilog, sizeof (zilog_t)); } /* * Open an intent log. */ zilog_t * zil_open(objset_t *os, zil_get_data_t *get_data) { zilog_t *zilog = dmu_objset_zil(os); ASSERT(zilog->zl_clean_taskq == NULL); ASSERT(zilog->zl_get_data == NULL); ASSERT(list_is_empty(&zilog->zl_lwb_list)); zilog->zl_get_data = get_data; zilog->zl_clean_taskq = taskq_create("zil_clean", 1, minclsyspri, 2, 2, TASKQ_PREPOPULATE); return (zilog); } /* * Close an intent log. */ void zil_close(zilog_t *zilog) { lwb_t *lwb; uint64_t txg = 0; zil_commit(zilog, 0); /* commit all itx */ /* * The lwb_max_txg for the stubby lwb will reflect the last activity * for the zil. After a txg_wait_synced() on the txg we know all the * callbacks have occurred that may clean the zil. Only then can we * destroy the zl_clean_taskq. */ mutex_enter(&zilog->zl_lock); lwb = list_tail(&zilog->zl_lwb_list); if (lwb != NULL) txg = lwb->lwb_max_txg; mutex_exit(&zilog->zl_lock); if (txg) txg_wait_synced(zilog->zl_dmu_pool, txg); ASSERT(!zilog_is_dirty(zilog)); taskq_destroy(zilog->zl_clean_taskq); zilog->zl_clean_taskq = NULL; zilog->zl_get_data = NULL; /* * We should have only one LWB left on the list; remove it now. */ mutex_enter(&zilog->zl_lock); lwb = list_head(&zilog->zl_lwb_list); if (lwb != NULL) { ASSERT(lwb == list_tail(&zilog->zl_lwb_list)); list_remove(&zilog->zl_lwb_list, lwb); zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); kmem_cache_free(zil_lwb_cache, lwb); } mutex_exit(&zilog->zl_lock); } static char *suspend_tag = "zil suspending"; /* * Suspend an intent log. While in suspended mode, we still honor * synchronous semantics, but we rely on txg_wait_synced() to do it. * On old version pools, we suspend the log briefly when taking a * snapshot so that it will have an empty intent log. * * Long holds are not really intended to be used the way we do here -- * held for such a short time. A concurrent caller of dsl_dataset_long_held() * could fail. Therefore we take pains to only put a long hold if it is * actually necessary. Fortunately, it will only be necessary if the * objset is currently mounted (or the ZVOL equivalent). In that case it * will already have a long hold, so we are not really making things any worse. * * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or * zvol_state_t), and use their mechanism to prevent their hold from being * dropped (e.g. VFS_HOLD()). However, that would be even more pain for * very little gain. * * if cookiep == NULL, this does both the suspend & resume. * Otherwise, it returns with the dataset "long held", and the cookie * should be passed into zil_resume(). */ int zil_suspend(const char *osname, void **cookiep) { objset_t *os; zilog_t *zilog; const zil_header_t *zh; int error; error = dmu_objset_hold(osname, suspend_tag, &os); if (error != 0) return (error); zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); zh = zilog->zl_header; if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (SET_ERROR(EBUSY)); } /* * Don't put a long hold in the cases where we can avoid it. This * is when there is no cookie so we are doing a suspend & resume * (i.e. called from zil_vdev_offline()), and there's nothing to do * for the suspend because it's already suspended, or there's no ZIL. */ if (cookiep == NULL && !zilog->zl_suspending && (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { mutex_exit(&zilog->zl_lock); dmu_objset_rele(os, suspend_tag); return (0); } dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); dsl_pool_rele(dmu_objset_pool(os), suspend_tag); zilog->zl_suspend++; if (zilog->zl_suspend > 1) { /* * Someone else is already suspending it. * Just wait for them to finish. */ while (zilog->zl_suspending) cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); mutex_exit(&zilog->zl_lock); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } /* * If there is no pointer to an on-disk block, this ZIL must not * be active (e.g. filesystem not mounted), so there's nothing * to clean up. */ if (BP_IS_HOLE(&zh->zh_log)) { ASSERT(cookiep != NULL); /* fast path already handled */ *cookiep = os; mutex_exit(&zilog->zl_lock); return (0); } zilog->zl_suspending = B_TRUE; mutex_exit(&zilog->zl_lock); zil_commit(zilog, 0); zil_destroy(zilog, B_FALSE); mutex_enter(&zilog->zl_lock); zilog->zl_suspending = B_FALSE; cv_broadcast(&zilog->zl_cv_suspend); mutex_exit(&zilog->zl_lock); if (cookiep == NULL) zil_resume(os); else *cookiep = os; return (0); } void zil_resume(void *cookie) { objset_t *os = cookie; zilog_t *zilog = dmu_objset_zil(os); mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_suspend != 0); zilog->zl_suspend--; mutex_exit(&zilog->zl_lock); dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); } typedef struct zil_replay_arg { zil_replay_func_t **zr_replay; void *zr_arg; boolean_t zr_byteswap; char *zr_lr; } zil_replay_arg_t; static int zil_replay_error(zilog_t *zilog, lr_t *lr, int error) { char name[MAXNAMELEN]; zilog->zl_replaying_seq--; /* didn't actually replay this one */ dmu_objset_name(zilog->zl_os, name); cmn_err(CE_WARN, "ZFS replay transaction error %d, " "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)(lr->lrc_txtype & ~TX_CI), (lr->lrc_txtype & TX_CI) ? "CI" : ""); return (error); } static int zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; int error = 0; zilog->zl_replaying_seq = lr->lrc_seq; if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ return (0); if (lr->lrc_txg < claim_txg) /* already committed */ return (0); /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; if (txtype == 0 || txtype >= TX_MAX_TYPE) return (zil_replay_error(zilog, lr, EINVAL)); /* * If this record type can be logged out of order, the object * (lr_foid) may no longer exist. That's legitimate, not an error. */ if (TX_OOO(txtype)) { error = dmu_object_info(zilog->zl_os, ((lr_ooo_t *)lr)->lr_foid, NULL); if (error == ENOENT || error == EEXIST) return (0); } /* * Make a copy of the data so we can revise and extend it. */ bcopy(lr, zr->zr_lr, reclen); /* * If this is a TX_WRITE with a blkptr, suck in the data. */ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { error = zil_read_log_data(zilog, (lr_write_t *)lr, zr->zr_lr + reclen); if (error != 0) return (zil_replay_error(zilog, lr, error)); } /* * The log block containing this lr may have been byteswapped * so that we can easily examine common fields like lrc_txtype. * However, the log is a mix of different record types, and only the * replay vectors know how to byteswap their records. Therefore, if * the lr was byteswapped, undo it before invoking the replay vector. */ if (zr->zr_byteswap) byteswap_uint64_array(zr->zr_lr, reclen); /* * We must now do two things atomically: replay this log record, * and update the log header sequence number to reflect the fact that * we did so. At the end of each replay function the sequence number * is updated if we are in replay mode. */ error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); if (error != 0) { /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with * EEXIST. So if we receive any error we try syncing out * any removes then retry the transaction. Note that we * specify B_FALSE for byteswap now, so we don't do it twice. */ txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); if (error != 0) return (zil_replay_error(zilog, lr, error)); } return (0); } /* ARGSUSED */ static int zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { zilog->zl_replay_blks++; return (0); } /* * If this dataset has a non-empty intent log, replay it and destroy it. */ void zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; zil_replay_arg_t zr; if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { zil_destroy(zilog, B_TRUE); return; } zr.zr_replay = replay_func; zr.zr_arg = arg; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); /* * Wait for in-progress removes to sync before starting replay. */ txg_wait_synced(zilog->zl_dmu_pool, 0); zilog->zl_replay = B_TRUE; zilog->zl_replay_time = ddi_get_lbolt(); ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, zh->zh_claim_txg); kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); zilog->zl_replay = B_FALSE; } boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx) { if (zilog->zl_sync == ZFS_SYNC_DISABLED) return (B_TRUE); if (zilog->zl_replay) { dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = zilog->zl_replaying_seq; return (B_TRUE); } return (B_FALSE); } /* ARGSUSED */ int zil_vdev_offline(const char *osname, void *arg) { int error; error = zil_suspend(osname, NULL); if (error != 0) return (SET_ERROR(EEXIST)); return (0); }