diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c index 704f034e9ee0..771b265c25d0 100644 --- a/module/zfs/dsl_pool.c +++ b/module/zfs/dsl_pool.c @@ -1,1100 +1,1100 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ int zfs_txg_synctime_ms = 1000; /* target millisecs to sync a txg */ int zfs_txg_history = 60; /* statistics for the last N txgs */ unsigned long zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ unsigned long zfs_write_limit_max = 0; /* max data payload per txg */ unsigned long zfs_write_limit_inflated = 0; unsigned long zfs_write_limit_override = 0; kmutex_t zfs_write_limit_lock; static pgcnt_t old_physmem = 0; static int dsl_pool_txg_history_update(kstat_t *ksp, int rw) { dsl_pool_t *dp = ksp->ks_private; txg_history_t *th; int i = 0; if (rw == KSTAT_WRITE) return (EACCES); if (ksp->ks_data) kmem_free(ksp->ks_data, ksp->ks_data_size); mutex_enter(&dp->dp_lock); ksp->ks_ndata = dp->dp_txg_history_size; ksp->ks_data_size = dp->dp_txg_history_size * sizeof(kstat_txg_t); if (ksp->ks_data_size > 0) ksp->ks_data = kmem_alloc(ksp->ks_data_size, KM_PUSHPAGE); /* Traversed oldest to youngest for the most readable kstat output */ for (th = list_tail(&dp->dp_txg_history); th != NULL; th = list_prev(&dp->dp_txg_history, th)) { mutex_enter(&th->th_lock); ASSERT3S(i + sizeof(kstat_txg_t), <=, ksp->ks_data_size); memcpy(ksp->ks_data + i, &th->th_kstat, sizeof(kstat_txg_t)); i += sizeof(kstat_txg_t); mutex_exit(&th->th_lock); } mutex_exit(&dp->dp_lock); return (0); } static void dsl_pool_txg_history_init(dsl_pool_t *dp, uint64_t txg) { char name[KSTAT_STRLEN]; list_create(&dp->dp_txg_history, sizeof (txg_history_t), offsetof(txg_history_t, th_link)); dsl_pool_txg_history_add(dp, txg); (void) snprintf(name, KSTAT_STRLEN, "txgs-%s", spa_name(dp->dp_spa)); dp->dp_txg_kstat = kstat_create("zfs", 0, name, "misc", KSTAT_TYPE_TXG, 0, KSTAT_FLAG_VIRTUAL); if (dp->dp_txg_kstat) { dp->dp_txg_kstat->ks_data = NULL; dp->dp_txg_kstat->ks_private = dp; dp->dp_txg_kstat->ks_update = dsl_pool_txg_history_update; kstat_install(dp->dp_txg_kstat); } } static void dsl_pool_txg_history_destroy(dsl_pool_t *dp) { txg_history_t *th; if (dp->dp_txg_kstat) { if (dp->dp_txg_kstat->ks_data) kmem_free(dp->dp_txg_kstat->ks_data, dp->dp_txg_kstat->ks_data_size); kstat_delete(dp->dp_txg_kstat); } mutex_enter(&dp->dp_lock); while ((th = list_remove_head(&dp->dp_txg_history))) { dp->dp_txg_history_size--; mutex_destroy(&th->th_lock); kmem_free(th, sizeof(txg_history_t)); } ASSERT3U(dp->dp_txg_history_size, ==, 0); list_destroy(&dp->dp_txg_history); mutex_exit(&dp->dp_lock); } txg_history_t * dsl_pool_txg_history_add(dsl_pool_t *dp, uint64_t txg) { txg_history_t *th, *rm; - th = kmem_zalloc(sizeof(txg_history_t), KM_SLEEP); + th = kmem_zalloc(sizeof(txg_history_t), KM_PUSHPAGE); mutex_init(&th->th_lock, NULL, MUTEX_DEFAULT, NULL); th->th_kstat.txg = txg; th->th_kstat.state = TXG_STATE_OPEN; th->th_kstat.birth = gethrtime(); mutex_enter(&dp->dp_lock); list_insert_head(&dp->dp_txg_history, th); dp->dp_txg_history_size++; while (dp->dp_txg_history_size > zfs_txg_history) { dp->dp_txg_history_size--; rm = list_remove_tail(&dp->dp_txg_history); mutex_destroy(&rm->th_lock); kmem_free(rm, sizeof(txg_history_t)); } mutex_exit(&dp->dp_lock); return (th); } /* * Traversed youngest to oldest because lookups are only done for open * or syncing txgs which are guaranteed to be at the head of the list. * The txg_history_t structure will be returned locked. */ txg_history_t * dsl_pool_txg_history_get(dsl_pool_t *dp, uint64_t txg) { txg_history_t *th; mutex_enter(&dp->dp_lock); for (th = list_head(&dp->dp_txg_history); th != NULL; th = list_next(&dp->dp_txg_history, th)) { if (th->th_kstat.txg == txg) { mutex_enter(&th->th_lock); break; } } mutex_exit(&dp->dp_lock); return (th); } void dsl_pool_txg_history_put(txg_history_t *th) { mutex_exit(&th->th_lock); } int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) { uint64_t obj; int err; err = zap_lookup(dp->dp_meta_objset, dp->dp_root_dir->dd_phys->dd_child_dir_zapobj, name, sizeof (obj), 1, &obj); if (err) return (err); return (dsl_dir_open_obj(dp, obj, name, dp, ddp)); } static dsl_pool_t * dsl_pool_open_impl(spa_t *spa, uint64_t txg) { dsl_pool_t *dp; blkptr_t *bp = spa_get_rootblkptr(spa); dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); dp->dp_spa = spa; dp->dp_meta_rootbp = *bp; rw_init(&dp->dp_config_rwlock, NULL, RW_DEFAULT, NULL); dp->dp_write_limit = zfs_write_limit_min; txg_init(dp, txg); txg_list_create(&dp->dp_dirty_datasets, offsetof(dsl_dataset_t, ds_dirty_link)); txg_list_create(&dp->dp_dirty_zilogs, offsetof(zilog_t, zl_dirty_link)); txg_list_create(&dp->dp_dirty_dirs, offsetof(dsl_dir_t, dd_dirty_link)); txg_list_create(&dp->dp_sync_tasks, offsetof(dsl_sync_task_group_t, dstg_node)); mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); dp->dp_iput_taskq = taskq_create("zfs_iput_taskq", 1, minclsyspri, 1, 4, 0); dsl_pool_txg_history_init(dp, txg); return (dp); } int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &dp->dp_meta_objset); if (err != 0) dsl_pool_close(dp); else *dpp = dp; return (err); } int dsl_pool_open(dsl_pool_t *dp) { int err; dsl_dir_t *dd; dsl_dataset_t *ds; uint64_t obj; rw_enter(&dp->dp_config_rwlock, RW_WRITER); err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &dp->dp_root_dir_obj); if (err) goto out; err = dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir); if (err) goto out; err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); if (err) goto out; if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); if (err) goto out; err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, FTAG, &ds); if (err == 0) { err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, dp, &dp->dp_origin_snap); dsl_dataset_rele(ds, FTAG); } dsl_dir_close(dd, dp); if (err) goto out; } if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir); if (err) goto out; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); if (err) goto out; VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } if (spa_feature_is_active(dp->dp_spa, &spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, &dp->dp_bptree_obj); if (err != 0) goto out; } if (spa_feature_is_active(dp->dp_spa, &spa_feature_table[SPA_FEATURE_EMPTY_BPOBJ])) { err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, &dp->dp_empty_bpobj); if (err != 0) goto out; } err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj); if (err == ENOENT) err = 0; if (err) goto out; err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); out: rw_exit(&dp->dp_config_rwlock); return (err); } void dsl_pool_close(dsl_pool_t *dp) { /* drop our references from dsl_pool_open() */ /* * Since we held the origin_snap from "syncing" context (which * includes pool-opening context), it actually only got a "ref" * and not a hold, so just drop that here. */ if (dp->dp_origin_snap) dsl_dataset_drop_ref(dp->dp_origin_snap, dp); if (dp->dp_mos_dir) dsl_dir_close(dp->dp_mos_dir, dp); if (dp->dp_free_dir) dsl_dir_close(dp->dp_free_dir, dp); if (dp->dp_root_dir) dsl_dir_close(dp->dp_root_dir, dp); bpobj_close(&dp->dp_free_bpobj); /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ if (dp->dp_meta_objset) dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_zilogs); txg_list_destroy(&dp->dp_sync_tasks); txg_list_destroy(&dp->dp_dirty_dirs); arc_flush(dp->dp_spa); txg_fini(dp); dsl_scan_fini(dp); dsl_pool_txg_history_destroy(dp); rw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); taskq_destroy(dp->dp_iput_taskq); if (dp->dp_blkstats) kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp, sizeof (dsl_pool_t)); } dsl_pool_t * dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) { int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); objset_t *os; dsl_dataset_t *ds; uint64_t obj; /* create and open the MOS (meta-objset) */ dp->dp_meta_objset = dmu_objset_create_impl(spa, NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); /* create the pool directory */ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); ASSERT3U(err, ==, 0); /* Initialize scan structures */ VERIFY3U(0, ==, dsl_scan_init(dp, txg)); /* create and open the root dir */ dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); VERIFY(0 == dsl_dir_open_obj(dp, dp->dp_root_dir_obj, NULL, dp, &dp->dp_root_dir)); /* create and open the meta-objset dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); VERIFY(0 == dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir)); if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { /* create and open the free dir */ (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); VERIFY(0 == dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* create and open the free_bplist */ obj = bpobj_alloc(dp->dp_meta_objset, SPA_MAXBLOCKSIZE, tx); VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); } if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) dsl_pool_create_origin(dp, tx); /* create the root dataset */ obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, 0, tx); /* create the root objset */ VERIFY(0 == dsl_dataset_hold_obj(dp, obj, FTAG, &ds)); VERIFY(NULL != (os = dmu_objset_create_impl(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx))); #ifdef _KERNEL zfs_create_fs(os, kcred, zplprops, tx); #endif dsl_dataset_rele(ds, FTAG); dmu_tx_commit(tx); return (dp); } /* * Account for the meta-objset space in its placeholder dsl_dir. */ void dsl_pool_mos_diduse_space(dsl_pool_t *dp, int64_t used, int64_t comp, int64_t uncomp) { ASSERT3U(comp, ==, uncomp); /* it's all metadata */ mutex_enter(&dp->dp_lock); dp->dp_mos_used_delta += used; dp->dp_mos_compressed_delta += comp; dp->dp_mos_uncompressed_delta += uncomp; mutex_exit(&dp->dp_lock); } static int deadlist_enqueue_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { dsl_deadlist_t *dl = arg; dsl_pool_t *dp = dmu_objset_pool(dl->dl_os); rw_enter(&dp->dp_config_rwlock, RW_READER); dsl_deadlist_insert(dl, bp, tx); rw_exit(&dp->dp_config_rwlock); return (0); } void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) { zio_t *zio; dmu_tx_t *tx; dsl_dir_t *dd; dsl_dataset_t *ds; objset_t *mos = dp->dp_meta_objset; hrtime_t start, write_time; uint64_t data_written; int err; list_t synced_datasets; list_create(&synced_datasets, sizeof (dsl_dataset_t), offsetof(dsl_dataset_t, ds_synced_link)); /* * We need to copy dp_space_towrite() before doing * dsl_sync_task_group_sync(), because * dsl_dataset_snapshot_reserve_space() will increase * dp_space_towrite but not actually write anything. */ data_written = dp->dp_space_towrite[txg & TXG_MASK]; tx = dmu_tx_create_assigned(dp, txg); dp->dp_read_overhead = 0; start = gethrtime(); zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) { /* * We must not sync any non-MOS datasets twice, because * we may have taken a snapshot of them. However, we * may sync newly-created datasets on pass 2. */ ASSERT(!list_link_active(&ds->ds_synced_link)); list_insert_tail(&synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } DTRACE_PROBE(pool_sync__1setup); err = zio_wait(zio); write_time = gethrtime() - start; ASSERT(err == 0); DTRACE_PROBE(pool_sync__2rootzio); /* * After the data blocks have been written (ensured by the zio_wait() * above), update the user/group space accounting. */ for (ds = list_head(&synced_datasets); ds; ds = list_next(&synced_datasets, ds)) dmu_objset_do_userquota_updates(ds->ds_objset, tx); /* * Sync the datasets again to push out the changes due to * userspace updates. This must be done before we process the * sync tasks, so that any snapshots will have the correct * user accounting information (and we won't get confused * about which blocks are part of the snapshot). */ zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg))) { ASSERT(list_link_active(&ds->ds_synced_link)); dmu_buf_rele(ds->ds_dbuf, ds); dsl_dataset_sync(ds, zio, tx); } err = zio_wait(zio); /* * Now that the datasets have been completely synced, we can * clean up our in-memory structures accumulated while syncing: * * - move dead blocks from the pending deadlist to the on-disk deadlist * - clean up zil records * - release hold from dsl_dataset_dirty() */ while ((ds = list_remove_head(&synced_datasets))) { ASSERTV(objset_t *os = ds->ds_objset); bplist_iterate(&ds->ds_pending_deadlist, deadlist_enqueue_cb, &ds->ds_deadlist, tx); ASSERT(!dmu_objset_is_dirty(os, txg)); dmu_buf_rele(ds->ds_dbuf, ds); } start = gethrtime(); while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg))) dsl_dir_sync(dd, tx); write_time += gethrtime() - start; /* * The MOS's space is accounted for in the pool/$MOS * (dp_mos_dir). We can't modify the mos while we're syncing * it, so we remember the deltas and apply them here. */ if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || dp->dp_mos_uncompressed_delta != 0) { dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, dp->dp_mos_used_delta, dp->dp_mos_compressed_delta, dp->dp_mos_uncompressed_delta, tx); dp->dp_mos_used_delta = 0; dp->dp_mos_compressed_delta = 0; dp->dp_mos_uncompressed_delta = 0; } start = gethrtime(); if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); dmu_objset_sync(mos, zio, tx); err = zio_wait(zio); ASSERT(err == 0); dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); } write_time += gethrtime() - start; DTRACE_PROBE2(pool_sync__4io, hrtime_t, write_time, hrtime_t, dp->dp_read_overhead); write_time -= dp->dp_read_overhead; /* * If we modify a dataset in the same txg that we want to destroy it, * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. * dsl_dir_destroy_check() will fail if there are unexpected holds. * Therefore, we want to sync the MOS (thus syncing the dd_dbuf * and clearing the hold on it) before we process the sync_tasks. * The MOS data dirtied by the sync_tasks will be synced on the next * pass. */ DTRACE_PROBE(pool_sync__3task); if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { dsl_sync_task_group_t *dstg; /* * No more sync tasks should have been added while we * were syncing. */ ASSERT(spa_sync_pass(dp->dp_spa) == 1); while ((dstg = txg_list_remove(&dp->dp_sync_tasks, txg))) dsl_sync_task_group_sync(dstg, tx); } dmu_tx_commit(tx); dp->dp_space_towrite[txg & TXG_MASK] = 0; ASSERT(dp->dp_tempreserved[txg & TXG_MASK] == 0); /* * If the write limit max has not been explicitly set, set it * to a fraction of available physical memory (default 1/8th). * Note that we must inflate the limit because the spa * inflates write sizes to account for data replication. * Check this each sync phase to catch changing memory size. */ if (physmem != old_physmem && zfs_write_limit_shift) { mutex_enter(&zfs_write_limit_lock); old_physmem = physmem; zfs_write_limit_max = ptob(physmem) >> zfs_write_limit_shift; zfs_write_limit_inflated = MAX(zfs_write_limit_min, spa_get_asize(dp->dp_spa, zfs_write_limit_max)); mutex_exit(&zfs_write_limit_lock); } /* * Attempt to keep the sync time consistent by adjusting the * amount of write traffic allowed into each transaction group. * Weight the throughput calculation towards the current value: * thru = 3/4 old_thru + 1/4 new_thru * * Note: write_time is in nanosecs, so write_time/MICROSEC * yields millisecs */ ASSERT(zfs_write_limit_min > 0); if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { uint64_t throughput = data_written / (write_time / MICROSEC); if (dp->dp_throughput) dp->dp_throughput = throughput / 4 + 3 * dp->dp_throughput / 4; else dp->dp_throughput = throughput; dp->dp_write_limit = MIN(zfs_write_limit_inflated, MAX(zfs_write_limit_min, dp->dp_throughput * zfs_txg_synctime_ms)); } } void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) { zilog_t *zilog; dsl_dataset_t *ds; while ((zilog = txg_list_remove(&dp->dp_dirty_zilogs, txg))) { ds = dmu_objset_ds(zilog->zl_os); zil_clean(zilog, txg); ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); dmu_buf_rele(ds->ds_dbuf, zilog); } ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } /* * TRUE if the current thread is the tx_sync_thread or if we * are being called from SPA context during pool initialization. */ int dsl_pool_sync_context(dsl_pool_t *dp) { return (curthread == dp->dp_tx.tx_sync_thread || spa_is_initializing(dp->dp_spa)); } uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree) { uint64_t space, resv; /* * Reserve about 1.6% (1/64), or at least 32MB, for allocation * efficiency. * XXX The intent log is not accounted for, so it must fit * within this slop. * * If we're trying to assess whether it's OK to do a free, * cut the reservation in half to allow forward progress * (e.g. make it possible to rm(1) files from a full pool). */ space = spa_get_dspace(dp->dp_spa); resv = MAX(space >> 6, SPA_MINDEVSIZE >> 1); if (netfree) resv >>= 1; return (space - resv); } int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx) { uint64_t reserved = 0; uint64_t write_limit = (zfs_write_limit_override ? zfs_write_limit_override : dp->dp_write_limit); if (zfs_no_write_throttle) { atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); return (0); } /* * Check to see if we have exceeded the maximum allowed IO for * this transaction group. We can do this without locks since * a little slop here is ok. Note that we do the reserved check * with only half the requested reserve: this is because the * reserve requests are worst-case, and we really don't want to * throttle based off of worst-case estimates. */ if (write_limit > 0) { reserved = dp->dp_space_towrite[tx->tx_txg & TXG_MASK] + dp->dp_tempreserved[tx->tx_txg & TXG_MASK] / 2; if (reserved && reserved > write_limit) { DMU_TX_STAT_BUMP(dmu_tx_write_limit); return (ERESTART); } } atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], space); /* * If this transaction group is over 7/8ths capacity, delay * the caller 1 clock tick. This will slow down the "fill" * rate until the sync process can catch up with us. */ if (reserved && reserved > (write_limit - (write_limit >> 3))) txg_delay(dp, tx->tx_txg, 1); return (0); } void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) { ASSERT(dp->dp_tempreserved[tx->tx_txg & TXG_MASK] >= space); atomic_add_64(&dp->dp_tempreserved[tx->tx_txg & TXG_MASK], -space); } void dsl_pool_memory_pressure(dsl_pool_t *dp) { uint64_t space_inuse = 0; int i; if (dp->dp_write_limit == zfs_write_limit_min) return; for (i = 0; i < TXG_SIZE; i++) { space_inuse += dp->dp_space_towrite[i]; space_inuse += dp->dp_tempreserved[i]; } dp->dp_write_limit = MAX(zfs_write_limit_min, MIN(dp->dp_write_limit, space_inuse / 4)); } void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) { if (space > 0) { mutex_enter(&dp->dp_lock); dp->dp_space_towrite[tx->tx_txg & TXG_MASK] += space; mutex_exit(&dp->dp_lock); } } /* ARGSUSED */ static int upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds, *prev = NULL; int err; dsl_pool_t *dp = spa_get_dsl(spa); err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); if (err) return (err); while (ds->ds_phys->ds_prev_snap_obj != 0) { err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, FTAG, &prev); if (err) { dsl_dataset_rele(ds, FTAG); return (err); } if (prev->ds_phys->ds_next_snap_obj != ds->ds_object) break; dsl_dataset_rele(ds, FTAG); ds = prev; prev = NULL; } if (prev == NULL) { prev = dp->dp_origin_snap; /* * The $ORIGIN can't have any data, or the accounting * will be wrong. */ ASSERT(prev->ds_phys->ds_bp.blk_birth == 0); /* The origin doesn't get attached to itself */ if (ds->ds_object == prev->ds_object) { dsl_dataset_rele(ds, FTAG); return (0); } dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_prev_snap_obj = prev->ds_object; ds->ds_phys->ds_prev_snap_txg = prev->ds_phys->ds_creation_txg; dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); ds->ds_dir->dd_phys->dd_origin_obj = prev->ds_object; dmu_buf_will_dirty(prev->ds_dbuf, tx); prev->ds_phys->ds_num_children++; if (ds->ds_phys->ds_next_snap_obj == 0) { ASSERT(ds->ds_prev == NULL); VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, ds, &ds->ds_prev)); } } ASSERT(ds->ds_dir->dd_phys->dd_origin_obj == prev->ds_object); ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); if (prev->ds_phys->ds_next_clones_obj == 0) { dmu_buf_will_dirty(prev->ds_dbuf, tx); prev->ds_phys->ds_next_clones_obj = zap_create(dp->dp_meta_objset, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); } VERIFY(0 == zap_add_int(dp->dp_meta_objset, prev->ds_phys->ds_next_clones_obj, ds->ds_object, tx)); dsl_dataset_rele(ds, FTAG); if (prev != dp->dp_origin_snap) dsl_dataset_rele(prev, FTAG); return (0); } void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap != NULL); VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, tx, DS_FIND_CHILDREN)); } /* ARGSUSED */ static int upgrade_dir_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) { dmu_tx_t *tx = arg; dsl_dataset_t *ds; dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); if (ds->ds_dir->dd_phys->dd_origin_obj) { dsl_dataset_t *origin; VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &origin)); if (origin->ds_dir->dd_phys->dd_clones == 0) { dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); origin->ds_dir->dd_phys->dd_clones = zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); } VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, origin->ds_dir->dd_phys->dd_clones, dsobj, tx)); dsl_dataset_rele(origin, FTAG); } dsl_dataset_rele(ds, FTAG); return (0); } void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t obj; ASSERT(dmu_tx_is_syncing(tx)); (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); VERIFY(0 == dsl_pool_open_special_dir(dp, FREE_DIR_NAME, &dp->dp_free_dir)); /* * We can't use bpobj_alloc(), because spa_version() still * returns the old version, and we need a new-version bpobj with * subobj support. So call dmu_object_alloc() directly. */ obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, SPA_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); VERIFY3U(0, ==, zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); VERIFY3U(0, ==, bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN)); } void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) { uint64_t dsobj; dsl_dataset_t *ds; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap == NULL); /* create the origin dir, ds, & snap-ds */ rw_enter(&dp->dp_config_rwlock, RW_WRITER); dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, NULL, 0, kcred, tx); VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); dsl_dataset_snapshot_sync(ds, ORIGIN_DIR_NAME, tx); VERIFY(0 == dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, dp, &dp->dp_origin_snap)); dsl_dataset_rele(ds, FTAG); rw_exit(&dp->dp_config_rwlock); } taskq_t * dsl_pool_iput_taskq(dsl_pool_t *dp) { return (dp->dp_iput_taskq); } /* * Walk through the pool-wide zap object of temporary snapshot user holds * and release them. */ void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) { zap_attribute_t za; zap_cursor_t zc; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; if (zapobj == 0) return; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); for (zap_cursor_init(&zc, mos, zapobj); zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { char *htag; uint64_t dsobj; htag = strchr(za.za_name, '-'); *htag = '\0'; ++htag; dsobj = strtonum(za.za_name, NULL); (void) dsl_dataset_user_release_tmp(dp, dsobj, htag, B_FALSE); } zap_cursor_fini(&zc); } /* * Create the pool-wide zap object for storing temporary snapshot holds. */ void dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) { objset_t *mos = dp->dp_meta_objset; ASSERT(dp->dp_tmp_userrefs_obj == 0); ASSERT(dmu_tx_is_syncing(tx)); dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); } static int dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) { objset_t *mos = dp->dp_meta_objset; uint64_t zapobj = dp->dp_tmp_userrefs_obj; char *name; int error; ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); ASSERT(dmu_tx_is_syncing(tx)); /* * If the pool was created prior to SPA_VERSION_USERREFS, the * zap object for temporary holds might not exist yet. */ if (zapobj == 0) { if (holding) { dsl_pool_user_hold_create_obj(dp, tx); zapobj = dp->dp_tmp_userrefs_obj; } else { return (ENOENT); } } name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); if (holding) error = zap_add(mos, zapobj, name, 8, 1, now, tx); else error = zap_remove(mos, zapobj, name, tx); strfree(name); return (error); } /* * Add a temporary hold for the given dataset object and tag. */ int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, uint64_t *now, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); } /* * Release a temporary hold for the given dataset object and tag. */ int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, dmu_tx_t *tx) { return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, tx, B_FALSE)); } #if defined(_KERNEL) && defined(HAVE_SPL) module_param(zfs_no_write_throttle, int, 0644); MODULE_PARM_DESC(zfs_no_write_throttle, "Disable write throttling"); module_param(zfs_write_limit_shift, int, 0444); MODULE_PARM_DESC(zfs_write_limit_shift, "log2(fraction of memory) per txg"); module_param(zfs_txg_synctime_ms, int, 0644); MODULE_PARM_DESC(zfs_txg_synctime_ms, "Target milliseconds between txg sync"); module_param(zfs_txg_history, int, 0644); MODULE_PARM_DESC(zfs_txg_history, "Historic statistics for the last N txgs"); module_param(zfs_write_limit_min, ulong, 0444); MODULE_PARM_DESC(zfs_write_limit_min, "Min txg write limit"); module_param(zfs_write_limit_max, ulong, 0444); MODULE_PARM_DESC(zfs_write_limit_max, "Max txg write limit"); module_param(zfs_write_limit_inflated, ulong, 0444); MODULE_PARM_DESC(zfs_write_limit_inflated, "Inflated txg write limit"); module_param(zfs_write_limit_override, ulong, 0444); MODULE_PARM_DESC(zfs_write_limit_override, "Override txg write limit"); #endif diff --git a/module/zfs/txg.c b/module/zfs/txg.c index c7c3df3f8f90..7c820af4f8b3 100644 --- a/module/zfs/txg.c +++ b/module/zfs/txg.c @@ -1,898 +1,898 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright (c) 2013 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include /* * ZFS Transaction Groups * ---------------------- * * ZFS transaction groups are, as the name implies, groups of transactions * that act on persistent state. ZFS asserts consistency at the granularity of * these transaction groups. Each successive transaction group (txg) is * assigned a 64-bit consecutive identifier. There are three active * transaction group states: open, quiescing, or syncing. At any given time, * there may be an active txg associated with each state; each active txg may * either be processing, or blocked waiting to enter the next state. There may * be up to three active txgs, and there is always a txg in the open state * (though it may be blocked waiting to enter the quiescing state). In broad * strokes, transactions — operations that change in-memory structures — are * accepted into the txg in the open state, and are completed while the txg is * in the open or quiescing states. The accumulated changes are written to * disk in the syncing state. * * Open * * When a new txg becomes active, it first enters the open state. New * transactions — updates to in-memory structures — are assigned to the * currently open txg. There is always a txg in the open state so that ZFS can * accept new changes (though the txg may refuse new changes if it has hit * some limit). ZFS advances the open txg to the next state for a variety of * reasons such as it hitting a time or size threshold, or the execution of an * administrative action that must be completed in the syncing state. * * Quiescing * * After a txg exits the open state, it enters the quiescing state. The * quiescing state is intended to provide a buffer between accepting new * transactions in the open state and writing them out to stable storage in * the syncing state. While quiescing, transactions can continue their * operation without delaying either of the other states. Typically, a txg is * in the quiescing state very briefly since the operations are bounded by * software latencies rather than, say, slower I/O latencies. After all * transactions complete, the txg is ready to enter the next state. * * Syncing * * In the syncing state, the in-memory state built up during the open and (to * a lesser degree) the quiescing states is written to stable storage. The * process of writing out modified data can, in turn modify more data. For * example when we write new blocks, we need to allocate space for them; those * allocations modify metadata (space maps)... which themselves must be * written to stable storage. During the sync state, ZFS iterates, writing out * data until it converges and all in-memory changes have been written out. * The first such pass is the largest as it encompasses all the modified user * data (as opposed to filesystem metadata). Subsequent passes typically have * far less data to write as they consist exclusively of filesystem metadata. * * To ensure convergence, after a certain number of passes ZFS begins * overwriting locations on stable storage that had been allocated earlier in * the syncing state (and subsequently freed). ZFS usually allocates new * blocks to optimize for large, continuous, writes. For the syncing state to * converge however it must complete a pass where no new blocks are allocated * since each allocation requires a modification of persistent metadata. * Further, to hasten convergence, after a prescribed number of passes, ZFS * also defers frees, and stops compressing. * * In addition to writing out user data, we must also execute synctasks during * the syncing context. A synctask is the mechanism by which some * administrative activities work such as creating and destroying snapshots or * datasets. Note that when a synctask is initiated it enters the open txg, * and ZFS then pushes that txg as quickly as possible to completion of the * syncing state in order to reduce the latency of the administrative * activity. To complete the syncing state, ZFS writes out a new uberblock, * the root of the tree of blocks that comprise all state stored on the ZFS * pool. Finally, if there is a quiesced txg waiting, we signal that it can * now transition to the syncing state. */ static void txg_sync_thread(dsl_pool_t *dp); static void txg_quiesce_thread(dsl_pool_t *dp); int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ /* * Prepare the txg subsystem. */ void txg_init(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; int c; bzero(tx, sizeof (tx_state_t)); tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); for (c = 0; c < max_ncpus; c++) { int i; mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); for (i = 0; i < TXG_SIZE; i++) { cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL); list_create(&tx->tx_cpu[c].tc_callbacks[i], sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); } } mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); tx->tx_open_txg = txg; } /* * Close down the txg subsystem. */ void txg_fini(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; int c; ASSERT(tx->tx_threads == 0); mutex_destroy(&tx->tx_sync_lock); cv_destroy(&tx->tx_sync_more_cv); cv_destroy(&tx->tx_sync_done_cv); cv_destroy(&tx->tx_quiesce_more_cv); cv_destroy(&tx->tx_quiesce_done_cv); cv_destroy(&tx->tx_exit_cv); for (c = 0; c < max_ncpus; c++) { int i; mutex_destroy(&tx->tx_cpu[c].tc_lock); for (i = 0; i < TXG_SIZE; i++) { cv_destroy(&tx->tx_cpu[c].tc_cv[i]); list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); } } if (tx->tx_commit_cb_taskq != NULL) taskq_destroy(tx->tx_commit_cb_taskq); vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); bzero(tx, sizeof (tx_state_t)); } /* * Start syncing transaction groups. */ void txg_sync_start(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; mutex_enter(&tx->tx_sync_lock); dprintf("pool %p\n", dp); ASSERT(tx->tx_threads == 0); tx->tx_threads = 2; tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, dp, 0, &p0, TS_RUN, minclsyspri); /* * The sync thread can need a larger-than-default stack size on * 32-bit x86. This is due in part to nested pools and * scrub_visitbp() recursion. */ tx->tx_sync_thread = thread_create(NULL, 32<<10, txg_sync_thread, dp, 0, &p0, TS_RUN, minclsyspri); mutex_exit(&tx->tx_sync_lock); } static void txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) { CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); mutex_enter(&tx->tx_sync_lock); } static void txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) { ASSERT(*tpp != NULL); *tpp = NULL; tx->tx_threads--; cv_broadcast(&tx->tx_exit_cv); CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ thread_exit(); } static void txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) { CALLB_CPR_SAFE_BEGIN(cpr); if (time) (void) cv_timedwait_interruptible(cv, &tx->tx_sync_lock, ddi_get_lbolt() + time); else cv_wait_interruptible(cv, &tx->tx_sync_lock); CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); } /* * Stop syncing transaction groups. */ void txg_sync_stop(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; dprintf("pool %p\n", dp); /* * Finish off any work in progress. */ ASSERT(tx->tx_threads == 2); /* * We need to ensure that we've vacated the deferred space_maps. */ txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); /* * Wake all sync threads and wait for them to die. */ mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); tx->tx_exiting = 1; cv_broadcast(&tx->tx_quiesce_more_cv); cv_broadcast(&tx->tx_quiesce_done_cv); cv_broadcast(&tx->tx_sync_more_cv); while (tx->tx_threads != 0) cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); tx->tx_exiting = 0; mutex_exit(&tx->tx_sync_lock); } uint64_t txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) { tx_state_t *tx = &dp->dp_tx; tx_cpu_t *tc; uint64_t txg; /* * It appears the processor id is simply used as a "random" * number to index into the array, and there isn't any other * significance to the chosen tx_cpu. Because.. Why not use * the current cpu to index into the array? */ kpreempt_disable(); tc = &tx->tx_cpu[CPU_SEQID]; kpreempt_enable(); mutex_enter(&tc->tc_lock); txg = tx->tx_open_txg; tc->tc_count[txg & TXG_MASK]++; th->th_cpu = tc; th->th_txg = txg; return (txg); } void txg_rele_to_quiesce(txg_handle_t *th) { tx_cpu_t *tc = th->th_cpu; mutex_exit(&tc->tc_lock); } void txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) { tx_cpu_t *tc = th->th_cpu; int g = th->th_txg & TXG_MASK; mutex_enter(&tc->tc_lock); list_move_tail(&tc->tc_callbacks[g], tx_callbacks); mutex_exit(&tc->tc_lock); } void txg_rele_to_sync(txg_handle_t *th) { tx_cpu_t *tc = th->th_cpu; int g = th->th_txg & TXG_MASK; mutex_enter(&tc->tc_lock); ASSERT(tc->tc_count[g] != 0); if (--tc->tc_count[g] == 0) cv_broadcast(&tc->tc_cv[g]); mutex_exit(&tc->tc_lock); th->th_cpu = NULL; /* defensive */ } static void txg_quiesce(dsl_pool_t *dp, uint64_t txg) { hrtime_t start; txg_history_t *th; tx_state_t *tx = &dp->dp_tx; int g = txg & TXG_MASK; int c; /* * Grab all tx_cpu locks so nobody else can get into this txg. */ for (c = 0; c < max_ncpus; c++) mutex_enter(&tx->tx_cpu[c].tc_lock); ASSERT(txg == tx->tx_open_txg); tx->tx_open_txg++; + /* + * Now that we've incremented tx_open_txg, we can let threads + * enter the next transaction group. + */ + for (c = 0; c < max_ncpus; c++) + mutex_exit(&tx->tx_cpu[c].tc_lock); + /* * Measure how long the txg was open and replace the kstat. */ th = dsl_pool_txg_history_get(dp, txg); th->th_kstat.open_time = gethrtime() - th->th_kstat.birth; th->th_kstat.state = TXG_STATE_QUIESCING; dsl_pool_txg_history_put(th); dsl_pool_txg_history_add(dp, tx->tx_open_txg); - /* - * Now that we've incremented tx_open_txg, we can let threads - * enter the next transaction group. - */ - for (c = 0; c < max_ncpus; c++) - mutex_exit(&tx->tx_cpu[c].tc_lock); - /* * Quiesce the transaction group by waiting for everyone to txg_exit(). */ start = gethrtime(); for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; mutex_enter(&tc->tc_lock); while (tc->tc_count[g] != 0) cv_wait(&tc->tc_cv[g], &tc->tc_lock); mutex_exit(&tc->tc_lock); } /* * Measure how long the txg took to quiesce. */ th = dsl_pool_txg_history_get(dp, txg); th->th_kstat.quiesce_time = gethrtime() - start; dsl_pool_txg_history_put(th); } static void txg_do_callbacks(list_t *cb_list) { dmu_tx_do_callbacks(cb_list, 0); list_destroy(cb_list); kmem_free(cb_list, sizeof (list_t)); } /* * Dispatch the commit callbacks registered on this txg to worker threads. */ static void txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) { int c; tx_state_t *tx = &dp->dp_tx; list_t *cb_list; for (c = 0; c < max_ncpus; c++) { tx_cpu_t *tc = &tx->tx_cpu[c]; /* No need to lock tx_cpu_t at this point */ int g = txg & TXG_MASK; if (list_is_empty(&tc->tc_callbacks[g])) continue; if (tx->tx_commit_cb_taskq == NULL) { /* * Commit callback taskq hasn't been created yet. */ tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", 100, minclsyspri, max_ncpus, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_PREPOPULATE); } cb_list = kmem_alloc(sizeof (list_t), KM_PUSHPAGE); list_create(cb_list, sizeof (dmu_tx_callback_t), offsetof(dmu_tx_callback_t, dcb_node)); list_move_tail(cb_list, &tc->tc_callbacks[g]); (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) txg_do_callbacks, cb_list, TQ_SLEEP); } } /* * Wait for pending commit callbacks of already-synced transactions to finish * processing. * Calling this function from within a commit callback will deadlock. */ void txg_wait_callbacks(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; if (tx->tx_commit_cb_taskq != NULL) taskq_wait(tx->tx_commit_cb_taskq); } static void txg_sync_thread(dsl_pool_t *dp) { spa_t *spa = dp->dp_spa; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; uint64_t start, delta; #ifdef _KERNEL /* * Annotate this process with a flag that indicates that it is * unsafe to use KM_SLEEP during memory allocations due to the * potential for a deadlock. KM_PUSHPAGE should be used instead. */ current->flags |= PF_NOFS; #endif /* _KERNEL */ txg_thread_enter(tx, &cpr); start = delta = 0; for (;;) { hrtime_t hrstart; txg_history_t *th; uint64_t timer, timeout; uint64_t txg; timeout = zfs_txg_timeout * hz; /* * We sync when we're scanning, there's someone waiting * on us, or the quiesce thread has handed off a txg to * us, or we have reached our timeout. */ timer = (delta >= timeout ? 0 : timeout - delta); while (!dsl_scan_active(dp->dp_scan) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && tx->tx_quiesced_txg == 0) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); delta = ddi_get_lbolt() - start; timer = (delta > timeout ? 0 : timeout - delta); } /* * Wait until the quiesce thread hands off a txg to us, * prompting it to do so if necessary. */ while (!tx->tx_exiting && tx->tx_quiesced_txg == 0) { if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; cv_broadcast(&tx->tx_quiesce_more_cv); txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); } if (tx->tx_exiting) txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); /* * Consume the quiesced txg which has been handed off to * us. This may cause the quiescing thread to now be * able to quiesce another txg, so we must signal it. */ txg = tx->tx_quiesced_txg; tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; cv_broadcast(&tx->tx_quiesce_more_cv); th = dsl_pool_txg_history_get(dp, txg); th->th_kstat.state = TXG_STATE_SYNCING; vdev_get_stats(spa->spa_root_vdev, &th->th_vs1); dsl_pool_txg_history_put(th); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); start = ddi_get_lbolt(); hrstart = gethrtime(); spa_sync(spa, txg); delta = ddi_get_lbolt() - start; mutex_enter(&tx->tx_sync_lock); tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; cv_broadcast(&tx->tx_sync_done_cv); /* * Dispatch commit callbacks to worker threads. */ txg_dispatch_callbacks(dp, txg); /* * Measure the txg sync time determine the amount of I/O done. */ th = dsl_pool_txg_history_get(dp, txg); vdev_get_stats(spa->spa_root_vdev, &th->th_vs2); th->th_kstat.sync_time = gethrtime() - hrstart; th->th_kstat.nread = th->th_vs2.vs_bytes[ZIO_TYPE_READ] - th->th_vs1.vs_bytes[ZIO_TYPE_READ]; th->th_kstat.nwritten = th->th_vs2.vs_bytes[ZIO_TYPE_WRITE] - th->th_vs1.vs_bytes[ZIO_TYPE_WRITE]; th->th_kstat.reads = th->th_vs2.vs_ops[ZIO_TYPE_READ] - th->th_vs1.vs_ops[ZIO_TYPE_READ]; th->th_kstat.writes = th->th_vs2.vs_ops[ZIO_TYPE_WRITE] - th->th_vs1.vs_ops[ZIO_TYPE_WRITE]; th->th_kstat.state = TXG_STATE_COMMITTED; dsl_pool_txg_history_put(th); } } static void txg_quiesce_thread(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; txg_thread_enter(tx, &cpr); for (;;) { uint64_t txg; /* * We quiesce when there's someone waiting on us. * However, we can only have one txg in "quiescing" or * "quiesced, waiting to sync" state. So we wait until * the "quiesced, waiting to sync" txg has been consumed * by the sync thread. */ while (!tx->tx_exiting && (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || tx->tx_quiesced_txg != 0)) txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); if (tx->tx_exiting) txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); txg = tx->tx_open_txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); txg_quiesce(dp, txg); mutex_enter(&tx->tx_sync_lock); /* * Hand this txg off to the sync thread. */ dprintf("quiesce done, handing off txg %llu\n", txg); tx->tx_quiesced_txg = txg; cv_broadcast(&tx->tx_sync_more_cv); cv_broadcast(&tx->tx_quiesce_done_cv); } } /* * Delay this thread by 'ticks' if we are still in the open transaction * group and there is already a waiting txg quiesing or quiesced. Abort * the delay if this txg stalls or enters the quiesing state. */ void txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) { tx_state_t *tx = &dp->dp_tx; clock_t timeout = ddi_get_lbolt() + ticks; /* don't delay if this txg could transition to quiesing immediately */ if (tx->tx_open_txg > txg || tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) return; mutex_enter(&tx->tx_sync_lock); if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { mutex_exit(&tx->tx_sync_lock); return; } while (ddi_get_lbolt() < timeout && tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, timeout); DMU_TX_STAT_BUMP(dmu_tx_delay); mutex_exit(&tx->tx_sync_lock); } void txg_wait_synced(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); if (txg == 0) txg = tx->tx_open_txg + TXG_DEFER_SIZE; if (tx->tx_sync_txg_waiting < txg) tx->tx_sync_txg_waiting = txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); while (tx->tx_synced_txg < txg) { dprintf("broadcasting sync more " "tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); cv_broadcast(&tx->tx_sync_more_cv); cv_wait(&tx->tx_sync_done_cv, &tx->tx_sync_lock); } mutex_exit(&tx->tx_sync_lock); } void txg_wait_open(dsl_pool_t *dp, uint64_t txg) { tx_state_t *tx = &dp->dp_tx; mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); if (txg == 0) txg = tx->tx_open_txg + 1; if (tx->tx_quiesce_txg_waiting < txg) tx->tx_quiesce_txg_waiting = txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); while (tx->tx_open_txg < txg) { cv_broadcast(&tx->tx_quiesce_more_cv); cv_wait(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); } mutex_exit(&tx->tx_sync_lock); } boolean_t txg_stalled(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); } boolean_t txg_sync_waiting(dsl_pool_t *dp) { tx_state_t *tx = &dp->dp_tx; return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || tx->tx_quiesced_txg != 0); } /* * Per-txg object lists. */ void txg_list_create(txg_list_t *tl, size_t offset) { int t; mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); tl->tl_offset = offset; for (t = 0; t < TXG_SIZE; t++) tl->tl_head[t] = NULL; } void txg_list_destroy(txg_list_t *tl) { int t; for (t = 0; t < TXG_SIZE; t++) ASSERT(txg_list_empty(tl, t)); mutex_destroy(&tl->tl_lock); } boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg) { return (tl->tl_head[txg & TXG_MASK] == NULL); } /* * Add an entry to the list. * Returns 0 if it's a new entry, 1 if it's already there. */ int txg_list_add(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); int already_on_list; mutex_enter(&tl->tl_lock); already_on_list = tn->tn_member[t]; if (!already_on_list) { tn->tn_member[t] = 1; tn->tn_next[t] = tl->tl_head[t]; tl->tl_head[t] = tn; } mutex_exit(&tl->tl_lock); return (already_on_list); } /* * Add an entry to the end of the list (walks list to find end). * Returns 0 if it's a new entry, 1 if it's already there. */ int txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); int already_on_list; mutex_enter(&tl->tl_lock); already_on_list = tn->tn_member[t]; if (!already_on_list) { txg_node_t **tp; for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) continue; tn->tn_member[t] = 1; tn->tn_next[t] = NULL; *tp = tn; } mutex_exit(&tl->tl_lock); return (already_on_list); } /* * Remove the head of the list and return it. */ void * txg_list_remove(txg_list_t *tl, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn; void *p = NULL; mutex_enter(&tl->tl_lock); if ((tn = tl->tl_head[t]) != NULL) { p = (char *)tn - tl->tl_offset; tl->tl_head[t] = tn->tn_next[t]; tn->tn_next[t] = NULL; tn->tn_member[t] = 0; } mutex_exit(&tl->tl_lock); return (p); } /* * Remove a specific item from the list and return it. */ void * txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn, **tp; mutex_enter(&tl->tl_lock); for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { if ((char *)tn - tl->tl_offset == p) { *tp = tn->tn_next[t]; tn->tn_next[t] = NULL; tn->tn_member[t] = 0; mutex_exit(&tl->tl_lock); return (p); } } mutex_exit(&tl->tl_lock); return (NULL); } int txg_list_member(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); return (tn->tn_member[t]); } /* * Walk a txg list -- only safe if you know it's not changing. */ void * txg_list_head(txg_list_t *tl, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = tl->tl_head[t]; return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); } void * txg_list_next(txg_list_t *tl, void *p, uint64_t txg) { int t = txg & TXG_MASK; txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); tn = tn->tn_next[t]; return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); } #if defined(_KERNEL) && defined(HAVE_SPL) EXPORT_SYMBOL(txg_init); EXPORT_SYMBOL(txg_fini); EXPORT_SYMBOL(txg_sync_start); EXPORT_SYMBOL(txg_sync_stop); EXPORT_SYMBOL(txg_hold_open); EXPORT_SYMBOL(txg_rele_to_quiesce); EXPORT_SYMBOL(txg_rele_to_sync); EXPORT_SYMBOL(txg_register_callbacks); EXPORT_SYMBOL(txg_delay); EXPORT_SYMBOL(txg_wait_synced); EXPORT_SYMBOL(txg_wait_open); EXPORT_SYMBOL(txg_wait_callbacks); EXPORT_SYMBOL(txg_stalled); EXPORT_SYMBOL(txg_sync_waiting); module_param(zfs_txg_timeout, int, 0644); MODULE_PARM_DESC(zfs_txg_timeout, "Max seconds worth of delta per txg"); #endif