diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c index c8bc4424f857..0419f3fab27a 100644 --- a/module/zfs/dsl_userhold.c +++ b/module/zfs/dsl_userhold.c @@ -1,537 +1,537 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include typedef struct dsl_dataset_user_hold_arg { nvlist_t *dduha_holds; nvlist_t *dduha_errlist; minor_t dduha_minor; } dsl_dataset_user_hold_arg_t; /* * If you add new checks here, you may need to add additional checks to the * "temporary" case in snapshot_check() in dmu_objset.c. */ int dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, boolean_t temphold, dmu_tx_t *tx) { dsl_pool_t *dp = dmu_tx_pool(tx); objset_t *mos = dp->dp_meta_objset; int error = 0; if (strlen(htag) > MAXNAMELEN) return (E2BIG); /* Tempholds have a more restricted length */ if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) return (E2BIG); /* tags must be unique (if ds already exists) */ if (ds != NULL) { mutex_enter(&ds->ds_lock); if (ds->ds_phys->ds_userrefs_obj != 0) { uint64_t value; error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, 8, 1, &value); if (error == 0) error = EEXIST; else if (error == ENOENT) error = 0; } mutex_exit(&ds->ds_lock); } return (error); } static int dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) { dsl_dataset_user_hold_arg_t *dduha = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; int rv = 0; if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) return (ENOTSUP); for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { int error = 0; dsl_dataset_t *ds; char *htag; /* must be a snapshot */ if (strchr(nvpair_name(pair), '@') == NULL) error = EINVAL; if (error == 0) error = nvpair_value_string(pair, &htag); if (error == 0) { error = dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds); } if (error == 0) { error = dsl_dataset_user_hold_check_one(ds, htag, dduha->dduha_minor != 0, tx); dsl_dataset_rele(ds, FTAG); } if (error != 0) { rv = error; fnvlist_add_int32(dduha->dduha_errlist, nvpair_name(pair), error); } } return (rv); } void dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj; mutex_enter(&ds->ds_lock); if (ds->ds_phys->ds_userrefs_obj == 0) { /* * This is the first user hold for this dataset. Create * the userrefs zap object. */ dmu_buf_will_dirty(ds->ds_dbuf, tx); zapobj = ds->ds_phys->ds_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); } else { zapobj = ds->ds_phys->ds_userrefs_obj; } ds->ds_userrefs++; mutex_exit(&ds->ds_lock); VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx)); if (minor != 0) { VERIFY0(dsl_pool_user_hold(dp, ds->ds_object, htag, now, tx)); dsl_register_onexit_hold_cleanup(ds, htag, minor); } spa_history_log_internal_ds(ds, "hold", tx, "tag=%s temp=%d refs=%llu", htag, minor != 0, ds->ds_userrefs); } static void dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_user_hold_arg_t *dduha = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; uint64_t now = gethrestime_sec(); for (pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); dsl_dataset_user_hold_sync_one(ds, fnvpair_value_string(pair), dduha->dduha_minor, now, tx); dsl_dataset_rele(ds, FTAG); } } /* * holds is nvl of snapname -> holdname * errlist will be filled in with snapname -> error * if cleanup_minor is not 0, the holds will be temporary, cleaned up * when the process exits. * * if any fails, all will fail. */ int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) { dsl_dataset_user_hold_arg_t dduha; nvpair_t *pair; pair = nvlist_next_nvpair(holds, NULL); if (pair == NULL) return (0); dduha.dduha_holds = holds; dduha.dduha_errlist = errlist; dduha.dduha_minor = cleanup_minor; return (dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check, dsl_dataset_user_hold_sync, &dduha, fnvlist_num_pairs(holds))); } typedef struct dsl_dataset_user_release_arg { nvlist_t *ddura_holds; nvlist_t *ddura_todelete; nvlist_t *ddura_errlist; } dsl_dataset_user_release_arg_t; static int dsl_dataset_user_release_check_one(dsl_dataset_t *ds, nvlist_t *holds, boolean_t *todelete) { uint64_t zapobj; nvpair_t *pair; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; int error; int numholds = 0; *todelete = B_FALSE; if (!dsl_dataset_is_snapshot(ds)) return (EINVAL); zapobj = ds->ds_phys->ds_userrefs_obj; if (zapobj == 0) return (ESRCH); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { /* Make sure the hold exists */ uint64_t tmp; error = zap_lookup(mos, zapobj, nvpair_name(pair), 8, 1, &tmp); if (error == ENOENT) error = ESRCH; if (error != 0) return (error); numholds++; } if (DS_IS_DEFER_DESTROY(ds) && ds->ds_phys->ds_num_children == 1 && ds->ds_userrefs == numholds) { /* we need to destroy the snapshot as well */ if (dsl_dataset_long_held(ds)) return (EBUSY); *todelete = B_TRUE; } return (0); } static int dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx) { dsl_dataset_user_release_arg_t *ddura = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; int rv = 0; if (!dmu_tx_is_syncing(tx)) return (0); for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { const char *name = nvpair_name(pair); int error; dsl_dataset_t *ds; nvlist_t *holds; error = nvpair_value_nvlist(pair, &holds); if (error != 0) return (EINVAL); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error == 0) { boolean_t deleteme; error = dsl_dataset_user_release_check_one(ds, holds, &deleteme); if (error == 0 && deleteme) { fnvlist_add_boolean(ddura->ddura_todelete, name); } dsl_dataset_rele(ds, FTAG); } if (error != 0) { if (ddura->ddura_errlist != NULL) { fnvlist_add_int32(ddura->ddura_errlist, name, error); } rv = error; } } return (rv); } static void dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds, dmu_tx_t *tx) { dsl_pool_t *dp = ds->ds_dir->dd_pool; objset_t *mos = dp->dp_meta_objset; uint64_t zapobj; int error; nvpair_t *pair; for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { ds->ds_userrefs--; error = dsl_pool_user_release(dp, ds->ds_object, nvpair_name(pair), tx); VERIFY(error == 0 || error == ENOENT); zapobj = ds->ds_phys->ds_userrefs_obj; VERIFY0(zap_remove(mos, zapobj, nvpair_name(pair), tx)); spa_history_log_internal_ds(ds, "release", tx, "tag=%s refs=%lld", nvpair_name(pair), (longlong_t)ds->ds_userrefs); } } static void dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_user_release_arg_t *ddura = arg; dsl_pool_t *dp = dmu_tx_pool(tx); nvpair_t *pair; for (pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); dsl_dataset_user_release_sync_one(ds, fnvpair_value_nvlist(pair), tx); if (nvlist_exists(ddura->ddura_todelete, nvpair_name(pair))) { ASSERT(ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && DS_IS_DEFER_DESTROY(ds)); dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); } dsl_dataset_rele(ds, FTAG); } } /* * holds is nvl of snapname -> { holdname, ... } * errlist will be filled in with snapname -> error * * if any fails, all will fail. */ int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist) { dsl_dataset_user_release_arg_t ddura; nvpair_t *pair; int error; pair = nvlist_next_nvpair(holds, NULL); if (pair == NULL) return (0); ddura.ddura_holds = holds; ddura.ddura_errlist = errlist; ddura.ddura_todelete = fnvlist_alloc(); error = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_release_check, dsl_dataset_user_release_sync, &ddura, fnvlist_num_pairs(holds)); fnvlist_free(ddura.ddura_todelete); return (error); } typedef struct dsl_dataset_user_release_tmp_arg { uint64_t ddurta_dsobj; nvlist_t *ddurta_holds; boolean_t ddurta_deleteme; } dsl_dataset_user_release_tmp_arg_t; static int dsl_dataset_user_release_tmp_check(void *arg, dmu_tx_t *tx) { dsl_dataset_user_release_tmp_arg_t *ddurta = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; int error; if (!dmu_tx_is_syncing(tx)) return (0); error = dsl_dataset_hold_obj(dp, ddurta->ddurta_dsobj, FTAG, &ds); if (error) return (error); error = dsl_dataset_user_release_check_one(ds, ddurta->ddurta_holds, &ddurta->ddurta_deleteme); dsl_dataset_rele(ds, FTAG); return (error); } static void dsl_dataset_user_release_tmp_sync(void *arg, dmu_tx_t *tx) { dsl_dataset_user_release_tmp_arg_t *ddurta = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dataset_t *ds; VERIFY0(dsl_dataset_hold_obj(dp, ddurta->ddurta_dsobj, FTAG, &ds)); dsl_dataset_user_release_sync_one(ds, ddurta->ddurta_holds, tx); if (ddurta->ddurta_deleteme) { ASSERT(ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && DS_IS_DEFER_DESTROY(ds)); dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); } dsl_dataset_rele(ds, FTAG); } /* * Called at spa_load time to release a stale temporary user hold. * Also called by the onexit code. */ void dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, const char *htag) { dsl_dataset_user_release_tmp_arg_t ddurta; #ifdef _KERNEL dsl_dataset_t *ds; int error; /* Make sure it is not mounted. */ dsl_pool_config_enter(dp, FTAG); error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); if (error == 0) { char name[MAXNAMELEN]; dsl_dataset_name(ds, name); dsl_dataset_rele(ds, FTAG); dsl_pool_config_exit(dp, FTAG); zfs_unmount_snap(name); } else { dsl_pool_config_exit(dp, FTAG); } #endif ddurta.ddurta_dsobj = dsobj; ddurta.ddurta_holds = fnvlist_alloc(); fnvlist_add_boolean(ddurta.ddurta_holds, htag); (void) dsl_sync_task(spa_name(dp->dp_spa), dsl_dataset_user_release_tmp_check, dsl_dataset_user_release_tmp_sync, &ddurta, 1); fnvlist_free(ddurta.ddurta_holds); } typedef struct zfs_hold_cleanup_arg { char zhca_spaname[MAXNAMELEN]; uint64_t zhca_spa_load_guid; uint64_t zhca_dsobj; char zhca_htag[MAXNAMELEN]; } zfs_hold_cleanup_arg_t; static void dsl_dataset_user_release_onexit(void *arg) { zfs_hold_cleanup_arg_t *ca = arg; spa_t *spa; int error; error = spa_open(ca->zhca_spaname, &spa, FTAG); if (error != 0) { zfs_dbgmsg("couldn't release hold on pool=%s ds=%llu tag=%s " "because pool is no longer loaded", ca->zhca_spaname, ca->zhca_dsobj, ca->zhca_htag); return; } if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { zfs_dbgmsg("couldn't release hold on pool=%s ds=%llu tag=%s " "because pool is no longer loaded (guid doesn't match)", ca->zhca_spaname, ca->zhca_dsobj, ca->zhca_htag); spa_close(spa, FTAG); return; } dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_dsobj, ca->zhca_htag); kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); spa_close(spa, FTAG); } void dsl_register_onexit_hold_cleanup(dsl_dataset_t *ds, const char *htag, minor_t minor) { - zfs_hold_cleanup_arg_t *ca = kmem_alloc(sizeof (*ca), KM_SLEEP); + zfs_hold_cleanup_arg_t *ca = kmem_alloc(sizeof (*ca), KM_PUSHPAGE); spa_t *spa = dsl_dataset_get_spa(ds); (void) strlcpy(ca->zhca_spaname, spa_name(spa), sizeof (ca->zhca_spaname)); ca->zhca_spa_load_guid = spa_load_guid(spa); ca->zhca_dsobj = ds->ds_object; (void) strlcpy(ca->zhca_htag, htag, sizeof (ca->zhca_htag)); VERIFY0(zfs_onexit_add_cb(minor, dsl_dataset_user_release_onexit, ca, NULL)); } int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl) { dsl_pool_t *dp; dsl_dataset_t *ds; int err; err = dsl_pool_hold(dsname, FTAG, &dp); if (err != 0) return (err); err = dsl_dataset_hold(dp, dsname, FTAG, &ds); if (err != 0) { dsl_pool_rele(dp, FTAG); return (err); } if (ds->ds_phys->ds_userrefs_obj != 0) { zap_attribute_t *za; zap_cursor_t zc; - za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + za = kmem_alloc(sizeof (zap_attribute_t), KM_PUSHPAGE); for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_phys->ds_userrefs_obj); zap_cursor_retrieve(&zc, za) == 0; zap_cursor_advance(&zc)) { fnvlist_add_uint64(nvl, za->za_name, za->za_first_integer); } zap_cursor_fini(&zc); kmem_free(za, sizeof (zap_attribute_t)); } dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (0); } diff --git a/module/zfs/rrwlock.c b/module/zfs/rrwlock.c index 8e80166c7d14..357afbfa5481 100644 --- a/module/zfs/rrwlock.c +++ b/module/zfs/rrwlock.c @@ -1,288 +1,288 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 2012 by Delphix. All rights reserved. */ #include #include /* * This file contains the implementation of a re-entrant read * reader/writer lock (aka "rrwlock"). * * This is a normal reader/writer lock with the additional feature * of allowing threads who have already obtained a read lock to * re-enter another read lock (re-entrant read) - even if there are * waiting writers. * * Callers who have not obtained a read lock give waiting writers priority. * * The rrwlock_t lock does not allow re-entrant writers, nor does it * allow a re-entrant mix of reads and writes (that is, it does not * allow a caller who has already obtained a read lock to be able to * then grab a write lock without first dropping all read locks, and * vice versa). * * The rrwlock_t uses tsd (thread specific data) to keep a list of * nodes (rrw_node_t), where each node keeps track of which specific * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering * should be rare, a thread that grabs multiple reads on the same rrwlock_t * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the * tsd list can represent a different rrwlock_t. This allows a thread * to enter multiple and unique rrwlock_ts for read locks at the same time. * * Since using tsd exposes some overhead, the rrwlock_t only needs to * keep tsd data when writers are waiting. If no writers are waiting, then * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd * is needed. Once a writer attempts to grab the lock, readers then * keep tsd data and bump the linked readers count (rr_linked_rcount). * * If there are waiting writers and there are anonymous readers, then a * reader doesn't know if it is a re-entrant lock. But since it may be one, * we allow the read to proceed (otherwise it could deadlock). Since once * waiting writers are active, readers no longer bump the anonymous count, * the anonymous readers will eventually flush themselves out. At this point, * readers will be able to tell if they are a re-entrant lock (have a * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then * we must let the proceed. If they are not, then the reader blocks for the * waiting writers. Hence, we do not starve writers. */ /* global key for TSD */ uint_t rrw_tsd_key; typedef struct rrw_node { struct rrw_node *rn_next; rrwlock_t *rn_rrl; void *rn_tag; } rrw_node_t; static rrw_node_t * rrn_find(rrwlock_t *rrl) { rrw_node_t *rn; if (refcount_count(&rrl->rr_linked_rcount) == 0) return (NULL); for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { if (rn->rn_rrl == rrl) return (rn); } return (NULL); } /* * Add a node to the head of the singly linked list. */ static void rrn_add(rrwlock_t *rrl, void *tag) { rrw_node_t *rn; - rn = kmem_alloc(sizeof (*rn), KM_SLEEP); + rn = kmem_alloc(sizeof (*rn), KM_PUSHPAGE); rn->rn_rrl = rrl; rn->rn_next = tsd_get(rrw_tsd_key); rn->rn_tag = tag; VERIFY(tsd_set(rrw_tsd_key, rn) == 0); } /* * If a node is found for 'rrl', then remove the node from this * thread's list and return TRUE; otherwise return FALSE. */ static boolean_t rrn_find_and_remove(rrwlock_t *rrl, void *tag) { rrw_node_t *rn; rrw_node_t *prev = NULL; if (refcount_count(&rrl->rr_linked_rcount) == 0) return (B_FALSE); for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { if (rn->rn_rrl == rrl && rn->rn_tag == tag) { if (prev) prev->rn_next = rn->rn_next; else VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); kmem_free(rn, sizeof (*rn)); return (B_TRUE); } prev = rn; } return (B_FALSE); } void rrw_init(rrwlock_t *rrl, boolean_t track_all) { mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); rrl->rr_writer = NULL; refcount_create(&rrl->rr_anon_rcount); refcount_create(&rrl->rr_linked_rcount); rrl->rr_writer_wanted = B_FALSE; rrl->rr_track_all = track_all; } void rrw_destroy(rrwlock_t *rrl) { mutex_destroy(&rrl->rr_lock); cv_destroy(&rrl->rr_cv); ASSERT(rrl->rr_writer == NULL); refcount_destroy(&rrl->rr_anon_rcount); refcount_destroy(&rrl->rr_linked_rcount); } void rrw_enter_read(rrwlock_t *rrl, void *tag) { mutex_enter(&rrl->rr_lock); #if !defined(DEBUG) && defined(_KERNEL) if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && !rrl->rr_track_all) { rrl->rr_anon_rcount.rc_count++; mutex_exit(&rrl->rr_lock); return; } DTRACE_PROBE(zfs__rrwfastpath__rdmiss); #endif ASSERT(rrl->rr_writer != curthread); ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && refcount_is_zero(&rrl->rr_anon_rcount) && rrn_find(rrl) == NULL)) cv_wait(&rrl->rr_cv, &rrl->rr_lock); if (rrl->rr_writer_wanted || rrl->rr_track_all) { /* may or may not be a re-entrant enter */ rrn_add(rrl, tag); (void) refcount_add(&rrl->rr_linked_rcount, tag); } else { (void) refcount_add(&rrl->rr_anon_rcount, tag); } ASSERT(rrl->rr_writer == NULL); mutex_exit(&rrl->rr_lock); } void rrw_enter_write(rrwlock_t *rrl) { mutex_enter(&rrl->rr_lock); ASSERT(rrl->rr_writer != curthread); while (refcount_count(&rrl->rr_anon_rcount) > 0 || refcount_count(&rrl->rr_linked_rcount) > 0 || rrl->rr_writer != NULL) { rrl->rr_writer_wanted = B_TRUE; cv_wait(&rrl->rr_cv, &rrl->rr_lock); } rrl->rr_writer_wanted = B_FALSE; rrl->rr_writer = curthread; mutex_exit(&rrl->rr_lock); } void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) { if (rw == RW_READER) rrw_enter_read(rrl, tag); else rrw_enter_write(rrl); } void rrw_exit(rrwlock_t *rrl, void *tag) { mutex_enter(&rrl->rr_lock); #if !defined(DEBUG) && defined(_KERNEL) if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { rrl->rr_anon_rcount.rc_count--; if (rrl->rr_anon_rcount.rc_count == 0) cv_broadcast(&rrl->rr_cv); mutex_exit(&rrl->rr_lock); return; } DTRACE_PROBE(zfs__rrwfastpath__exitmiss); #endif ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || !refcount_is_zero(&rrl->rr_linked_rcount) || rrl->rr_writer != NULL); if (rrl->rr_writer == NULL) { int64_t count; if (rrn_find_and_remove(rrl, tag)) { count = refcount_remove(&rrl->rr_linked_rcount, tag); } else { ASSERT(!rrl->rr_track_all); count = refcount_remove(&rrl->rr_anon_rcount, tag); } if (count == 0) cv_broadcast(&rrl->rr_cv); } else { ASSERT(rrl->rr_writer == curthread); ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && refcount_is_zero(&rrl->rr_linked_rcount)); rrl->rr_writer = NULL; cv_broadcast(&rrl->rr_cv); } mutex_exit(&rrl->rr_lock); } /* * If the lock was created with track_all, rrw_held(RW_READER) will return * B_TRUE iff the current thread has the lock for reader. Otherwise it may * return B_TRUE if any thread has the lock for reader. */ boolean_t rrw_held(rrwlock_t *rrl, krw_t rw) { boolean_t held; mutex_enter(&rrl->rr_lock); if (rw == RW_WRITER) { held = (rrl->rr_writer == curthread); } else { held = (!refcount_is_zero(&rrl->rr_anon_rcount) || rrn_find(rrl) != NULL); } mutex_exit(&rrl->rr_lock); return (held); } void rrw_tsd_destroy(void *arg) { rrw_node_t *rn = arg; if (rn != NULL) { panic("thread %p terminating with rrw lock %p held", (void *)curthread, (void *)rn->rn_rrl); } } diff --git a/module/zfs/zfs_onexit.c b/module/zfs/zfs_onexit.c index 2f60b5e4de15..41ae3fdb98e9 100644 --- a/module/zfs/zfs_onexit.c +++ b/module/zfs/zfs_onexit.c @@ -1,247 +1,247 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include /* * ZFS kernel routines may add/delete callback routines to be invoked * upon process exit (triggered via the close operation from the /dev/zfs * driver). * * These cleanup callbacks are intended to allow for the accumulation * of kernel state across multiple ioctls. User processes participate * simply by opening ZFS_DEV. This causes the ZFS driver to do create * some private data for the file descriptor and generating a unique * minor number. The process then passes along that file descriptor to * each ioctl that might have a cleanup operation. * * Consumers of the onexit routines should call zfs_onexit_fd_hold() early * on to validate the given fd and add a reference to its file table entry. * This allows the consumer to do its work and then add a callback, knowing * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers * should call zfs_onexit_fd_rele(). * * A simple example is zfs_ioc_recv(), where we might create an AVL tree * with dataset/GUID mappings and then reuse that tree on subsequent * zfs_ioc_recv() calls. * * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc() * the AVL tree and pass it along with a callback function to * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the * callback and return an action handle. * * The action handle is then passed from user space to subsequent * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree * by calling zfs_onexit_cb_data() with the device minor number and * action handle. * * If the user process exits abnormally, the callback is invoked implicitly * as part of the driver close operation. Once the user space process is * finished with the accumulated kernel state, it can also just call close(2) * on the cleanup fd to trigger the cleanup callback. */ void zfs_onexit_init(zfs_onexit_t **zop) { zfs_onexit_t *zo; zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP); mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t), offsetof(zfs_onexit_action_node_t, za_link)); } void zfs_onexit_destroy(zfs_onexit_t *zo) { zfs_onexit_action_node_t *ap; mutex_enter(&zo->zo_lock); while ((ap = list_head(&zo->zo_actions)) != NULL) { list_remove(&zo->zo_actions, ap); mutex_exit(&zo->zo_lock); ap->za_func(ap->za_data); kmem_free(ap, sizeof (zfs_onexit_action_node_t)); mutex_enter(&zo->zo_lock); } mutex_exit(&zo->zo_lock); list_destroy(&zo->zo_actions); mutex_destroy(&zo->zo_lock); kmem_free(zo, sizeof (zfs_onexit_t)); } static int zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) { *zo = zfsdev_get_state(minor, ZST_ONEXIT); if (*zo == NULL) return (EBADF); return (0); } /* * Consumers might need to operate by minor number instead of fd, since * they might be running in another thread (e.g. txg_sync_thread). Callers * of this function must call zfs_onexit_fd_rele() when they're finished * using the minor number. */ int zfs_onexit_fd_hold(int fd, minor_t *minorp) { file_t *fp; zfs_onexit_t *zo; fp = getf(fd); if (fp == NULL) return (EBADF); *minorp = zfsdev_getminor(fp->f_file); return (zfs_onexit_minor_to_state(*minorp, &zo)); } void zfs_onexit_fd_rele(int fd) { releasef(fd); } /* * Add a callback to be invoked when the calling process exits. */ int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, uint64_t *action_handle) { zfs_onexit_t *zo; zfs_onexit_action_node_t *ap; int error; error = zfs_onexit_minor_to_state(minor, &zo); if (error) return (error); - ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP); + ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_PUSHPAGE); list_link_init(&ap->za_link); ap->za_func = func; ap->za_data = data; mutex_enter(&zo->zo_lock); list_insert_tail(&zo->zo_actions, ap); mutex_exit(&zo->zo_lock); if (action_handle) *action_handle = (uint64_t)(uintptr_t)ap; return (0); } static zfs_onexit_action_node_t * zfs_onexit_find_cb(zfs_onexit_t *zo, uint64_t action_handle) { zfs_onexit_action_node_t *match; zfs_onexit_action_node_t *ap; list_t *l; ASSERT(MUTEX_HELD(&zo->zo_lock)); match = (zfs_onexit_action_node_t *)(uintptr_t)action_handle; l = &zo->zo_actions; for (ap = list_head(l); ap != NULL; ap = list_next(l, ap)) { if (match == ap) break; } return (ap); } /* * Delete the callback, triggering it first if 'fire' is set. */ int zfs_onexit_del_cb(minor_t minor, uint64_t action_handle, boolean_t fire) { zfs_onexit_t *zo; zfs_onexit_action_node_t *ap; int error; error = zfs_onexit_minor_to_state(minor, &zo); if (error) return (error); mutex_enter(&zo->zo_lock); ap = zfs_onexit_find_cb(zo, action_handle); if (ap != NULL) { list_remove(&zo->zo_actions, ap); mutex_exit(&zo->zo_lock); if (fire) ap->za_func(ap->za_data); kmem_free(ap, sizeof (zfs_onexit_action_node_t)); } else { mutex_exit(&zo->zo_lock); error = ENOENT; } return (error); } /* * Return the data associated with this callback. This allows consumers * of the cleanup-on-exit interfaces to stash kernel data across system * calls, knowing that it will be cleaned up if the calling process exits. */ int zfs_onexit_cb_data(minor_t minor, uint64_t action_handle, void **data) { zfs_onexit_t *zo; zfs_onexit_action_node_t *ap; int error; *data = NULL; error = zfs_onexit_minor_to_state(minor, &zo); if (error) return (error); mutex_enter(&zo->zo_lock); ap = zfs_onexit_find_cb(zo, action_handle); if (ap != NULL) *data = ap->za_data; else error = ENOENT; mutex_exit(&zo->zo_lock); return (error); }