diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h index f3dd9f26f23c..64ab2a8118f5 100644 --- a/include/sys/zvol_impl.h +++ b/include/sys/zvol_impl.h @@ -1,142 +1,142 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ #ifndef _SYS_ZVOL_IMPL_H #define _SYS_ZVOL_IMPL_H #include #define ZVOL_RDONLY (1<<0) /* zvol is readonly (writes rejected) */ #define ZVOL_WRITTEN_TO (1<<1) /* zvol has been written to (needs flush) */ #define ZVOL_EXCL (1<<2) /* zvol has O_EXCL client right now */ #define ZVOL_REMOVING (1<<3) /* zvol waiting to remove minor */ /* * The in-core state of each volume. */ typedef struct zvol_state { char zv_name[MAXNAMELEN]; /* name */ uint64_t zv_volsize; /* advertised space */ uint64_t zv_volblocksize; /* volume block size */ objset_t *zv_objset; /* objset handle */ uint32_t zv_flags; /* ZVOL_* flags */ uint32_t zv_open_count; /* open counts */ uint32_t zv_changed; /* disk changed */ uint32_t zv_volmode; /* volmode */ zilog_t *zv_zilog; /* ZIL handle */ zfs_rangelock_t zv_rangelock; /* for range locking */ dnode_t *zv_dn; /* dnode hold */ dataset_kstats_t zv_kstat; /* zvol kstats */ list_node_t zv_next; /* next zvol_state_t linkage */ uint64_t zv_hash; /* name hash */ struct hlist_node zv_hlink; /* hash link */ kmutex_t zv_state_lock; /* protects zvol_state_t */ atomic_t zv_suspend_ref; /* refcount for suspend */ krwlock_t zv_suspend_lock; /* suspend lock */ kcondvar_t zv_removing_cv; /* ready to remove minor */ struct zvol_state_os *zv_zso; /* private platform state */ boolean_t zv_threading; /* volthreading property */ } zvol_state_t; /* * zvol taskqs */ typedef struct zv_taskq { uint_t tqs_cnt; taskq_t **tqs_taskq; } zv_taskq_t; typedef struct zv_request_stack { zvol_state_t *zv; struct bio *bio; #ifdef __linux__ struct request *rq; #endif } zv_request_t; typedef struct zv_request_task { zv_request_t zvr; taskq_ent_t ent; } zv_request_task_t; /* * Switch taskq at multiple of 512 MB offset. This can be set to a lower value * to utilize more threads for small files but may affect prefetch hits. */ #define ZVOL_TASKQ_OFFSET_SHIFT 29 extern krwlock_t zvol_state_lock; #define ZVOL_HT_SIZE 1024 extern struct hlist_head *zvol_htable; #define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)]) extern zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE]; extern unsigned int zvol_inhibit_dev; extern unsigned int zvol_prefetch_bytes; extern unsigned int zvol_volmode; extern unsigned int zvol_threads; extern unsigned int zvol_num_taskqs; extern unsigned int zvol_request_sync; extern zv_taskq_t zvol_taskqs; /* * platform independent functions exported to platform code */ zvol_state_t *zvol_find_by_name_hash(const char *name, uint64_t hash, int mode); int zvol_first_open(zvol_state_t *zv, boolean_t readonly); uint64_t zvol_name_hash(const char *name); void zvol_last_close(zvol_state_t *zv); void zvol_insert(zvol_state_t *zv); void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len); void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, boolean_t commit); int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio); int zvol_init_impl(void); void zvol_fini_impl(void); void zvol_wait_close(zvol_state_t *zv); int zvol_clone_range(zvol_state_handle_t *, uint64_t, zvol_state_handle_t *, uint64_t, uint64_t); void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps); zv_request_task_t *zv_request_task_create(zv_request_t zvr); void zv_request_task_free(zv_request_task_t *task); /* * platform dependent functions exported to platform independent code */ void zvol_os_free(zvol_state_t *zv); int zvol_os_rename_minor(zvol_state_t *zv, const char *newname); int zvol_os_create_minor(const char *name); int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize); boolean_t zvol_os_is_zvol(const char *path); -void zvol_os_clear_private(zvol_state_t *zv); +void zvol_os_remove_minor(zvol_state_t *zv); void zvol_os_set_disk_ro(zvol_state_t *zv, int flags); void zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity); #endif diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 265dfd55fc4d..493da08e3a2c 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -1,1637 +1,1623 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * * Copyright (c) 2006-2010 Pawel Jakub Dawidek * All rights reserved. * * Portions Copyright 2010 Robert Milkowski * * Copyright 2011 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2012, 2017 by Delphix. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. * Copyright (c) 2014 Integros [integros.com] - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ /* Portions Copyright 2011 Martin Matuska */ /* * ZFS volume emulation driver. * * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. * Volumes are accessed through the symbolic links named: * * /dev/zvol// * * Volumes are persistent through reboot. No user command needs to be * run before opening and using a device. * * On FreeBSD ZVOLs are simply GEOM providers like any other storage device * in the system. Except when they're simply character devices (volmode=dev). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #define ZVOL_DUMPSIZE "dumpsize" #ifdef ZVOL_LOCK_DEBUG #define ZVOL_RW_READER RW_WRITER #define ZVOL_RW_READ_HELD RW_WRITE_HELD #else #define ZVOL_RW_READER RW_READER #define ZVOL_RW_READ_HELD RW_READ_HELD #endif struct zvol_state_os { #define zso_dev _zso_state._zso_dev #define zso_geom _zso_state._zso_geom union { /* volmode=dev */ struct zvol_state_dev { struct cdev *zsd_cdev; struct selinfo zsd_selinfo; } _zso_dev; /* volmode=geom */ struct zvol_state_geom { struct g_provider *zsg_provider; } _zso_geom; } _zso_state; int zso_dying; }; static uint32_t zvol_minors; SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); static boolean_t zpool_on_zvol = B_FALSE; SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, "Allow zpools to use zvols as vdevs (DANGEROUS)"); /* * Toggle unmap functionality. */ boolean_t zvol_unmap_enabled = B_TRUE; SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); /* * zvol maximum transfer in one DMU tx. */ int zvol_maxphys = DMU_MAX_ACCESS / 2; static void zvol_ensure_zilog(zvol_state_t *zv); static d_open_t zvol_cdev_open; static d_close_t zvol_cdev_close; static d_ioctl_t zvol_cdev_ioctl; static d_read_t zvol_cdev_read; static d_write_t zvol_cdev_write; static d_strategy_t zvol_cdev_bio_strategy; static d_kqfilter_t zvol_cdev_kqfilter; static struct cdevsw zvol_cdevsw = { .d_name = "zvol", .d_version = D_VERSION, .d_flags = D_DISK | D_TRACKCLOSE, .d_open = zvol_cdev_open, .d_close = zvol_cdev_close, .d_ioctl = zvol_cdev_ioctl, .d_read = zvol_cdev_read, .d_write = zvol_cdev_write, .d_strategy = zvol_cdev_bio_strategy, .d_kqfilter = zvol_cdev_kqfilter, }; static void zvol_filter_detach(struct knote *kn); static int zvol_filter_vnode(struct knote *kn, long hint); static struct filterops zvol_filterops_vnode = { .f_isfd = 1, .f_detach = zvol_filter_detach, .f_event = zvol_filter_vnode, }; extern uint_t zfs_geom_probe_vdev_key; struct g_class zfs_zvol_class = { .name = "ZFS::ZVOL", .version = G_VERSION, }; DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); static int zvol_geom_open(struct g_provider *pp, int flag, int count); static int zvol_geom_close(struct g_provider *pp, int flag, int count); -static void zvol_geom_destroy(zvol_state_t *zv); static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); static void zvol_geom_bio_start(struct bio *bp); static int zvol_geom_bio_getattr(struct bio *bp); static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync); /* * GEOM mode implementation */ static int zvol_geom_open(struct g_provider *pp, int flag, int count) { zvol_state_t *zv; int err = 0; boolean_t drop_suspend = B_FALSE; if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { /* * If zfs_geom_probe_vdev_key is set, that means that zfs is * attempting to probe geom providers while looking for a * replacement for a missing VDEV. In this case, the * spa_namespace_lock will not be held, but it is still illegal * to use a zvol as a vdev. Deadlocks can result if another * thread has spa_namespace_lock. */ return (SET_ERROR(EOPNOTSUPP)); } retry: rw_enter(&zvol_state_lock, ZVOL_RW_READER); /* * Obtain a copy of private under zvol_state_lock to make sure either * the result of zvol free code setting private to NULL is observed, * or the zv is protected from being freed because of the positive * zv_open_count. */ zv = pp->private; if (zv == NULL) { rw_exit(&zvol_state_lock); err = SET_ERROR(ENXIO); goto out_locked; } mutex_enter(&zv->zv_state_lock); if (zv->zv_zso->zso_dying || zv->zv_flags & ZVOL_REMOVING) { rw_exit(&zvol_state_lock); err = SET_ERROR(ENXIO); goto out_zv_locked; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); /* * Make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock. */ if (zv->zv_open_count == 0) { drop_suspend = B_TRUE; if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { boolean_t drop_namespace = B_FALSE; ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); /* * Take spa_namespace_lock to prevent lock inversion when * zvols from one pool are opened as vdevs in another. */ if (!mutex_owned(&spa_namespace_lock)) { if (!mutex_tryenter(&spa_namespace_lock)) { mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; kern_yield(PRI_USER); goto retry; } else { drop_namespace = B_TRUE; } } err = zvol_first_open(zv, !(flag & FWRITE)); if (drop_namespace) mutex_exit(&spa_namespace_lock); if (err) goto out_zv_locked; pp->mediasize = zv->zv_volsize; pp->stripeoffset = 0; pp->stripesize = zv->zv_volblocksize; } ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* * Check for a bad on-disk format version now since we * lied about owning the dataset readonly before. */ if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || dmu_objset_incompatible_encryption_version(zv->zv_objset))) { err = SET_ERROR(EROFS); goto out_opened; } if (zv->zv_flags & ZVOL_EXCL) { err = SET_ERROR(EBUSY); goto out_opened; } if (flag & O_EXCL) { if (zv->zv_open_count != 0) { err = SET_ERROR(EBUSY); goto out_opened; } zv->zv_flags |= ZVOL_EXCL; } zv->zv_open_count += count; out_opened: if (zv->zv_open_count == 0) { zvol_last_close(zv); wakeup(zv); } out_zv_locked: mutex_exit(&zv->zv_state_lock); out_locked: if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); } static int zvol_geom_close(struct g_provider *pp, int flag, int count) { (void) flag; zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; int new_open_count; rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = pp->private; if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { ASSERT3U(zv->zv_open_count, ==, 1); zv->zv_flags &= ~ZVOL_EXCL; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); /* * If the open count is zero, this is a spurious close. * That indicates a bug in the kernel / DDI framework. */ ASSERT3U(zv->zv_open_count, >, 0); /* * Make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock. */ new_open_count = zv->zv_open_count - count; if (new_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* Check to see if zv_suspend_lock is needed. */ new_open_count = zv->zv_open_count - count; if (new_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* * You may get multiple opens, but only one close. */ zv->zv_open_count = new_open_count; if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); wakeup(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (0); } -static void -zvol_geom_destroy(zvol_state_t *zv) -{ - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp = zsg->zsg_provider; - - ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_GEOM); - - g_topology_assert(); - - zsg->zsg_provider = NULL; - g_wither_geom(pp->geom, ENXIO); -} - void zvol_wait_close(zvol_state_t *zv) { if (zv->zv_volmode != ZFS_VOLMODE_GEOM) return; mutex_enter(&zv->zv_state_lock); zv->zv_zso->zso_dying = B_TRUE; if (zv->zv_open_count) msleep(zv, &zv->zv_state_lock, PRIBIO, "zvol:dying", 10*hz); mutex_exit(&zv->zv_state_lock); } static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) { int count, error, flags; g_topology_assert(); /* * To make it easier we expect either open or close, but not both * at the same time. */ KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || (acr <= 0 && acw <= 0 && ace <= 0), ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", pp->name, acr, acw, ace)); if (pp->private == NULL) { if (acr <= 0 && acw <= 0 && ace <= 0) return (0); return (pp->error); } /* * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if * ace != 0, because GEOM already handles that and handles it a bit * differently. GEOM allows for multiple read/exclusive consumers and * ZFS allows only one exclusive consumer, no matter if it is reader or * writer. I like better the way GEOM works so I'll leave it for GEOM * to decide what to do. */ count = acr + acw + ace; if (count == 0) return (0); flags = 0; if (acr != 0 || ace != 0) flags |= FREAD; if (acw != 0) flags |= FWRITE; g_topology_unlock(); if (count > 0) error = zvol_geom_open(pp, flags, count); else error = zvol_geom_close(pp, flags, -count); g_topology_lock(); return (error); } static void zvol_geom_bio_start(struct bio *bp) { zvol_state_t *zv = bp->bio_to->private; if (zv == NULL) { g_io_deliver(bp, ENXIO); return; } if (bp->bio_cmd == BIO_GETATTR) { if (zvol_geom_bio_getattr(bp)) g_io_deliver(bp, EOPNOTSUPP); return; } zvol_geom_bio_strategy(bp, !g_is_geom_thread(curthread) && THREAD_CAN_SLEEP()); } static int zvol_geom_bio_getattr(struct bio *bp) { zvol_state_t *zv; zv = bp->bio_to->private; ASSERT3P(zv, !=, NULL); spa_t *spa = dmu_objset_spa(zv->zv_objset); uint64_t refd, avail, usedobjs, availobjs; if (g_handleattr_int(bp, "GEOM::candelete", 1)) return (0); if (strcmp(bp->bio_attribute, "blocksavail") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) return (0); } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) return (0); } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { avail = metaslab_class_get_space(spa_normal_class(spa)); avail -= metaslab_class_get_alloc(spa_normal_class(spa)); if (g_handleattr_off_t(bp, "poolblocksavail", avail / DEV_BSIZE)) return (0); } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { refd = metaslab_class_get_alloc(spa_normal_class(spa)); if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) return (0); } return (1); } static void zvol_filter_detach(struct knote *kn) { zvol_state_t *zv; struct zvol_state_dev *zsd; zv = kn->kn_hook; zsd = &zv->zv_zso->zso_dev; knlist_remove(&zsd->zsd_selinfo.si_note, kn, 0); } static int zvol_filter_vnode(struct knote *kn, long hint) { kn->kn_fflags |= kn->kn_sfflags & hint; return (kn->kn_fflags != 0); } static int zvol_cdev_kqfilter(struct cdev *dev, struct knote *kn) { zvol_state_t *zv; struct zvol_state_dev *zsd; zv = dev->si_drv2; zsd = &zv->zv_zso->zso_dev; if (kn->kn_filter != EVFILT_VNODE) return (EINVAL); /* XXX: extend support for other NOTE_* events */ if (kn->kn_sfflags != NOTE_ATTRIB) return (EINVAL); kn->kn_fop = &zvol_filterops_vnode; kn->kn_hook = zv; knlist_add(&zsd->zsd_selinfo.si_note, kn, 0); return (0); } static void zvol_strategy_impl(zv_request_t *zvr) { zvol_state_t *zv; struct bio *bp; uint64_t off, volsize; size_t resid; char *addr; objset_t *os; zfs_locked_range_t *lr; int error = 0; boolean_t doread = B_FALSE; boolean_t is_dumpified; boolean_t commit; bp = zvr->bio; zv = zvr->zv; if (zv == NULL) { error = SET_ERROR(ENXIO); goto out; } rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); if (zv->zv_flags & ZVOL_REMOVING) { error = SET_ERROR(ENXIO); goto resume; } switch (bp->bio_cmd) { case BIO_READ: doread = B_TRUE; break; case BIO_WRITE: case BIO_FLUSH: case BIO_DELETE: if (zv->zv_flags & ZVOL_RDONLY) { error = SET_ERROR(EROFS); goto resume; } zvol_ensure_zilog(zv); if (bp->bio_cmd == BIO_FLUSH) goto commit; break; default: error = SET_ERROR(EOPNOTSUPP); goto resume; } off = bp->bio_offset; volsize = zv->zv_volsize; os = zv->zv_objset; ASSERT3P(os, !=, NULL); addr = bp->bio_data; resid = bp->bio_length; if (resid > 0 && off >= volsize) { error = SET_ERROR(EIO); goto resume; } is_dumpified = B_FALSE; commit = !doread && !is_dumpified && zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; /* * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. */ lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, doread ? RL_READER : RL_WRITER); if (bp->bio_cmd == BIO_DELETE) { dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, off, resid); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, off, resid); resid = 0; } goto unlock; } while (resid != 0 && off < volsize) { size_t size = MIN(resid, zvol_maxphys); if (doread) { error = dmu_read_by_dnode(zv->zv_dn, off, size, addr, DMU_READ_PREFETCH); } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write_by_dnode(zv->zv_dn, off, size, addr, tx, DMU_READ_PREFETCH); zvol_log_write(zv, tx, off, size, commit); dmu_tx_commit(tx); } } if (error) { /* Convert checksum errors into IO errors. */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } off += size; addr += size; resid -= size; } unlock: zfs_rangelock_exit(lr); bp->bio_completed = bp->bio_length - resid; if (bp->bio_completed < bp->bio_length && off > volsize) error = SET_ERROR(EINVAL); switch (bp->bio_cmd) { case BIO_FLUSH: break; case BIO_READ: dataset_kstats_update_read_kstats(&zv->zv_kstat, bp->bio_completed); break; case BIO_WRITE: dataset_kstats_update_write_kstats(&zv->zv_kstat, bp->bio_completed); break; case BIO_DELETE: break; default: break; } if (error == 0 && commit) { commit: error = zil_commit(zv->zv_zilog, ZVOL_OBJ); } resume: rw_exit(&zv->zv_suspend_lock); out: if (bp->bio_to) g_io_deliver(bp, error); else biofinish(bp, NULL, error); } static void zvol_strategy_task(void *arg) { zv_request_task_t *task = arg; zvol_strategy_impl(&task->zvr); zv_request_task_free(task); } static void zvol_geom_bio_strategy(struct bio *bp, boolean_t sync) { zv_taskq_t *ztqs = &zvol_taskqs; zv_request_task_t *task; zvol_state_t *zv; uint_t tq_idx; uint_t taskq_hash; int error; if (bp->bio_to) zv = bp->bio_to->private; else zv = bp->bio_dev->si_drv2; if (zv == NULL) { error = SET_ERROR(ENXIO); if (bp->bio_to) g_io_deliver(bp, error); else biofinish(bp, NULL, error); return; } zv_request_t zvr = { .zv = zv, .bio = bp, }; if (sync || zvol_request_sync) { zvol_strategy_impl(&zvr); return; } taskq_hash = cityhash3((uintptr_t)zv, curcpu, bp->bio_offset >> ZVOL_TASKQ_OFFSET_SHIFT); tq_idx = taskq_hash % ztqs->tqs_cnt; task = zv_request_task_create(zvr); taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_strategy_task, task, 0, &task->ent); } static void zvol_cdev_bio_strategy(struct bio *bp) { zvol_geom_bio_strategy(bp, B_FALSE); } /* * Character device mode implementation */ static int zvol_cdev_read(struct cdev *dev, struct uio *uio_s, int ioflag) { zvol_state_t *zv; uint64_t volsize; zfs_locked_range_t *lr; int error = 0; zfs_uio_t uio; zfs_uio_init(&uio, uio_s); zv = dev->si_drv2; volsize = zv->zv_volsize; /* * uio_loffset == volsize isn't an error as * it's required for EOF processing. */ if (zfs_uio_resid(&uio) > 0 && (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) return (SET_ERROR(EIO)); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); ssize_t start_resid = zfs_uio_resid(&uio); lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), zfs_uio_resid(&uio), RL_READER); while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); /* Don't read past the end. */ if (bytes > volsize - zfs_uio_offset(&uio)) bytes = volsize - zfs_uio_offset(&uio); error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, DMU_READ_PREFETCH); if (error) { /* Convert checksum errors into IO errors. */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } zfs_rangelock_exit(lr); int64_t nread = start_resid - zfs_uio_resid(&uio); dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); rw_exit(&zv->zv_suspend_lock); return (error); } static int zvol_cdev_write(struct cdev *dev, struct uio *uio_s, int ioflag) { zvol_state_t *zv; uint64_t volsize; zfs_locked_range_t *lr; int error = 0; boolean_t commit; zfs_uio_t uio; zv = dev->si_drv2; volsize = zv->zv_volsize; zfs_uio_init(&uio, uio_s); if (zfs_uio_resid(&uio) > 0 && (zfs_uio_offset(&uio) < 0 || zfs_uio_offset(&uio) > volsize)) return (SET_ERROR(EIO)); ssize_t start_resid = zfs_uio_resid(&uio); commit = (ioflag & IO_SYNC) || (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); zvol_ensure_zilog(zv); lr = zfs_rangelock_enter(&zv->zv_rangelock, zfs_uio_offset(&uio), zfs_uio_resid(&uio), RL_WRITER); while (zfs_uio_resid(&uio) > 0 && zfs_uio_offset(&uio) < volsize) { uint64_t bytes = MIN(zfs_uio_resid(&uio), DMU_MAX_ACCESS >> 1); uint64_t off = zfs_uio_offset(&uio); dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* Don't write past the end. */ bytes = volsize - off; dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, DMU_READ_PREFETCH); if (error == 0) zvol_log_write(zv, tx, off, bytes, commit); dmu_tx_commit(tx); if (error) break; } zfs_rangelock_exit(lr); int64_t nwritten = start_resid - zfs_uio_resid(&uio); dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); if (error == 0 && commit) error = zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); return (error); } static int zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) { zvol_state_t *zv; int err = 0; boolean_t drop_suspend = B_FALSE; retry: rw_enter(&zvol_state_lock, ZVOL_RW_READER); /* * Obtain a copy of si_drv2 under zvol_state_lock to make sure either * the result of zvol free code setting si_drv2 to NULL is observed, * or the zv is protected from being freed because of the positive * zv_open_count. */ zv = dev->si_drv2; if (zv == NULL) { rw_exit(&zvol_state_lock); err = SET_ERROR(ENXIO); goto out_locked; } mutex_enter(&zv->zv_state_lock); if (zv->zv_zso->zso_dying) { rw_exit(&zvol_state_lock); err = SET_ERROR(ENXIO); goto out_zv_locked; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); /* * Make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock. */ if (zv->zv_open_count == 0) { drop_suspend = B_TRUE; if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { boolean_t drop_namespace = B_FALSE; ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); /* * Take spa_namespace_lock to prevent lock inversion when * zvols from one pool are opened as vdevs in another. */ if (!mutex_owned(&spa_namespace_lock)) { if (!mutex_tryenter(&spa_namespace_lock)) { mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; kern_yield(PRI_USER); goto retry; } else { drop_namespace = B_TRUE; } } err = zvol_first_open(zv, !(flags & FWRITE)); if (drop_namespace) mutex_exit(&spa_namespace_lock); if (err) goto out_zv_locked; } ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { err = SET_ERROR(EROFS); goto out_opened; } if (zv->zv_flags & ZVOL_EXCL) { err = SET_ERROR(EBUSY); goto out_opened; } if (flags & O_EXCL) { if (zv->zv_open_count != 0) { err = SET_ERROR(EBUSY); goto out_opened; } zv->zv_flags |= ZVOL_EXCL; } zv->zv_open_count++; out_opened: if (zv->zv_open_count == 0) { zvol_last_close(zv); wakeup(zv); } out_zv_locked: mutex_exit(&zv->zv_state_lock); out_locked: if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (err); } static int zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) { zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, ZVOL_RW_READER); zv = dev->si_drv2; if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); if (zv->zv_flags & ZVOL_EXCL) { ASSERT3U(zv->zv_open_count, ==, 1); zv->zv_flags &= ~ZVOL_EXCL; } ASSERT3S(zv->zv_volmode, ==, ZFS_VOLMODE_DEV); /* * If the open count is zero, this is a spurious close. * That indicates a bug in the kernel / DDI framework. */ ASSERT3U(zv->zv_open_count, >, 0); /* * Make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock. */ if (zv->zv_open_count == 1) { if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); mutex_enter(&zv->zv_state_lock); /* Check to see if zv_suspend_lock is needed. */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* * You may get multiple opens, but only one close. */ zv->zv_open_count--; if (zv->zv_open_count == 0) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); wakeup(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); return (0); } static int zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, int fflag, struct thread *td) { zvol_state_t *zv; zfs_locked_range_t *lr; off_t offset, length; int error; boolean_t sync; zv = dev->si_drv2; error = 0; KASSERT(zv->zv_open_count > 0, ("Device with zero access count in %s", __func__)); switch (cmd) { case DIOCGSECTORSIZE: *(uint32_t *)data = DEV_BSIZE; break; case DIOCGMEDIASIZE: *(off_t *)data = zv->zv_volsize; break; case DIOCGFLUSH: rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); if (zv->zv_zilog != NULL) error = zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); break; case DIOCGDELETE: if (!zvol_unmap_enabled) break; offset = ((off_t *)data)[0]; length = ((off_t *)data)[1]; if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || offset < 0 || offset >= zv->zv_volsize || length <= 0) { printf("%s: offset=%jd length=%jd\n", __func__, offset, length); error = SET_ERROR(EINVAL); break; } rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); zvol_ensure_zilog(zv); lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, RL_WRITER); dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error != 0) { sync = FALSE; dmu_tx_abort(tx); } else { sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); zvol_log_truncate(zv, tx, offset, length); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length); } zfs_rangelock_exit(lr); if (sync) error = zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); break; case DIOCGSTRIPESIZE: *(off_t *)data = zv->zv_volblocksize; break; case DIOCGSTRIPEOFFSET: *(off_t *)data = 0; break; case DIOCGATTR: { spa_t *spa = dmu_objset_spa(zv->zv_objset); struct diocgattr_arg *arg = (struct diocgattr_arg *)data; uint64_t refd, avail, usedobjs, availobjs; if (strcmp(arg->name, "GEOM::candelete") == 0) arg->value.i = 1; else if (strcmp(arg->name, "blocksavail") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); arg->value.off = avail / DEV_BSIZE; } else if (strcmp(arg->name, "blocksused") == 0) { dmu_objset_space(zv->zv_objset, &refd, &avail, &usedobjs, &availobjs); arg->value.off = refd / DEV_BSIZE; } else if (strcmp(arg->name, "poolblocksavail") == 0) { avail = metaslab_class_get_space(spa_normal_class(spa)); avail -= metaslab_class_get_alloc( spa_normal_class(spa)); arg->value.off = avail / DEV_BSIZE; } else if (strcmp(arg->name, "poolblocksused") == 0) { refd = metaslab_class_get_alloc(spa_normal_class(spa)); arg->value.off = refd / DEV_BSIZE; } else error = SET_ERROR(ENOIOCTL); break; } case FIOSEEKHOLE: case FIOSEEKDATA: { off_t *off = (off_t *)data; uint64_t noff; boolean_t hole; hole = (cmd == FIOSEEKHOLE); noff = *off; lr = zfs_rangelock_enter(&zv->zv_rangelock, 0, UINT64_MAX, RL_READER); error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); zfs_rangelock_exit(lr); *off = noff; break; } default: error = SET_ERROR(ENOIOCTL); } return (error); } /* * Misc. helpers */ static void zvol_ensure_zilog(zvol_state_t *zv) { ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); /* * Open a ZIL if this is the first time we have written to this * zvol. We protect zv->zv_zilog with zv_suspend_lock rather * than zv_state_lock so that we don't need to acquire an * additional lock in this path. */ if (zv->zv_zilog == NULL) { if (!rw_tryupgrade(&zv->zv_suspend_lock)) { rw_exit(&zv->zv_suspend_lock); rw_enter(&zv->zv_suspend_lock, RW_WRITER); } if (zv->zv_zilog == NULL) { zv->zv_zilog = zil_open(zv->zv_objset, zvol_get_data, &zv->zv_kstat.dk_zil_sums); zv->zv_flags |= ZVOL_WRITTEN_TO; /* replay / destroy done in zvol_os_create_minor() */ VERIFY0(zv->zv_zilog->zl_header->zh_flags & ZIL_REPLAY_NEEDED); } rw_downgrade(&zv->zv_suspend_lock); } } boolean_t zvol_os_is_zvol(const char *device) { return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); } int zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { int error = 0; ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); /* Move to a new hashtable entry. */ zv->zv_hash = zvol_name_hash(newname); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; struct g_geom *gp; g_topology_lock(); gp = pp->geom; ASSERT3P(gp, !=, NULL); zsg->zsg_provider = NULL; g_wither_provider(pp, ENXIO); pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; pp->sectorsize = DEV_BSIZE; pp->mediasize = zv->zv_volsize; pp->private = zv; zsg->zsg_provider = pp; g_error_provider(pp, 0); g_topology_unlock(); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev; struct make_dev_args args; dev = zsd->zsd_cdev; if (dev != NULL) { destroy_dev(dev); dev = zsd->zsd_cdev = NULL; if (zv->zv_open_count > 0) { zv->zv_flags &= ~ZVOL_EXCL; zv->zv_open_count = 0; /* XXX need suspend lock but lock order */ zvol_last_close(zv); } } make_dev_args_init(&args); args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; args.mda_devsw = &zvol_cdevsw; args.mda_cr = NULL; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv2 = zv; error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname); if (error == 0) { dev->si_iosize_max = maxphys; zsd->zsd_cdev = dev; } } strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); dataset_kstats_rename(&zv->zv_kstat, newname); return (error); } /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ static int zvol_alloc(const char *name, uint64_t volsize, uint64_t volblocksize, zvol_state_t **zvp) { zvol_state_t *zv; uint64_t volmode; int error; error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); if (error) return (error); if (volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; if (volmode == ZFS_VOLMODE_NONE) return (0); zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_volmode = volmode; zv->zv_volsize = volsize; zv->zv_volblocksize = volblocksize; if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp; struct g_geom *gp; g_topology_lock(); gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); gp->start = zvol_geom_bio_start; gp->access = zvol_geom_access; pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; pp->sectorsize = DEV_BSIZE; pp->mediasize = 0; pp->private = zv; zsg->zsg_provider = pp; } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; struct cdev *dev; struct make_dev_args args; make_dev_args_init(&args); args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; args.mda_devsw = &zvol_cdevsw; args.mda_cr = NULL; args.mda_uid = UID_ROOT; args.mda_gid = GID_OPERATOR; args.mda_mode = 0640; args.mda_si_drv2 = zv; error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); if (error) { kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); return (error); } dev->si_iosize_max = maxphys; zsd->zsd_cdev = dev; knlist_init_sx(&zsd->zsd_selinfo.si_note, &zv->zv_state_lock); } (void) strlcpy(zv->zv_name, name, MAXPATHLEN); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); *zvp = zv; return (error); } /* * Remove minor node for the specified volume. */ void -zvol_os_free(zvol_state_t *zv) +zvol_os_remove_minor(zvol_state_t *zv) { - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); + ASSERT0(atomic_read(&zv->zv_suspend_ref)); + ASSERT(zv->zv_flags & ZVOL_REMOVING); - ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); - - rw_destroy(&zv->zv_suspend_lock); - zfs_rangelock_fini(&zv->zv_rangelock); + struct zvol_state_os *zso = zv->zv_zso; + zv->zv_zso = NULL; if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp __maybe_unused = zsg->zsg_provider; - - ASSERT0P(pp->private); + struct zvol_state_geom *zsg = &zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + pp->private = NULL; + mutex_exit(&zv->zv_state_lock); g_topology_lock(); - zvol_geom_destroy(zv); + g_wither_geom(pp->geom, ENXIO); g_topology_unlock(); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct zvol_state_dev *zsd = &zso->zso_dev; struct cdev *dev = zsd->zsd_cdev; + if (dev != NULL) + dev->si_drv2 = NULL; + mutex_exit(&zv->zv_state_lock); + if (dev != NULL) { - ASSERT0P(dev->si_drv2); destroy_dev(dev); knlist_clear(&zsd->zsd_selinfo.si_note, 0); knlist_destroy(&zsd->zsd_selinfo.si_note); } } + kmem_free(zso, sizeof (struct zvol_state_os)); + + mutex_enter(&zv->zv_state_lock); +} + +void +zvol_os_free(zvol_state_t *zv) +{ + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + ASSERT0P(zv->zv_zso); + + ASSERT0P(zv->zv_objset); + ASSERT0P(zv->zv_zilog); + ASSERT0P(zv->zv_dn); + + ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); + mutex_destroy(&zv->zv_state_lock); cv_destroy(&zv->zv_removing_cv); dataset_kstats_destroy(&zv->zv_kstat); - kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); zvol_minors--; } /* * Create a minor node (plus a whole lot more) for the specified volume. */ int zvol_os_create_minor(const char *name) { zvol_state_t *zv = NULL; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t hash, len; int error; bool replayed_zil = B_FALSE; if (zvol_inhibit_dev) return (0); ZFS_LOG(1, "Creating ZVOL %s...", name); hash = zvol_name_hash(name); if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); return (SET_ERROR(EEXIST)); } DROP_GIANT(); doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); /* Lie and say we're read-only. */ error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; error = zvol_alloc(name, volsize, doi->doi_data_block_size, &zv); if (error || zv == NULL) goto out_dmu_objset_disown; zv->zv_hash = hash; if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) zv->zv_flags |= ZVOL_RDONLY; zv->zv_objset = os; ASSERT0P(zv->zv_kstat.dk_kstats); error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); if (error) goto out_dmu_objset_disown; ASSERT0P(zv->zv_zilog); zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); else replayed_zil = zil_replay(os, zv, zvol_replay_vector); } if (replayed_zil) zil_close(zv->zv_zilog); zv->zv_zilog = NULL; len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_ASYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, ZIO_PRIORITY_ASYNC_READ); } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); if (error == 0 && zv && zv->zv_volmode == ZFS_VOLMODE_GEOM) { g_error_provider(zv->zv_zso->zso_geom.zsg_provider, 0); /* geom was locked inside zvol_alloc() function */ g_topology_unlock(); } out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); if (error == 0 && zv) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); zvol_minors++; rw_exit(&zvol_state_lock); ZFS_LOG(1, "ZVOL %s created.", name); } PICKUP_GIANT(); return (error); } -void -zvol_os_clear_private(zvol_state_t *zv) -{ - ASSERT(RW_LOCK_HELD(&zvol_state_lock)); - if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { - struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; - struct g_provider *pp = zsg->zsg_provider; - - if (pp->private == NULL) /* already cleared */ - return; - - pp->private = NULL; - ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); - } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { - struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; - struct cdev *dev = zsd->zsd_cdev; - - if (dev != NULL) - dev->si_drv2 = NULL; - } -} - int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) { zv->zv_volsize = volsize; if (zv->zv_volmode == ZFS_VOLMODE_GEOM) { struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; struct g_provider *pp = zsg->zsg_provider; g_topology_lock(); if (pp->private == NULL) { g_topology_unlock(); return (SET_ERROR(ENXIO)); } /* * Do not invoke resize event when initial size was zero. * ZVOL initializes the size on first open, this is not * real resizing. */ if (pp->mediasize == 0) pp->mediasize = zv->zv_volsize; else g_resize_provider(pp, zv->zv_volsize); g_topology_unlock(); } else if (zv->zv_volmode == ZFS_VOLMODE_DEV) { struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; KNOTE_UNLOCKED(&zsd->zsd_selinfo.si_note, NOTE_ATTRIB); } return (0); } void zvol_os_set_disk_ro(zvol_state_t *zv, int flags) { /* * The ro/rw ZVOL mode is switched using zvol_set_ro() function by * enabling/disabling ZVOL_RDONLY flag. No additional FreeBSD-specific * actions are required for readonly zfs property switching. */ } void zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) { /* * The ZVOL size/capacity is changed by zvol_set_volsize() function. * Leave this method empty, because all required job is doing by * zvol_os_update_volsize() platform-specific function. */ } /* * Public interfaces */ int zvol_busy(void) { return (zvol_minors != 0); } int zvol_init(void) { return (zvol_init_impl()); } void zvol_fini(void) { zvol_fini_impl(); } diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index a73acdad34ae..57f5bd67d7a8 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1,1826 +1,1848 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. * Copyright (c) 2024, Rob Norris - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, boolean_t force_sync); static unsigned int zvol_major = ZVOL_MAJOR; static unsigned long zvol_max_discard_blocks = 16384; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS static unsigned int zvol_open_timeout_ms = 1000; #endif static unsigned int zvol_blk_mq_threads = 0; static unsigned int zvol_blk_mq_actual_threads; static boolean_t zvol_use_blk_mq = B_FALSE; /* * The maximum number of volblocksize blocks to process per thread. Typically, * write heavy workloads preform better with higher values here, and read * heavy workloads preform better with lower values, but that's not a hard * and fast rule. It's basically a knob to tune between "less overhead with * less parallelism" and "more overhead, but more parallelism". * * '8' was chosen as a reasonable, balanced, default based off of sequential * read and write tests to a zvol in an NVMe pool (with 16 CPUs). */ static unsigned int zvol_blk_mq_blocks_per_thread = 8; #ifndef BLKDEV_DEFAULT_RQ /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ #endif /* * Finalize our BIO or request. */ static inline void zvol_end_io(struct bio *bio, struct request *rq, int error) { ASSERT3U(error, >=, 0); if (bio) { bio->bi_status = errno_to_bi_status(error); bio_endio(bio); } else { blk_mq_end_request(rq, errno_to_bi_status(error)); } } static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; static unsigned int zvol_actual_blk_mq_queue_depth; struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ struct request_queue *zvo_queue; /* request queue */ dev_t zvo_dev; /* device id */ struct blk_mq_tag_set tag_set; /* Set from the global 'zvol_use_blk_mq' at zvol load */ boolean_t use_blk_mq; }; static struct ida zvol_ida; /* * This is called when a new block multiqueue request comes in. A request * contains one or more BIOs. */ static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; zvol_state_t *zv = rq->q->queuedata; /* Tell the kernel that we are starting to process this request */ blk_mq_start_request(rq); if (blk_rq_is_passthrough(rq)) { /* Skip non filesystem request */ blk_mq_end_request(rq, BLK_STS_IOERR); return (BLK_STS_IOERR); } zvol_request_impl(zv, NULL, rq, 0); /* Acknowledge to the kernel that we got this request */ return (BLK_STS_OK); } static struct blk_mq_ops zvol_blk_mq_queue_ops = { .queue_rq = zvol_mq_queue_rq, }; /* Initialize our blk-mq struct */ static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) { struct zvol_state_os *zso = zv->zv_zso; memset(&zso->tag_set, 0, sizeof (zso->tag_set)); /* Initialize tag set. */ zso->tag_set.ops = &zvol_blk_mq_queue_ops; zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; zso->tag_set.numa_node = NUMA_NO_NODE; zso->tag_set.cmd_size = 0; /* * We need BLK_MQ_F_BLOCKING here since we do blocking calls in * zvol_request_impl() */ zso->tag_set.flags = BLK_MQ_F_BLOCKING; #ifdef BLK_MQ_F_SHOULD_MERGE /* * Linux 6.14 removed BLK_MQ_F_SHOULD_MERGE and made it implicit. * For older kernels, we set it. */ zso->tag_set.flags |= BLK_MQ_F_SHOULD_MERGE; #endif zso->tag_set.driver_data = zv; return (blk_mq_alloc_tag_set(&zso->tag_set)); } /* * Given a path, return TRUE if path is a ZVOL. */ boolean_t zvol_os_is_zvol(const char *path) { dev_t dev = 0; if (vdev_lookup_bdev(path, &dev) != 0) return (B_FALSE); if (MAJOR(dev) == zvol_major) return (B_TRUE); return (B_FALSE); } static void zvol_write(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; zvol_state_t *zv = zvr->zv; struct request_queue *q; struct gendisk *disk; unsigned long start_time = 0; boolean_t acct = B_FALSE; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); q = zv->zv_zso->zvo_queue; disk = zv->zv_zso->zvo_disk; /* bio marked as FLUSH need to flush before write */ if (io_is_flush(bio, rq)) { error = zil_commit(zv->zv_zilog, ZVOL_OBJ); if (error != 0) { rw_exit(&zv->zv_suspend_lock); zvol_end_io(bio, rq, -error); return; } } /* Some requests are just for flush and nothing else. */ if (io_size(bio, rq) == 0) { rw_exit(&zv->zv_suspend_lock); zvol_end_io(bio, rq, 0); return; } zfs_uio_bvec_init(&uio, bio, rq); ssize_t start_resid = uio.uio_resid; /* * With use_blk_mq, accounting is done by blk_mq_start_request() * and blk_mq_end_request(), so we can skip it here. */ if (bio) { acct = blk_queue_io_stat(q); if (acct) { start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); } } boolean_t sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_WRITER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio.uio_loffset; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx, DMU_READ_PREFETCH); if (error == 0) { zvol_log_write(zv, tx, off, bytes, sync); } dmu_tx_commit(tx); if (error) break; } zfs_rangelock_exit(lr); int64_t nwritten = start_resid - uio.uio_resid; dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); task_io_account_write(nwritten); if (error == 0 && sync) error = zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); } zvol_end_io(bio, rq, error); } static void zvol_write_task(void *arg) { zv_request_task_t *task = arg; zvol_write(&task->zvr); zv_request_task_free(task); } static void zvol_discard(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; zvol_state_t *zv = zvr->zv; uint64_t start = io_offset(bio, rq); uint64_t size = io_size(bio, rq); uint64_t end = start + size; boolean_t sync; int error = 0; dmu_tx_t *tx; struct request_queue *q = zv->zv_zso->zvo_queue; struct gendisk *disk = zv->zv_zso->zvo_disk; unsigned long start_time = 0; boolean_t acct = B_FALSE; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); if (bio) { acct = blk_queue_io_stat(q); if (acct) { start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); } } sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (end > zv->zv_volsize) { error = SET_ERROR(EIO); goto unlock; } /* * Align the request to volume block boundaries when a secure erase is * not required. This will prevent dnode_free_range() from zeroing out * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ if (!io_is_secure_erase(bio, rq)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN_TYPED(end, zv->zv_volblocksize, uint64_t); size = end - start; } if (start >= end) goto unlock; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, start, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, start, size); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); } zfs_rangelock_exit(lr); if (error == 0 && sync) error = zil_commit(zv->zv_zilog, ZVOL_OBJ); unlock: rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); } zvol_end_io(bio, rq, error); } static void zvol_discard_task(void *arg) { zv_request_task_t *task = arg; zvol_discard(&task->zvr); zv_request_task_free(task); } static void zvol_read(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; boolean_t acct = B_FALSE; zvol_state_t *zv = zvr->zv; struct request_queue *q; struct gendisk *disk; unsigned long start_time = 0; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); zfs_uio_bvec_init(&uio, bio, rq); q = zv->zv_zso->zvo_queue; disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; /* * When blk-mq is being used, accounting is done by * blk_mq_start_request() and blk_mq_end_request(). */ if (bio) { acct = blk_queue_io_stat(q); if (acct) start_time = blk_generic_start_io_acct(q, disk, READ, bio); } zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_READER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); /* don't read past the end */ if (bytes > volsize - uio.uio_loffset) bytes = volsize - uio.uio_loffset; error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes, DMU_READ_PREFETCH); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } zfs_rangelock_exit(lr); int64_t nread = start_resid - uio.uio_resid; dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); task_io_account_read(nread); rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, READ, bio, start_time); } zvol_end_io(bio, rq, error); } static void zvol_read_task(void *arg) { zv_request_task_t *task = arg; zvol_read(&task->zvr); zv_request_task_free(task); } /* * Process a BIO or request * * Either 'bio' or 'rq' should be set depending on if we are processing a * bio or a request (both should not be set). * * force_sync: Set to 0 to defer processing to a background taskq * Set to 1 to process data synchronously */ static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, boolean_t force_sync) { fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); int rw = io_data_dir(bio, rq); if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { zvol_end_io(bio, rq, SET_ERROR(ENXIO)); goto out; } if (zvol_request_sync || zv->zv_threading == B_FALSE) force_sync = 1; zv_request_t zvr = { .zv = zv, .bio = bio, .rq = rq, }; if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", zv->zv_zso->zvo_disk->disk_name, (long long unsigned)offset, (long unsigned)size); zvol_end_io(bio, rq, SET_ERROR(EIO)); goto out; } zv_request_task_t *task; zv_taskq_t *ztqs = &zvol_taskqs; uint_t blk_mq_hw_queue = 0; uint_t tq_idx; uint_t taskq_hash; if (rq) #ifdef HAVE_BLK_MQ_RQ_HCTX blk_mq_hw_queue = rq->mq_hctx->queue_num; #else blk_mq_hw_queue = rq->q->queue_hw_ctx[ rq->q->mq_map[raw_smp_processor_id()]]->queue_num; #endif taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, blk_mq_hw_queue); tq_idx = taskq_hash % ztqs->tqs_cnt; if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { zvol_end_io(bio, rq, SET_ERROR(EROFS)); goto out; } /* * Prevents the zvol from being suspended, or the ZIL being * concurrently opened. Will be released after the i/o * completes. */ rw_enter(&zv->zv_suspend_lock, RW_READER); /* * Open a ZIL if this is the first time we have written to this * zvol. We protect zv->zv_zilog with zv_suspend_lock rather * than zv_state_lock so that we don't need to acquire an * additional lock in this path. */ if (zv->zv_zilog == NULL) { rw_exit(&zv->zv_suspend_lock); rw_enter(&zv->zv_suspend_lock, RW_WRITER); if (zv->zv_zilog == NULL) { zv->zv_zilog = zil_open(zv->zv_objset, zvol_get_data, &zv->zv_kstat.dk_zil_sums); zv->zv_flags |= ZVOL_WRITTEN_TO; /* replay / destroy done in zvol_create_minor */ VERIFY0((zv->zv_zilog->zl_header->zh_flags & ZIL_REPLAY_NEEDED)); } rw_downgrade(&zv->zv_suspend_lock); } /* * We don't want this thread to be blocked waiting for i/o to * complete, so we instead wait from a taskq callback. The * i/o may be a ZIL write (via zil_commit()), or a read of an * indirect block, or a read of a data block (if this is a * partial-block write). We will indicate that the i/o is * complete by calling END_IO() from the taskq callback. * * This design allows the calling thread to continue and * initiate more concurrent operations by calling * zvol_request() again. There are typically only a small * number of threads available to call zvol_request() (e.g. * one per iSCSI target), so keeping the latency of * zvol_request() low is important for performance. * * The zvol_request_sync module parameter allows this * behavior to be altered, for performance evaluation * purposes. If the callback blocks, setting * zvol_request_sync=1 will result in much worse performance. * * We can have up to zvol_threads concurrent i/o's being * processed for all zvols on the system. This is typically * a vast improvement over the zvol_request_sync=1 behavior * of one i/o at a time per zvol. However, an even better * design would be for zvol_request() to initiate the zio * directly, and then be notified by the zio_done callback, * which would call END_IO(). Unfortunately, the DMU/ZIL * interfaces lack this functionality (they block waiting for * the i/o to complete). */ if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { if (force_sync) { zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_discard_task, task, 0, &task->ent); } } else { if (force_sync) { zvol_write(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_write_task, task, 0, &task->ent); } } } else { /* * The SCST driver, and possibly others, may issue READ I/Os * with a length of zero bytes. These empty I/Os contain no * data and require no additional handling. */ if (size == 0) { zvol_end_io(bio, rq, 0); goto out; } rw_enter(&zv->zv_suspend_lock, RW_READER); /* See comment in WRITE case above. */ if (force_sync) { zvol_read(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(ztqs->tqs_taskq[tq_idx], zvol_read_task, task, 0, &task->ent); } } out: spl_fstrans_unmark(cookie); } #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID static void zvol_submit_bio(struct bio *bio) #else static blk_qc_t zvol_submit_bio(struct bio *bio) #endif #else static MAKE_REQUEST_FN_RET zvol_request(struct request_queue *q, struct bio *bio) #endif { #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bio->bi_bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif #endif zvol_state_t *zv = q->queuedata; zvol_request_impl(zv, bio, NULL, 0); #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) return (BLK_QC_T_NONE); #endif } static int #ifdef HAVE_BLK_MODE_T zvol_open(struct gendisk *disk, blk_mode_t flag) #else zvol_open(struct block_device *bdev, fmode_t flag) #endif { zvol_state_t *zv; int error = 0; boolean_t drop_suspend = B_FALSE; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); hrtime_t start = gethrtime(); retry: #endif rw_enter(&zvol_state_lock, RW_READER); /* * Obtain a copy of private_data under the zvol_state_lock to make * sure that either the result of zvol free code path setting * disk->private_data to NULL is observed, or zvol_os_free() * is not called on this zv because of the positive zv_open_count. */ #ifdef HAVE_BLK_MODE_T zv = disk->private_data; #else zv = bdev->bd_disk->private_data; #endif if (zv == NULL) { rw_exit(&zvol_state_lock); return (-SET_ERROR(ENXIO)); } mutex_enter(&zv->zv_state_lock); if (unlikely(zv->zv_flags & ZVOL_REMOVING)) { mutex_exit(&zv->zv_state_lock); rw_exit(&zvol_state_lock); return (-SET_ERROR(ENXIO)); } /* * Make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); } else { drop_suspend = B_TRUE; } } else { drop_suspend = B_TRUE; } } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { boolean_t drop_namespace = B_FALSE; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); /* * In all other call paths the spa_namespace_lock is taken * before the bdev->bd_mutex lock. However, on open(2) * the __blkdev_get() function calls fops->open() with the * bdev->bd_mutex lock held. This can result in a deadlock * when zvols from one pool are used as vdevs in another. * * To prevent a lock inversion deadlock we preemptively * take the spa_namespace_lock. Normally the lock will not * be contended and this is safe because spa_open_common() * handles the case where the caller already holds the * spa_namespace_lock. * * When the lock cannot be aquired after multiple retries * this must be the vdev on zvol deadlock case and we have * no choice but to return an error. For 5.12 and older * kernels returning -ERESTARTSYS will result in the * bdev->bd_mutex being dropped, then reacquired, and * fops->open() being called again. This process can be * repeated safely until both locks are acquired. For 5.13 * and newer the -ERESTARTSYS retry logic was removed from * the kernel so the only option is to return the error for * the caller to handle it. */ if (!mutex_owned(&spa_namespace_lock)) { if (!mutex_tryenter(&spa_namespace_lock)) { mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; #ifdef HAVE_BLKDEV_GET_ERESTARTSYS schedule(); return (-SET_ERROR(ERESTARTSYS)); #else if ((gethrtime() - start) > timeout) return (-SET_ERROR(ERESTARTSYS)); schedule_timeout_interruptible( MSEC_TO_TICK(10)); goto retry; #endif } else { drop_namespace = B_TRUE; } } error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); if (drop_namespace) mutex_exit(&spa_namespace_lock); } if (error == 0) { if ((blk_mode_is_open_write(flag)) && (zv->zv_flags & ZVOL_RDONLY)) { if (zv->zv_open_count == 0) zvol_last_close(zv); error = -SET_ERROR(EROFS); } else { zv->zv_open_count++; } } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); if (error == 0) #ifdef HAVE_BLK_MODE_T disk_check_media_change(disk); #else zfs_check_media_change(bdev); #endif return (error); } static void #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG zvol_release(struct gendisk *disk) #else zvol_release(struct gendisk *disk, fmode_t unused) #endif { #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) (void) unused; #endif zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, RW_READER); zv = disk->private_data; mutex_enter(&zv->zv_state_lock); ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 1) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); zv->zv_open_count--; if (zv->zv_open_count == 0) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); } static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; ASSERT3U(zv->zv_open_count, >, 0); switch (cmd) { case BLKFLSBUF: #ifdef HAVE_FSYNC_BDEV fsync_bdev(bdev); #elif defined(HAVE_SYNC_BLOCKDEV) sync_blockdev(bdev); #else #error "Neither fsync_bdev() nor sync_blockdev() found" #endif invalidate_bdev(bdev); rw_enter(&zv->zv_suspend_lock, RW_READER); if (!(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); rw_exit(&zv->zv_suspend_lock); break; case BLKZNAME: mutex_enter(&zv->zv_state_lock); error = -copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); mutex_exit(&zv->zv_state_lock); if (error) error = SET_ERROR(error); break; default: error = SET_ERROR(ENOTTY); break; } return (-error); } #ifdef CONFIG_COMPAT static int zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { return (zvol_ioctl(bdev, mode, cmd, arg)); } #else #define zvol_compat_ioctl NULL #endif static unsigned int zvol_check_events(struct gendisk *disk, unsigned int clearing) { unsigned int mask = 0; rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; zv->zv_changed = 0; mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (mask); } static int zvol_revalidate_disk(struct gendisk *disk) { rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> SECTOR_BITS); mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (0); } int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) { struct gendisk *disk = zv->zv_zso->zvo_disk; #if defined(HAVE_REVALIDATE_DISK_SIZE) revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); #elif defined(HAVE_REVALIDATE_DISK) revalidate_disk(disk); #else zvol_revalidate_disk(disk); #endif return (0); } -void -zvol_os_clear_private(zvol_state_t *zv) -{ - /* - * Cleared while holding zvol_state_lock as a writer - * which will prevent zvol_open() from opening it. - */ - zv->zv_zso->zvo_disk->private_data = NULL; -} - /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very * tiny devices. For devices over 1 Mib a standard head and sector count * is used to keep the cylinders count reasonable. */ static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { zvol_state_t *zv = bdev->bd_disk->private_data; sector_t sectors; ASSERT3U(zv->zv_open_count, >, 0); sectors = get_capacity(zv->zv_zso->zvo_disk); if (sectors > 2048) { geo->heads = 16; geo->sectors = 63; } else { geo->heads = 2; geo->sectors = 4; } geo->start = 0; geo->cylinders = sectors / (geo->heads * geo->sectors); return (0); } /* * Why have two separate block_device_operations structs? * * Normally we'd just have one, and assign 'submit_bio' as needed. However, * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we * can't just change submit_bio dynamically at runtime. So just create two * separate structs to get around this. */ static const struct block_device_operations zvol_ops_blk_mq = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .check_events = zvol_check_events, #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK .revalidate_disk = zvol_revalidate_disk, #endif .getgeo = zvol_getgeo, .owner = THIS_MODULE, }; static const struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .check_events = zvol_check_events, #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK .revalidate_disk = zvol_revalidate_disk, #endif .getgeo = zvol_getgeo, .owner = THIS_MODULE, #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS .submit_bio = zvol_submit_bio, #endif }; /* * Since 6.9, Linux has been removing queue limit setters in favour of an * initial queue_limits struct applied when the device is open. Since 6.11, * queue_limits is being extended to allow more things to be applied when the * device is open. Setters are also being removed for this. * * For OpenZFS, this means that depending on kernel version, some options may * be set up before the device is open, and some applied to an open device * (queue) after the fact. * * We manage this complexity by having our own limits struct, * zvol_queue_limits_t, in which we carry any queue config that we're * interested in setting. This structure is the same on all kernels. * * These limits are then applied to the queue at device open time by the most * appropriate method for the kernel. * * zvol_queue_limits_convert() is used on 6.9+ (where the two-arg form of * blk_alloc_disk() exists). This converts our limits struct to a proper Linux * struct queue_limits, and passes it in. Any fields added in later kernels are * (obviously) not set up here. * * zvol_queue_limits_apply() is called on all kernel versions after the queue * is created, and applies any remaining config. Before 6.9 that will be * everything, via setter methods. After 6.9 that will be whatever couldn't be * put into struct queue_limits. (This implies that zvol_queue_limits_apply() * will always be a no-op on the latest kernel we support). */ typedef struct zvol_queue_limits { unsigned int zql_max_hw_sectors; unsigned short zql_max_segments; unsigned int zql_max_segment_size; unsigned int zql_io_opt; unsigned int zql_physical_block_size; unsigned int zql_max_discard_sectors; unsigned int zql_discard_granularity; } zvol_queue_limits_t; static void zvol_queue_limits_init(zvol_queue_limits_t *limits, zvol_state_t *zv, boolean_t use_blk_mq) { limits->zql_max_hw_sectors = (DMU_MAX_ACCESS / 4) >> 9; if (use_blk_mq) { /* * IO requests can be really big (1MB). When an IO request * comes in, it is passed off to zvol_read() or zvol_write() * in a new thread, where it is chunked up into 'volblocksize' * sized pieces and processed. So for example, if the request * is a 1MB write and your volblocksize is 128k, one zvol_write * thread will take that request and sequentially do ten 128k * IOs. This is due to the fact that the thread needs to lock * each volblocksize sized block. So you might be wondering: * "instead of passing the whole 1MB request to one thread, * why not pass ten individual 128k chunks to ten threads and * process the whole write in parallel?" The short answer is * that there's a sweet spot number of chunks that balances * the greater parallelism with the added overhead of more * threads. The sweet spot can be different depending on if you * have a read or write heavy workload. Writes typically want * high chunk counts while reads typically want lower ones. On * a test pool with 6 NVMe drives in a 3x 2-disk mirror * configuration, with volblocksize=8k, the sweet spot for good * sequential reads and writes was at 8 chunks. */ /* * Below we tell the kernel how big we want our requests * to be. You would think that blk_queue_io_opt() would be * used to do this since it is used to "set optimal request * size for the queue", but that doesn't seem to do * anything - the kernel still gives you huge requests * with tons of little PAGE_SIZE segments contained within it. * * Knowing that the kernel will just give you PAGE_SIZE segments * no matter what, you can say "ok, I want PAGE_SIZE byte * segments, and I want 'N' of them per request", where N is * the correct number of segments for the volblocksize and * number of chunks you want. */ if (zvol_blk_mq_blocks_per_thread != 0) { unsigned int chunks; chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); limits->zql_max_segment_size = PAGE_SIZE; limits->zql_max_segments = (zv->zv_volblocksize * chunks) / PAGE_SIZE; } else { /* * Special case: zvol_blk_mq_blocks_per_thread = 0 * Max everything out. */ limits->zql_max_segments = UINT16_MAX; limits->zql_max_segment_size = UINT_MAX; } } else { limits->zql_max_segments = UINT16_MAX; limits->zql_max_segment_size = UINT_MAX; } limits->zql_io_opt = DMU_MAX_ACCESS / 2; limits->zql_physical_block_size = zv->zv_volblocksize; limits->zql_max_discard_sectors = (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9; limits->zql_discard_granularity = zv->zv_volblocksize; } #ifdef HAVE_BLK_ALLOC_DISK_2ARG static void zvol_queue_limits_convert(zvol_queue_limits_t *limits, struct queue_limits *qlimits) { memset(qlimits, 0, sizeof (struct queue_limits)); qlimits->max_hw_sectors = limits->zql_max_hw_sectors; qlimits->max_segments = limits->zql_max_segments; qlimits->max_segment_size = limits->zql_max_segment_size; qlimits->io_opt = limits->zql_io_opt; qlimits->physical_block_size = limits->zql_physical_block_size; qlimits->max_discard_sectors = limits->zql_max_discard_sectors; qlimits->max_hw_discard_sectors = limits->zql_max_discard_sectors; qlimits->discard_granularity = limits->zql_discard_granularity; #ifdef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES qlimits->features = BLK_FEAT_WRITE_CACHE | BLK_FEAT_FUA | BLK_FEAT_IO_STAT; #endif } #endif static void zvol_queue_limits_apply(zvol_queue_limits_t *limits, struct request_queue *queue) { #ifndef HAVE_BLK_ALLOC_DISK_2ARG blk_queue_max_hw_sectors(queue, limits->zql_max_hw_sectors); blk_queue_max_segments(queue, limits->zql_max_segments); blk_queue_max_segment_size(queue, limits->zql_max_segment_size); blk_queue_io_opt(queue, limits->zql_io_opt); blk_queue_physical_block_size(queue, limits->zql_physical_block_size); blk_queue_max_discard_sectors(queue, limits->zql_max_discard_sectors); blk_queue_discard_granularity(queue, limits->zql_discard_granularity); #endif #ifndef HAVE_BLKDEV_QUEUE_LIMITS_FEATURES blk_queue_set_write_cache(queue, B_TRUE); blk_queue_flag_set(QUEUE_FLAG_IO_STAT, queue); #endif } static int zvol_alloc_non_blk_mq(struct zvol_state_os *zso, zvol_queue_limits_t *limits) { #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) #if defined(HAVE_BLK_ALLOC_DISK) zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); if (zso->zvo_disk == NULL) return (1); zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) struct queue_limits qlimits; zvol_queue_limits_convert(limits, &qlimits); struct gendisk *disk = blk_alloc_disk(&qlimits, NUMA_NO_NODE); if (IS_ERR(disk)) { zso->zvo_disk = NULL; return (1); } zso->zvo_disk = disk; zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; #else zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); if (zso->zvo_queue == NULL) return (1); zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); return (1); } zso->zvo_disk->queue = zso->zvo_queue; #endif /* HAVE_BLK_ALLOC_DISK */ #else zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); if (zso->zvo_queue == NULL) return (1); zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); return (1); } zso->zvo_disk->queue = zso->zvo_queue; #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ zvol_queue_limits_apply(limits, zso->zvo_queue); return (0); } static int zvol_alloc_blk_mq(zvol_state_t *zv, zvol_queue_limits_t *limits) { struct zvol_state_os *zso = zv->zv_zso; /* Allocate our blk-mq tag_set */ if (zvol_blk_mq_alloc_tag_set(zv) != 0) return (1); #if defined(HAVE_BLK_ALLOC_DISK) zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); if (zso->zvo_disk == NULL) { blk_mq_free_tag_set(&zso->tag_set); return (1); } zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_disk->minors = ZVOL_MINORS; #elif defined(HAVE_BLK_ALLOC_DISK_2ARG) struct queue_limits qlimits; zvol_queue_limits_convert(limits, &qlimits); struct gendisk *disk = blk_mq_alloc_disk(&zso->tag_set, &qlimits, zv); if (IS_ERR(disk)) { zso->zvo_disk = NULL; blk_mq_free_tag_set(&zso->tag_set); return (1); } zso->zvo_disk = disk; zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_disk->minors = ZVOL_MINORS; #else zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); blk_mq_free_tag_set(&zso->tag_set); return (1); } /* Allocate queue */ zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); if (IS_ERR(zso->zvo_queue)) { blk_mq_free_tag_set(&zso->tag_set); return (1); } /* Our queue is now created, assign it to our disk */ zso->zvo_disk->queue = zso->zvo_queue; #endif zvol_queue_limits_apply(limits, zso->zvo_queue); return (0); } /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ static int zvol_alloc(dev_t dev, const char *name, uint64_t volsize, uint64_t volblocksize, zvol_state_t **zvp) { zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; int ret; ret = dsl_prop_get_integer(name, "volmode", &volmode, NULL); if (ret) return (ret); if (volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; if (volmode == ZFS_VOLMODE_NONE) return (0); zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; zv->zv_volmode = volmode; zv->zv_volsize = volsize; zv->zv_volblocksize = volblocksize; list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zv->zv_removing_cv, NULL, CV_DEFAULT, NULL); zv->zv_zso->use_blk_mq = zvol_use_blk_mq; zvol_queue_limits_t limits; zvol_queue_limits_init(&limits, zv, zv->zv_zso->use_blk_mq); /* * The block layer has 3 interfaces for getting BIOs: * * 1. blk-mq request queues (new) * 2. submit_bio() (oldest) * 3. regular request queues (old). * * Each of those interfaces has two permutations: * * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates * both the disk and its queue (5.14 kernel or newer) * * b) We don't have blk_*alloc_disk(), and have to allocate the * disk and the queue separately. (5.13 kernel or older) */ if (zv->zv_zso->use_blk_mq) { ret = zvol_alloc_blk_mq(zv, &limits); if (ret != 0) goto out_kmem; zso->zvo_disk->fops = &zvol_ops_blk_mq; } else { ret = zvol_alloc_non_blk_mq(zso, &limits); if (ret != 0) goto out_kmem; zso->zvo_disk->fops = &zvol_ops; } /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zso->zvo_queue, 1); if (!zv->zv_zso->use_blk_mq) { /* Disable write merging in favor of the ZIO pipeline. */ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); } zso->zvo_queue->queuedata = zv; zso->zvo_dev = dev; zv->zv_open_count = 0; strlcpy(zv->zv_name, name, sizeof (zv->zv_name)); zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zso->zvo_disk->major = zvol_major; zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; /* * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. * This is accomplished by limiting the number of minors for the * device to one and explicitly disabling partition scanning. */ if (volmode == ZFS_VOLMODE_DEV) { zso->zvo_disk->minors = 1; zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; zso->zvo_disk->flags |= GENHD_FL_NO_PART; } zso->zvo_disk->first_minor = (dev & MINORMASK); zso->zvo_disk->private_data = zv; snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); *zvp = zv; return (ret); out_kmem: kmem_free(zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); return (ret); } +void +zvol_os_remove_minor(zvol_state_t *zv) +{ + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT0(zv->zv_open_count); + ASSERT0(atomic_read(&zv->zv_suspend_ref)); + ASSERT(zv->zv_flags & ZVOL_REMOVING); + + /* + * Cleared while holding zvol_state_lock as a writer + * which will prevent zvol_open() from opening it. + */ + struct zvol_state_os *zso = zv->zv_zso; + zv->zv_zso = NULL; + + /* Clearing private_data will make new callers return immediately. */ + zso->zvo_disk->private_data = NULL; + + /* + * Drop the state lock before calling del_gendisk(). There may be + * callers waiting to acquire it, but del_gendisk() will block until + * they exit, which would deadlock. + */ + mutex_exit(&zv->zv_state_lock); + + del_gendisk(zso->zvo_disk); +#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ + (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) +#if defined(HAVE_BLK_CLEANUP_DISK) + blk_cleanup_disk(zso->zvo_disk); +#else + put_disk(zso->zvo_disk); +#endif +#else + blk_cleanup_queue(zso->zvo_queue); + put_disk(zso->zvo_disk); +#endif + + if (zso->use_blk_mq) + blk_mq_free_tag_set(&zso->tag_set); + + ida_simple_remove(&zvol_ida, MINOR(zso->zvo_dev) >> ZVOL_MINOR_BITS); + + kmem_free(zso, sizeof (struct zvol_state_os)); + + mutex_enter(&zv->zv_state_lock); +} + /* * Cleanup then free a zvol_state_t which was created by zvol_alloc(). * At this time, the structure is not opened by anyone, is taken off * the zvol_state_list, and has its private data set to NULL. * The zvol_state_lock is dropped. * * This function may take many milliseconds to complete (e.g. we've seen * it take over 256ms), due to the calls to "blk_cleanup_queue" and * "del_gendisk". Thus, consumers need to be careful to account for this * latency when calling this function. */ void zvol_os_free(zvol_state_t *zv) { ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); - ASSERT0P(zv->zv_zso->zvo_disk->private_data); + ASSERT0P(zv->zv_zso); + + ASSERT0P(zv->zv_objset); + ASSERT0P(zv->zv_zilog); + ASSERT0P(zv->zv_dn); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); - del_gendisk(zv->zv_zso->zvo_disk); -#if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ - (defined(HAVE_BLK_ALLOC_DISK) || defined(HAVE_BLK_ALLOC_DISK_2ARG)) -#if defined(HAVE_BLK_CLEANUP_DISK) - blk_cleanup_disk(zv->zv_zso->zvo_disk); -#else - put_disk(zv->zv_zso->zvo_disk); -#endif -#else - blk_cleanup_queue(zv->zv_zso->zvo_queue); - put_disk(zv->zv_zso->zvo_disk); -#endif - - if (zv->zv_zso->use_blk_mq) - blk_mq_free_tag_set(&zv->zv_zso->tag_set); - - ida_simple_remove(&zvol_ida, - MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); - cv_destroy(&zv->zv_removing_cv); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); - kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); } void zvol_wait_close(zvol_state_t *zv) { } struct add_disk_work { struct delayed_work work; struct gendisk *disk; int error; }; static int __zvol_os_add_disk(struct gendisk *disk) { int error = 0; #ifdef HAVE_ADD_DISK_RET error = -add_disk(disk); if (error) error = SET_ERROR(error); #else add_disk(disk); #endif return (error); } #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) static void zvol_os_add_disk_work(struct work_struct *work) { struct add_disk_work *add_disk_work; add_disk_work = container_of(work, struct add_disk_work, work.work); add_disk_work->error = __zvol_os_add_disk(add_disk_work->disk); } #endif /* * SPECIAL CASE: * * This function basically calls add_disk() from a workqueue. You may be * thinking: why not just call add_disk() directly? * * When you call add_disk(), the zvol appears to the world. When this happens, * the kernel calls disk_scan_partitions() on the zvol, which behaves * differently on the 6.9+ kernels: * * - 6.8 and older kernels - * disk_scan_partitions() * handle = bdev_open_by_dev( * zvol_open() * bdev_release(handle); * zvol_release() * * * - 6.9+ kernels - * disk_scan_partitions() * file = bdev_file_open_by_dev() * zvol_open() * fput(file) * < wait for return to userspace > * zvol_release() * * The difference is that the bdev_release() from the 6.8 kernel is synchronous * while the fput() from the 6.9 kernel is async. Or more specifically it's * async that has to wait until we return to userspace (since it adds the fput * into the caller's work queue with the TWA_RESUME flag set). This is not the * behavior we want, since we want do things like create+destroy a zvol within * a single ZFS_IOC_CREATE ioctl, and the "create" part needs to release the * reference to the zvol while we're in the IOCTL, which can't wait until we * return to userspace. * * We can get around this since fput() has a special codepath for when it's * running in a kernel thread or interrupt. In those cases, it just puts the * fput into the system workqueue, which we can force to run with * __flush_workqueue(). That is why we call add_disk() from a workqueue - so it * run from a kernel thread and "tricks" the fput() codepaths. * * Note that __flush_workqueue() is slowly getting deprecated. This may be ok * though, since our IOCTL will spin on EBUSY waiting for the zvol release (via * fput) to happen, which it eventually, naturally, will from the system_wq * without us explicitly calling __flush_workqueue(). */ static int zvol_os_add_disk(struct gendisk *disk) { #if defined(HAVE_BDEV_FILE_OPEN_BY_PATH) /* 6.9+ kernel */ struct add_disk_work add_disk_work; INIT_DELAYED_WORK(&add_disk_work.work, zvol_os_add_disk_work); add_disk_work.disk = disk; add_disk_work.error = 0; /* Use *_delayed_work functions since they're not GPL'd */ schedule_delayed_work(&add_disk_work.work, 0); flush_delayed_work(&add_disk_work.work); __flush_workqueue(system_wq); return (add_disk_work.error); #else /* <= 6.8 kernel */ return (__zvol_os_add_disk(disk)); #endif } /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block * device is live and ready for use. */ int zvol_os_create_minor(const char *name) { zvol_state_t *zv = NULL; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t len; unsigned minor = 0; int error = 0; int idx; uint64_t hash = zvol_name_hash(name); uint64_t volthreading; bool replayed_zil = B_FALSE; if (zvol_inhibit_dev) return (0); idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; if (MINOR(minor) != minor) { /* too many partitions can cause an overflow */ zfs_dbgmsg("zvol: create minor overflow: %s, minor %u/%u", name, minor, MINOR(minor)); ida_simple_remove(&zvol_ida, idx); return (SET_ERROR(EINVAL)); } zv = zvol_find_by_name_hash(name, hash, RW_NONE); if (zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); ida_simple_remove(&zvol_ida, idx); return (SET_ERROR(EEXIST)); } doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; error = zvol_alloc(MKDEV(zvol_major, minor), name, volsize, doi->doi_data_block_size, &zv); if (error || zv == NULL) goto out_dmu_objset_disown; zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; zv->zv_objset = os; /* Default */ zv->zv_threading = B_TRUE; if (dsl_prop_get_integer(name, "volthreading", &volthreading, NULL) == 0) zv->zv_threading = volthreading; set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); #ifdef QUEUE_FLAG_DISCARD blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); #endif #ifdef QUEUE_FLAG_NONROT blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); #endif #ifdef QUEUE_FLAG_ADD_RANDOM blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); #endif /* This flag was introduced in kernel version 4.12. */ #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); #endif ASSERT0P(zv->zv_kstat.dk_kstats); error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); if (error) goto out_dmu_objset_disown; ASSERT0P(zv->zv_zilog); zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); else replayed_zil = zil_replay(os, zv, zvol_replay_vector); } if (replayed_zil) zil_close(zv->zv_zilog); zv->zv_zilog = NULL; /* * When udev detects the addition of the device it will immediately * invoke blkid(8) to determine the type of content on the device. * Prefetching the blocks commonly scanned by blkid(8) will speed * up this process. */ len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, ZIO_PRIORITY_SYNC_READ); } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); /* * Keep in mind that once add_disk() is called, the zvol is * announced to the world, and zvol_open()/zvol_release() can * be called at any time. Incidentally, add_disk() itself calls * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() * directly as well. */ if (error == 0 && zv) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); rw_exit(&zvol_state_lock); error = zvol_os_add_disk(zv->zv_zso->zvo_disk); } else { ida_simple_remove(&zvol_ida, idx); } return (error); } int zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_zso->zvo_disk); ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); /* move to new hashtable entry */ zv->zv_hash = zvol_name_hash(newname); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); /* * The block device's read-only state is briefly changed causing * a KOBJ_CHANGE uevent to be issued. This ensures udev detects * the name change and fixes the symlinks. This does not change * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never * changes. This would normally be done using kobject_uevent() but * that is a GPL-only symbol which is why we need this workaround. */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); dataset_kstats_rename(&zv->zv_kstat, newname); return (0); } void zvol_os_set_disk_ro(zvol_state_t *zv, int flags) { set_disk_ro(zv->zv_zso->zvo_disk, flags); } void zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) { set_capacity(zv->zv_zso->zvo_disk, capacity); } int zvol_init(void) { int error; error = zvol_init_impl(); if (error) { printk(KERN_INFO "ZFS: zvol_init_impl() failed %d\n", error); return (error); } error = -register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (SET_ERROR(error)); } if (zvol_blk_mq_queue_depth == 0) { zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; } else { zvol_actual_blk_mq_queue_depth = MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); } if (zvol_blk_mq_threads == 0) { zvol_blk_mq_actual_threads = num_online_cpus(); } else { zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1024); } ida_init(&zvol_ida); return (0); } void zvol_fini(void) { unregister_blkdev(zvol_major, ZVOL_DRIVER); zvol_fini_impl(); ida_destroy(&zvol_ida); } module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); module_param(zvol_blk_mq_queue_depth, uint, 0644); MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); module_param(zvol_use_blk_mq, uint, 0644); MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, "Process volblocksize blocks per thread"); #ifndef HAVE_BLKDEV_GET_ERESTARTSYS module_param(zvol_open_timeout_ms, uint, 0644); MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); #endif diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 3556608dec3d..cd3e1894c0ef 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -1,2216 +1,2211 @@ // SPDX-License-Identifier: CDDL-1.0 /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * * ZFS volume emulation driver. * * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. * Volumes are accessed through the symbolic links named: * * /dev// * * Volumes are persistent through reboot and module load. No user command * needs to be run before opening and using a device. * * Copyright 2014 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. - * Copyright (c) 2024, Klara, Inc. + * Copyright (c) 2024, 2025, Klara, Inc. */ /* * Note on locking of zvol state structures. * * These structures are used to maintain internal state used to emulate block * devices on top of zvols. In particular, management of device minor number * operations - create, remove, rename, and set_snapdev - involves access to * these structures. The zvol_state_lock is primarily used to protect the * zvol_state_list. The zv->zv_state_lock is used to protect the contents * of the zvol_state_t structures, as well as to make sure that when the * time comes to remove the structure from the list, it is not in use, and * therefore, it can be taken off zvol_state_list and freed. * * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol, * e.g. for the duration of receive and rollback operations. This lock can be * held for significant periods of time. Given that it is undesirable to hold * mutexes for long periods of time, the following lock ordering applies: * - take zvol_state_lock if necessary, to protect zvol_state_list * - take zv_suspend_lock if necessary, by the code path in question * - take zv_state_lock to protect zvol_state_t * * The minor operations are issued to spa->spa_zvol_taskq queues, that are * single-threaded (to preserve order of minor operations), and are executed * through the zvol_task_cb that dispatches the specific operations. Therefore, * these operations are serialized per pool. Consequently, we can be certain * that for a given zvol, there is only one operation at a time in progress. * That is why one can be sure that first, zvol_state_t for a given zvol is * allocated and placed on zvol_state_list, and then other minor operations * for this zvol are going to proceed in the order of issue. * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include unsigned int zvol_inhibit_dev = 0; unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; unsigned int zvol_threads = 0; unsigned int zvol_num_taskqs = 0; unsigned int zvol_request_sync = 0; struct hlist_head *zvol_htable; static list_t zvol_state_list; krwlock_t zvol_state_lock; extern int zfs_bclone_wait_dirty; zv_taskq_t zvol_taskqs; typedef enum { ZVOL_ASYNC_CREATE_MINORS, ZVOL_ASYNC_REMOVE_MINORS, ZVOL_ASYNC_RENAME_MINORS, ZVOL_ASYNC_SET_SNAPDEV, ZVOL_ASYNC_SET_VOLMODE, ZVOL_ASYNC_MAX } zvol_async_op_t; typedef struct { zvol_async_op_t zt_op; char zt_name1[MAXNAMELEN]; char zt_name2[MAXNAMELEN]; uint64_t zt_value; uint32_t zt_total; uint32_t zt_done; int32_t zt_status; int zt_error; } zvol_task_t; zv_request_task_t * zv_request_task_create(zv_request_t zvr) { zv_request_task_t *task; task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); taskq_init_ent(&task->ent); task->zvr = zvr; return (task); } void zv_request_task_free(zv_request_task_t *task) { kmem_free(task, sizeof (*task)); } uint64_t zvol_name_hash(const char *name) { uint64_t crc = -1ULL; ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); for (const uint8_t *p = (const uint8_t *)name; *p != 0; p++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; return (crc); } /* * Find a zvol_state_t given the name and hash generated by zvol_name_hash. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, * return (NULL) without the taking locks. The zv_suspend_lock is always taken * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ zvol_state_t * zvol_find_by_name_hash(const char *name, uint64_t hash, int mode) { zvol_state_t *zv; struct hlist_node *p = NULL; rw_enter(&zvol_state_lock, RW_READER); hlist_for_each(p, ZVOL_HT_HEAD(hash)) { zv = hlist_entry(p, zvol_state_t, zv_hlink); mutex_enter(&zv->zv_state_lock); if (zv->zv_hash == hash && strcmp(zv->zv_name, name) == 0) { /* * this is the right zvol, take the locks in the * right order */ if (mode != RW_NONE && !rw_tryenter(&zv->zv_suspend_lock, mode)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, mode); mutex_enter(&zv->zv_state_lock); /* * zvol cannot be renamed as we continue * to hold zvol_state_lock */ ASSERT(zv->zv_hash == hash && strcmp(zv->zv_name, name) == 0); } rw_exit(&zvol_state_lock); return (zv); } mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (NULL); } /* * Find a zvol_state_t given the name. * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, * return (NULL) without the taking locks. The zv_suspend_lock is always taken * before zv_state_lock. The mode argument indicates the mode (including none) * for zv_suspend_lock to be taken. */ static zvol_state_t * zvol_find_by_name(const char *name, int mode) { return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode)); } /* * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. */ void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; nvlist_t *nvprops = zct->zct_props; int error; uint64_t volblocksize, volsize; VERIFY0(nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); /* * These properties must be removed from the list so the generic * property setting step won't apply to them. */ VERIFY0(nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE))); (void) nvlist_remove_all(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, DMU_OT_NONE, 0, tx); ASSERT0(error); error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, DMU_OT_NONE, 0, tx); ASSERT0(error); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); ASSERT0(error); } /* * ZFS_IOC_OBJSET_STATS entry point. */ int zvol_get_stats(objset_t *os, nvlist_t *nv) { int error; dmu_object_info_t *doi; uint64_t val; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); if (error) return (error); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_object_info(os, ZVOL_OBJ, doi); if (error == 0) { dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, doi->doi_data_block_size); } kmem_free(doi, sizeof (dmu_object_info_t)); return (error); } /* * Sanity check volume size. */ int zvol_check_volsize(uint64_t volsize, uint64_t blocksize) { if (volsize == 0) return (SET_ERROR(EINVAL)); if (volsize % blocksize != 0) return (SET_ERROR(EINVAL)); #ifdef _ILP32 if (volsize - 1 > SPEC_MAXOFFSET_T) return (SET_ERROR(EOVERFLOW)); #endif return (0); } /* * Ensure the zap is flushed then inform the VFS of the capacity change. */ static int zvol_update_volsize(uint64_t volsize, objset_t *os) { dmu_tx_t *tx; int error; uint64_t txg; tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error) { dmu_tx_abort(tx); return (error); } txg = dmu_tx_get_txg(tx); error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); dmu_tx_commit(tx); txg_wait_synced(dmu_objset_pool(os), txg); if (error == 0) error = dmu_free_long_range(os, ZVOL_OBJ, volsize, DMU_OBJECT_END); return (error); } /* * Set ZFS_PROP_VOLSIZE set entry point. Note that modifying the volume * size will result in a udev "change" event being generated. */ int zvol_set_volsize(const char *name, uint64_t volsize) { objset_t *os = NULL; uint64_t readonly; int error; boolean_t owned = B_FALSE; error = dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); if (error != 0) return (error); if (readonly) return (SET_ERROR(EROFS)); zvol_state_t *zv = zvol_find_by_name(name, RW_READER); ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) && RW_READ_HELD(&zv->zv_suspend_lock))); if (zv == NULL || zv->zv_objset == NULL) { if (zv != NULL) rw_exit(&zv->zv_suspend_lock); if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE, FTAG, &os)) != 0) { if (zv != NULL) mutex_exit(&zv->zv_state_lock); return (error); } owned = B_TRUE; if (zv != NULL) zv->zv_objset = os; } else { os = zv->zv_objset; } dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP); if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) || (error = zvol_check_volsize(volsize, doi->doi_data_block_size))) goto out; error = zvol_update_volsize(volsize, os); if (error == 0 && zv != NULL) { zv->zv_volsize = volsize; zv->zv_changed = 1; } out: kmem_free(doi, sizeof (dmu_object_info_t)); if (owned) { dmu_objset_disown(os, B_TRUE, FTAG); if (zv != NULL) zv->zv_objset = NULL; } else { rw_exit(&zv->zv_suspend_lock); } if (zv != NULL) mutex_exit(&zv->zv_state_lock); if (error == 0 && zv != NULL) zvol_os_update_volsize(zv, volsize); return (error); } /* * Update volthreading. */ int zvol_set_volthreading(const char *name, boolean_t value) { zvol_state_t *zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL) return (SET_ERROR(ENOENT)); zv->zv_threading = value; mutex_exit(&zv->zv_state_lock); return (0); } /* * Update zvol ro property. */ int zvol_set_ro(const char *name, boolean_t value) { zvol_state_t *zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL) return (-1); if (value) { zvol_os_set_disk_ro(zv, 1); zv->zv_flags |= ZVOL_RDONLY; } else { zvol_os_set_disk_ro(zv, 0); zv->zv_flags &= ~ZVOL_RDONLY; } mutex_exit(&zv->zv_state_lock); return (0); } /* * Sanity check volume block size. */ int zvol_check_volblocksize(const char *name, uint64_t volblocksize) { /* Record sizes above 128k need the feature to be enabled */ if (volblocksize > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (volblocksize > zfs_max_recordsize) { spa_close(spa, FTAG); return (SET_ERROR(EDOM)); } spa_close(spa, FTAG); } if (volblocksize < SPA_MINBLOCKSIZE || volblocksize > SPA_MAXBLOCKSIZE || !ISP2(volblocksize)) return (SET_ERROR(EDOM)); return (0); } /* * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we * implement DKIOCFREE/free-long-range. */ static int zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_truncate_t *lr = arg2; uint64_t offset, length; ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); int error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { (void) zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length); } return (error); } /* * Replay a TX_WRITE ZIL transaction that didn't get committed * after a system failure */ static int zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_write_t *lr = arg2; objset_t *os = zv->zv_objset; char *data = (char *)(lr + 1); /* data follows lr_write_t */ uint64_t offset, length; dmu_tx_t *tx; int error; ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); offset = lr->lr_offset; length = lr->lr_length; /* If it's a dmu_sync() block, write the whole block */ if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); if (length < blocksize) { offset -= offset % blocksize; length = blocksize; } } tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error) { dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, offset, length, data, tx); (void) zil_replaying(zv->zv_zilog, tx); dmu_tx_commit(tx); } return (error); } /* * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed * after a system failure */ static int zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) { zvol_state_t *zv = arg1; lr_clone_range_t *lr = arg2; objset_t *os = zv->zv_objset; dmu_tx_t *tx; int error; uint64_t blksz; uint64_t off; uint64_t len; ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, lr_bps[lr->lr_nbps])); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); ASSERT(spa_feature_is_enabled(dmu_objset_spa(os), SPA_FEATURE_BLOCK_CLONING)); off = lr->lr_offset; len = lr->lr_length; blksz = lr->lr_blksz; if ((off % blksz) != 0) { return (SET_ERROR(EINVAL)); } error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); if (error != 0 || !zv->zv_dn) return (error); tx = dmu_tx_create(os); dmu_tx_hold_clone_by_dnode(tx, zv->zv_dn, off, len, blksz); error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error != 0) { dmu_tx_abort(tx); goto out; } error = dmu_brt_clone(zv->zv_objset, ZVOL_OBJ, off, len, tx, lr->lr_bps, lr->lr_nbps); if (error != 0) { dmu_tx_commit(tx); goto out; } /* * zil_replaying() not only check if we are replaying ZIL, but also * updates the ZIL header to record replay progress. */ VERIFY(zil_replaying(zv->zv_zilog, tx)); dmu_tx_commit(tx); out: dnode_rele(zv->zv_dn, zv); zv->zv_dn = NULL; return (error); } int zvol_clone_range(zvol_state_t *zv_src, uint64_t inoff, zvol_state_t *zv_dst, uint64_t outoff, uint64_t len) { zilog_t *zilog_dst; zfs_locked_range_t *inlr, *outlr; objset_t *inos, *outos; dmu_tx_t *tx; blkptr_t *bps; size_t maxblocks; int error = 0; rw_enter(&zv_dst->zv_suspend_lock, RW_READER); if (zv_dst->zv_zilog == NULL) { rw_exit(&zv_dst->zv_suspend_lock); rw_enter(&zv_dst->zv_suspend_lock, RW_WRITER); if (zv_dst->zv_zilog == NULL) { zv_dst->zv_zilog = zil_open(zv_dst->zv_objset, zvol_get_data, &zv_dst->zv_kstat.dk_zil_sums); zv_dst->zv_flags |= ZVOL_WRITTEN_TO; VERIFY0((zv_dst->zv_zilog->zl_header->zh_flags & ZIL_REPLAY_NEEDED)); } rw_downgrade(&zv_dst->zv_suspend_lock); } if (zv_src != zv_dst) rw_enter(&zv_src->zv_suspend_lock, RW_READER); inos = zv_src->zv_objset; outos = zv_dst->zv_objset; /* * Sanity checks */ if (!spa_feature_is_enabled(dmu_objset_spa(outos), SPA_FEATURE_BLOCK_CLONING)) { error = SET_ERROR(EOPNOTSUPP); goto out; } if (dmu_objset_spa(inos) != dmu_objset_spa(outos)) { error = SET_ERROR(EXDEV); goto out; } if (inos->os_encrypted != outos->os_encrypted) { error = SET_ERROR(EXDEV); goto out; } if (zv_src->zv_volblocksize != zv_dst->zv_volblocksize) { error = SET_ERROR(EINVAL); goto out; } if (inoff >= zv_src->zv_volsize || outoff >= zv_dst->zv_volsize) { goto out; } /* * Do not read beyond boundary */ if (len > zv_src->zv_volsize - inoff) len = zv_src->zv_volsize - inoff; if (len > zv_dst->zv_volsize - outoff) len = zv_dst->zv_volsize - outoff; if (len == 0) goto out; /* * No overlapping if we are cloning within the same file */ if (zv_src == zv_dst) { if (inoff < outoff + len && outoff < inoff + len) { error = SET_ERROR(EINVAL); goto out; } } /* * Offsets and length must be at block boundaries */ if ((inoff % zv_src->zv_volblocksize) != 0 || (outoff % zv_dst->zv_volblocksize) != 0) { error = SET_ERROR(EINVAL); goto out; } /* * Length must be multiple of block size */ if ((len % zv_src->zv_volblocksize) != 0) { error = SET_ERROR(EINVAL); goto out; } zilog_dst = zv_dst->zv_zilog; maxblocks = zil_max_log_data(zilog_dst, sizeof (lr_clone_range_t)) / sizeof (bps[0]); bps = vmem_alloc(sizeof (bps[0]) * maxblocks, KM_SLEEP); /* * Maintain predictable lock order. */ if (zv_src < zv_dst || (zv_src == zv_dst && inoff < outoff)) { inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, RL_READER); outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, RL_WRITER); } else { outlr = zfs_rangelock_enter(&zv_dst->zv_rangelock, outoff, len, RL_WRITER); inlr = zfs_rangelock_enter(&zv_src->zv_rangelock, inoff, len, RL_READER); } while (len > 0) { uint64_t size, last_synced_txg; size_t nbps = maxblocks; size = MIN(zv_src->zv_volblocksize * maxblocks, len); last_synced_txg = spa_last_synced_txg( dmu_objset_spa(zv_src->zv_objset)); error = dmu_read_l0_bps(zv_src->zv_objset, ZVOL_OBJ, inoff, size, bps, &nbps); if (error != 0) { /* * If we are trying to clone a block that was created * in the current transaction group, the error will be * EAGAIN here. Based on zfs_bclone_wait_dirty either * return a shortened range to the caller so it can * fallback, or wait for the next TXG and check again. */ if (error == EAGAIN && zfs_bclone_wait_dirty) { txg_wait_synced(dmu_objset_pool (zv_src->zv_objset), last_synced_txg + 1); continue; } break; } tx = dmu_tx_create(zv_dst->zv_objset); dmu_tx_hold_clone_by_dnode(tx, zv_dst->zv_dn, outoff, size, zv_src->zv_volblocksize); error = dmu_tx_assign(tx, DMU_TX_WAIT); if (error != 0) { dmu_tx_abort(tx); break; } error = dmu_brt_clone(zv_dst->zv_objset, ZVOL_OBJ, outoff, size, tx, bps, nbps); if (error != 0) { dmu_tx_commit(tx); break; } zvol_log_clone_range(zilog_dst, tx, TX_CLONE_RANGE, outoff, size, zv_src->zv_volblocksize, bps, nbps); dmu_tx_commit(tx); inoff += size; outoff += size; len -= size; } vmem_free(bps, sizeof (bps[0]) * maxblocks); zfs_rangelock_exit(outlr); zfs_rangelock_exit(inlr); if (error == 0 && zv_dst->zv_objset->os_sync == ZFS_SYNC_ALWAYS) { error = zil_commit(zilog_dst, ZVOL_OBJ); } out: if (zv_src != zv_dst) rw_exit(&zv_src->zv_suspend_lock); rw_exit(&zv_dst->zv_suspend_lock); return (error); } /* * Handles TX_CLONE_RANGE transactions. */ void zvol_log_clone_range(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t off, uint64_t len, uint64_t blksz, const blkptr_t *bps, size_t nbps) { itx_t *itx; lr_clone_range_t *lr; uint64_t partlen, max_log_data; size_t partnbps; if (zil_replaying(zilog, tx)) return; max_log_data = zil_max_log_data(zilog, sizeof (lr_clone_range_t)); while (nbps > 0) { partnbps = MIN(nbps, max_log_data / sizeof (bps[0])); partlen = partnbps * blksz; ASSERT3U(partlen, <, len + blksz); partlen = MIN(partlen, len); itx = zil_itx_create(txtype, sizeof (*lr) + sizeof (bps[0]) * partnbps); lr = (lr_clone_range_t *)&itx->itx_lr; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; lr->lr_length = partlen; lr->lr_blksz = blksz; lr->lr_nbps = partnbps; memcpy(lr->lr_bps, bps, sizeof (bps[0]) * partnbps); zil_itx_assign(zilog, itx, tx); bps += partnbps; ASSERT3U(nbps, >=, partnbps); nbps -= partnbps; off += partlen; ASSERT3U(len, >=, partlen); len -= partlen; } } static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { (void) arg1, (void) arg2, (void) byteswap; return (SET_ERROR(ENOTSUP)); } /* * Callback vectors for replaying records. * Only TX_WRITE and TX_TRUNCATE are needed for zvol. */ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* no such transaction type */ zvol_replay_err, /* TX_CREATE */ zvol_replay_err, /* TX_MKDIR */ zvol_replay_err, /* TX_MKXATTR */ zvol_replay_err, /* TX_SYMLINK */ zvol_replay_err, /* TX_REMOVE */ zvol_replay_err, /* TX_RMDIR */ zvol_replay_err, /* TX_LINK */ zvol_replay_err, /* TX_RENAME */ zvol_replay_write, /* TX_WRITE */ zvol_replay_truncate, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ zvol_replay_err, /* TX_ACL_V0 */ zvol_replay_err, /* TX_ACL */ zvol_replay_err, /* TX_CREATE_ACL */ zvol_replay_err, /* TX_CREATE_ATTR */ zvol_replay_err, /* TX_CREATE_ACL_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL */ zvol_replay_err, /* TX_MKDIR_ATTR */ zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ zvol_replay_err, /* TX_WRITE2 */ zvol_replay_err, /* TX_SETSAXATTR */ zvol_replay_err, /* TX_RENAME_EXCHANGE */ zvol_replay_err, /* TX_RENAME_WHITEOUT */ zvol_replay_clone_range, /* TX_CLONE_RANGE */ }; /* * zvol_log_write() handles TX_WRITE transactions. */ void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, uint64_t size, boolean_t commit) { uint32_t blocksize = zv->zv_volblocksize; zilog_t *zilog = zv->zv_zilog; itx_wr_state_t write_state; uint64_t log_size = 0; if (zil_replaying(zilog, tx)) return; write_state = zil_write_state(zilog, size, blocksize, B_FALSE, commit); while (size) { itx_t *itx; lr_write_t *lr; itx_wr_state_t wr_state = write_state; ssize_t len = size; if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog)) wr_state = WR_NEED_COPY; else if (wr_state == WR_INDIRECT) len = MIN(blocksize - P2PHASE(offset, blocksize), size); itx = zil_itx_create(TX_WRITE, sizeof (*lr) + (wr_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, offset, len, lr + 1, DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING) != 0) { zil_itx_destroy(itx, 0); itx = zil_itx_create(TX_WRITE, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; wr_state = WR_NEED_COPY; } log_size += itx->itx_size; if (wr_state == WR_NEED_COPY) log_size += len; itx->itx_wr_state = wr_state; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = offset; lr->lr_length = len; lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); itx->itx_private = zv; zil_itx_assign(zilog, itx, tx); offset += len; size -= len; } dsl_pool_wrlog_count(zilog->zl_dmu_pool, log_size, tx->tx_txg); } /* * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. */ void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len) { itx_t *itx; lr_truncate_t *lr; zilog_t *zilog = zv->zv_zilog; if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); lr = (lr_truncate_t *)&itx->itx_lr; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; lr->lr_length = len; zil_itx_assign(zilog, itx, tx); } static void zvol_get_done(zgd_t *zgd, int error) { (void) error; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_rangelock_exit(zgd->zgd_lr); kmem_free(zgd, sizeof (zgd_t)); } /* * Get data to generate a TX_WRITE intent log record. */ int zvol_get_data(void *arg, uint64_t arg2, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zvol_state_t *zv = arg; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error; ASSERT3P(lwb, !=, NULL); ASSERT3U(size, !=, 0); zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, DMU_READ_NO_PREFETCH | DMU_KEEP_CACHING); } else { /* indirect write */ ASSERT3P(zio, !=, NULL); /* * Have to lock the whole block to ensure when it's written out * and its checksum is being calculated that no one can change * the data. Contrarily to zfs_get_data we need not re-check * blocksize after we get the lock because it cannot be changed. */ size = zv->zv_volblocksize; offset = P2ALIGN_TYPED(offset, size, uint64_t); zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, size, RL_READER); error = dmu_buf_hold_noread_by_dnode(zv->zv_dn, offset, zgd, &db); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db != NULL); ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zvol_get_done, zgd); if (error == 0) return (0); } } zvol_get_done(zgd, error); return (error); } /* * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. */ void zvol_insert(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); list_insert_head(&zvol_state_list, zv); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); } /* * Simply remove the zvol from to list of zvols. */ static void zvol_remove(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zvol_state_lock)); list_remove(&zvol_state_list, zv); hlist_del(&zv->zv_hlink); } /* * Setup zv after we just own the zv->objset */ static int zvol_setup_zv(zvol_state_t *zv) { uint64_t volsize; int error; uint64_t ro; objset_t *os = zv->zv_objset; ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock)); zv->zv_zilog = NULL; zv->zv_flags &= ~ZVOL_WRITTEN_TO; error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); if (error) return (error); error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) return (error); error = dnode_hold(os, ZVOL_OBJ, zv, &zv->zv_dn); if (error) return (error); zvol_os_set_capacity(zv, volsize >> 9); zv->zv_volsize = volsize; if (ro || dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) { zvol_os_set_disk_ro(zv, 1); zv->zv_flags |= ZVOL_RDONLY; } else { zvol_os_set_disk_ro(zv, 0); zv->zv_flags &= ~ZVOL_RDONLY; } return (0); } /* * Shutdown every zv_objset related stuff except zv_objset itself. * The is the reverse of zvol_setup_zv. */ static void zvol_shutdown_zv(zvol_state_t *zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock) && RW_LOCK_HELD(&zv->zv_suspend_lock)); if (zv->zv_flags & ZVOL_WRITTEN_TO) { ASSERT(zv->zv_zilog != NULL); zil_close(zv->zv_zilog); } zv->zv_zilog = NULL; dnode_rele(zv->zv_dn, zv); zv->zv_dn = NULL; /* * Evict cached data. We must write out any dirty data before * disowning the dataset. */ if (zv->zv_flags & ZVOL_WRITTEN_TO) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); dmu_objset_evict_dbufs(zv->zv_objset); } /* * return the proper tag for rollback and recv */ void * zvol_tag(zvol_state_t *zv) { ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); return (zv->zv_open_count > 0 ? zv : NULL); } /* * Suspend the zvol for recv and rollback. */ zvol_state_t * zvol_suspend(const char *name) { zvol_state_t *zv; zv = zvol_find_by_name(name, RW_WRITER); if (zv == NULL) return (NULL); /* block all I/O, release in zvol_resume. */ ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); atomic_inc(&zv->zv_suspend_ref); if (zv->zv_open_count > 0) zvol_shutdown_zv(zv); /* * do not hold zv_state_lock across suspend/resume to * avoid locking up zvol lookups */ mutex_exit(&zv->zv_state_lock); /* zv_suspend_lock is released in zvol_resume() */ return (zv); } int zvol_resume(zvol_state_t *zv) { int error = 0; ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); mutex_enter(&zv->zv_state_lock); if (zv->zv_open_count > 0) { VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset)); VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv); VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset)); dmu_objset_rele(zv->zv_objset, zv); error = zvol_setup_zv(zv); } mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); /* * We need this because we don't hold zvol_state_lock while releasing * zv_suspend_lock. zvol_remove_minors_impl thus cannot check * zv_suspend_lock to determine it is safe to free because rwlock is * not inherent atomic. */ atomic_dec(&zv->zv_suspend_ref); if (zv->zv_flags & ZVOL_REMOVING) cv_broadcast(&zv->zv_removing_cv); return (error); } int zvol_first_open(zvol_state_t *zv, boolean_t readonly) { objset_t *os; int error; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); ASSERT(mutex_owned(&spa_namespace_lock)); boolean_t ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os); if (error) return (error); zv->zv_objset = os; error = zvol_setup_zv(zv); if (error) { dmu_objset_disown(os, 1, zv); zv->zv_objset = NULL; } return (error); } void zvol_last_close(zvol_state_t *zv) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_flags & ZVOL_REMOVING) cv_broadcast(&zv->zv_removing_cv); zvol_shutdown_zv(zv); dmu_objset_disown(zv->zv_objset, 1, zv); zv->zv_objset = NULL; } typedef struct minors_job { list_t *list; list_node_t link; /* input */ char *name; /* output */ int error; } minors_job_t; /* * Prefetch zvol dnodes for the minors_job */ static void zvol_prefetch_minors_impl(void *arg) { minors_job_t *job = arg; char *dsname = job->name; objset_t *os = NULL; job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (job->error == 0) { dmu_prefetch_dnode(os, ZVOL_OBJ, ZIO_PRIORITY_SYNC_READ); dmu_objset_disown(os, B_TRUE, FTAG); } } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_snap_minor_cb(const char *dsname, void *arg) { minors_job_t *j = arg; list_t *minors_list = j->list; const char *name = j->name; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); /* skip the designated dataset */ if (name && strcmp(dsname, name) == 0) return (0); /* at this point, the dsname should name a snapshot */ if (strchr(dsname, '@') == 0) { dprintf("zvol_create_snap_minor_cb(): " "%s is not a snapshot name\n", dsname); } else { minors_job_t *job; char *n = kmem_strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); } return (0); } /* * If spa_keystore_load_wkey() is called for an encrypted zvol, * we need to look for any clones also using the key. This function * is "best effort" - so we just skip over it if there are failures. */ static void zvol_add_clones(const char *dsname, list_t *minors_list) { /* Also check if it has clones */ dsl_dir_t *dd = NULL; dsl_pool_t *dp = NULL; if (dsl_pool_hold(dsname, FTAG, &dp) != 0) return; if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) goto out; if (dsl_dir_hold(dp, dsname, FTAG, &dd, NULL) != 0) goto out; if (dsl_dir_phys(dd)->dd_clones == 0) goto out; zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); zap_attribute_t *za = zap_attribute_alloc(); objset_t *mos = dd->dd_pool->dp_meta_objset; for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones); zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { dsl_dataset_t *clone; minors_job_t *job; if (dsl_dataset_hold_obj(dd->dd_pool, za->za_first_integer, FTAG, &clone) == 0) { char name[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(clone, name); char *n = kmem_strdup(name); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); dsl_dataset_rele(clone, FTAG); } } zap_cursor_fini(zc); zap_attribute_free(za); kmem_free(zc, sizeof (zap_cursor_t)); out: if (dd != NULL) dsl_dir_rele(dd, FTAG); dsl_pool_rele(dp, FTAG); } /* * Mask errors to continue dmu_objset_find() traversal */ static int zvol_create_minors_cb(const char *dsname, void *arg) { uint64_t snapdev; int error; list_t *minors_list = arg; ASSERT0(MUTEX_HELD(&spa_namespace_lock)); error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL); if (error) return (0); /* * Given the name and the 'snapdev' property, create device minor nodes * with the linkages to zvols/snapshots as needed. * If the name represents a zvol, create a minor node for the zvol, then * check if its snapshots are 'visible', and if so, iterate over the * snapshots and create device minor nodes for those. */ if (strchr(dsname, '@') == 0) { minors_job_t *job; char *n = kmem_strdup(dsname); if (n == NULL) return (0); job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); job->name = n; job->list = minors_list; job->error = 0; list_insert_tail(minors_list, job); /* don't care if dispatch fails, because job->error is 0 */ taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, TQ_SLEEP); zvol_add_clones(dsname, minors_list); if (snapdev == ZFS_SNAPDEV_VISIBLE) { /* * traverse snapshots only, do not traverse children, * and skip the 'dsname' */ (void) dmu_objset_find(dsname, zvol_create_snap_minor_cb, (void *)job, DS_FIND_SNAPSHOTS); } } else { dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", dsname); } return (0); } static void zvol_task_update_status(zvol_task_t *task, uint64_t total, uint64_t done, int error) { task->zt_total += total; task->zt_done += done; if (task->zt_total != task->zt_done) { task->zt_status = -1; if (error) task->zt_error = error; } } static void zvol_task_report_status(zvol_task_t *task) { #ifdef ZFS_DEBUG static const char *const msg[] = { "create", "remove", "rename", "set snapdev", "set volmode", "unknown", }; if (task->zt_status == 0) return; zvol_async_op_t op = MIN(task->zt_op, ZVOL_ASYNC_MAX); if (task->zt_error) { dprintf("The %s minors zvol task was not ok, last error %d\n", msg[op], task->zt_error); } else { dprintf("The %s minors zvol task was not ok\n", msg[op]); } #else (void) task; #endif } /* * Create minors for the specified dataset, including children and snapshots. * Pay attention to the 'snapdev' property and iterate over the snapshots * only if they are 'visible'. This approach allows one to assure that the * snapshot metadata is read from disk only if it is needed. * * The name can represent a dataset to be recursively scanned for zvols and * their snapshots, or a single zvol snapshot. If the name represents a * dataset, the scan is performed in two nested stages: * - scan the dataset for zvols, and * - for each zvol, create a minor node, then check if the zvol's snapshots * are 'visible', and only then iterate over the snapshots if needed * * If the name represents a snapshot, a check is performed if the snapshot is * 'visible' (which also verifies that the parent is a zvol), and if so, * a minor node for that snapshot is created. */ static void zvol_create_minors_impl(zvol_task_t *task) { const char *name = task->zt_name1; list_t minors_list; minors_job_t *job; uint64_t snapdev; int total = 0, done = 0, last_error, error; /* * Note: the dsl_pool_config_lock must not be held. * Minor node creation needs to obtain the zvol_state_lock. * zvol_open() obtains the zvol_state_lock and then the dsl pool * config lock. Therefore, we can't have the config lock now if * we are going to wait for the zvol_state_lock, because it * would be a lock order inversion which could lead to deadlock. */ if (zvol_inhibit_dev) { return; } /* * This is the list for prefetch jobs. Whenever we found a match * during dmu_objset_find, we insert a minors_job to the list and do * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need * any lock because all list operation is done on the current thread. * * We will use this list to do zvol_os_create_minor after prefetch * so we don't have to traverse using dmu_objset_find again. */ list_create(&minors_list, sizeof (minors_job_t), offsetof(minors_job_t, link)); if (strchr(name, '@') != NULL) { error = dsl_prop_get_integer(name, "snapdev", &snapdev, NULL); if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) { error = zvol_os_create_minor(name); if (error == 0) { done++; } else { last_error = error; } total++; } } else { fstrans_cookie_t cookie = spl_fstrans_mark(); (void) dmu_objset_find(name, zvol_create_minors_cb, &minors_list, DS_FIND_CHILDREN); spl_fstrans_unmark(cookie); } taskq_wait_outstanding(system_taskq, 0); /* * Prefetch is completed, we can do zvol_os_create_minor * sequentially. */ while ((job = list_remove_head(&minors_list)) != NULL) { if (!job->error) { error = zvol_os_create_minor(job->name); if (error == 0) { done++; } else { last_error = error; } } else if (job->error == EINVAL) { /* * The objset, with the name requested by current job * exist, but have the type different from zvol. * Just ignore this sort of errors. */ done++; } else { last_error = job->error; } total++; kmem_strfree(job->name); kmem_free(job, sizeof (minors_job_t)); } list_destroy(&minors_list); zvol_task_update_status(task, total, done, last_error); } /* * Remove minors for specified dataset including children and snapshots. */ /* * Remove the minor for a given zvol. This will do it all: * - flag the zvol for removal, so new requests are rejected * - wait until outstanding requests are completed * - remove it from lists * - free it * It's also usable as a taskq task, and smells nice too. */ static void zvol_remove_minor_task(void *arg) { zvol_state_t *zv = (zvol_state_t *)arg; ASSERT(!RW_LOCK_HELD(&zvol_state_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); mutex_enter(&zv->zv_state_lock); while (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { zv->zv_flags |= ZVOL_REMOVING; cv_wait(&zv->zv_removing_cv, &zv->zv_state_lock); } mutex_exit(&zv->zv_state_lock); rw_enter(&zvol_state_lock, RW_WRITER); mutex_enter(&zv->zv_state_lock); + zvol_os_remove_minor(zv); zvol_remove(zv); - zvol_os_clear_private(zv); mutex_exit(&zv->zv_state_lock); rw_exit(&zvol_state_lock); zvol_os_free(zv); } static void zvol_free_task(void *arg) { zvol_os_free(arg); } /* * Remove minors for specified dataset and, optionally, its children and * snapshots. */ static void zvol_remove_minors_impl(zvol_task_t *task) { zvol_state_t *zv, *zv_next; const char *name = task ? task->zt_name1 : NULL; int namelen = ((name) ? strlen(name) : 0); boolean_t children = task ? !!task->zt_value : B_TRUE; taskqid_t t; list_t delay_list, free_list; if (zvol_inhibit_dev) return; list_create(&delay_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); list_create(&free_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); int error = ENOENT; rw_enter(&zvol_state_lock, RW_WRITER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); /* * This zvol should be removed if: * - no name was offered (ie removing all at shutdown); or * - name matches exactly; or * - we were asked to remove children, and * - the start of the name matches, and * - there is a '/' immediately after the matched name; or * - there is a '@' immediately after the matched name */ if (name == NULL || strcmp(zv->zv_name, name) == 0 || (children && strncmp(zv->zv_name, name, namelen) == 0 && (zv->zv_name[namelen] == '/' || zv->zv_name[namelen] == '@'))) { /* * By holding zv_state_lock here, we guarantee that no * one is currently using this zv */ error = 0; /* * If in use, try to throw everyone off and try again * later. */ + zv->zv_flags |= ZVOL_REMOVING; if (zv->zv_open_count > 0 || atomic_read(&zv->zv_suspend_ref)) { - zv->zv_flags |= ZVOL_REMOVING; t = taskq_dispatch( zv->zv_objset->os_spa->spa_zvol_taskq, zvol_remove_minor_task, zv, TQ_SLEEP); if (t == TASKQID_INVALID) { /* * Couldn't create the task, so we'll * do it in place once the loop is * finished. */ list_insert_head(&delay_list, zv); } mutex_exit(&zv->zv_state_lock); continue; } + zvol_os_remove_minor(zv); zvol_remove(zv); - /* - * Cleared while holding zvol_state_lock as a writer - * which will prevent zvol_open() from opening it. - */ - zvol_os_clear_private(zv); - /* Drop zv_state_lock before zvol_free() */ mutex_exit(&zv->zv_state_lock); /* Try parallel zv_free, if failed do it in place */ t = taskq_dispatch(system_taskq, zvol_free_task, zv, TQ_SLEEP); if (t == TASKQID_INVALID) list_insert_head(&free_list, zv); } else { mutex_exit(&zv->zv_state_lock); } } rw_exit(&zvol_state_lock); /* Wait for zvols that we couldn't create a remove task for */ while ((zv = list_remove_head(&delay_list)) != NULL) zvol_remove_minor_task(zv); /* Free any that we couldn't free in parallel earlier */ while ((zv = list_remove_head(&free_list)) != NULL) zvol_os_free(zv); if (error && task) task->zt_error = SET_ERROR(error); } /* Remove minor for this specific volume only */ static int zvol_remove_minor_impl(const char *name) { if (zvol_inhibit_dev) return (0); zvol_task_t task; memset(&task, 0, sizeof (zvol_task_t)); strlcpy(task.zt_name1, name, sizeof (task.zt_name1)); task.zt_value = B_FALSE; zvol_remove_minors_impl(&task); return (task.zt_error); } /* * Rename minors for specified dataset including children and snapshots. */ static void zvol_rename_minors_impl(zvol_task_t *task) { zvol_state_t *zv, *zv_next; const char *oldname = task->zt_name1; const char *newname = task->zt_name2; int total = 0, done = 0, last_error, error, oldnamelen; if (zvol_inhibit_dev) return; oldnamelen = strlen(oldname); rw_enter(&zvol_state_lock, RW_READER); for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { zv_next = list_next(&zvol_state_list, zv); mutex_enter(&zv->zv_state_lock); if (strcmp(zv->zv_name, oldname) == 0) { error = zvol_os_rename_minor(zv, newname); } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && (zv->zv_name[oldnamelen] == '/' || zv->zv_name[oldnamelen] == '@')) { char *name = kmem_asprintf("%s%c%s", newname, zv->zv_name[oldnamelen], zv->zv_name + oldnamelen + 1); error = zvol_os_rename_minor(zv, name); kmem_strfree(name); } if (error) { last_error = error; } else { done++; } total++; mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); zvol_task_update_status(task, total, done, last_error); } typedef struct zvol_snapdev_cb_arg { zvol_task_t *task; uint64_t snapdev; } zvol_snapdev_cb_arg_t; static int zvol_set_snapdev_cb(const char *dsname, void *param) { zvol_snapdev_cb_arg_t *arg = param; int error = 0; if (strchr(dsname, '@') == NULL) return (0); switch (arg->snapdev) { case ZFS_SNAPDEV_VISIBLE: error = zvol_os_create_minor(dsname); break; case ZFS_SNAPDEV_HIDDEN: error = zvol_remove_minor_impl(dsname); break; } zvol_task_update_status(arg->task, 1, error == 0, error); return (0); } static void zvol_set_snapdev_impl(zvol_task_t *task) { const char *name = task->zt_name1; uint64_t snapdev = task->zt_value; zvol_snapdev_cb_arg_t arg = {task, snapdev}; fstrans_cookie_t cookie = spl_fstrans_mark(); /* * The zvol_set_snapdev_sync() sets snapdev appropriately * in the dataset hierarchy. Here, we only scan snapshots. */ dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS); spl_fstrans_unmark(cookie); } static void zvol_set_volmode_impl(zvol_task_t *task) { const char *name = task->zt_name1; uint64_t volmode = task->zt_value; fstrans_cookie_t cookie; uint64_t old_volmode; zvol_state_t *zv; int error; if (strchr(name, '@') != NULL) return; /* * It's unfortunate we need to remove minors before we create new ones: * this is necessary because our backing gendisk (zvol_state->zv_disk) * could be different when we set, for instance, volmode from "geom" * to "dev" (or vice versa). */ zv = zvol_find_by_name(name, RW_NONE); if (zv == NULL && volmode == ZFS_VOLMODE_NONE) return; if (zv != NULL) { old_volmode = zv->zv_volmode; mutex_exit(&zv->zv_state_lock); if (old_volmode == volmode) return; zvol_wait_close(zv); } cookie = spl_fstrans_mark(); switch (volmode) { case ZFS_VOLMODE_NONE: error = zvol_remove_minor_impl(name); break; case ZFS_VOLMODE_GEOM: case ZFS_VOLMODE_DEV: error = zvol_remove_minor_impl(name); /* * The remove minor function call above, might be not * needed, if volmode was switched from 'none' value. * Ignore error in this case. */ if (error == ENOENT) error = 0; else if (error) break; error = zvol_os_create_minor(name); break; case ZFS_VOLMODE_DEFAULT: error = zvol_remove_minor_impl(name); if (zvol_volmode == ZFS_VOLMODE_NONE) break; else /* if zvol_volmode is invalid defaults to "geom" */ error = zvol_os_create_minor(name); break; } zvol_task_update_status(task, 1, error == 0, error); spl_fstrans_unmark(cookie); } /* * The worker thread function performed asynchronously. */ static void zvol_task_cb(void *arg) { zvol_task_t *task = arg; switch (task->zt_op) { case ZVOL_ASYNC_CREATE_MINORS: zvol_create_minors_impl(task); break; case ZVOL_ASYNC_REMOVE_MINORS: zvol_remove_minors_impl(task); break; case ZVOL_ASYNC_RENAME_MINORS: zvol_rename_minors_impl(task); break; case ZVOL_ASYNC_SET_SNAPDEV: zvol_set_snapdev_impl(task); break; case ZVOL_ASYNC_SET_VOLMODE: zvol_set_volmode_impl(task); break; default: VERIFY(0); break; } zvol_task_report_status(task); kmem_free(task, sizeof (zvol_task_t)); } typedef struct zvol_set_prop_int_arg { const char *zsda_name; uint64_t zsda_value; zprop_source_t zsda_source; zfs_prop_t zsda_prop; } zvol_set_prop_int_arg_t; /* * Sanity check the dataset for safe use by the sync task. No additional * conditions are imposed. */ static int zvol_set_common_check(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; int error; error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); if (error != 0) return (error); dsl_dir_rele(dd, FTAG); return (error); } static int zvol_set_common_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) { zvol_set_prop_int_arg_t *zsda = arg; char dsname[ZFS_MAX_DATASET_NAME_LEN]; zvol_task_t *task; uint64_t prop; const char *prop_name = zfs_prop_to_name(zsda->zsda_prop); dsl_dataset_name(ds, dsname); if (dsl_prop_get_int_ds(ds, prop_name, &prop) != 0) return (0); task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); if (zsda->zsda_prop == ZFS_PROP_VOLMODE) { task->zt_op = ZVOL_ASYNC_SET_VOLMODE; } else if (zsda->zsda_prop == ZFS_PROP_SNAPDEV) { task->zt_op = ZVOL_ASYNC_SET_SNAPDEV; } else { kmem_free(task, sizeof (zvol_task_t)); return (0); } task->zt_value = prop; strlcpy(task->zt_name1, dsname, sizeof (task->zt_name1)); (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); return (0); } /* * Traverse all child datasets and apply the property appropriately. * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel * dataset and read the effective "property" on every child in the callback * function: this is because the value is not guaranteed to be the same in the * whole dataset hierarchy. */ static void zvol_set_common_sync(void *arg, dmu_tx_t *tx) { zvol_set_prop_int_arg_t *zsda = arg; dsl_pool_t *dp = dmu_tx_pool(tx); dsl_dir_t *dd; dsl_dataset_t *ds; int error; VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); if (error == 0) { dsl_prop_set_sync_impl(ds, zfs_prop_to_name(zsda->zsda_prop), zsda->zsda_source, sizeof (zsda->zsda_value), 1, &zsda->zsda_value, tx); dsl_dataset_rele(ds, FTAG); } dmu_objset_find_dp(dp, dd->dd_object, zvol_set_common_sync_cb, zsda, DS_FIND_CHILDREN); dsl_dir_rele(dd, FTAG); } int zvol_set_common(const char *ddname, zfs_prop_t prop, zprop_source_t source, uint64_t val) { zvol_set_prop_int_arg_t zsda; zsda.zsda_name = ddname; zsda.zsda_source = source; zsda.zsda_value = val; zsda.zsda_prop = prop; return (dsl_sync_task(ddname, zvol_set_common_check, zvol_set_common_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); } void zvol_create_minors(const char *name) { spa_t *spa; zvol_task_t *task; taskqid_t id; if (spa_open(name, &spa, FTAG) != 0) return; task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); task->zt_op = ZVOL_ASYNC_CREATE_MINORS; strlcpy(task->zt_name1, name, sizeof (task->zt_name1)); id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if (id != TASKQID_INVALID) taskq_wait_id(spa->spa_zvol_taskq, id); spa_close(spa, FTAG); } void zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) { zvol_task_t *task; taskqid_t id; task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); task->zt_op = ZVOL_ASYNC_REMOVE_MINORS; strlcpy(task->zt_name1, name, sizeof (task->zt_name1)); task->zt_value = B_TRUE; id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } void zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, boolean_t async) { zvol_task_t *task; taskqid_t id; task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); task->zt_op = ZVOL_ASYNC_RENAME_MINORS; strlcpy(task->zt_name1, name1, sizeof (task->zt_name1)); strlcpy(task->zt_name2, name2, sizeof (task->zt_name2)); id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); if ((async == B_FALSE) && (id != TASKQID_INVALID)) taskq_wait_id(spa->spa_zvol_taskq, id); } boolean_t zvol_is_zvol(const char *name) { return (zvol_os_is_zvol(name)); } int zvol_init_impl(void) { int i; /* * zvol_threads is the module param the user passes in. * * zvol_actual_threads is what we use internally, since the user can * pass zvol_thread = 0 to mean "use all the CPUs" (the default). */ static unsigned int zvol_actual_threads; if (zvol_threads == 0) { /* * See dde9380a1 for why 32 was chosen here. This should * probably be refined to be some multiple of the number * of CPUs. */ zvol_actual_threads = MAX(max_ncpus, 32); } else { zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); } /* * Use at least 32 zvol_threads but for many core system, * prefer 6 threads per taskq, but no more taskqs * than threads in them on large systems. * * taskq total * cpus taskqs threads threads * ------- ------- ------- ------- * 1 1 32 32 * 2 1 32 32 * 4 1 32 32 * 8 2 16 32 * 16 3 11 33 * 32 5 7 35 * 64 8 8 64 * 128 11 12 132 * 256 16 16 256 */ zv_taskq_t *ztqs = &zvol_taskqs; int num_tqs = MIN(max_ncpus, zvol_num_taskqs); if (num_tqs == 0) { num_tqs = 1 + max_ncpus / 6; while (num_tqs * num_tqs > zvol_actual_threads) num_tqs--; } int per_tq_thread = zvol_actual_threads / num_tqs; if (per_tq_thread * num_tqs < zvol_actual_threads) per_tq_thread++; ztqs->tqs_cnt = num_tqs; ztqs->tqs_taskq = kmem_alloc(num_tqs * sizeof (taskq_t *), KM_SLEEP); for (uint_t i = 0; i < num_tqs; i++) { char name[32]; (void) snprintf(name, sizeof (name), "%s_tq-%u", ZVOL_DRIVER, i); ztqs->tqs_taskq[i] = taskq_create(name, per_tq_thread, maxclsyspri, per_tq_thread, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (ztqs->tqs_taskq[i] == NULL) { for (int j = i - 1; j >= 0; j--) taskq_destroy(ztqs->tqs_taskq[j]); kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); ztqs->tqs_taskq = NULL; return (SET_ERROR(ENOMEM)); } } list_create(&zvol_state_list, sizeof (zvol_state_t), offsetof(zvol_state_t, zv_next)); rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL); zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), KM_SLEEP); for (i = 0; i < ZVOL_HT_SIZE; i++) INIT_HLIST_HEAD(&zvol_htable[i]); return (0); } void zvol_fini_impl(void) { zv_taskq_t *ztqs = &zvol_taskqs; zvol_remove_minors_impl(NULL); /* * The call to "zvol_remove_minors_impl" may dispatch entries to * the system_taskq, but it doesn't wait for those entries to * complete before it returns. Thus, we must wait for all of the * removals to finish, before we can continue. */ taskq_wait_outstanding(system_taskq, 0); kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); list_destroy(&zvol_state_list); rw_destroy(&zvol_state_lock); if (ztqs->tqs_taskq == NULL) { ASSERT0(ztqs->tqs_cnt); } else { for (uint_t i = 0; i < ztqs->tqs_cnt; i++) { ASSERT3P(ztqs->tqs_taskq[i], !=, NULL); taskq_destroy(ztqs->tqs_taskq[i]); } kmem_free(ztqs->tqs_taskq, ztqs->tqs_cnt * sizeof (taskq_t *)); ztqs->tqs_taskq = NULL; } } ZFS_MODULE_PARAM(zfs_vol, zvol_, inhibit_dev, UINT, ZMOD_RW, "Do not create zvol device nodes"); ZFS_MODULE_PARAM(zfs_vol, zvol_, prefetch_bytes, UINT, ZMOD_RW, "Prefetch N bytes at zvol start+end"); ZFS_MODULE_PARAM(zfs_vol, zvol_vol, mode, UINT, ZMOD_RW, "Default volmode property value"); ZFS_MODULE_PARAM(zfs_vol, zvol_, threads, UINT, ZMOD_RW, "Number of threads for I/O requests. Set to 0 to use all active CPUs"); ZFS_MODULE_PARAM(zfs_vol, zvol_, num_taskqs, UINT, ZMOD_RW, "Number of zvol taskqs"); ZFS_MODULE_PARAM(zfs_vol, zvol_, request_sync, UINT, ZMOD_RW, "Synchronously handle bio requests");