diff --git a/config/kernel-block-device-operations.m4 b/config/kernel-block-device-operations.m4 index 8e64ecca909b..a48618185bfb 100644 --- a/config/kernel-block-device-operations.m4 +++ b/config/kernel-block-device-operations.m4 @@ -1,63 +1,95 @@ dnl # dnl # 2.6.38 API change dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [ ZFS_LINUX_TEST_SRC([block_device_operations_check_events], [ #include unsigned int blk_check_events(struct gendisk *disk, unsigned int clearing) { return (0); } static const struct block_device_operations bops __attribute__ ((unused)) = { .check_events = blk_check_events, }; ], [], [$NO_UNUSED_BUT_SET_VARIABLE]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [ AC_MSG_CHECKING([whether bops->check_events() exists]) ZFS_LINUX_TEST_RESULT([block_device_operations_check_events], [ AC_MSG_RESULT(yes) ],[ ZFS_LINUX_TEST_ERROR([bops->check_events()]) ]) ]) dnl # dnl # 3.10.x API change dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ ZFS_LINUX_TEST_SRC([block_device_operations_release_void], [ #include void blk_release(struct gendisk *g, fmode_t mode) { return; } static const struct block_device_operations bops __attribute__ ((unused)) = { .open = NULL, .release = blk_release, .ioctl = NULL, .compat_ioctl = NULL, }; ], [], [$NO_UNUSED_BUT_SET_VARIABLE]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ AC_MSG_CHECKING([whether bops->release() is void]) ZFS_LINUX_TEST_RESULT([block_device_operations_release_void], [ AC_MSG_RESULT(yes) ],[ ZFS_LINUX_TEST_ERROR([bops->release()]) ]) ]) +dnl # +dnl # 5.13 API change +dnl # block_device_operations->revalidate_disk() was removed +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [ + ZFS_LINUX_TEST_SRC([block_device_operations_revalidate_disk], [ + #include + + int blk_revalidate_disk(struct gendisk *disk) { + return(0); + } + + static const struct block_device_operations + bops __attribute__ ((unused)) = { + .revalidate_disk = blk_revalidate_disk, + }; + ], [], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [ + AC_MSG_CHECKING([whether bops->revalidate_disk() exists]) + ZFS_LINUX_TEST_RESULT([block_device_operations_revalidate_disk], [ + AC_DEFINE([HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [1], + [Define if revalidate_disk() in block_device_operations]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS], [ ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID + ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK ]) AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS], [ ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID + ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK ]) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index fe809176fbbb..14cae3563848 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -1,575 +1,579 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2011 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Written by Brian Behlendorf . * LLNL-CODE-403049. */ #ifndef _ZFS_BLKDEV_H #define _ZFS_BLKDEV_H #include #include #include #include #include /* for SECTOR_* */ #ifndef HAVE_BLK_QUEUE_FLAG_SET static inline void blk_queue_flag_set(unsigned int flag, struct request_queue *q) { queue_flag_set(flag, q); } #endif #ifndef HAVE_BLK_QUEUE_FLAG_CLEAR static inline void blk_queue_flag_clear(unsigned int flag, struct request_queue *q) { queue_flag_clear(flag, q); } #endif /* * 4.7 - 4.x API, * The blk_queue_write_cache() interface has replaced blk_queue_flush() * interface. However, the new interface is GPL-only thus we implement * our own trivial wrapper when the GPL-only version is detected. * * 2.6.36 - 4.6 API, * The blk_queue_flush() interface has replaced blk_queue_ordered() * interface. However, while the old interface was available to all the * new one is GPL-only. Thus if the GPL-only version is detected we * implement our own trivial helper. */ static inline void blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua) { #if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY) if (wc) blk_queue_flag_set(QUEUE_FLAG_WC, q); else blk_queue_flag_clear(QUEUE_FLAG_WC, q); if (fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); else blk_queue_flag_clear(QUEUE_FLAG_FUA, q); #elif defined(HAVE_BLK_QUEUE_WRITE_CACHE) blk_queue_write_cache(q, wc, fua); #elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY) if (wc) q->flush_flags |= REQ_FLUSH; if (fua) q->flush_flags |= REQ_FUA; #elif defined(HAVE_BLK_QUEUE_FLUSH) blk_queue_flush(q, (wc ? REQ_FLUSH : 0) | (fua ? REQ_FUA : 0)); #else #error "Unsupported kernel" #endif } static inline void blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages) { #ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC q->backing_dev_info->ra_pages = ra_pages; #else q->backing_dev_info.ra_pages = ra_pages; #endif } #ifdef HAVE_BIO_BVEC_ITER #define BIO_BI_SECTOR(bio) (bio)->bi_iter.bi_sector #define BIO_BI_SIZE(bio) (bio)->bi_iter.bi_size #define BIO_BI_IDX(bio) (bio)->bi_iter.bi_idx #define BIO_BI_SKIP(bio) (bio)->bi_iter.bi_bvec_done #define bio_for_each_segment4(bv, bvp, b, i) \ bio_for_each_segment((bv), (b), (i)) typedef struct bvec_iter bvec_iterator_t; #else #define BIO_BI_SECTOR(bio) (bio)->bi_sector #define BIO_BI_SIZE(bio) (bio)->bi_size #define BIO_BI_IDX(bio) (bio)->bi_idx #define BIO_BI_SKIP(bio) (0) #define bio_for_each_segment4(bv, bvp, b, i) \ bio_for_each_segment((bvp), (b), (i)) typedef int bvec_iterator_t; #endif static inline void bio_set_flags_failfast(struct block_device *bdev, int *flags) { #ifdef CONFIG_BUG /* * Disable FAILFAST for loopback devices because of the * following incorrect BUG_ON() in loop_make_request(). * This support is also disabled for md devices because the * test suite layers md devices on top of loopback devices. * This may be removed when the loopback driver is fixed. * * BUG_ON(!lo || (rw != READ && rw != WRITE)); */ if ((MAJOR(bdev->bd_dev) == LOOP_MAJOR) || (MAJOR(bdev->bd_dev) == MD_MAJOR)) return; #ifdef BLOCK_EXT_MAJOR if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) return; #endif /* BLOCK_EXT_MAJOR */ #endif /* CONFIG_BUG */ *flags |= REQ_FAILFAST_MASK; } /* * Maximum disk label length, it may be undefined for some kernels. */ #if !defined(DISK_NAME_LEN) #define DISK_NAME_LEN 32 #endif /* DISK_NAME_LEN */ #ifdef HAVE_BIO_BI_STATUS static inline int bi_status_to_errno(blk_status_t status) { switch (status) { case BLK_STS_OK: return (0); case BLK_STS_NOTSUPP: return (EOPNOTSUPP); case BLK_STS_TIMEOUT: return (ETIMEDOUT); case BLK_STS_NOSPC: return (ENOSPC); case BLK_STS_TRANSPORT: return (ENOLINK); case BLK_STS_TARGET: return (EREMOTEIO); case BLK_STS_NEXUS: return (EBADE); case BLK_STS_MEDIUM: return (ENODATA); case BLK_STS_PROTECTION: return (EILSEQ); case BLK_STS_RESOURCE: return (ENOMEM); case BLK_STS_AGAIN: return (EAGAIN); case BLK_STS_IOERR: return (EIO); default: return (EIO); } } static inline blk_status_t errno_to_bi_status(int error) { switch (error) { case 0: return (BLK_STS_OK); case EOPNOTSUPP: return (BLK_STS_NOTSUPP); case ETIMEDOUT: return (BLK_STS_TIMEOUT); case ENOSPC: return (BLK_STS_NOSPC); case ENOLINK: return (BLK_STS_TRANSPORT); case EREMOTEIO: return (BLK_STS_TARGET); case EBADE: return (BLK_STS_NEXUS); case ENODATA: return (BLK_STS_MEDIUM); case EILSEQ: return (BLK_STS_PROTECTION); case ENOMEM: return (BLK_STS_RESOURCE); case EAGAIN: return (BLK_STS_AGAIN); case EIO: return (BLK_STS_IOERR); default: return (BLK_STS_IOERR); } } #endif /* HAVE_BIO_BI_STATUS */ /* * 4.3 API change * The bio_endio() prototype changed slightly. These are helper * macro's to ensure the prototype and invocation are handled. */ #ifdef HAVE_1ARG_BIO_END_IO_T #ifdef HAVE_BIO_BI_STATUS #define BIO_END_IO_ERROR(bio) bi_status_to_errno(bio->bi_status) #define BIO_END_IO_PROTO(fn, x, z) static void fn(struct bio *x) #define BIO_END_IO(bio, error) bio_set_bi_status(bio, error) static inline void bio_set_bi_status(struct bio *bio, int error) { ASSERT3S(error, <=, 0); bio->bi_status = errno_to_bi_status(-error); bio_endio(bio); } #else #define BIO_END_IO_ERROR(bio) (-(bio->bi_error)) #define BIO_END_IO_PROTO(fn, x, z) static void fn(struct bio *x) #define BIO_END_IO(bio, error) bio_set_bi_error(bio, error) static inline void bio_set_bi_error(struct bio *bio, int error) { ASSERT3S(error, <=, 0); bio->bi_error = error; bio_endio(bio); } #endif /* HAVE_BIO_BI_STATUS */ #else #define BIO_END_IO_PROTO(fn, x, z) static void fn(struct bio *x, int z) #define BIO_END_IO(bio, error) bio_endio(bio, error); #endif /* HAVE_1ARG_BIO_END_IO_T */ /* * 4.1 - x.y.z API, * 3.10.0 CentOS 7.x API, * blkdev_reread_part() * * For older kernels trigger a re-reading of the partition table by calling * check_disk_change() which calls flush_disk() to invalidate the device. * * For newer kernels (as of 5.10), bdev_check_media_change is used, in favor of * check_disk_change(), with the modification that invalidation is no longer * forced. */ #ifdef HAVE_CHECK_DISK_CHANGE #define zfs_check_media_change(bdev) check_disk_change(bdev) #ifdef HAVE_BLKDEV_REREAD_PART #define vdev_bdev_reread_part(bdev) blkdev_reread_part(bdev) #else #define vdev_bdev_reread_part(bdev) check_disk_change(bdev) #endif /* HAVE_BLKDEV_REREAD_PART */ #else #ifdef HAVE_BDEV_CHECK_MEDIA_CHANGE static inline int zfs_check_media_change(struct block_device *bdev) { +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK struct gendisk *gd = bdev->bd_disk; const struct block_device_operations *bdo = gd->fops; +#endif if (!bdev_check_media_change(bdev)) return (0); +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK /* * Force revalidation, to mimic the old behavior of * check_disk_change() */ if (bdo->revalidate_disk) bdo->revalidate_disk(gd); +#endif return (0); } #define vdev_bdev_reread_part(bdev) zfs_check_media_change(bdev) #else /* * This is encountered if check_disk_change() and bdev_check_media_change() * are not available in the kernel - likely due to an API change that needs * to be chased down. */ #error "Unsupported kernel: no usable disk change check" #endif /* HAVE_BDEV_CHECK_MEDIA_CHANGE */ #endif /* HAVE_CHECK_DISK_CHANGE */ /* * 2.6.27 API change * The function was exported for use, prior to this it existed but the * symbol was not exported. * * 4.4.0-6.21 API change for Ubuntu * lookup_bdev() gained a second argument, FMODE_*, to check inode permissions. * * 5.11 API change * Changed to take a dev_t argument which is set on success and return a * non-zero error code on failure. */ static inline int vdev_lookup_bdev(const char *path, dev_t *dev) { #if defined(HAVE_DEVT_LOOKUP_BDEV) return (lookup_bdev(path, dev)); #elif defined(HAVE_1ARG_LOOKUP_BDEV) struct block_device *bdev = lookup_bdev(path); if (IS_ERR(bdev)) return (PTR_ERR(bdev)); *dev = bdev->bd_dev; bdput(bdev); return (0); #elif defined(HAVE_MODE_LOOKUP_BDEV) struct block_device *bdev = lookup_bdev(path, FMODE_READ); if (IS_ERR(bdev)) return (PTR_ERR(bdev)); *dev = bdev->bd_dev; bdput(bdev); return (0); #else #error "Unsupported kernel" #endif } /* * Kernels without bio_set_op_attrs use bi_rw for the bio flags. */ #if !defined(HAVE_BIO_SET_OP_ATTRS) static inline void bio_set_op_attrs(struct bio *bio, unsigned rw, unsigned flags) { bio->bi_rw |= rw | flags; } #endif /* * bio_set_flush - Set the appropriate flags in a bio to guarantee * data are on non-volatile media on completion. * * 2.6.37 - 4.8 API, * Introduce WRITE_FLUSH, WRITE_FUA, and WRITE_FLUSH_FUA flags as a * replacement for WRITE_BARRIER to allow expressing richer semantics * to the block layer. It's up to the block layer to implement the * semantics correctly. Use the WRITE_FLUSH_FUA flag combination. * * 4.8 - 4.9 API, * REQ_FLUSH was renamed to REQ_PREFLUSH. For consistency with previous * OpenZFS releases, prefer the WRITE_FLUSH_FUA flag set if it's available. * * 4.10 API, * The read/write flags and their modifiers, including WRITE_FLUSH, * WRITE_FUA and WRITE_FLUSH_FUA were removed from fs.h in * torvalds/linux@70fd7614 and replaced by direct flag modification * of the REQ_ flags in bio->bi_opf. Use REQ_PREFLUSH. */ static inline void bio_set_flush(struct bio *bio) { #if defined(HAVE_REQ_PREFLUSH) /* >= 4.10 */ bio_set_op_attrs(bio, 0, REQ_PREFLUSH); #elif defined(WRITE_FLUSH_FUA) /* >= 2.6.37 and <= 4.9 */ bio_set_op_attrs(bio, 0, WRITE_FLUSH_FUA); #else #error "Allowing the build will cause bio_set_flush requests to be ignored." #endif } /* * 4.8 - 4.x API, * REQ_OP_FLUSH * * 4.8-rc0 - 4.8-rc1, * REQ_PREFLUSH * * 2.6.36 - 4.7 API, * REQ_FLUSH * * in all cases but may have a performance impact for some kernels. It * has the advantage of minimizing kernel specific changes in the zvol code. * */ static inline boolean_t bio_is_flush(struct bio *bio) { #if defined(HAVE_REQ_OP_FLUSH) && defined(HAVE_BIO_BI_OPF) return ((bio_op(bio) == REQ_OP_FLUSH) || (bio->bi_opf & REQ_PREFLUSH)); #elif defined(HAVE_REQ_PREFLUSH) && defined(HAVE_BIO_BI_OPF) return (bio->bi_opf & REQ_PREFLUSH); #elif defined(HAVE_REQ_PREFLUSH) && !defined(HAVE_BIO_BI_OPF) return (bio->bi_rw & REQ_PREFLUSH); #elif defined(HAVE_REQ_FLUSH) return (bio->bi_rw & REQ_FLUSH); #else #error "Unsupported kernel" #endif } /* * 4.8 - 4.x API, * REQ_FUA flag moved to bio->bi_opf * * 2.6.x - 4.7 API, * REQ_FUA */ static inline boolean_t bio_is_fua(struct bio *bio) { #if defined(HAVE_BIO_BI_OPF) return (bio->bi_opf & REQ_FUA); #elif defined(REQ_FUA) return (bio->bi_rw & REQ_FUA); #else #error "Allowing the build will cause fua requests to be ignored." #endif } /* * 4.8 - 4.x API, * REQ_OP_DISCARD * * 2.6.36 - 4.7 API, * REQ_DISCARD * * In all cases the normal I/O path is used for discards. The only * difference is how the kernel tags individual I/Os as discards. */ static inline boolean_t bio_is_discard(struct bio *bio) { #if defined(HAVE_REQ_OP_DISCARD) return (bio_op(bio) == REQ_OP_DISCARD); #elif defined(HAVE_REQ_DISCARD) return (bio->bi_rw & REQ_DISCARD); #else #error "Unsupported kernel" #endif } /* * 4.8 - 4.x API, * REQ_OP_SECURE_ERASE * * 2.6.36 - 4.7 API, * REQ_SECURE */ static inline boolean_t bio_is_secure_erase(struct bio *bio) { #if defined(HAVE_REQ_OP_SECURE_ERASE) return (bio_op(bio) == REQ_OP_SECURE_ERASE); #elif defined(REQ_SECURE) return (bio->bi_rw & REQ_SECURE); #else return (0); #endif } /* * 2.6.33 API change * Discard granularity and alignment restrictions may now be set. For * older kernels which do not support this it is safe to skip it. */ static inline void blk_queue_discard_granularity(struct request_queue *q, unsigned int dg) { q->limits.discard_granularity = dg; } /* * 4.8 - 4.x API, * blk_queue_secure_erase() * * 2.6.36 - 4.7 API, * blk_queue_secdiscard() */ static inline int blk_queue_discard_secure(struct request_queue *q) { #if defined(HAVE_BLK_QUEUE_SECURE_ERASE) return (blk_queue_secure_erase(q)); #elif defined(HAVE_BLK_QUEUE_SECDISCARD) return (blk_queue_secdiscard(q)); #else return (0); #endif } /* * A common holder for vdev_bdev_open() is used to relax the exclusive open * semantics slightly. Internal vdev disk callers may pass VDEV_HOLDER to * allow them to open the device multiple times. Other kernel callers and * user space processes which don't pass this value will get EBUSY. This is * currently required for the correct operation of hot spares. */ #define VDEV_HOLDER ((void *)0x2401de7) static inline unsigned long blk_generic_start_io_acct(struct request_queue *q __attribute__((unused)), struct gendisk *disk __attribute__((unused)), int rw __attribute__((unused)), struct bio *bio) { #if defined(HAVE_DISK_IO_ACCT) return (disk_start_io_acct(disk, bio_sectors(bio), bio_op(bio))); #elif defined(HAVE_BIO_IO_ACCT) return (bio_start_io_acct(bio)); #elif defined(HAVE_GENERIC_IO_ACCT_3ARG) unsigned long start_time = jiffies; generic_start_io_acct(rw, bio_sectors(bio), &disk->part0); return (start_time); #elif defined(HAVE_GENERIC_IO_ACCT_4ARG) unsigned long start_time = jiffies; generic_start_io_acct(q, rw, bio_sectors(bio), &disk->part0); return (start_time); #else /* Unsupported */ return (0); #endif } static inline void blk_generic_end_io_acct(struct request_queue *q __attribute__((unused)), struct gendisk *disk __attribute__((unused)), int rw __attribute__((unused)), struct bio *bio, unsigned long start_time) { #if defined(HAVE_DISK_IO_ACCT) disk_end_io_acct(disk, bio_op(bio), start_time); #elif defined(HAVE_BIO_IO_ACCT) bio_end_io_acct(bio, start_time); #elif defined(HAVE_GENERIC_IO_ACCT_3ARG) generic_end_io_acct(rw, &disk->part0, start_time); #elif defined(HAVE_GENERIC_IO_ACCT_4ARG) generic_end_io_acct(q, rw, &disk->part0, start_time); #endif } #ifndef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS static inline struct request_queue * blk_generic_alloc_queue(make_request_fn make_request, int node_id) { #if defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN) return (blk_alloc_queue(make_request, node_id)); #elif defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN_RH) return (blk_alloc_queue_rh(make_request, node_id)); #else struct request_queue *q = blk_alloc_queue(GFP_KERNEL); if (q != NULL) blk_queue_make_request(q, make_request); return (q); #endif } #endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ #endif /* _ZFS_BLKDEV_H */ diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 7756d819fdeb..741979f11af8 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1,1144 +1,1146 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include unsigned int zvol_major = ZVOL_MAJOR; unsigned int zvol_request_sync = 0; unsigned int zvol_prefetch_bytes = (128 * 1024); unsigned long zvol_max_discard_blocks = 16384; unsigned int zvol_threads = 32; struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ struct request_queue *zvo_queue; /* request queue */ dev_t zvo_dev; /* device id */ }; taskq_t *zvol_taskq; static struct ida zvol_ida; typedef struct zv_request_stack { zvol_state_t *zv; struct bio *bio; } zv_request_t; typedef struct zv_request_task { zv_request_t zvr; taskq_ent_t ent; } zv_request_task_t; static zv_request_task_t * zv_request_task_create(zv_request_t zvr) { zv_request_task_t *task; task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); taskq_init_ent(&task->ent); task->zvr = zvr; return (task); } static void zv_request_task_free(zv_request_task_t *task) { kmem_free(task, sizeof (*task)); } /* * Given a path, return TRUE if path is a ZVOL. */ static boolean_t zvol_is_zvol_impl(const char *path) { dev_t dev = 0; if (vdev_lookup_bdev(path, &dev) != 0) return (B_FALSE); if (MAJOR(dev) == zvol_major) return (B_TRUE); return (B_FALSE); } static void zvol_write(zv_request_t *zvr) { struct bio *bio = zvr->bio; int error = 0; zfs_uio_t uio; zfs_uio_bvec_init(&uio, bio); zvol_state_t *zv = zvr->zv; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); /* bio marked as FLUSH need to flush before write */ if (bio_is_flush(bio)) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* Some requests are just for flush and nothing else. */ if (uio.uio_resid == 0) { rw_exit(&zv->zv_suspend_lock); BIO_END_IO(bio, 0); return; } struct request_queue *q = zv->zv_zso->zvo_queue; struct gendisk *disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; unsigned long start_time; boolean_t acct = blk_queue_io_stat(q); if (acct) start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); boolean_t sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_WRITER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio.uio_loffset; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); if (error == 0) { zvol_log_write(zv, tx, off, bytes, sync); } dmu_tx_commit(tx); if (error) break; } zfs_rangelock_exit(lr); int64_t nwritten = start_resid - uio.uio_resid; dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); task_io_account_write(nwritten); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); if (acct) blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); BIO_END_IO(bio, -error); } static void zvol_write_task(void *arg) { zv_request_task_t *task = arg; zvol_write(&task->zvr); zv_request_task_free(task); } static void zvol_discard(zv_request_t *zvr) { struct bio *bio = zvr->bio; zvol_state_t *zv = zvr->zv; uint64_t start = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); uint64_t end = start + size; boolean_t sync; int error = 0; dmu_tx_t *tx; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); struct request_queue *q = zv->zv_zso->zvo_queue; struct gendisk *disk = zv->zv_zso->zvo_disk; unsigned long start_time; boolean_t acct = blk_queue_io_stat(q); if (acct) start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (end > zv->zv_volsize) { error = SET_ERROR(EIO); goto unlock; } /* * Align the request to volume block boundaries when a secure erase is * not required. This will prevent dnode_free_range() from zeroing out * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ if (!bio_is_secure_erase(bio)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; } if (start >= end) goto unlock; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, start, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, start, size, B_TRUE); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); } zfs_rangelock_exit(lr); if (error == 0 && sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); unlock: rw_exit(&zv->zv_suspend_lock); if (acct) blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); BIO_END_IO(bio, -error); } static void zvol_discard_task(void *arg) { zv_request_task_t *task = arg; zvol_discard(&task->zvr); zv_request_task_free(task); } static void zvol_read(zv_request_t *zvr) { struct bio *bio = zvr->bio; int error = 0; zfs_uio_t uio; zfs_uio_bvec_init(&uio, bio); zvol_state_t *zv = zvr->zv; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); struct request_queue *q = zv->zv_zso->zvo_queue; struct gendisk *disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; unsigned long start_time; boolean_t acct = blk_queue_io_stat(q); if (acct) start_time = blk_generic_start_io_acct(q, disk, READ, bio); zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_READER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); /* don't read past the end */ if (bytes > volsize - uio.uio_loffset) bytes = volsize - uio.uio_loffset; error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } zfs_rangelock_exit(lr); int64_t nread = start_resid - uio.uio_resid; dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); task_io_account_read(nread); rw_exit(&zv->zv_suspend_lock); if (acct) blk_generic_end_io_acct(q, disk, READ, bio, start_time); BIO_END_IO(bio, -error); } static void zvol_read_task(void *arg) { zv_request_task_t *task = arg; zvol_read(&task->zvr); zv_request_task_free(task); } #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS static blk_qc_t zvol_submit_bio(struct bio *bio) #else static MAKE_REQUEST_FN_RET zvol_request(struct request_queue *q, struct bio *bio) #endif { #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bio->bi_bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif #endif zvol_state_t *zv = q->queuedata; fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = BIO_BI_SECTOR(bio) << 9; uint64_t size = BIO_BI_SIZE(bio); int rw = bio_data_dir(bio); if (bio_has_data(bio) && offset + size > zv->zv_volsize) { printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", zv->zv_zso->zvo_disk->disk_name, (long long unsigned)offset, (long unsigned)size); BIO_END_IO(bio, -SET_ERROR(EIO)); goto out; } zv_request_t zvr = { .zv = zv, .bio = bio, }; zv_request_task_t *task; if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { BIO_END_IO(bio, -SET_ERROR(EROFS)); goto out; } /* * Prevents the zvol from being suspended, or the ZIL being * concurrently opened. Will be released after the i/o * completes. */ rw_enter(&zv->zv_suspend_lock, RW_READER); /* * Open a ZIL if this is the first time we have written to this * zvol. We protect zv->zv_zilog with zv_suspend_lock rather * than zv_state_lock so that we don't need to acquire an * additional lock in this path. */ if (zv->zv_zilog == NULL) { rw_exit(&zv->zv_suspend_lock); rw_enter(&zv->zv_suspend_lock, RW_WRITER); if (zv->zv_zilog == NULL) { zv->zv_zilog = zil_open(zv->zv_objset, zvol_get_data); zv->zv_flags |= ZVOL_WRITTEN_TO; /* replay / destroy done in zvol_create_minor */ VERIFY0((zv->zv_zilog->zl_header->zh_flags & ZIL_REPLAY_NEEDED)); } rw_downgrade(&zv->zv_suspend_lock); } /* * We don't want this thread to be blocked waiting for i/o to * complete, so we instead wait from a taskq callback. The * i/o may be a ZIL write (via zil_commit()), or a read of an * indirect block, or a read of a data block (if this is a * partial-block write). We will indicate that the i/o is * complete by calling BIO_END_IO() from the taskq callback. * * This design allows the calling thread to continue and * initiate more concurrent operations by calling * zvol_request() again. There are typically only a small * number of threads available to call zvol_request() (e.g. * one per iSCSI target), so keeping the latency of * zvol_request() low is important for performance. * * The zvol_request_sync module parameter allows this * behavior to be altered, for performance evaluation * purposes. If the callback blocks, setting * zvol_request_sync=1 will result in much worse performance. * * We can have up to zvol_threads concurrent i/o's being * processed for all zvols on the system. This is typically * a vast improvement over the zvol_request_sync=1 behavior * of one i/o at a time per zvol. However, an even better * design would be for zvol_request() to initiate the zio * directly, and then be notified by the zio_done callback, * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL * interfaces lack this functionality (they block waiting for * the i/o to complete). */ if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { if (zvol_request_sync) { zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, zvol_discard_task, task, 0, &task->ent); } } else { if (zvol_request_sync) { zvol_write(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, zvol_write_task, task, 0, &task->ent); } } } else { /* * The SCST driver, and possibly others, may issue READ I/Os * with a length of zero bytes. These empty I/Os contain no * data and require no additional handling. */ if (size == 0) { BIO_END_IO(bio, 0); goto out; } rw_enter(&zv->zv_suspend_lock, RW_READER); /* See comment in WRITE case above. */ if (zvol_request_sync) { zvol_read(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, zvol_read_task, task, 0, &task->ent); } } out: spl_fstrans_unmark(cookie); #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) return (BLK_QC_T_NONE); #endif } static int zvol_open(struct block_device *bdev, fmode_t flag) { zvol_state_t *zv; int error = 0; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, RW_READER); /* * Obtain a copy of private_data under the zvol_state_lock to make * sure that either the result of zvol free code path setting * bdev->bd_disk->private_data to NULL is observed, or zvol_free() * is not called on this zv because of the positive zv_open_count. */ zv = bdev->bd_disk->private_data; if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(-ENXIO)); } mutex_enter(&zv->zv_state_lock); /* * make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); if (error) goto out_mutex; } if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { error = -EROFS; goto out_open_count; } zv->zv_open_count++; mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); zfs_check_media_change(bdev); return (0); out_open_count: if (zv->zv_open_count == 0) zvol_last_close(zv); out_mutex: mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); if (error == -EINTR) { error = -ERESTARTSYS; schedule(); } return (SET_ERROR(error)); } static void zvol_release(struct gendisk *disk, fmode_t mode) { zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, RW_READER); zv = disk->private_data; mutex_enter(&zv->zv_state_lock); ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 1) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); zv->zv_open_count--; if (zv->zv_open_count == 0) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); } static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; ASSERT3U(zv->zv_open_count, >, 0); switch (cmd) { case BLKFLSBUF: fsync_bdev(bdev); invalidate_bdev(bdev); rw_enter(&zv->zv_suspend_lock, RW_READER); if (!(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); rw_exit(&zv->zv_suspend_lock); break; case BLKZNAME: mutex_enter(&zv->zv_state_lock); error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); mutex_exit(&zv->zv_state_lock); break; default: error = -ENOTTY; break; } return (SET_ERROR(error)); } #ifdef CONFIG_COMPAT static int zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { return (zvol_ioctl(bdev, mode, cmd, arg)); } #else #define zvol_compat_ioctl NULL #endif static unsigned int zvol_check_events(struct gendisk *disk, unsigned int clearing) { unsigned int mask = 0; rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; zv->zv_changed = 0; mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (mask); } static int zvol_revalidate_disk(struct gendisk *disk) { rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> SECTOR_BITS); mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (0); } static int zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) { struct gendisk *disk = zv->zv_zso->zvo_disk; #if defined(HAVE_REVALIDATE_DISK_SIZE) revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); #elif defined(HAVE_REVALIDATE_DISK) revalidate_disk(disk); #else zvol_revalidate_disk(disk); #endif return (0); } static void zvol_clear_private(zvol_state_t *zv) { /* * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ zv->zv_zso->zvo_disk->private_data = NULL; } /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very * tiny devices. For devices over 1 Mib a standard head and sector count * is used to keep the cylinders count reasonable. */ static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { zvol_state_t *zv = bdev->bd_disk->private_data; sector_t sectors; ASSERT3U(zv->zv_open_count, >, 0); sectors = get_capacity(zv->zv_zso->zvo_disk); if (sectors > 2048) { geo->heads = 16; geo->sectors = 63; } else { geo->heads = 2; geo->sectors = 4; } geo->start = 0; geo->cylinders = sectors / (geo->heads * geo->sectors); return (0); } static struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .check_events = zvol_check_events, +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK .revalidate_disk = zvol_revalidate_disk, +#endif .getgeo = zvol_getgeo, .owner = THIS_MODULE, #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS .submit_bio = zvol_submit_bio, #endif }; /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ static zvol_state_t * zvol_alloc(dev_t dev, const char *name) { zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) return (NULL); if (volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; if (volmode == ZFS_VOLMODE_NONE) return (NULL); zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; zv->zv_volmode = volmode; list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); #else zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); #endif if (zso->zvo_queue == NULL) goto out_kmem; blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zso->zvo_queue, 1); /* Disable write merging in favor of the ZIO pipeline. */ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) goto out_queue; zso->zvo_queue->queuedata = zv; zso->zvo_dev = dev; zv->zv_open_count = 0; strlcpy(zv->zv_name, name, MAXNAMELEN); zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zso->zvo_disk->major = zvol_major; zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; if (volmode == ZFS_VOLMODE_DEV) { /* * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set * gendisk->minors = 1 as noted in include/linux/genhd.h. * Also disable extended partition numbers (GENHD_FL_EXT_DEVT) * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN) * setting gendisk->flags accordingly. */ zso->zvo_disk->minors = 1; #if defined(GENHD_FL_EXT_DEVT) zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; #endif #if defined(GENHD_FL_NO_PART_SCAN) zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN; #endif } zso->zvo_disk->first_minor = (dev & MINORMASK); zso->zvo_disk->fops = &zvol_ops; zso->zvo_disk->private_data = zv; zso->zvo_disk->queue = zso->zvo_queue; snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); return (zv); out_queue: blk_cleanup_queue(zso->zvo_queue); out_kmem: kmem_free(zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); return (NULL); } /* * Cleanup then free a zvol_state_t which was created by zvol_alloc(). * At this time, the structure is not opened by anyone, is taken off * the zvol_state_list, and has its private data set to NULL. * The zvol_state_lock is dropped. * * This function may take many milliseconds to complete (e.g. we've seen * it take over 256ms), due to the calls to "blk_cleanup_queue" and * "del_gendisk". Thus, consumers need to be careful to account for this * latency when calling this function. */ static void zvol_free(zvol_state_t *zv) { ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); del_gendisk(zv->zv_zso->zvo_disk); blk_cleanup_queue(zv->zv_zso->zvo_queue); put_disk(zv->zv_zso->zvo_disk); ida_simple_remove(&zvol_ida, MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); } void zvol_wait_close(zvol_state_t *zv) { } /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block * device is live and ready for use. */ static int zvol_os_create_minor(const char *name) { zvol_state_t *zv; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t len; unsigned minor = 0; int error = 0; int idx; uint64_t hash = zvol_name_hash(name); if (zvol_inhibit_dev) return (0); idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; zv = zvol_find_by_name_hash(name, hash, RW_NONE); if (zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); ida_simple_remove(&zvol_ida, idx); return (SET_ERROR(EEXIST)); } doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; zv = zvol_alloc(MKDEV(zvol_major, minor), name); if (zv == NULL) { error = SET_ERROR(EAGAIN); goto out_dmu_objset_disown; } zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; zv->zv_volblocksize = doi->doi_data_block_size; zv->zv_volsize = volsize; zv->zv_objset = os; set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, (DMU_MAX_ACCESS / 4) >> 9); blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); blk_queue_physical_block_size(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); blk_queue_discard_granularity(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); #ifdef QUEUE_FLAG_NONROT blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); #endif #ifdef QUEUE_FLAG_ADD_RANDOM blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); #endif /* This flag was introduced in kernel version 4.12. */ #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); #endif ASSERT3P(zv->zv_zilog, ==, NULL); zv->zv_zilog = zil_open(os, zvol_get_data); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) zil_destroy(zv->zv_zilog, B_FALSE); else zil_replay(os, zv, zvol_replay_vector); } zil_close(zv->zv_zilog); zv->zv_zilog = NULL; ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); /* * When udev detects the addition of the device it will immediately * invoke blkid(8) to determine the type of content on the device. * Prefetching the blocks commonly scanned by blkid(8) will speed * up this process. */ len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, ZIO_PRIORITY_SYNC_READ); } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); /* * Keep in mind that once add_disk() is called, the zvol is * announced to the world, and zvol_open()/zvol_release() can * be called at any time. Incidentally, add_disk() itself calls * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() * directly as well. */ if (error == 0) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); rw_exit(&zvol_state_lock); add_disk(zv->zv_zso->zvo_disk); } else { ida_simple_remove(&zvol_ida, idx); } return (error); } static void zvol_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_zso->zvo_disk); ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); /* move to new hashtable entry */ zv->zv_hash = zvol_name_hash(zv->zv_name); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); /* * The block device's read-only state is briefly changed causing * a KOBJ_CHANGE uevent to be issued. This ensures udev detects * the name change and fixes the symlinks. This does not change * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never * changes. This would normally be done using kobject_uevent() but * that is a GPL-only symbol which is why we need this workaround. */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); } static void zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) { set_disk_ro(zv->zv_zso->zvo_disk, flags); } static void zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) { set_capacity(zv->zv_zso->zvo_disk, capacity); } const static zvol_platform_ops_t zvol_linux_ops = { .zv_free = zvol_free, .zv_rename_minor = zvol_rename_minor, .zv_create_minor = zvol_os_create_minor, .zv_update_volsize = zvol_update_volsize, .zv_clear_private = zvol_clear_private, .zv_is_zvol = zvol_is_zvol_impl, .zv_set_disk_ro = zvol_set_disk_ro_impl, .zv_set_capacity = zvol_set_capacity_impl, }; int zvol_init(void) { int error; int threads = MIN(MAX(zvol_threads, 1), 1024); error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (zvol_taskq == NULL) { unregister_blkdev(zvol_major, ZVOL_DRIVER); return (-ENOMEM); } zvol_init_impl(); ida_init(&zvol_ida); zvol_register_ops(&zvol_linux_ops); return (0); } void zvol_fini(void) { zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); taskq_destroy(zvol_taskq); ida_destroy(&zvol_ida); } /* BEGIN CSTYLED */ module_param(zvol_inhibit_dev, uint, 0644); MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests"); module_param(zvol_request_sync, uint, 0644); MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); /* END CSTYLED */