diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index 8e9e638b125a..c5a353ca9203 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -1,663 +1,687 @@ dnl # dnl # 2.6.38 API change, dnl # Added blkdev_get_by_path() dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH], [ ZFS_LINUX_TEST_SRC([blkdev_get_by_path], [ #include #include ], [ struct block_device *bdev __attribute__ ((unused)) = NULL; const char *path = "path"; fmode_t mode = 0; void *holder = NULL; bdev = blkdev_get_by_path(path, mode, holder); ]) ]) dnl # dnl # 6.5.x API change, dnl # blkdev_get_by_path() takes 4 args dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG], [ ZFS_LINUX_TEST_SRC([blkdev_get_by_path_4arg], [ #include #include ], [ struct block_device *bdev __attribute__ ((unused)) = NULL; const char *path = "path"; fmode_t mode = 0; void *holder = NULL; struct blk_holder_ops h; bdev = blkdev_get_by_path(path, mode, holder, &h); ]) ]) dnl # dnl # 6.8.x API change dnl # bdev_open_by_path() replaces blkdev_get_by_path() dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH], [ ZFS_LINUX_TEST_SRC([bdev_open_by_path], [ #include #include ], [ struct bdev_handle *bdh __attribute__ ((unused)) = NULL; const char *path = "path"; fmode_t mode = 0; void *holder = NULL; struct blk_holder_ops h; bdh = bdev_open_by_path(path, mode, holder, &h); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args]) ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [ AC_MSG_RESULT(yes) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 4 args]) ZFS_LINUX_TEST_RESULT([blkdev_get_by_path_4arg], [ AC_DEFINE(HAVE_BLKDEV_GET_BY_PATH_4ARG, 1, [blkdev_get_by_path() exists and takes 4 args]) AC_MSG_RESULT(yes) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether bdev_open_by_path() exists]) ZFS_LINUX_TEST_RESULT([bdev_open_by_path], [ AC_DEFINE(HAVE_BDEV_OPEN_BY_PATH, 1, [bdev_open_by_path() exists]) AC_MSG_RESULT(yes) ], [ ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) ]) ]) ]) ]) dnl # dnl # 6.5.x API change dnl # blk_mode_t was added as a type to supercede some places where fmode_t dnl # is used dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T], [ ZFS_LINUX_TEST_SRC([blk_mode_t], [ #include #include ], [ blk_mode_t m __attribute((unused)) = (blk_mode_t)0; ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T], [ AC_MSG_CHECKING([whether blk_mode_t is defined]) ZFS_LINUX_TEST_RESULT([blk_mode_t], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLK_MODE_T, 1, [blk_mode_t is defined]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 2.6.38 API change, dnl # Added blkdev_put() dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT], [ ZFS_LINUX_TEST_SRC([blkdev_put], [ #include #include ], [ struct block_device *bdev = NULL; fmode_t mode = 0; blkdev_put(bdev, mode); ]) ]) dnl # dnl # 6.5.x API change. dnl # blkdev_put() takes (void* holder) as arg 2 dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER], [ ZFS_LINUX_TEST_SRC([blkdev_put_holder], [ #include #include ], [ struct block_device *bdev = NULL; void *holder = NULL; blkdev_put(bdev, holder); ]) ]) dnl # dnl # 6.8.x API change dnl # bdev_release() replaces blkdev_put() dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE], [ ZFS_LINUX_TEST_SRC([bdev_release], [ #include #include ], [ struct bdev_handle *bdh = NULL; bdev_release(bdh); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ AC_MSG_CHECKING([whether blkdev_put() exists]) ZFS_LINUX_TEST_RESULT([blkdev_put], [ AC_MSG_RESULT(yes) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2]) ZFS_LINUX_TEST_RESULT([blkdev_put_holder], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_PUT_HOLDER, 1, [blkdev_put() accepts void* as arg 2]) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether bdev_release() exists]) ZFS_LINUX_TEST_RESULT([bdev_release], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BDEV_RELEASE, 1, [bdev_release() exists]) ], [ ZFS_LINUX_TEST_ERROR([blkdev_put()]) ]) ]) ]) ]) dnl # dnl # 4.1 API, exported blkdev_reread_part() symbol, back ported to the dnl # 3.10.0 CentOS 7.x enterprise kernels. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART], [ ZFS_LINUX_TEST_SRC([blkdev_reread_part], [ #include #include ], [ struct block_device *bdev = NULL; int error; error = blkdev_reread_part(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_REREAD_PART], [ AC_MSG_CHECKING([whether blkdev_reread_part() exists]) ZFS_LINUX_TEST_RESULT([blkdev_reread_part], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_REREAD_PART, 1, [blkdev_reread_part() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # check_disk_change() was removed in 5.10 dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE], [ ZFS_LINUX_TEST_SRC([check_disk_change], [ #include #include ], [ struct block_device *bdev = NULL; bool error; error = check_disk_change(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [ AC_MSG_CHECKING([whether check_disk_change() exists]) ZFS_LINUX_TEST_RESULT([check_disk_change], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_CHECK_DISK_CHANGE, 1, [check_disk_change() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 6.5.x API change dnl # disk_check_media_change() was added dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [ ZFS_LINUX_TEST_SRC([disk_check_media_change], [ #include #include ], [ struct block_device *bdev = NULL; bool error; error = disk_check_media_change(bdev->bd_disk); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE], [ AC_MSG_CHECKING([whether disk_check_media_change() exists]) ZFS_LINUX_TEST_RESULT([disk_check_media_change], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_DISK_CHECK_MEDIA_CHANGE, 1, [disk_check_media_change() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # bdev_kobj() is introduced from 5.12 dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ], [ ZFS_LINUX_TEST_SRC([bdev_kobj], [ #include #include #include ], [ struct block_device *bdev = NULL; struct kobject *disk_kobj; disk_kobj = bdev_kobj(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ], [ AC_MSG_CHECKING([whether bdev_kobj() exists]) ZFS_LINUX_TEST_RESULT([bdev_kobj], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BDEV_KOBJ, 1, [bdev_kobj() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # part_to_dev() was removed in 5.12 dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV], [ ZFS_LINUX_TEST_SRC([part_to_dev], [ #include #include ], [ struct hd_struct *p = NULL; struct device *pdev; pdev = part_to_dev(p); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV], [ AC_MSG_CHECKING([whether part_to_dev() exists]) ZFS_LINUX_TEST_RESULT([part_to_dev], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_PART_TO_DEV, 1, [part_to_dev() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 5.10 API, check_disk_change() is removed, in favor of dnl # bdev_check_media_change(), which doesn't force revalidation dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE], [ ZFS_LINUX_TEST_SRC([bdev_check_media_change], [ #include #include ], [ struct block_device *bdev = NULL; int error; error = bdev_check_media_change(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE], [ AC_MSG_CHECKING([whether bdev_check_media_change() exists]) ZFS_LINUX_TEST_RESULT([bdev_check_media_change], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BDEV_CHECK_MEDIA_CHANGE, 1, [bdev_check_media_change() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 2.6.22 API change dnl # Single argument invalidate_bdev() dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV], [ ZFS_LINUX_TEST_SRC([invalidate_bdev], [ #include #include ],[ struct block_device *bdev = NULL; invalidate_bdev(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV], [ AC_MSG_CHECKING([whether invalidate_bdev() exists]) ZFS_LINUX_TEST_RESULT([invalidate_bdev], [ AC_MSG_RESULT(yes) ],[ ZFS_LINUX_TEST_ERROR([invalidate_bdev()]) ]) ]) dnl # dnl # 5.11 API, lookup_bdev() takes dev_t argument. dnl # 2.6.27 API, lookup_bdev() was first exported. dnl # 4.4.0-6.21 API, lookup_bdev() on Ubuntu takes mode argument. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV], [ ZFS_LINUX_TEST_SRC([lookup_bdev_devt], [ #include ], [ int error __attribute__ ((unused)); const char path[] = "/example/path"; dev_t dev; error = lookup_bdev(path, &dev); ]) ZFS_LINUX_TEST_SRC([lookup_bdev_1arg], [ #include #include ], [ struct block_device *bdev __attribute__ ((unused)); const char path[] = "/example/path"; bdev = lookup_bdev(path); ]) ZFS_LINUX_TEST_SRC([lookup_bdev_mode], [ #include ], [ struct block_device *bdev __attribute__ ((unused)); const char path[] = "/example/path"; bdev = lookup_bdev(path, FMODE_READ); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV], [ AC_MSG_CHECKING([whether lookup_bdev() wants dev_t arg]) ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_devt], [lookup_bdev], [fs/block_dev.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_DEVT_LOOKUP_BDEV, 1, [lookup_bdev() wants dev_t arg]) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether lookup_bdev() wants 1 arg]) ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_1arg], [lookup_bdev], [fs/block_dev.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_1ARG_LOOKUP_BDEV, 1, [lookup_bdev() wants 1 arg]) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether lookup_bdev() wants mode arg]) ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_mode], [lookup_bdev], [fs/block_dev.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_MODE_LOOKUP_BDEV, 1, [lookup_bdev() wants mode arg]) ], [ ZFS_LINUX_TEST_ERROR([lookup_bdev()]) ]) ]) ]) ]) dnl # dnl # 2.6.30 API change dnl # dnl # The bdev_physical_block_size() interface was added to provide a way dnl # to determine the smallest write which can be performed without a dnl # read-modify-write operation. dnl # dnl # Unfortunately, this interface isn't entirely reliable because dnl # drives are sometimes known to misreport this value. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE], [ ZFS_LINUX_TEST_SRC([bdev_physical_block_size], [ #include ],[ struct block_device *bdev __attribute__ ((unused)) = NULL; bdev_physical_block_size(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE], [ AC_MSG_CHECKING([whether bdev_physical_block_size() is available]) ZFS_LINUX_TEST_RESULT([bdev_physical_block_size], [ AC_MSG_RESULT(yes) ],[ ZFS_LINUX_TEST_ERROR([bdev_physical_block_size()]) ]) ]) dnl # dnl # 2.6.30 API change dnl # Added bdev_logical_block_size(). dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [ ZFS_LINUX_TEST_SRC([bdev_logical_block_size], [ #include ],[ struct block_device *bdev __attribute__ ((unused)) = NULL; bdev_logical_block_size(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [ AC_MSG_CHECKING([whether bdev_logical_block_size() is available]) ZFS_LINUX_TEST_RESULT([bdev_logical_block_size], [ AC_MSG_RESULT(yes) ],[ ZFS_LINUX_TEST_ERROR([bdev_logical_block_size()]) ]) ]) dnl # dnl # 5.11 API change dnl # Added bdev_whole() helper. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE], [ ZFS_LINUX_TEST_SRC([bdev_whole], [ #include ],[ struct block_device *bdev = NULL; bdev = bdev_whole(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [ AC_MSG_CHECKING([whether bdev_whole() is available]) ZFS_LINUX_TEST_RESULT([bdev_whole], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BDEV_WHOLE, 1, [bdev_whole() is available]) ],[ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 5.20 API change, dnl # Removed bdevname(), snprintf(.., %pg) should be used. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME], [ ZFS_LINUX_TEST_SRC([bdevname], [ #include #include ], [ struct block_device *bdev __attribute__ ((unused)) = NULL; char path[BDEVNAME_SIZE]; (void) bdevname(bdev, path); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [ AC_MSG_CHECKING([whether bdevname() exists]) ZFS_LINUX_TEST_RESULT([bdevname], [ AC_DEFINE(HAVE_BDEVNAME, 1, [bdevname() is available]) AC_MSG_RESULT(yes) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 5.19 API: blkdev_issue_secure_erase() +dnl # 4.7 API: __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) dnl # 3.10 API: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [ #include ],[ struct block_device *bdev = NULL; sector_t sector = 0; sector_t nr_sects = 0; int error __attribute__ ((unused)); error = blkdev_issue_secure_erase(bdev, sector, nr_sects, GFP_KERNEL); ]) + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_flags], [ + #include + ],[ + struct block_device *bdev = NULL; + sector_t sector = 0; + sector_t nr_sects = 0; + unsigned long flags = 0; + struct bio *biop = NULL; + int error __attribute__ ((unused)); + + error = __blkdev_issue_discard(bdev, + sector, nr_sects, GFP_KERNEL, flags, &biop); + ]) + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [ #include ],[ struct block_device *bdev = NULL; sector_t sector = 0; sector_t nr_sects = 0; unsigned long flags = 0; int error __attribute__ ((unused)); error = blkdev_issue_discard(bdev, sector, nr_sects, GFP_KERNEL, flags); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [ AC_MSG_CHECKING([whether blkdev_issue_secure_erase() is available]) ZFS_LINUX_TEST_RESULT([blkdev_issue_secure_erase], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_ISSUE_SECURE_ERASE, 1, [blkdev_issue_secure_erase() is available]) ],[ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) - ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ + AC_MSG_CHECKING([whether __blkdev_issue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1, - [blkdev_issue_discard() is available]) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC, 1, + [__blkdev_issue_discard() is available]) ],[ - ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()]) + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1, + [blkdev_issue_discard() is available]) + ],[ + ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()]) + ]) ]) ]) ]) dnl # dnl # 5.13 API change dnl # blkdev_get_by_path() no longer handles ERESTARTSYS dnl # dnl # Unfortunately we're forced to rely solely on the kernel version dnl # number in order to determine the expected behavior. This was an dnl # internal change to blkdev_get_by_dev(), see commit a8ed1a0607. dnl # AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [ AC_MSG_CHECKING([whether blkdev_get_by_path() handles ERESTARTSYS]) AS_VERSION_COMPARE([$LINUX_VERSION], [5.13.0], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_GET_ERESTARTSYS, 1, [blkdev_get_by_path() handles ERESTARTSYS]) ],[ AC_MSG_RESULT(no) ],[ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 6.5.x API change dnl # BLK_STS_NEXUS replaced with BLK_STS_RESV_CONFLICT dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT], [ ZFS_LINUX_TEST_SRC([blk_sts_resv_conflict], [ #include ],[ blk_status_t s __attribute__ ((unused)) = BLK_STS_RESV_CONFLICT; ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [ AC_MSG_CHECKING([whether BLK_STS_RESV_CONFLICT is defined]) ZFS_LINUX_TEST_RESULT([blk_sts_resv_conflict], [ AC_DEFINE(HAVE_BLK_STS_RESV_CONFLICT, 1, [BLK_STS_RESV_CONFLICT is defined]) AC_MSG_RESULT(yes) ], [ AC_MSG_RESULT(no) ]) ]) ]) AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_PUT ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_KOBJ ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_BLKDEV_PUT ZFS_AC_KERNEL_BLKDEV_REREAD_PART ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE ZFS_AC_KERNEL_BLKDEV_BDEVNAME ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE ZFS_AC_KERNEL_BLKDEV_BDEV_KOBJ ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T ]) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index e7f0aa573848..b0bda5fa2012 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1,1132 +1,1176 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_LINUX_BLK_CGROUP_HEADER #include #endif /* * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying * block_device. Since it carries the block_device inside, its convenient to * just use the handle as a proxy. For pre-6.8, we just emulate this with * a cast, since we don't need any of the other fields inside the handle. */ #ifdef HAVE_BDEV_OPEN_BY_PATH typedef struct bdev_handle zfs_bdev_handle_t; #define BDH_BDEV(bdh) ((bdh)->bdev) #define BDH_IS_ERR(bdh) (IS_ERR(bdh)) #define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) #define BDH_ERR_PTR(err) (ERR_PTR(err)) #else typedef void zfs_bdev_handle_t; #define BDH_BDEV(bdh) ((struct block_device *)bdh) #define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) #define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) #define BDH_ERR_PTR(err) (ERR_PTR(err)) #endif typedef struct vdev_disk { zfs_bdev_handle_t *vd_bdh; krwlock_t vd_lock; } vdev_disk_t; /* * Unique identifier for the exclusive vdev holder. */ static void *zfs_vdev_holder = VDEV_HOLDER; /* * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the * device is missing. The missing path may be transient since the links * can be briefly removed and recreated in response to udev events. */ static uint_t zfs_vdev_open_timeout_ms = 1000; /* * Size of the "reserved" partition, in blocks. */ #define EFI_MIN_RESV_SIZE (16 * 1024) /* * Virtual device vector for disks. */ typedef struct dio_request { zio_t *dr_zio; /* Parent ZIO */ atomic_t dr_ref; /* References */ int dr_error; /* Bio error */ int dr_bio_count; /* Count of bio's */ struct bio *dr_bio[]; /* Attached bio's */ } dio_request_t; /* * BIO request failfast mask. */ static unsigned int zfs_vdev_failfast_mask = 1; #ifdef HAVE_BLK_MODE_T static blk_mode_t #else static fmode_t #endif vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive) { #ifdef HAVE_BLK_MODE_T blk_mode_t mode = 0; if (spa_mode & SPA_MODE_READ) mode |= BLK_OPEN_READ; if (spa_mode & SPA_MODE_WRITE) mode |= BLK_OPEN_WRITE; if (exclusive) mode |= BLK_OPEN_EXCL; #else fmode_t mode = 0; if (spa_mode & SPA_MODE_READ) mode |= FMODE_READ; if (spa_mode & SPA_MODE_WRITE) mode |= FMODE_WRITE; if (exclusive) mode |= FMODE_EXCL; #endif return (mode); } /* * Returns the usable capacity (in bytes) for the partition or disk. */ static uint64_t bdev_capacity(struct block_device *bdev) { return (i_size_read(bdev->bd_inode)); } #if !defined(HAVE_BDEV_WHOLE) static inline struct block_device * bdev_whole(struct block_device *bdev) { return (bdev->bd_contains); } #endif #if defined(HAVE_BDEVNAME) #define vdev_bdevname(bdev, name) bdevname(bdev, name) #else static inline void vdev_bdevname(struct block_device *bdev, char *name) { snprintf(name, BDEVNAME_SIZE, "%pg", bdev); } #endif /* * Returns the maximum expansion capacity of the block device (in bytes). * * It is possible to expand a vdev when it has been created as a wholedisk * and the containing block device has increased in capacity. Or when the * partition containing the pool has been manually increased in size. * * This function is only responsible for calculating the potential expansion * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is * responsible for verifying the expected partition layout in the wholedisk * case, and updating the partition table if appropriate. Once the partition * size has been increased the additional capacity will be visible using * bdev_capacity(). * * The returned maximum expansion capacity is always expected to be larger, or * at the very least equal, to its usable capacity to prevent overestimating * the pool expandsize. */ static uint64_t bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) { uint64_t psize; int64_t available; if (wholedisk && bdev != bdev_whole(bdev)) { /* * When reporting maximum expansion capacity for a wholedisk * deduct any capacity which is expected to be lost due to * alignment restrictions. Over reporting this value isn't * harmful and would only result in slightly less capacity * than expected post expansion. * The estimated available space may be slightly smaller than * bdev_capacity() for devices where the number of sectors is * not a multiple of the alignment size and the partition layout * is keeping less than PARTITION_END_ALIGNMENT bytes after the * "reserved" EFI partition: in such cases return the device * usable capacity. */ available = i_size_read(bdev_whole(bdev)->bd_inode) - ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + PARTITION_END_ALIGNMENT) << SECTOR_BITS); psize = MAX(available, bdev_capacity(bdev)); } else { psize = bdev_capacity(bdev); } return (psize); } static void vdev_disk_error(zio_t *zio) { /* * This function can be called in interrupt context, for instance while * handling IRQs coming from a misbehaving disk device; use printk() * which is safe from any context. */ printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " "offset=%llu size=%llu flags=%llu\n", spa_name(zio->io_spa), zio->io_vd->vdev_path, zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, zio->io_flags); } static void vdev_disk_kobj_evt_post(vdev_t *v) { vdev_disk_t *vd = v->vdev_tsd; if (vd && vd->vd_bdh) { spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); } else { vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", v->vdev_path); } } static zfs_bdev_handle_t * vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder) { #if defined(HAVE_BDEV_OPEN_BY_PATH) return (bdev_open_by_path(path, vdev_bdev_mode(mode, B_TRUE), holder, NULL)); #elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) return (blkdev_get_by_path(path, vdev_bdev_mode(mode, B_TRUE), holder, NULL)); #else return (blkdev_get_by_path(path, vdev_bdev_mode(mode, B_TRUE), holder)); #endif } static void vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t mode, void *holder) { #if defined(HAVE_BDEV_RELEASE) return (bdev_release(bdh)); #elif defined(HAVE_BLKDEV_PUT_HOLDER) return (blkdev_put(BDH_BDEV(bdh), holder)); #else return (blkdev_put(BDH_BDEV(bdh), vdev_bdev_mode(mode, B_TRUE))); #endif } static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { zfs_bdev_handle_t *bdh; #ifdef HAVE_BLK_MODE_T blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); #else fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); #endif hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; /* Must have a pathname and it must be absolute. */ if (v->vdev_path == NULL || v->vdev_path[0] != '/') { v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; vdev_dbgmsg(v, "invalid vdev_path"); return (SET_ERROR(EINVAL)); } /* * Reopen the device if it is currently open. When expanding a * partition force re-scanning the partition table if userland * did not take care of this already. We need to do this while closed * in order to get an accurate updated block device size. Then * since udev may need to recreate the device links increase the * open retry timeout before reporting the device as unavailable. */ vd = v->vdev_tsd; if (vd) { char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; boolean_t reread_part = B_FALSE; rw_enter(&vd->vd_lock, RW_WRITER); bdh = vd->vd_bdh; vd->vd_bdh = NULL; if (bdh) { struct block_device *bdev = BDH_BDEV(bdh); if (v->vdev_expanding && bdev != bdev_whole(bdev)) { vdev_bdevname(bdev_whole(bdev), disk_name + 5); /* * If userland has BLKPG_RESIZE_PARTITION, * then it should have updated the partition * table already. We can detect this by * comparing our current physical size * with that of the device. If they are * the same, then we must not have * BLKPG_RESIZE_PARTITION or it failed to * update the partition table online. We * fallback to rescanning the partition * table from the kernel below. However, * if the capacity already reflects the * updated partition, then we skip * rescanning the partition table here. */ if (v->vdev_psize == bdev_capacity(bdev)) reread_part = B_TRUE; } vdev_blkdev_put(bdh, mode, zfs_vdev_holder); } if (reread_part) { bdh = vdev_blkdev_get_by_path(disk_name, mode, zfs_vdev_holder); if (!BDH_IS_ERR(bdh)) { int error = vdev_bdev_reread_part(BDH_BDEV(bdh)); vdev_blkdev_put(bdh, mode, zfs_vdev_holder); if (error == 0) { timeout = MSEC2NSEC( zfs_vdev_open_timeout_ms * 2); } } } } else { vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); rw_enter(&vd->vd_lock, RW_WRITER); } /* * Devices are always opened by the path provided at configuration * time. This means that if the provided path is a udev by-id path * then drives may be re-cabled without an issue. If the provided * path is a udev by-path path, then the physical location information * will be preserved. This can be critical for more complicated * configurations where drives are located in specific physical * locations to maximize the systems tolerance to component failure. * * Alternatively, you can provide your own udev rule to flexibly map * the drives as you see fit. It is not advised that you use the * /dev/[hd]d devices which may be reordered due to probing order. * Devices in the wrong locations will be detected by the higher * level vdev validation. * * The specified paths may be briefly removed and recreated in * response to udev events. This should be exceptionally unlikely * because the zpool command makes every effort to verify these paths * have already settled prior to reaching this point. Therefore, * a ENOENT failure at this point is highly likely to be transient * and it is reasonable to sleep and retry before giving up. In * practice delays have been observed to be on the order of 100ms. * * When ERESTARTSYS is returned it indicates the block device is * a zvol which could not be opened due to the deadlock detection * logic in zvol_open(). Extend the timeout and retry the open * subsequent attempts are expected to eventually succeed. */ hrtime_t start = gethrtime(); bdh = BDH_ERR_PTR(-ENXIO); while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { bdh = vdev_blkdev_get_by_path(v->vdev_path, mode, zfs_vdev_holder); if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { /* * There is no point of waiting since device is removed * explicitly */ if (v->vdev_removed) break; schedule_timeout(MSEC_TO_TICK(10)); } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); continue; } else if (BDH_IS_ERR(bdh)) { break; } } if (BDH_IS_ERR(bdh)) { int error = -BDH_PTR_ERR(bdh); vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, (u_longlong_t)(gethrtime() - start), (u_longlong_t)timeout); vd->vd_bdh = NULL; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); return (SET_ERROR(error)); } else { vd->vd_bdh = bdh; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); } struct block_device *bdev = BDH_BDEV(vd->vd_bdh); /* Determine the physical block size */ int physical_block_size = bdev_physical_block_size(bdev); /* Determine the logical block size */ int logical_block_size = bdev_logical_block_size(bdev); /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ v->vdev_nowritecache = B_FALSE; /* Set when device reports it supports TRIM. */ v->vdev_has_trim = bdev_discard_supported(bdev); /* Set when device reports it supports secure TRIM. */ v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); /* Inform the ZIO pipeline that we are non-rotational */ v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); /* Physical volume size in bytes for the partition */ *psize = bdev_capacity(bdev); /* Physical volume size in bytes including possible expansion space */ *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); /* Based on the minimum sector size set the block size */ *physical_ashift = highbit64(MAX(physical_block_size, SPA_MINBLOCKSIZE)) - 1; *logical_ashift = highbit64(MAX(logical_block_size, SPA_MINBLOCKSIZE)) - 1; return (0); } static void vdev_disk_close(vdev_t *v) { vdev_disk_t *vd = v->vdev_tsd; if (v->vdev_reopening || vd == NULL) return; if (vd->vd_bdh != NULL) { vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), zfs_vdev_holder); } rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); v->vdev_tsd = NULL; } static dio_request_t * vdev_disk_dio_alloc(int bio_count) { dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + sizeof (struct bio *) * bio_count, KM_SLEEP); atomic_set(&dr->dr_ref, 0); dr->dr_bio_count = bio_count; dr->dr_error = 0; for (int i = 0; i < dr->dr_bio_count; i++) dr->dr_bio[i] = NULL; return (dr); } static void vdev_disk_dio_free(dio_request_t *dr) { int i; for (i = 0; i < dr->dr_bio_count; i++) if (dr->dr_bio[i]) bio_put(dr->dr_bio[i]); kmem_free(dr, sizeof (dio_request_t) + sizeof (struct bio *) * dr->dr_bio_count); } static void vdev_disk_dio_get(dio_request_t *dr) { atomic_inc(&dr->dr_ref); } static void vdev_disk_dio_put(dio_request_t *dr) { int rc = atomic_dec_return(&dr->dr_ref); /* * Free the dio_request when the last reference is dropped and * ensure zio_interpret is called only once with the correct zio */ if (rc == 0) { zio_t *zio = dr->dr_zio; int error = dr->dr_error; vdev_disk_dio_free(dr); if (zio) { zio->io_error = error; ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); zio_delay_interrupt(zio); } } } BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) { dio_request_t *dr = bio->bi_private; if (dr->dr_error == 0) { #ifdef HAVE_1ARG_BIO_END_IO_T dr->dr_error = BIO_END_IO_ERROR(bio); #else if (error) dr->dr_error = -(error); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) dr->dr_error = EIO; #endif } /* Drop reference acquired by __vdev_disk_physio */ vdev_disk_dio_put(dr); } static inline void vdev_submit_bio_impl(struct bio *bio) { #ifdef HAVE_1ARG_SUBMIT_BIO (void) submit_bio(bio); #else (void) submit_bio(bio_data_dir(bio), bio); #endif } /* * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so * replace it with preempt_schedule under the following condition: */ #if defined(CONFIG_ARM64) && \ defined(CONFIG_PREEMPTION) && \ defined(CONFIG_BLK_CGROUP) #define preempt_schedule_notrace(x) preempt_schedule(x) #endif /* * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct * as an argument removing the need to set it with bio_set_dev(). This * removes the need for all of the following compatibility code. */ #if !defined(HAVE_BIO_ALLOC_4ARG) #ifdef HAVE_BIO_SET_DEV #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) /* * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). * As a side effect the function was converted to GPL-only. Define our * own version when needed which uses rcu_read_lock_sched(). * * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public * part, moving blkg_tryget into the private one. Define our own version. */ #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) static inline bool vdev_blkg_tryget(struct blkcg_gq *blkg) { struct percpu_ref *ref = &blkg->refcnt; unsigned long __percpu *count; bool rc; rcu_read_lock_sched(); if (__ref_is_percpu(ref, &count)) { this_cpu_inc(*count); rc = true; } else { #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA rc = atomic_long_inc_not_zero(&ref->data->count); #else rc = atomic_long_inc_not_zero(&ref->count); #endif } rcu_read_unlock_sched(); return (rc); } #else #define vdev_blkg_tryget(bg) blkg_tryget(bg) #endif #ifdef HAVE_BIO_SET_DEV_MACRO /* * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the * GPL-only bio_associate_blkg() symbol thus inadvertently converting * the entire macro. Provide a minimal version which always assigns the * request queue's root_blkg to the bio. */ static inline void vdev_bio_associate_blkg(struct bio *bio) { #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bio->bi_bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif ASSERT3P(q, !=, NULL); ASSERT3P(bio->bi_blkg, ==, NULL); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; } #define bio_associate_blkg vdev_bio_associate_blkg #else static inline void vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) { #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif bio_clear_flag(bio, BIO_REMAPPED); if (bio->bi_bdev != bdev) bio_clear_flag(bio, BIO_THROTTLED); bio->bi_bdev = bdev; ASSERT3P(q, !=, NULL); ASSERT3P(bio->bi_blkg, ==, NULL); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; } #define bio_set_dev vdev_bio_set_dev #endif #endif #else /* * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. */ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev) { bio->bi_bdev = bdev; } #endif /* HAVE_BIO_SET_DEV */ #endif /* !HAVE_BIO_ALLOC_4ARG */ static inline void vdev_submit_bio(struct bio *bio) { struct bio_list *bio_list = current->bio_list; current->bio_list = NULL; vdev_submit_bio_impl(bio); current->bio_list = bio_list; } static inline struct bio * vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, unsigned short nr_vecs) { struct bio *bio; #ifdef HAVE_BIO_ALLOC_4ARG bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); #else bio = bio_alloc(gfp_mask, nr_vecs); if (likely(bio != NULL)) bio_set_dev(bio, bdev); #endif return (bio); } static inline unsigned int vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) { unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, bio_size, abd_offset); #ifdef HAVE_BIO_MAX_SEGS return (bio_max_segs(nr_segs)); #else return (MIN(nr_segs, BIO_MAX_PAGES)); #endif } static int __vdev_disk_physio(struct block_device *bdev, zio_t *zio, size_t io_size, uint64_t io_offset, int rw, int flags) { dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; int bio_size; int bio_count = 16; int error = 0; struct blk_plug plug; unsigned short nr_vecs; /* * Accessing outside the block device is never allowed. */ if (io_offset + io_size > bdev->bd_inode->i_size) { vdev_dbgmsg(zio->io_vd, "Illegal access %llu size %llu, device size %llu", (u_longlong_t)io_offset, (u_longlong_t)io_size, (u_longlong_t)i_size_read(bdev->bd_inode)); return (SET_ERROR(EIO)); } retry: dr = vdev_disk_dio_alloc(bio_count); if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)) && zio->io_vd->vdev_failfast == B_TRUE) { bio_set_flags_failfast(bdev, &flags, zfs_vdev_failfast_mask & 1, zfs_vdev_failfast_mask & 2, zfs_vdev_failfast_mask & 4); } dr->dr_zio = zio; /* * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio * can cover at least 128KB and at most 1MB. When the required number * of iovec's exceeds this, we are forced to break the IO in multiple * bio's and wait for them all to complete. This is likely if the * recordsize property is increased beyond 1MB. The default * bio_count=16 should typically accommodate the maximum-size zio of * 16MB. */ abd_offset = 0; bio_offset = io_offset; bio_size = io_size; for (int i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ if (bio_size <= 0) break; /* * If additional bio's are required, we have to retry, but * this should be rare - see the comment above. */ if (dr->dr_bio_count == i) { vdev_disk_dio_free(dr); bio_count *= 2; goto retry; } nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); if (unlikely(dr->dr_bio[i] == NULL)) { vdev_disk_dio_free(dr); return (SET_ERROR(ENOMEM)); } /* Matching put called by vdev_disk_physio_completion */ vdev_disk_dio_get(dr); BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; dr->dr_bio[i]->bi_private = dr; bio_set_op_attrs(dr->dr_bio[i], rw, flags); /* Remaining size is returned to become the new size */ bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, bio_size, abd_offset); /* Advance in buffer and construct another bio if needed */ abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); } /* Extra reference to protect dio_request during vdev_submit_bio */ vdev_disk_dio_get(dr); if (dr->dr_bio_count > 1) blk_start_plug(&plug); /* Submit all bio's associated with this dio */ for (int i = 0; i < dr->dr_bio_count; i++) { if (dr->dr_bio[i]) vdev_submit_bio(dr->dr_bio[i]); } if (dr->dr_bio_count > 1) blk_finish_plug(&plug); vdev_disk_dio_put(dr); return (error); } BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) { zio_t *zio = bio->bi_private; #ifdef HAVE_1ARG_BIO_END_IO_T zio->io_error = BIO_END_IO_ERROR(bio); #else zio->io_error = -error; #endif if (zio->io_error && (zio->io_error == EOPNOTSUPP)) zio->io_vd->vdev_nowritecache = B_TRUE; bio_put(bio); ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); zio_interrupt(zio); } static int vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) { struct request_queue *q; struct bio *bio; q = bdev_get_queue(bdev); if (!q) return (SET_ERROR(ENXIO)); bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); if (unlikely(bio == NULL)) return (SET_ERROR(ENOMEM)); bio->bi_end_io = vdev_disk_io_flush_completion; bio->bi_private = zio; bio_set_flush(bio); vdev_submit_bio(bio); invalidate_bdev(bdev); return (0); } +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ + defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) +BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) +{ + zio_t *zio = bio->bi_private; +#ifdef HAVE_1ARG_BIO_END_IO_T + zio->io_error = BIO_END_IO_ERROR(bio); +#else + zio->io_error = -error; +#endif + bio_put(bio); + if (zio->io_error) + vdev_disk_error(zio); + zio_interrupt(zio); +} + static int -vdev_disk_io_trim(zio_t *zio) +vdev_issue_discard_trim(zio_t *zio, unsigned long flags) { - vdev_t *v = zio->io_vd; - vdev_disk_t *vd = v->vdev_tsd; + int ret; + struct bio *bio = NULL; -#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) { - return (-blkdev_issue_secure_erase(BDH_BDEV(vd->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); - } else { - return (-blkdev_issue_discard(BDH_BDEV(vd->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); +#if defined(BLKDEV_DISCARD_SECURE) + ret = - __blkdev_issue_discard( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio); +#else + (void) flags; + ret = - __blkdev_issue_discard( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio); +#endif + if (!ret && bio) { + bio->bi_private = zio; + bio->bi_end_io = vdev_disk_discard_end_io; + vdev_submit_bio(bio); } -#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) + return (ret); +} +#endif + +static int +vdev_disk_io_trim(zio_t *zio) +{ unsigned long trim_flags = 0; -#if defined(BLKDEV_DISCARD_SECURE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) + if (zio->io_trim_flags & ZIO_TRIM_SECURE) { +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) + return (-blkdev_issue_secure_erase( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); +#elif defined(BLKDEV_DISCARD_SECURE) trim_flags |= BLKDEV_DISCARD_SECURE; #endif - return (-blkdev_issue_discard(BDH_BDEV(vd->vd_bdh), + } +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ + defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) + return (vdev_issue_discard_trim(zio, trim_flags)); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) + return (-blkdev_issue_discard( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); #else #error "Unsupported kernel" #endif } static void vdev_disk_io_start(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; int rw, error; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. * Nothing to be done here but return failure. */ if (vd == NULL) { zio->io_error = ENXIO; zio_interrupt(zio); return; } rw_enter(&vd->vd_lock, RW_READER); /* * If the vdev is closed, it's likely due to a failed reopen and is * in the UNAVAIL state. Nothing to be done here but return failure. */ if (vd->vd_bdh == NULL) { rw_exit(&vd->vd_lock); zio->io_error = ENXIO; zio_interrupt(zio); return; } switch (zio->io_type) { case ZIO_TYPE_IOCTL: if (!vdev_readable(v)) { rw_exit(&vd->vd_lock); zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: if (zfs_nocacheflush) break; if (v->vdev_nowritecache) { zio->io_error = SET_ERROR(ENOTSUP); break; } error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); if (error == 0) { rw_exit(&vd->vd_lock); return; } zio->io_error = error; break; default: zio->io_error = SET_ERROR(ENOTSUP); } rw_exit(&vd->vd_lock); zio_execute(zio); return; case ZIO_TYPE_WRITE: rw = WRITE; break; case ZIO_TYPE_READ: rw = READ; break; case ZIO_TYPE_TRIM: zio->io_error = vdev_disk_io_trim(zio); rw_exit(&vd->vd_lock); +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) + if (zio->io_trim_flags & ZIO_TRIM_SECURE) + zio_interrupt(zio); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) zio_interrupt(zio); +#endif return; default: rw_exit(&vd->vd_lock); zio->io_error = SET_ERROR(ENOTSUP); zio_interrupt(zio); return; } zio->io_target_timestamp = zio_handle_io_delay(zio); error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio, zio->io_size, zio->io_offset, rw, 0); rw_exit(&vd->vd_lock); if (error) { zio->io_error = error; zio_interrupt(zio); return; } } static void vdev_disk_io_done(zio_t *zio) { /* * If the device returned EIO, we revalidate the media. If it is * determined the media has changed this triggers the asynchronous * removal of the device from the configuration. */ if (zio->io_error == EIO) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { invalidate_bdev(BDH_BDEV(vd->vd_bdh)); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } } } static void vdev_disk_hold(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); /* We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') return; /* * Only prefetch path and devid info if the device has * never been opened. */ if (vd->vdev_tsd != NULL) return; } static void vdev_disk_rele(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); /* XXX: Implement me as a vnode rele for the device */ } vdev_ops_t vdev_disk_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, .vdev_op_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, .vdev_op_io_done = vdev_disk_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = vdev_disk_hold, .vdev_op_rele = vdev_disk_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE, /* leaf vdev */ .vdev_op_kobj_evt_post = vdev_disk_kobj_evt_post }; /* * The zfs_vdev_scheduler module option has been deprecated. Setting this * value no longer has any effect. It has not yet been entirely removed * to allow the module to be loaded if this option is specified in the * /etc/modprobe.d/zfs.conf file. The following warning will be logged. */ static int param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) { int error = param_set_charp(val, kp); if (error == 0) { printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " "is not supported.\n"); } return (error); } static const char *zfs_vdev_scheduler = "unused"; module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, param_get_charp, &zfs_vdev_scheduler, 0644); MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); int param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { uint_t val; int error; error = kstrtouint(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) return (SET_ERROR(-EINVAL)); error = param_set_uint(buf, kp); if (error < 0) return (SET_ERROR(error)); return (0); } int param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { uint_t val; int error; error = kstrtouint(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) return (SET_ERROR(-EINVAL)); error = param_set_uint(buf, kp); if (error < 0) return (SET_ERROR(error)); return (0); } ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, open_timeout_ms, UINT, ZMOD_RW, "Timeout before determining that a device is missing"); ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, failfast_mask, UINT, ZMOD_RW, "Defines failfast mask: 1 - device, 2 - transport, 4 - driver");