diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index fb7b1a458638..462d6c6efa8e 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -1,396 +1,424 @@ dnl # dnl # 2.6.38 API change, dnl # Added blkdev_get_by_path() dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH], [ ZFS_LINUX_TEST_SRC([blkdev_get_by_path], [ #include #include ], [ struct block_device *bdev __attribute__ ((unused)) = NULL; const char *path = "path"; fmode_t mode = 0; void *holder = NULL; bdev = blkdev_get_by_path(path, mode, holder); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ AC_MSG_CHECKING([whether blkdev_get_by_path() exists]) ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [ AC_MSG_RESULT(yes) ], [ ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) ]) ]) dnl # dnl # 2.6.38 API change, dnl # Added blkdev_put() dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT], [ ZFS_LINUX_TEST_SRC([blkdev_put], [ #include #include ], [ struct block_device *bdev = NULL; fmode_t mode = 0; blkdev_put(bdev, mode); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ AC_MSG_CHECKING([whether blkdev_put() exists]) ZFS_LINUX_TEST_RESULT([blkdev_put], [ AC_MSG_RESULT(yes) ], [ ZFS_LINUX_TEST_ERROR([blkdev_put()]) ]) ]) dnl # dnl # 4.1 API, exported blkdev_reread_part() symbol, back ported to the dnl # 3.10.0 CentOS 7.x enterprise kernels. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART], [ ZFS_LINUX_TEST_SRC([blkdev_reread_part], [ #include #include ], [ struct block_device *bdev = NULL; int error; error = blkdev_reread_part(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_REREAD_PART], [ AC_MSG_CHECKING([whether blkdev_reread_part() exists]) ZFS_LINUX_TEST_RESULT([blkdev_reread_part], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_REREAD_PART, 1, [blkdev_reread_part() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # check_disk_change() was removed in 5.10 dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE], [ ZFS_LINUX_TEST_SRC([check_disk_change], [ #include #include ], [ struct block_device *bdev = NULL; bool error; error = check_disk_change(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE], [ AC_MSG_CHECKING([whether check_disk_change() exists]) ZFS_LINUX_TEST_RESULT([check_disk_change], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_CHECK_DISK_CHANGE, 1, [check_disk_change() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 5.10 API, check_disk_change() is removed, in favor of dnl # bdev_check_media_change(), which doesn't force revalidation dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE], [ ZFS_LINUX_TEST_SRC([bdev_check_media_change], [ #include #include ], [ struct block_device *bdev = NULL; int error; error = bdev_check_media_change(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE], [ AC_MSG_CHECKING([whether bdev_check_media_change() exists]) ZFS_LINUX_TEST_RESULT([bdev_check_media_change], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BDEV_CHECK_MEDIA_CHANGE, 1, [bdev_check_media_change() exists]) ], [ AC_MSG_RESULT(no) ]) ]) dnl # dnl # 2.6.22 API change dnl # Single argument invalidate_bdev() dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV], [ ZFS_LINUX_TEST_SRC([invalidate_bdev], [ #include #include ],[ struct block_device *bdev = NULL; invalidate_bdev(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV], [ AC_MSG_CHECKING([whether invalidate_bdev() exists]) ZFS_LINUX_TEST_RESULT([invalidate_bdev], [ AC_MSG_RESULT(yes) ],[ ZFS_LINUX_TEST_ERROR([invalidate_bdev()]) ]) ]) dnl # dnl # 5.11 API, lookup_bdev() takes dev_t argument. dnl # 2.6.27 API, lookup_bdev() was first exported. dnl # 4.4.0-6.21 API, lookup_bdev() on Ubuntu takes mode argument. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV], [ ZFS_LINUX_TEST_SRC([lookup_bdev_devt], [ #include ], [ int error __attribute__ ((unused)); const char path[] = "/example/path"; dev_t dev; error = lookup_bdev(path, &dev); ]) ZFS_LINUX_TEST_SRC([lookup_bdev_1arg], [ #include #include ], [ struct block_device *bdev __attribute__ ((unused)); const char path[] = "/example/path"; bdev = lookup_bdev(path); ]) ZFS_LINUX_TEST_SRC([lookup_bdev_mode], [ #include ], [ struct block_device *bdev __attribute__ ((unused)); const char path[] = "/example/path"; bdev = lookup_bdev(path, FMODE_READ); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV], [ AC_MSG_CHECKING([whether lookup_bdev() wants dev_t arg]) ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_devt], [lookup_bdev], [fs/block_dev.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_DEVT_LOOKUP_BDEV, 1, [lookup_bdev() wants dev_t arg]) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether lookup_bdev() wants 1 arg]) ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_1arg], [lookup_bdev], [fs/block_dev.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_1ARG_LOOKUP_BDEV, 1, [lookup_bdev() wants 1 arg]) ], [ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether lookup_bdev() wants mode arg]) ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_mode], [lookup_bdev], [fs/block_dev.c], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_MODE_LOOKUP_BDEV, 1, [lookup_bdev() wants mode arg]) ], [ ZFS_LINUX_TEST_ERROR([lookup_bdev()]) ]) ]) ]) ]) dnl # dnl # 2.6.30 API change dnl # dnl # The bdev_physical_block_size() interface was added to provide a way dnl # to determine the smallest write which can be performed without a dnl # read-modify-write operation. dnl # dnl # Unfortunately, this interface isn't entirely reliable because dnl # drives are sometimes known to misreport this value. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE], [ ZFS_LINUX_TEST_SRC([bdev_physical_block_size], [ #include ],[ struct block_device *bdev __attribute__ ((unused)) = NULL; bdev_physical_block_size(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE], [ AC_MSG_CHECKING([whether bdev_physical_block_size() is available]) ZFS_LINUX_TEST_RESULT([bdev_physical_block_size], [ AC_MSG_RESULT(yes) ],[ ZFS_LINUX_TEST_ERROR([bdev_physical_block_size()]) ]) ]) dnl # dnl # 2.6.30 API change dnl # Added bdev_logical_block_size(). dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [ ZFS_LINUX_TEST_SRC([bdev_logical_block_size], [ #include ],[ struct block_device *bdev __attribute__ ((unused)) = NULL; bdev_logical_block_size(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [ AC_MSG_CHECKING([whether bdev_logical_block_size() is available]) ZFS_LINUX_TEST_RESULT([bdev_logical_block_size], [ AC_MSG_RESULT(yes) ],[ ZFS_LINUX_TEST_ERROR([bdev_logical_block_size()]) ]) ]) dnl # dnl # 5.11 API change dnl # Added bdev_whole() helper. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE], [ ZFS_LINUX_TEST_SRC([bdev_whole], [ #include ],[ struct block_device *bdev = NULL; bdev = bdev_whole(bdev); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE], [ AC_MSG_CHECKING([whether bdev_whole() is available]) ZFS_LINUX_TEST_RESULT([bdev_whole], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BDEV_WHOLE, 1, [bdev_whole() is available]) ],[ AC_MSG_RESULT(no) ]) ]) +dnl # +dnl # 5.20 API change, +dnl # Removed bdevname(), snprintf(.., %pg) should be used. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME], [ + ZFS_LINUX_TEST_SRC([bdevname], [ + #include + #include + ], [ + struct block_device *bdev __attribute__ ((unused)) = NULL; + char path[BDEVNAME_SIZE]; + + (void) bdevname(bdev, path); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [ + AC_MSG_CHECKING([whether bdevname() exists]) + ZFS_LINUX_TEST_RESULT([bdevname], [ + AC_DEFINE(HAVE_BDEVNAME, 1, [bdevname() is available]) + AC_MSG_RESULT(yes) + ], [ + AC_MSG_RESULT(no) + ]) +]) + dnl # dnl # 5.19 API: blkdev_issue_secure_erase() dnl # 3.10 API: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ ZFS_LINUX_TEST_SRC([blkdev_issue_secure_erase], [ #include ],[ struct block_device *bdev = NULL; sector_t sector = 0; sector_t nr_sects = 0; int error __attribute__ ((unused)); error = blkdev_issue_secure_erase(bdev, sector, nr_sects, GFP_KERNEL); ]) ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [ #include ],[ struct block_device *bdev = NULL; sector_t sector = 0; sector_t nr_sects = 0; unsigned long flags = 0; int error __attribute__ ((unused)); error = blkdev_issue_discard(bdev, sector, nr_sects, GFP_KERNEL, flags); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [ AC_MSG_CHECKING([whether blkdev_issue_secure_erase() is available]) ZFS_LINUX_TEST_RESULT([blkdev_issue_secure_erase], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_ISSUE_SECURE_ERASE, 1, [blkdev_issue_secure_erase() is available]) ],[ AC_MSG_RESULT(no) AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1, [blkdev_issue_discard() is available]) ],[ ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()]) ]) ]) ]) dnl # dnl # 5.13 API change dnl # blkdev_get_by_path() no longer handles ERESTARTSYS dnl # dnl # Unfortunately we're forced to rely solely on the kernel version dnl # number in order to determine the expected behavior. This was an dnl # internal change to blkdev_get_by_dev(), see commit a8ed1a0607. dnl # AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS], [ AC_MSG_CHECKING([whether blkdev_get_by_path() handles ERESTARTSYS]) AS_VERSION_COMPARE([$LINUX_VERSION], [5.13.0], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_GET_ERESTARTSYS, 1, [blkdev_get_by_path() handles ERESTARTSYS]) ],[ AC_MSG_RESULT(no) ],[ AC_MSG_RESULT(no) ]) ]) AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_PUT ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE ZFS_AC_KERNEL_SRC_BLKDEV_CHECK_DISK_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_WHOLE + ZFS_AC_KERNEL_SRC_BLKDEV_BDEVNAME ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_BLKDEV_PUT ZFS_AC_KERNEL_BLKDEV_REREAD_PART ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE ZFS_AC_KERNEL_BLKDEV_CHECK_DISK_CHANGE ZFS_AC_KERNEL_BLKDEV_BDEV_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_BLKDEV_BDEV_WHOLE + ZFS_AC_KERNEL_BLKDEV_BDEVNAME ZFS_AC_KERNEL_BLKDEV_GET_ERESTARTSYS ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE ]) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 97cd90bf0a75..d19595706ca0 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -1,1012 +1,1022 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * Rewritten for Linux by Brian Behlendorf . * LLNL-CODE-403049. * Copyright (c) 2012, 2019 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_LINUX_BLK_CGROUP_HEADER #include #endif typedef struct vdev_disk { struct block_device *vd_bdev; krwlock_t vd_lock; } vdev_disk_t; /* * Unique identifier for the exclusive vdev holder. */ static void *zfs_vdev_holder = VDEV_HOLDER; /* * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the * device is missing. The missing path may be transient since the links * can be briefly removed and recreated in response to udev events. */ static unsigned zfs_vdev_open_timeout_ms = 1000; /* * Size of the "reserved" partition, in blocks. */ #define EFI_MIN_RESV_SIZE (16 * 1024) /* * Virtual device vector for disks. */ typedef struct dio_request { zio_t *dr_zio; /* Parent ZIO */ atomic_t dr_ref; /* References */ int dr_error; /* Bio error */ int dr_bio_count; /* Count of bio's */ struct bio *dr_bio[0]; /* Attached bio's */ } dio_request_t; static fmode_t vdev_bdev_mode(spa_mode_t spa_mode) { fmode_t mode = 0; if (spa_mode & SPA_MODE_READ) mode |= FMODE_READ; if (spa_mode & SPA_MODE_WRITE) mode |= FMODE_WRITE; return (mode); } /* * Returns the usable capacity (in bytes) for the partition or disk. */ static uint64_t bdev_capacity(struct block_device *bdev) { return (i_size_read(bdev->bd_inode)); } #if !defined(HAVE_BDEV_WHOLE) static inline struct block_device * bdev_whole(struct block_device *bdev) { return (bdev->bd_contains); } #endif +#if defined(HAVE_BDEVNAME) +#define vdev_bdevname(bdev, name) bdevname(bdev, name) +#else +static inline void +vdev_bdevname(struct block_device *bdev, char *name) +{ + snprintf(name, BDEVNAME_SIZE, "%pg", bdev); +} +#endif + /* * Returns the maximum expansion capacity of the block device (in bytes). * * It is possible to expand a vdev when it has been created as a wholedisk * and the containing block device has increased in capacity. Or when the * partition containing the pool has been manually increased in size. * * This function is only responsible for calculating the potential expansion * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is * responsible for verifying the expected partition layout in the wholedisk * case, and updating the partition table if appropriate. Once the partition * size has been increased the additional capacity will be visible using * bdev_capacity(). * * The returned maximum expansion capacity is always expected to be larger, or * at the very least equal, to its usable capacity to prevent overestimating * the pool expandsize. */ static uint64_t bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) { uint64_t psize; int64_t available; if (wholedisk && bdev != bdev_whole(bdev)) { /* * When reporting maximum expansion capacity for a wholedisk * deduct any capacity which is expected to be lost due to * alignment restrictions. Over reporting this value isn't * harmful and would only result in slightly less capacity * than expected post expansion. * The estimated available space may be slightly smaller than * bdev_capacity() for devices where the number of sectors is * not a multiple of the alignment size and the partition layout * is keeping less than PARTITION_END_ALIGNMENT bytes after the * "reserved" EFI partition: in such cases return the device * usable capacity. */ available = i_size_read(bdev_whole(bdev)->bd_inode) - ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + PARTITION_END_ALIGNMENT) << SECTOR_BITS); psize = MAX(available, bdev_capacity(bdev)); } else { psize = bdev_capacity(bdev); } return (psize); } static void vdev_disk_error(zio_t *zio) { /* * This function can be called in interrupt context, for instance while * handling IRQs coming from a misbehaving disk device; use printk() * which is safe from any context. */ printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), zio->io_vd->vdev_path, zio->io_error, zio->io_type, (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, zio->io_flags); } static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { struct block_device *bdev; fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; /* Must have a pathname and it must be absolute. */ if (v->vdev_path == NULL || v->vdev_path[0] != '/') { v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; vdev_dbgmsg(v, "invalid vdev_path"); return (SET_ERROR(EINVAL)); } /* * Reopen the device if it is currently open. When expanding a * partition force re-scanning the partition table if userland * did not take care of this already. We need to do this while closed * in order to get an accurate updated block device size. Then * since udev may need to recreate the device links increase the * open retry timeout before reporting the device as unavailable. */ vd = v->vdev_tsd; if (vd) { char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; boolean_t reread_part = B_FALSE; rw_enter(&vd->vd_lock, RW_WRITER); bdev = vd->vd_bdev; vd->vd_bdev = NULL; if (bdev) { if (v->vdev_expanding && bdev != bdev_whole(bdev)) { - bdevname(bdev_whole(bdev), disk_name + 5); + vdev_bdevname(bdev_whole(bdev), disk_name + 5); /* * If userland has BLKPG_RESIZE_PARTITION, * then it should have updated the partition * table already. We can detect this by * comparing our current physical size * with that of the device. If they are * the same, then we must not have * BLKPG_RESIZE_PARTITION or it failed to * update the partition table online. We * fallback to rescanning the partition * table from the kernel below. However, * if the capacity already reflects the * updated partition, then we skip * rescanning the partition table here. */ if (v->vdev_psize == bdev_capacity(bdev)) reread_part = B_TRUE; } blkdev_put(bdev, mode | FMODE_EXCL); } if (reread_part) { bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, zfs_vdev_holder); if (!IS_ERR(bdev)) { int error = vdev_bdev_reread_part(bdev); blkdev_put(bdev, mode | FMODE_EXCL); if (error == 0) { timeout = MSEC2NSEC( zfs_vdev_open_timeout_ms * 2); } } } } else { vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); rw_enter(&vd->vd_lock, RW_WRITER); } /* * Devices are always opened by the path provided at configuration * time. This means that if the provided path is a udev by-id path * then drives may be re-cabled without an issue. If the provided * path is a udev by-path path, then the physical location information * will be preserved. This can be critical for more complicated * configurations where drives are located in specific physical * locations to maximize the systems tolerance to component failure. * * Alternatively, you can provide your own udev rule to flexibly map * the drives as you see fit. It is not advised that you use the * /dev/[hd]d devices which may be reordered due to probing order. * Devices in the wrong locations will be detected by the higher * level vdev validation. * * The specified paths may be briefly removed and recreated in * response to udev events. This should be exceptionally unlikely * because the zpool command makes every effort to verify these paths * have already settled prior to reaching this point. Therefore, * a ENOENT failure at this point is highly likely to be transient * and it is reasonable to sleep and retry before giving up. In * practice delays have been observed to be on the order of 100ms. * * When ERESTARTSYS is returned it indicates the block device is * a zvol which could not be opened due to the deadlock detection * logic in zvol_open(). Extend the timeout and retry the open * subsequent attempts are expected to eventually succeed. */ hrtime_t start = gethrtime(); bdev = ERR_PTR(-ENXIO); while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, zfs_vdev_holder); if (unlikely(PTR_ERR(bdev) == -ENOENT)) { schedule_timeout(MSEC_TO_TICK(10)); } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); continue; } else if (IS_ERR(bdev)) { break; } } if (IS_ERR(bdev)) { int error = -PTR_ERR(bdev); vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, (u_longlong_t)(gethrtime() - start), (u_longlong_t)timeout); vd->vd_bdev = NULL; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); return (SET_ERROR(error)); } else { vd->vd_bdev = bdev; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); } /* Determine the physical block size */ int physical_block_size = bdev_physical_block_size(vd->vd_bdev); /* Determine the logical block size */ int logical_block_size = bdev_logical_block_size(vd->vd_bdev); /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ v->vdev_nowritecache = B_FALSE; /* Set when device reports it supports TRIM. */ v->vdev_has_trim = bdev_discard_supported(vd->vd_bdev); /* Set when device reports it supports secure TRIM. */ v->vdev_has_securetrim = bdev_secure_discard_supported(vd->vd_bdev); /* Inform the ZIO pipeline that we are non-rotational */ v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); /* Physical volume size in bytes for the partition */ *psize = bdev_capacity(vd->vd_bdev); /* Physical volume size in bytes including possible expansion space */ *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); /* Based on the minimum sector size set the block size */ *physical_ashift = highbit64(MAX(physical_block_size, SPA_MINBLOCKSIZE)) - 1; *logical_ashift = highbit64(MAX(logical_block_size, SPA_MINBLOCKSIZE)) - 1; return (0); } static void vdev_disk_close(vdev_t *v) { vdev_disk_t *vd = v->vdev_tsd; if (v->vdev_reopening || vd == NULL) return; if (vd->vd_bdev != NULL) { blkdev_put(vd->vd_bdev, vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); } rw_destroy(&vd->vd_lock); kmem_free(vd, sizeof (vdev_disk_t)); v->vdev_tsd = NULL; } static dio_request_t * vdev_disk_dio_alloc(int bio_count) { dio_request_t *dr = kmem_zalloc(sizeof (dio_request_t) + sizeof (struct bio *) * bio_count, KM_SLEEP); atomic_set(&dr->dr_ref, 0); dr->dr_bio_count = bio_count; dr->dr_error = 0; for (int i = 0; i < dr->dr_bio_count; i++) dr->dr_bio[i] = NULL; return (dr); } static void vdev_disk_dio_free(dio_request_t *dr) { int i; for (i = 0; i < dr->dr_bio_count; i++) if (dr->dr_bio[i]) bio_put(dr->dr_bio[i]); kmem_free(dr, sizeof (dio_request_t) + sizeof (struct bio *) * dr->dr_bio_count); } static void vdev_disk_dio_get(dio_request_t *dr) { atomic_inc(&dr->dr_ref); } static int vdev_disk_dio_put(dio_request_t *dr) { int rc = atomic_dec_return(&dr->dr_ref); /* * Free the dio_request when the last reference is dropped and * ensure zio_interpret is called only once with the correct zio */ if (rc == 0) { zio_t *zio = dr->dr_zio; int error = dr->dr_error; vdev_disk_dio_free(dr); if (zio) { zio->io_error = error; ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); zio_delay_interrupt(zio); } } return (rc); } BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) { dio_request_t *dr = bio->bi_private; int rc; if (dr->dr_error == 0) { #ifdef HAVE_1ARG_BIO_END_IO_T dr->dr_error = BIO_END_IO_ERROR(bio); #else if (error) dr->dr_error = -(error); else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) dr->dr_error = EIO; #endif } /* Drop reference acquired by __vdev_disk_physio */ rc = vdev_disk_dio_put(dr); } static inline void vdev_submit_bio_impl(struct bio *bio) { #ifdef HAVE_1ARG_SUBMIT_BIO (void) submit_bio(bio); #else (void) submit_bio(bio_data_dir(bio), bio); #endif } /* * preempt_schedule_notrace is GPL-only which breaks the ZFS build, so * replace it with preempt_schedule under the following condition: */ #if defined(CONFIG_ARM64) && \ defined(CONFIG_PREEMPTION) && \ defined(CONFIG_BLK_CGROUP) #define preempt_schedule_notrace(x) preempt_schedule(x) #endif /* * As for the Linux 5.18 kernel bio_alloc() expects a block_device struct * as an argument removing the need to set it with bio_set_dev(). This * removes the need for all of the following compatibility code. */ #if !defined(HAVE_BIO_ALLOC_4ARG) #ifdef HAVE_BIO_SET_DEV #if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) /* * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). * As a side effect the function was converted to GPL-only. Define our * own version when needed which uses rcu_read_lock_sched(). * * The Linux 5.17 kernel split linux/blk-cgroup.h into a private and a public * part, moving blkg_tryget into the private one. Define our own version. */ #if defined(HAVE_BLKG_TRYGET_GPL_ONLY) || !defined(HAVE_BLKG_TRYGET) static inline bool vdev_blkg_tryget(struct blkcg_gq *blkg) { struct percpu_ref *ref = &blkg->refcnt; unsigned long __percpu *count; bool rc; rcu_read_lock_sched(); if (__ref_is_percpu(ref, &count)) { this_cpu_inc(*count); rc = true; } else { #ifdef ZFS_PERCPU_REF_COUNT_IN_DATA rc = atomic_long_inc_not_zero(&ref->data->count); #else rc = atomic_long_inc_not_zero(&ref->count); #endif } rcu_read_unlock_sched(); return (rc); } #else #define vdev_blkg_tryget(bg) blkg_tryget(bg) #endif #ifdef HAVE_BIO_SET_DEV_MACRO /* * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the * GPL-only bio_associate_blkg() symbol thus inadvertently converting * the entire macro. Provide a minimal version which always assigns the * request queue's root_blkg to the bio. */ static inline void vdev_bio_associate_blkg(struct bio *bio) { #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bio->bi_bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif ASSERT3P(q, !=, NULL); ASSERT3P(bio->bi_blkg, ==, NULL); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; } #define bio_associate_blkg vdev_bio_associate_blkg #else static inline void vdev_bio_set_dev(struct bio *bio, struct block_device *bdev) { #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif bio_clear_flag(bio, BIO_REMAPPED); if (bio->bi_bdev != bdev) bio_clear_flag(bio, BIO_THROTTLED); bio->bi_bdev = bdev; ASSERT3P(q, !=, NULL); ASSERT3P(bio->bi_blkg, ==, NULL); if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) bio->bi_blkg = q->root_blkg; } #define bio_set_dev vdev_bio_set_dev #endif #endif #else /* * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. */ static inline void bio_set_dev(struct bio *bio, struct block_device *bdev) { bio->bi_bdev = bdev; } #endif /* HAVE_BIO_SET_DEV */ #endif /* !HAVE_BIO_ALLOC_4ARG */ static inline void vdev_submit_bio(struct bio *bio) { struct bio_list *bio_list = current->bio_list; current->bio_list = NULL; vdev_submit_bio_impl(bio); current->bio_list = bio_list; } static inline struct bio * vdev_bio_alloc(struct block_device *bdev, gfp_t gfp_mask, unsigned short nr_vecs) { struct bio *bio; #ifdef HAVE_BIO_ALLOC_4ARG bio = bio_alloc(bdev, nr_vecs, 0, gfp_mask); #else bio = bio_alloc(gfp_mask, nr_vecs); if (likely(bio != NULL)) bio_set_dev(bio, bdev); #endif return (bio); } static inline unsigned int vdev_bio_max_segs(zio_t *zio, int bio_size, uint64_t abd_offset) { unsigned long nr_segs = abd_nr_pages_off(zio->io_abd, bio_size, abd_offset); #ifdef HAVE_BIO_MAX_SEGS return (bio_max_segs(nr_segs)); #else return (MIN(nr_segs, BIO_MAX_PAGES)); #endif } static int __vdev_disk_physio(struct block_device *bdev, zio_t *zio, size_t io_size, uint64_t io_offset, int rw, int flags) { dio_request_t *dr; uint64_t abd_offset; uint64_t bio_offset; int bio_size; int bio_count = 16; int error = 0; struct blk_plug plug; unsigned short nr_vecs; /* * Accessing outside the block device is never allowed. */ if (io_offset + io_size > bdev->bd_inode->i_size) { vdev_dbgmsg(zio->io_vd, "Illegal access %llu size %llu, device size %llu", io_offset, io_size, i_size_read(bdev->bd_inode)); return (SET_ERROR(EIO)); } retry: dr = vdev_disk_dio_alloc(bio_count); if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) bio_set_flags_failfast(bdev, &flags); dr->dr_zio = zio; /* * Since bio's can have up to BIO_MAX_PAGES=256 iovec's, each of which * is at least 512 bytes and at most PAGESIZE (typically 4K), one bio * can cover at least 128KB and at most 1MB. When the required number * of iovec's exceeds this, we are forced to break the IO in multiple * bio's and wait for them all to complete. This is likely if the * recordsize property is increased beyond 1MB. The default * bio_count=16 should typically accommodate the maximum-size zio of * 16MB. */ abd_offset = 0; bio_offset = io_offset; bio_size = io_size; for (int i = 0; i <= dr->dr_bio_count; i++) { /* Finished constructing bio's for given buffer */ if (bio_size <= 0) break; /* * If additional bio's are required, we have to retry, but * this should be rare - see the comment above. */ if (dr->dr_bio_count == i) { vdev_disk_dio_free(dr); bio_count *= 2; goto retry; } nr_vecs = vdev_bio_max_segs(zio, bio_size, abd_offset); dr->dr_bio[i] = vdev_bio_alloc(bdev, GFP_NOIO, nr_vecs); if (unlikely(dr->dr_bio[i] == NULL)) { vdev_disk_dio_free(dr); return (SET_ERROR(ENOMEM)); } /* Matching put called by vdev_disk_physio_completion */ vdev_disk_dio_get(dr); BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; dr->dr_bio[i]->bi_private = dr; bio_set_op_attrs(dr->dr_bio[i], rw, flags); /* Remaining size is returned to become the new size */ bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, bio_size, abd_offset); /* Advance in buffer and construct another bio if needed */ abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); } /* Extra reference to protect dio_request during vdev_submit_bio */ vdev_disk_dio_get(dr); if (dr->dr_bio_count > 1) blk_start_plug(&plug); /* Submit all bio's associated with this dio */ for (int i = 0; i < dr->dr_bio_count; i++) { if (dr->dr_bio[i]) vdev_submit_bio(dr->dr_bio[i]); } if (dr->dr_bio_count > 1) blk_finish_plug(&plug); (void) vdev_disk_dio_put(dr); return (error); } BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) { zio_t *zio = bio->bi_private; #ifdef HAVE_1ARG_BIO_END_IO_T zio->io_error = BIO_END_IO_ERROR(bio); #else zio->io_error = -error; #endif if (zio->io_error && (zio->io_error == EOPNOTSUPP)) zio->io_vd->vdev_nowritecache = B_TRUE; bio_put(bio); ASSERT3S(zio->io_error, >=, 0); if (zio->io_error) vdev_disk_error(zio); zio_interrupt(zio); } static int vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) { struct request_queue *q; struct bio *bio; q = bdev_get_queue(bdev); if (!q) return (SET_ERROR(ENXIO)); bio = vdev_bio_alloc(bdev, GFP_NOIO, 0); if (unlikely(bio == NULL)) return (SET_ERROR(ENOMEM)); bio->bi_end_io = vdev_disk_io_flush_completion; bio->bi_private = zio; bio_set_flush(bio); vdev_submit_bio(bio); invalidate_bdev(bdev); return (0); } static int vdev_disk_io_trim(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) if (zio->io_trim_flags & ZIO_TRIM_SECURE) { return (-blkdev_issue_secure_erase(vd->vd_bdev, zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); } else { return (-blkdev_issue_discard(vd->vd_bdev, zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); } #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) unsigned long trim_flags = 0; #if defined(BLKDEV_DISCARD_SECURE) if (zio->io_trim_flags & ZIO_TRIM_SECURE) trim_flags |= BLKDEV_DISCARD_SECURE; #endif return (-blkdev_issue_discard(vd->vd_bdev, zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); #else #error "Unsupported kernel" #endif } static void vdev_disk_io_start(zio_t *zio) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; int rw, error; /* * If the vdev is closed, it's likely in the REMOVED or FAULTED state. * Nothing to be done here but return failure. */ if (vd == NULL) { zio->io_error = ENXIO; zio_interrupt(zio); return; } rw_enter(&vd->vd_lock, RW_READER); /* * If the vdev is closed, it's likely due to a failed reopen and is * in the UNAVAIL state. Nothing to be done here but return failure. */ if (vd->vd_bdev == NULL) { rw_exit(&vd->vd_lock); zio->io_error = ENXIO; zio_interrupt(zio); return; } switch (zio->io_type) { case ZIO_TYPE_IOCTL: if (!vdev_readable(v)) { rw_exit(&vd->vd_lock); zio->io_error = SET_ERROR(ENXIO); zio_interrupt(zio); return; } switch (zio->io_cmd) { case DKIOCFLUSHWRITECACHE: if (zfs_nocacheflush) break; if (v->vdev_nowritecache) { zio->io_error = SET_ERROR(ENOTSUP); break; } error = vdev_disk_io_flush(vd->vd_bdev, zio); if (error == 0) { rw_exit(&vd->vd_lock); return; } zio->io_error = error; break; default: zio->io_error = SET_ERROR(ENOTSUP); } rw_exit(&vd->vd_lock); zio_execute(zio); return; case ZIO_TYPE_WRITE: rw = WRITE; break; case ZIO_TYPE_READ: rw = READ; break; case ZIO_TYPE_TRIM: zio->io_error = vdev_disk_io_trim(zio); rw_exit(&vd->vd_lock); zio_interrupt(zio); return; default: rw_exit(&vd->vd_lock); zio->io_error = SET_ERROR(ENOTSUP); zio_interrupt(zio); return; } zio->io_target_timestamp = zio_handle_io_delay(zio); error = __vdev_disk_physio(vd->vd_bdev, zio, zio->io_size, zio->io_offset, rw, 0); rw_exit(&vd->vd_lock); if (error) { zio->io_error = error; zio_interrupt(zio); return; } } static void vdev_disk_io_done(zio_t *zio) { /* * If the device returned EIO, we revalidate the media. If it is * determined the media has changed this triggers the asynchronous * removal of the device from the configuration. */ if (zio->io_error == EIO) { vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; if (zfs_check_media_change(vd->vd_bdev)) { invalidate_bdev(vd->vd_bdev); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } } } static void vdev_disk_hold(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); /* We must have a pathname, and it must be absolute. */ if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') return; /* * Only prefetch path and devid info if the device has * never been opened. */ if (vd->vdev_tsd != NULL) return; } static void vdev_disk_rele(vdev_t *vd) { ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); /* XXX: Implement me as a vnode rele for the device */ } vdev_ops_t vdev_disk_ops = { .vdev_op_init = NULL, .vdev_op_fini = NULL, .vdev_op_open = vdev_disk_open, .vdev_op_close = vdev_disk_close, .vdev_op_asize = vdev_default_asize, .vdev_op_min_asize = vdev_default_min_asize, .vdev_op_min_alloc = NULL, .vdev_op_io_start = vdev_disk_io_start, .vdev_op_io_done = vdev_disk_io_done, .vdev_op_state_change = NULL, .vdev_op_need_resilver = NULL, .vdev_op_hold = vdev_disk_hold, .vdev_op_rele = vdev_disk_rele, .vdev_op_remap = NULL, .vdev_op_xlate = vdev_default_xlate, .vdev_op_rebuild_asize = NULL, .vdev_op_metaslab_init = NULL, .vdev_op_config_generate = NULL, .vdev_op_nparity = NULL, .vdev_op_ndisks = NULL, .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ .vdev_op_leaf = B_TRUE /* leaf vdev */ }; /* * The zfs_vdev_scheduler module option has been deprecated. Setting this * value no longer has any effect. It has not yet been entirely removed * to allow the module to be loaded if this option is specified in the * /etc/modprobe.d/zfs.conf file. The following warning will be logged. */ static int param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) { int error = param_set_charp(val, kp); if (error == 0) { printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " "is not supported.\n"); } return (error); } char *zfs_vdev_scheduler = "unused"; module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, param_get_charp, &zfs_vdev_scheduler, 0644); MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); int param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { uint64_t val; int error; error = kstrtoull(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) return (SET_ERROR(-EINVAL)); error = param_set_ulong(buf, kp); if (error < 0) return (SET_ERROR(error)); return (0); } int param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) { uint64_t val; int error; error = kstrtoull(buf, 0, &val); if (error < 0) return (SET_ERROR(error)); if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) return (SET_ERROR(-EINVAL)); error = param_set_ulong(buf, kp); if (error < 0) return (SET_ERROR(error)); return (0); }