diff --git a/config/kernel-fsync-bdev.m4 b/config/kernel-fsync-bdev.m4 new file mode 100644 index 000000000000..c47e236f705f --- /dev/null +++ b/config/kernel-fsync-bdev.m4 @@ -0,0 +1,36 @@ +dnl # +dnl # 6.6 API change, +dnl # fsync_bdev was removed in favor of sync_blockdev +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SYNC_BDEV], [ + ZFS_LINUX_TEST_SRC([fsync_bdev], [ + #include + ],[ + fsync_bdev(NULL); + ]) + + ZFS_LINUX_TEST_SRC([sync_blockdev], [ + #include + ],[ + sync_blockdev(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SYNC_BDEV], [ + AC_MSG_CHECKING([whether fsync_bdev() exists]) + ZFS_LINUX_TEST_RESULT([fsync_bdev], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FSYNC_BDEV, 1, + [fsync_bdev() is declared in include/blkdev.h]) + ],[ + AC_MSG_CHECKING([whether sync_blockdev() exists]) + ZFS_LINUX_TEST_RESULT([sync_blockdev], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SYNC_BLOCKDEV, 1, + [sync_blockdev() is declared in include/blkdev.h]) + ],[ + ZFS_LINUX_TEST_ERROR( + [neither fsync_bdev() nor sync_blockdev() exist]) + ]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index df194ec72207..056517a841f2 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -1,1028 +1,1030 @@ dnl # dnl # Default ZFS kernel configuration dnl # AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ AM_COND_IF([BUILD_LINUX], [ dnl # Setup the kernel build environment. ZFS_AC_KERNEL ZFS_AC_QAT dnl # Sanity checks for module building and CONFIG_* defines ZFS_AC_KERNEL_CONFIG_DEFINED ZFS_AC_MODULE_SYMVERS dnl # Sequential ZFS_LINUX_TRY_COMPILE tests ZFS_AC_KERNEL_FPU_HEADER ZFS_AC_KERNEL_OBJTOOL_HEADER ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T ZFS_AC_KERNEL_MISC_MINOR ZFS_AC_KERNEL_DECLARE_EVENT_CLASS dnl # Parallel ZFS_LINUX_TEST_SRC / ZFS_LINUX_TEST_RESULT tests ZFS_AC_KERNEL_TEST_SRC ZFS_AC_KERNEL_TEST_RESULT AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ" ]) AC_SUBST(KERNEL_MAKE) ]) ]) dnl # dnl # Generate and compile all of the kernel API test cases to determine dnl # which interfaces are available. By invoking the kernel build system dnl # only once the compilation can be done in parallel significantly dnl # speeding up the process. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_OBJTOOL ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE ZFS_AC_KERNEL_SRC_PDE_DATA ZFS_AC_KERNEL_SRC_FALLOCATE ZFS_AC_KERNEL_SRC_FADVISE ZFS_AC_KERNEL_SRC_GENERIC_FADVISE ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_SRC_RWSEM ZFS_AC_KERNEL_SRC_SCHED ZFS_AC_KERNEL_SRC_USLEEP_RANGE ZFS_AC_KERNEL_SRC_KMEM_CACHE ZFS_AC_KERNEL_SRC_KVMALLOC ZFS_AC_KERNEL_SRC_VMALLOC_PAGE_KERNEL ZFS_AC_KERNEL_SRC_WAIT ZFS_AC_KERNEL_SRC_INODE_TIMES ZFS_AC_KERNEL_SRC_INODE_LOCK ZFS_AC_KERNEL_SRC_GROUP_INFO_GID ZFS_AC_KERNEL_SRC_RW ZFS_AC_KERNEL_SRC_TIMER_SETUP ZFS_AC_KERNEL_SRC_SUPER_USER_NS ZFS_AC_KERNEL_SRC_PROC_OPERATIONS ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS ZFS_AC_KERNEL_SRC_BIO ZFS_AC_KERNEL_SRC_BLKDEV ZFS_AC_KERNEL_SRC_BLK_QUEUE ZFS_AC_KERNEL_SRC_GENHD_FLAGS ZFS_AC_KERNEL_SRC_REVALIDATE_DISK ZFS_AC_KERNEL_SRC_GET_DISK_RO ZFS_AC_KERNEL_SRC_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_SRC_XATTR ZFS_AC_KERNEL_SRC_ACL ZFS_AC_KERNEL_SRC_INODE_SETATTR ZFS_AC_KERNEL_SRC_INODE_GETATTR ZFS_AC_KERNEL_SRC_INODE_SET_FLAGS ZFS_AC_KERNEL_SRC_INODE_SET_IVERSION ZFS_AC_KERNEL_SRC_SHOW_OPTIONS ZFS_AC_KERNEL_SRC_FILE_INODE ZFS_AC_KERNEL_SRC_FILE_DENTRY ZFS_AC_KERNEL_SRC_FSYNC ZFS_AC_KERNEL_SRC_AIO_FSYNC ZFS_AC_KERNEL_SRC_EVICT_INODE ZFS_AC_KERNEL_SRC_DIRTY_INODE ZFS_AC_KERNEL_SRC_SHRINKER ZFS_AC_KERNEL_SRC_MKDIR ZFS_AC_KERNEL_SRC_LOOKUP_FLAGS ZFS_AC_KERNEL_SRC_CREATE ZFS_AC_KERNEL_SRC_PERMISSION ZFS_AC_KERNEL_SRC_GET_LINK ZFS_AC_KERNEL_SRC_PUT_LINK ZFS_AC_KERNEL_SRC_TMPFILE ZFS_AC_KERNEL_SRC_AUTOMOUNT ZFS_AC_KERNEL_SRC_ENCODE_FH_WITH_INODE ZFS_AC_KERNEL_SRC_COMMIT_METADATA ZFS_AC_KERNEL_SRC_CLEAR_INODE ZFS_AC_KERNEL_SRC_SETATTR_PREPARE ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED ZFS_AC_KERNEL_SRC_DENTRY ZFS_AC_KERNEL_SRC_DENTRY_ALIAS_D_U ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SRC_SECURITY_INODE ZFS_AC_KERNEL_SRC_FST_MOUNT ZFS_AC_KERNEL_SRC_BDI ZFS_AC_KERNEL_SRC_SET_NLINK ZFS_AC_KERNEL_SRC_SGET ZFS_AC_KERNEL_SRC_LSEEK_EXECUTE ZFS_AC_KERNEL_SRC_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO ZFS_AC_KERNEL_SRC_VFS_GETATTR ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_ITERATE ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_READPAGES ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS ZFS_AC_KERNEL_SRC_VFS_IOV_ITER ZFS_AC_KERNEL_SRC_VFS_COPY_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_FILE_OPERATIONS_EXTEND ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT ZFS_AC_KERNEL_SRC_FPU ZFS_AC_KERNEL_SRC_FMODE_T ZFS_AC_KERNEL_SRC_KUIDGID_T ZFS_AC_KERNEL_SRC_KUID_HELPERS ZFS_AC_KERNEL_SRC_RENAME ZFS_AC_KERNEL_SRC_CURRENT_TIME ZFS_AC_KERNEL_SRC_USERNS_CAPABILITIES ZFS_AC_KERNEL_SRC_IN_COMPAT_SYSCALL ZFS_AC_KERNEL_SRC_KTIME ZFS_AC_KERNEL_SRC_TOTALRAM_PAGES_FUNC ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES ZFS_AC_KERNEL_SRC_KSTRTOUL ZFS_AC_KERNEL_SRC_PERCPU ZFS_AC_KERNEL_SRC_CPU_HOTPLUG ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR ZFS_AC_KERNEL_SRC_MKNOD ZFS_AC_KERNEL_SRC_SYMLINK ZFS_AC_KERNEL_SRC_BIO_MAX_SEGS ZFS_AC_KERNEL_SRC_SIGNAL_STOP ZFS_AC_KERNEL_SRC_SIGINFO ZFS_AC_KERNEL_SRC_SYSFS ZFS_AC_KERNEL_SRC_SET_SPECIAL_STATE ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG ZFS_AC_KERNEL_SRC_PAGEMAP_FOLIO_WAIT_BIT ZFS_AC_KERNEL_SRC_ADD_DISK ZFS_AC_KERNEL_SRC_KTHREAD ZFS_AC_KERNEL_SRC_ZERO_PAGE ZFS_AC_KERNEL_SRC___COPY_FROM_USER_INATOMIC ZFS_AC_KERNEL_SRC_USER_NS_COMMON_INUM ZFS_AC_KERNEL_SRC_IDMAP_MNT_API ZFS_AC_KERNEL_SRC_IATTR_VFSID ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ + ZFS_AC_KERNEL_SRC_SYNC_BDEV case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE ;; esac AC_MSG_CHECKING([for available kernel interfaces]) ZFS_LINUX_TEST_COMPILE_ALL([kabi]) AC_MSG_RESULT([done]) ]) dnl # dnl # Check results of kernel interface tests. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_ACCESS_OK_TYPE ZFS_AC_KERNEL_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_OBJTOOL ZFS_AC_KERNEL_PDE_DATA ZFS_AC_KERNEL_FALLOCATE ZFS_AC_KERNEL_FADVISE ZFS_AC_KERNEL_GENERIC_FADVISE ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_RWSEM ZFS_AC_KERNEL_SCHED ZFS_AC_KERNEL_USLEEP_RANGE ZFS_AC_KERNEL_KMEM_CACHE ZFS_AC_KERNEL_KVMALLOC ZFS_AC_KERNEL_VMALLOC_PAGE_KERNEL ZFS_AC_KERNEL_WAIT ZFS_AC_KERNEL_INODE_TIMES ZFS_AC_KERNEL_INODE_LOCK ZFS_AC_KERNEL_GROUP_INFO_GID ZFS_AC_KERNEL_RW ZFS_AC_KERNEL_TIMER_SETUP ZFS_AC_KERNEL_SUPER_USER_NS ZFS_AC_KERNEL_PROC_OPERATIONS ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS ZFS_AC_KERNEL_BIO ZFS_AC_KERNEL_BLKDEV ZFS_AC_KERNEL_BLK_QUEUE ZFS_AC_KERNEL_GENHD_FLAGS ZFS_AC_KERNEL_REVALIDATE_DISK ZFS_AC_KERNEL_GET_DISK_RO ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_DISCARD_GRANULARITY ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_XATTR ZFS_AC_KERNEL_ACL ZFS_AC_KERNEL_INODE_SETATTR ZFS_AC_KERNEL_INODE_GETATTR ZFS_AC_KERNEL_INODE_SET_FLAGS ZFS_AC_KERNEL_INODE_SET_IVERSION ZFS_AC_KERNEL_SHOW_OPTIONS ZFS_AC_KERNEL_FILE_INODE ZFS_AC_KERNEL_FILE_DENTRY ZFS_AC_KERNEL_FSYNC ZFS_AC_KERNEL_AIO_FSYNC ZFS_AC_KERNEL_EVICT_INODE ZFS_AC_KERNEL_DIRTY_INODE ZFS_AC_KERNEL_SHRINKER ZFS_AC_KERNEL_MKDIR ZFS_AC_KERNEL_LOOKUP_FLAGS ZFS_AC_KERNEL_CREATE ZFS_AC_KERNEL_PERMISSION ZFS_AC_KERNEL_GET_LINK ZFS_AC_KERNEL_PUT_LINK ZFS_AC_KERNEL_TMPFILE ZFS_AC_KERNEL_AUTOMOUNT ZFS_AC_KERNEL_ENCODE_FH_WITH_INODE ZFS_AC_KERNEL_COMMIT_METADATA ZFS_AC_KERNEL_CLEAR_INODE ZFS_AC_KERNEL_SETATTR_PREPARE ZFS_AC_KERNEL_INSERT_INODE_LOCKED ZFS_AC_KERNEL_DENTRY ZFS_AC_KERNEL_DENTRY_ALIAS_D_U ZFS_AC_KERNEL_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SECURITY_INODE ZFS_AC_KERNEL_FST_MOUNT ZFS_AC_KERNEL_BDI ZFS_AC_KERNEL_SET_NLINK ZFS_AC_KERNEL_SGET ZFS_AC_KERNEL_LSEEK_EXECUTE ZFS_AC_KERNEL_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_VFS_READ_FOLIO ZFS_AC_KERNEL_VFS_GETATTR ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_ITERATE ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_READPAGES ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_VFS_RW_ITERATE ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS ZFS_AC_KERNEL_VFS_IOV_ITER ZFS_AC_KERNEL_VFS_COPY_FILE_RANGE ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE ZFS_AC_KERNEL_VFS_FILE_OPERATIONS_EXTEND ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_MAKE_REQUEST_FN ZFS_AC_KERNEL_GENERIC_IO_ACCT ZFS_AC_KERNEL_FPU ZFS_AC_KERNEL_FMODE_T ZFS_AC_KERNEL_KUIDGID_T ZFS_AC_KERNEL_KUID_HELPERS ZFS_AC_KERNEL_RENAME ZFS_AC_KERNEL_CURRENT_TIME ZFS_AC_KERNEL_USERNS_CAPABILITIES ZFS_AC_KERNEL_IN_COMPAT_SYSCALL ZFS_AC_KERNEL_KTIME ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC ZFS_AC_KERNEL_TOTALHIGH_PAGES ZFS_AC_KERNEL_KSTRTOUL ZFS_AC_KERNEL_PERCPU ZFS_AC_KERNEL_CPU_HOTPLUG ZFS_AC_KERNEL_GENERIC_FILLATTR ZFS_AC_KERNEL_MKNOD ZFS_AC_KERNEL_SYMLINK ZFS_AC_KERNEL_BIO_MAX_SEGS ZFS_AC_KERNEL_SIGNAL_STOP ZFS_AC_KERNEL_SIGINFO ZFS_AC_KERNEL_SYSFS ZFS_AC_KERNEL_SET_SPECIAL_STATE ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG ZFS_AC_KERNEL_PAGEMAP_FOLIO_WAIT_BIT ZFS_AC_KERNEL_ADD_DISK ZFS_AC_KERNEL_KTHREAD ZFS_AC_KERNEL_ZERO_PAGE ZFS_AC_KERNEL___COPY_FROM_USER_INATOMIC ZFS_AC_KERNEL_USER_NS_COMMON_INUM ZFS_AC_KERNEL_IDMAP_MNT_API ZFS_AC_KERNEL_IATTR_VFSID ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_COPY_SPLICE_READ + ZFS_AC_KERNEL_SYNC_BDEV case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE ;; esac ]) dnl # dnl # Detect name used for Module.symvers file in kernel dnl # AC_DEFUN([ZFS_AC_MODULE_SYMVERS], [ modpost=$LINUX/scripts/Makefile.modpost AC_MSG_CHECKING([kernel file name for module symbols]) AS_IF([test "x$enable_linux_builtin" != xyes -a -f "$modpost"], [ AS_IF([grep -q Modules.symvers $modpost], [ LINUX_SYMBOLS=Modules.symvers ], [ LINUX_SYMBOLS=Module.symvers ]) AS_IF([test ! -f "$LINUX_OBJ/$LINUX_SYMBOLS"], [ AC_MSG_ERROR([ *** Please make sure the kernel devel package for your distribution *** is installed. If you are building with a custom kernel, make sure *** the kernel is configured, built, and the '--with-linux=PATH' *** configure option refers to the location of the kernel source. ]) ]) ], [ LINUX_SYMBOLS=NONE ]) AC_MSG_RESULT($LINUX_SYMBOLS) AC_SUBST(LINUX_SYMBOLS) ]) dnl # dnl # Detect the kernel to be built against dnl # dnl # Most modern Linux distributions have separate locations for bare dnl # source (source) and prebuilt (build) files. Additionally, there are dnl # `source` and `build` symlinks in `/lib/modules/$(KERNEL_VERSION)` dnl # pointing to them. The directory search order is now: dnl # dnl # - `configure` command line values if both `--with-linux` and dnl # `--with-linux-obj` were defined dnl # dnl # - If only `--with-linux` was defined, `--with-linux-obj` is assumed dnl # to have the same value as `--with-linux` dnl # dnl # - If neither `--with-linux` nor `--with-linux-obj` were defined dnl # autodetection is used: dnl # dnl # - `/lib/modules/$(uname -r)/{source,build}` respectively, if exist. dnl # dnl # - If only `/lib/modules/$(uname -r)/build` exists, it is assumed dnl # to be both source and build directory. dnl # dnl # - The first directory in `/lib/modules` with the highest version dnl # number according to `sort -V` which contains both `source` and dnl # `build` symlinks/directories. If module directory contains only dnl # `build` component, it is assumed to be both source and build dnl # directory. dnl # dnl # - Last resort: the first directory matching `/usr/src/kernels/*` dnl # and `/usr/src/linux-*` with the highest version number according dnl # to `sort -V` is assumed to be both source and build directory. dnl # AC_DEFUN([ZFS_AC_KERNEL], [ AC_ARG_WITH([linux], AS_HELP_STRING([--with-linux=PATH], [Path to kernel source]), [kernelsrc="$withval"]) AC_ARG_WITH(linux-obj, AS_HELP_STRING([--with-linux-obj=PATH], [Path to kernel build objects]), [kernelbuild="$withval"]) AC_MSG_CHECKING([kernel source and build directories]) AS_IF([test -n "$kernelsrc" && test -z "$kernelbuild"], [ kernelbuild="$kernelsrc" ], [test -z "$kernelsrc"], [ AS_IF([test -e "/lib/modules/$(uname -r)/source" && \ test -e "/lib/modules/$(uname -r)/build"], [ src="/lib/modules/$(uname -r)/source" build="/lib/modules/$(uname -r)/build" ], [test -e "/lib/modules/$(uname -r)/build"], [ build="/lib/modules/$(uname -r)/build" src="$build" ], [ src= for d in $(ls -1d /lib/modules/* 2>/dev/null | sort -Vr); do if test -e "$d/source" && test -e "$d/build"; then src="$d/source" build="$d/build" break fi if test -e "$d/build"; then src="$d/build" build="$d/build" break fi done # the least reliable method if test -z "$src"; then src=$(ls -1d /usr/src/kernels/* /usr/src/linux-* \ 2>/dev/null | grep -v obj | sort -Vr | head -1) build="$src" fi ]) AS_IF([test -n "$src" && test -e "$src"], [ kernelsrc=$(readlink -e "$src") ], [ kernelsrc="[Not found]" ]) AS_IF([test -n "$build" && test -e "$build"], [ kernelbuild=$(readlink -e "$build") ], [ kernelbuild="[Not found]" ]) ], [ AS_IF([test "$kernelsrc" = "NONE"], [ kernsrcver=NONE ]) withlinux=yes ]) AC_MSG_RESULT([done]) AC_MSG_CHECKING([kernel source directory]) AC_MSG_RESULT([$kernelsrc]) AC_MSG_CHECKING([kernel build directory]) AC_MSG_RESULT([$kernelbuild]) AS_IF([test ! -d "$kernelsrc" || test ! -d "$kernelbuild"], [ AC_MSG_ERROR([ *** Please make sure the kernel devel package for your distribution *** is installed and then try again. If that fails, you can specify the *** location of the kernel source and build with the '--with-linux=PATH' and *** '--with-linux-obj=PATH' options respectively.]) ]) AC_MSG_CHECKING([kernel source version]) utsrelease1=$kernelbuild/include/linux/version.h utsrelease2=$kernelbuild/include/linux/utsrelease.h utsrelease3=$kernelbuild/include/generated/utsrelease.h AS_IF([test -r $utsrelease1 && grep -qF UTS_RELEASE $utsrelease1], [ utsrelease=$utsrelease1 ], [test -r $utsrelease2 && grep -qF UTS_RELEASE $utsrelease2], [ utsrelease=$utsrelease2 ], [test -r $utsrelease3 && grep -qF UTS_RELEASE $utsrelease3], [ utsrelease=$utsrelease3 ]) AS_IF([test -n "$utsrelease"], [ kernsrcver=$($AWK '/UTS_RELEASE/ { gsub(/"/, "", $[3]); print $[3] }' $utsrelease) AS_IF([test -z "$kernsrcver"], [ AC_MSG_RESULT([Not found]) AC_MSG_ERROR([ *** Cannot determine kernel version. ]) ]) ], [ AC_MSG_RESULT([Not found]) if test "x$enable_linux_builtin" != xyes; then AC_MSG_ERROR([ *** Cannot find UTS_RELEASE definition. ]) else AC_MSG_ERROR([ *** Cannot find UTS_RELEASE definition. *** Please run 'make prepare' inside the kernel source tree.]) fi ]) AC_MSG_RESULT([$kernsrcver]) AS_VERSION_COMPARE([$kernsrcver], [$ZFS_META_KVER_MIN], [ AC_MSG_ERROR([ *** Cannot build against kernel version $kernsrcver. *** The minimum supported kernel version is $ZFS_META_KVER_MIN. ]) ]) LINUX=${kernelsrc} LINUX_OBJ=${kernelbuild} LINUX_VERSION=${kernsrcver} AC_SUBST(LINUX) AC_SUBST(LINUX_OBJ) AC_SUBST(LINUX_VERSION) ]) dnl # dnl # Detect the QAT module to be built against, QAT provides hardware dnl # acceleration for data compression: dnl # dnl # https://01.org/intel-quickassist-technology dnl # dnl # 1) Download and install QAT driver from the above link dnl # 2) Start QAT driver in your system: dnl # service qat_service start dnl # 3) Enable QAT in ZFS, e.g.: dnl # ./configure --with-qat=/QAT1.6 dnl # make dnl # 4) Set GZIP compression in ZFS dataset: dnl # zfs set compression = gzip dnl # dnl # Then the data written to this ZFS pool is compressed by QAT accelerator dnl # automatically, and de-compressed by QAT when read from the pool. dnl # dnl # 1) Get QAT hardware statistics with: dnl # cat /proc/icp_dh895xcc_dev/qat dnl # 2) To disable QAT: dnl # insmod zfs.ko zfs_qat_disable=1 dnl # AC_DEFUN([ZFS_AC_QAT], [ AC_ARG_WITH([qat], AS_HELP_STRING([--with-qat=PATH], [Path to qat source]), AS_IF([test "$withval" = "yes"], AC_MSG_ERROR([--with-qat=PATH requires a PATH]), [qatsrc="$withval"])) AC_ARG_WITH([qat-obj], AS_HELP_STRING([--with-qat-obj=PATH], [Path to qat build objects]), [qatbuild="$withval"]) AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat source directory]) AC_MSG_RESULT([$qatsrc]) QAT_SRC="${qatsrc}/quickassist" AS_IF([ test ! -e "$QAT_SRC/include/cpa.h"], [ AC_MSG_ERROR([ *** Please make sure the qat driver package is installed *** and specify the location of the qat source with the *** '--with-qat=PATH' option then try again. Failed to *** find cpa.h in: ${QAT_SRC}/include]) ]) ]) AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat build directory]) AS_IF([test -z "$qatbuild"], [ qatbuild="${qatsrc}/build" ]) AC_MSG_RESULT([$qatbuild]) QAT_OBJ=${qatbuild} AS_IF([ ! test -e "$QAT_OBJ/icp_qa_al.ko" && ! test -e "$QAT_OBJ/qat_api.ko"], [ AC_MSG_ERROR([ *** Please make sure the qat driver is installed then try again. *** Failed to find icp_qa_al.ko or qat_api.ko in: $QAT_OBJ]) ]) AC_SUBST(QAT_SRC) AC_SUBST(QAT_OBJ) AC_DEFINE(HAVE_QAT, 1, [qat is enabled and existed]) ]) dnl # dnl # Detect the name used for the QAT Module.symvers file. dnl # AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat file for module symbols]) QAT_SYMBOLS=$QAT_SRC/lookaside/access_layer/src/Module.symvers AS_IF([test -r $QAT_SYMBOLS], [ AC_MSG_RESULT([$QAT_SYMBOLS]) AC_SUBST(QAT_SYMBOLS) ],[ AC_MSG_ERROR([ *** Please make sure the qat driver is installed then try again. *** Failed to find Module.symvers in: $QAT_SYMBOLS ]) ]) ]) ]) dnl # dnl # ZFS_LINUX_CONFTEST_H dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_H], [ test -d build/$2 || mkdir -p build/$2 cat - <<_ACEOF >build/$2/$2.h $1 _ACEOF ]) dnl # dnl # ZFS_LINUX_CONFTEST_C dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_C], [ test -d build/$2 || mkdir -p build/$2 cat confdefs.h - <<_ACEOF >build/$2/$2.c $1 _ACEOF ]) dnl # dnl # ZFS_LINUX_CONFTEST_MAKEFILE dnl # dnl # $1 - test case name dnl # $2 - add to top-level Makefile dnl # $3 - additional build flags dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_MAKEFILE], [ test -d build || mkdir -p build test -d build/$1 || mkdir -p build/$1 file=build/$1/Makefile dnl # Example command line to manually build source. cat - <<_ACEOF >$file # Example command line to manually build source # make modules -C $LINUX_OBJ $ARCH_UM M=$PWD/build/$1 ccflags-y := -Werror $FRAME_LARGER_THAN _ACEOF dnl # Additional custom CFLAGS as requested. m4_ifval($3, [echo "ccflags-y += $3" >>$file], []) dnl # Test case source echo "obj-m := $1.o" >>$file AS_IF([test "x$2" = "xyes"], [echo "obj-m += $1/" >>build/Makefile], []) ]) dnl # dnl # ZFS_LINUX_TEST_PROGRAM(C)([PROLOGUE], [BODY]) dnl # m4_define([ZFS_LINUX_TEST_PROGRAM], [ #include $1 int main (void) { $2 ; return 0; } MODULE_DESCRIPTION("conftest"); MODULE_AUTHOR(ZFS_META_AUTHOR); MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); MODULE_LICENSE($3); ]) dnl # dnl # ZFS_LINUX_TEST_REMOVE dnl # dnl # Removes the specified test source and results. dnl # AC_DEFUN([ZFS_LINUX_TEST_REMOVE], [ test -d build/$1 && rm -Rf build/$1 test -f build/Makefile && sed '/$1/d' build/Makefile ]) dnl # dnl # ZFS_LINUX_COMPILE dnl # dnl # $1 - build dir dnl # $2 - test command dnl # $3 - pass command dnl # $4 - fail command dnl # $5 - set KBUILD_MODPOST_NOFINAL='yes' dnl # $6 - set KBUILD_MODPOST_WARN='yes' dnl # dnl # Used internally by ZFS_LINUX_TEST_{COMPILE,MODPOST} dnl # AC_DEFUN([ZFS_LINUX_COMPILE], [ AC_ARG_VAR([KERNEL_CC], [C compiler for building kernel modules]) AC_ARG_VAR([KERNEL_LD], [Linker for building kernel modules]) AC_ARG_VAR([KERNEL_LLVM], [Binary option to build kernel modules with LLVM/CLANG toolchain]) AC_TRY_COMMAND([ KBUILD_MODPOST_NOFINAL="$5" KBUILD_MODPOST_WARN="$6" make modules -k -j$TEST_JOBS ${KERNEL_CC:+CC=$KERNEL_CC} ${KERNEL_LD:+LD=$KERNEL_LD} ${KERNEL_LLVM:+LLVM=$KERNEL_LLVM} CONFIG_MODULES=y CFLAGS_MODULE=-DCONFIG_MODULES -C $LINUX_OBJ $ARCH_UM M=$PWD/$1 >$1/build.log 2>&1]) AS_IF([AC_TRY_COMMAND([$2])], [$3], [$4]) ]) dnl # dnl # ZFS_LINUX_TEST_COMPILE dnl # dnl # Perform a full compile excluding the final modpost phase. dnl # AC_DEFUN([ZFS_LINUX_TEST_COMPILE], [ ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ mv $2/Makefile $2/Makefile.compile.$1 mv $2/build.log $2/build.log.$1 ],[ AC_MSG_ERROR([ *** Unable to compile test source to determine kernel interfaces.]) ], [yes], []) ]) dnl # dnl # ZFS_LINUX_TEST_MODPOST dnl # dnl # Perform a full compile including the modpost phase. This may dnl # be an incremental build if the objects have already been built. dnl # AC_DEFUN([ZFS_LINUX_TEST_MODPOST], [ ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ mv $2/Makefile $2/Makefile.modpost.$1 cat $2/build.log >>build/build.log.$1 ],[ AC_MSG_ERROR([ *** Unable to modpost test source to determine kernel interfaces.]) ], [], [yes]) ]) dnl # dnl # Perform the compilation of the test cases in two phases. dnl # dnl # Phase 1) attempt to build the object files for all of the tests dnl # defined by the ZFS_LINUX_TEST_SRC macro. But do not dnl # perform the final modpost stage. dnl # dnl # Phase 2) disable all tests which failed the initial compilation, dnl # then invoke the final modpost step for the remaining tests. dnl # dnl # This allows us efficiently build the test cases in parallel while dnl # remaining resilient to build failures which are expected when dnl # detecting the available kernel interfaces. dnl # dnl # The maximum allowed parallelism can be controlled by setting the dnl # TEST_JOBS environment variable. Otherwise, it default to $(nproc). dnl # AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [ dnl # Phase 1 - Compilation only, final linking is skipped. ZFS_LINUX_TEST_COMPILE([$1], [build]) dnl # dnl # Phase 2 - When building external modules disable test cases dnl # which failed to compile and invoke modpost to verify the dnl # final linking. dnl # dnl # Test names suffixed with '_license' call modpost independently dnl # to ensure that a single incompatibility does not result in the dnl # modpost phase exiting early. This check is not performed on dnl # every symbol since the majority are compatible and doing so dnl # would significantly slow down this phase. dnl # dnl # When configuring for builtin (--enable-linux-builtin) dnl # fake the linking step artificially create the expected .ko dnl # files for tests which did compile. This is required for dnl # kernels which do not have loadable module support or have dnl # not yet been built. dnl # AS_IF([test "x$enable_linux_builtin" = "xno"], [ for dir in $(awk '/^obj-m/ { print [$]3 }' \ build/Makefile.compile.$1); do name=${dir%/} AS_IF([test -f build/$name/$name.o], [ AS_IF([test "${name##*_}" = "license"], [ ZFS_LINUX_TEST_MODPOST([$1], [build/$name]) echo "obj-n += $dir" >>build/Makefile ], [ echo "obj-m += $dir" >>build/Makefile ]) ], [ echo "obj-n += $dir" >>build/Makefile ]) done ZFS_LINUX_TEST_MODPOST([$1], [build]) ], [ for dir in $(awk '/^obj-m/ { print [$]3 }' \ build/Makefile.compile.$1); do name=${dir%/} AS_IF([test -f build/$name/$name.o], [ touch build/$name/$name.ko ]) done ]) ]) dnl # dnl # ZFS_LINUX_TEST_SRC dnl # dnl # $1 - name dnl # $2 - global dnl # $3 - source dnl # $4 - extra cflags dnl # $5 - check license-compatibility dnl # dnl # Check if the test source is buildable at all and then if it is dnl # license compatible. dnl # dnl # N.B because all of the test cases are compiled in parallel they dnl # must never depend on the results of previous tests. Each test dnl # needs to be entirely independent. dnl # AC_DEFUN([ZFS_LINUX_TEST_SRC], [ ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[$2]], [[$3]], [["Dual BSD/GPL"]])], [$1]) ZFS_LINUX_CONFTEST_MAKEFILE([$1], [yes], [$4]) AS_IF([ test -n "$5" ], [ ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM( [[$2]], [[$3]], [[$5]])], [$1_license]) ZFS_LINUX_CONFTEST_MAKEFILE([$1_license], [yes], [$4]) ]) ]) dnl # dnl # ZFS_LINUX_TEST_RESULT dnl # dnl # $1 - name of a test source (ZFS_LINUX_TEST_SRC) dnl # $2 - run on success (valid .ko generated) dnl # $3 - run on failure (unable to compile) dnl # AC_DEFUN([ZFS_LINUX_TEST_RESULT], [ AS_IF([test -d build/$1], [ AS_IF([test -f build/$1/$1.ko], [$2], [$3]) ], [ AC_MSG_ERROR([ *** No matching source for the "$1" test, check that *** both the test source and result macros refer to the same name. ]) ]) ]) dnl # dnl # ZFS_LINUX_TEST_ERROR dnl # dnl # Generic error message which can be used when none of the expected dnl # kernel interfaces were detected. dnl # AC_DEFUN([ZFS_LINUX_TEST_ERROR], [ AC_MSG_ERROR([ *** None of the expected "$1" interfaces were detected. *** This may be because your kernel version is newer than what is *** supported, or you are using a patched custom kernel with *** incompatible modifications. *** *** ZFS Version: $ZFS_META_ALIAS *** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX ]) ]) dnl # dnl # ZFS_LINUX_TEST_RESULT_SYMBOL dnl # dnl # Like ZFS_LINUX_TEST_RESULT except ZFS_CHECK_SYMBOL_EXPORT is called to dnl # verify symbol exports, unless --enable-linux-builtin was provided to dnl # configure. dnl # AC_DEFUN([ZFS_LINUX_TEST_RESULT_SYMBOL], [ AS_IF([ ! test -f build/$1/$1.ko], [ $5 ], [ AS_IF([test "x$enable_linux_builtin" != "xyes"], [ ZFS_CHECK_SYMBOL_EXPORT([$2], [$3], [$4], [$5]) ], [ $4 ]) ]) ]) dnl # dnl # ZFS_LINUX_COMPILE_IFELSE dnl # AC_DEFUN([ZFS_LINUX_COMPILE_IFELSE], [ ZFS_LINUX_TEST_REMOVE([conftest]) m4_ifvaln([$1], [ZFS_LINUX_CONFTEST_C([$1], [conftest])]) m4_ifvaln([$5], [ZFS_LINUX_CONFTEST_H([$5], [conftest])], [ZFS_LINUX_CONFTEST_H([], [conftest])]) ZFS_LINUX_CONFTEST_MAKEFILE([conftest], [no], [m4_ifvaln([$5], [-I$PWD/build/conftest], [])]) ZFS_LINUX_COMPILE([build/conftest], [$2], [$3], [$4], [], []) ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE dnl # dnl # $1 - global dnl # $2 - source dnl # $3 - run on success (valid .ko generated) dnl # $4 - run on failure (unable to compile) dnl # dnl # When configuring as builtin (--enable-linux-builtin) for kernels dnl # without loadable module support (CONFIG_MODULES=n) only the object dnl # file is created. See ZFS_LINUX_TEST_COMPILE_ALL for details. dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE], [ AS_IF([test "x$enable_linux_builtin" = "xyes"], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.o], [$3], [$4]) ], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.ko], [$3], [$4]) ]) ]) dnl # dnl # ZFS_CHECK_SYMBOL_EXPORT dnl # dnl # Check if a symbol is exported on not by consulting the symbols dnl # file, or optionally the source code. dnl # AC_DEFUN([ZFS_CHECK_SYMBOL_EXPORT], [ grep -q -E '[[[:space:]]]$1[[[:space:]]]' \ $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null rc=$? if test $rc -ne 0; then export=0 for file in $2; do grep -q -E "EXPORT_SYMBOL.*($1)" \ "$LINUX/$file" 2>/dev/null rc=$? if test $rc -eq 0; then export=1 break; fi done if test $export -eq 0; then : $4 else : $3 fi else : $3 fi ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE_SYMBOL dnl # dnl # Like ZFS_LINUX_TRY_COMPILER except ZFS_CHECK_SYMBOL_EXPORT is called dnl # to verify symbol exports, unless --enable-linux-builtin was provided dnl # to configure. dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE_SYMBOL], [ ZFS_LINUX_TRY_COMPILE([$1], [$2], [rc=0], [rc=1]) if test $rc -ne 0; then : $6 else if test "x$enable_linux_builtin" != xyes; then ZFS_CHECK_SYMBOL_EXPORT([$3], [$4], [rc=0], [rc=1]) fi if test $rc -ne 0; then : $6 else : $5 fi fi ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE_HEADER dnl # like ZFS_LINUX_TRY_COMPILE, except the contents conftest.h are dnl # provided via the fifth parameter dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE_HEADER], [ AS_IF([test "x$enable_linux_builtin" = "xyes"], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.o], [$3], [$4], [$5]) ], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.ko], [$3], [$4], [$5]) ]) ]) dnl # dnl # AS_VERSION_COMPARE_LE dnl # like AS_VERSION_COMPARE_LE, but runs $3 if (and only if) $1 <= $2 dnl # AS_VERSION_COMPARE_LE (version-1, version-2, [action-if-less-or-equal], [action-if-greater]) dnl # AC_DEFUN([AS_VERSION_COMPARE_LE], [ AS_VERSION_COMPARE([$1], [$2], [$3], [$3], [$4]) ]) dnl # dnl # ZFS_LINUX_REQUIRE_API dnl # like ZFS_LINUX_TEST_ERROR, except only fails if the kernel is dnl # at least some specified version. dnl # AC_DEFUN([ZFS_LINUX_REQUIRE_API], [ AS_VERSION_COMPARE_LE([$2], [$kernsrcver], [ AC_MSG_ERROR([ *** None of the expected "$1" interfaces were detected. This *** interface is expected for kernels version "$2" and above. *** This may be because your kernel version is newer than what is *** supported, or you are using a patched custom kernel with *** incompatible modifications. Newer kernels may have incompatible *** APIs. *** *** ZFS Version: $ZFS_META_ALIAS *** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX ]) ], [ AC_MSG_RESULT(no) ]) ]) diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 7a95b54bdf0d..f94ce69fb9e2 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1,1640 +1,1646 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2012, 2020 by Delphix. All rights reserved. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef HAVE_BLK_MQ #include #endif static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, boolean_t force_sync); static unsigned int zvol_major = ZVOL_MAJOR; static unsigned int zvol_request_sync = 0; static unsigned int zvol_prefetch_bytes = (128 * 1024); static unsigned long zvol_max_discard_blocks = 16384; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS static unsigned int zvol_open_timeout_ms = 1000; #endif static unsigned int zvol_threads = 0; #ifdef HAVE_BLK_MQ static unsigned int zvol_blk_mq_threads = 0; static unsigned int zvol_blk_mq_actual_threads; static boolean_t zvol_use_blk_mq = B_FALSE; /* * The maximum number of volblocksize blocks to process per thread. Typically, * write heavy workloads preform better with higher values here, and read * heavy workloads preform better with lower values, but that's not a hard * and fast rule. It's basically a knob to tune between "less overhead with * less parallelism" and "more overhead, but more parallelism". * * '8' was chosen as a reasonable, balanced, default based off of sequential * read and write tests to a zvol in an NVMe pool (with 16 CPUs). */ static unsigned int zvol_blk_mq_blocks_per_thread = 8; #endif #ifndef BLKDEV_DEFAULT_RQ /* BLKDEV_MAX_RQ was renamed to BLKDEV_DEFAULT_RQ in the 5.16 kernel */ #define BLKDEV_DEFAULT_RQ BLKDEV_MAX_RQ #endif /* * Finalize our BIO or request. */ #ifdef HAVE_BLK_MQ #define END_IO(zv, bio, rq, error) do { \ if (bio) { \ BIO_END_IO(bio, error); \ } else { \ blk_mq_end_request(rq, errno_to_bi_status(error)); \ } \ } while (0) #else #define END_IO(zv, bio, rq, error) BIO_END_IO(bio, error) #endif #ifdef HAVE_BLK_MQ static unsigned int zvol_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; static unsigned int zvol_actual_blk_mq_queue_depth; #endif struct zvol_state_os { struct gendisk *zvo_disk; /* generic disk */ struct request_queue *zvo_queue; /* request queue */ dev_t zvo_dev; /* device id */ #ifdef HAVE_BLK_MQ struct blk_mq_tag_set tag_set; #endif /* Set from the global 'zvol_use_blk_mq' at zvol load */ boolean_t use_blk_mq; }; static taskq_t *zvol_taskq; static struct ida zvol_ida; typedef struct zv_request_stack { zvol_state_t *zv; struct bio *bio; struct request *rq; } zv_request_t; typedef struct zv_work { struct request *rq; struct work_struct work; } zv_work_t; typedef struct zv_request_task { zv_request_t zvr; taskq_ent_t ent; } zv_request_task_t; static zv_request_task_t * zv_request_task_create(zv_request_t zvr) { zv_request_task_t *task; task = kmem_alloc(sizeof (zv_request_task_t), KM_SLEEP); taskq_init_ent(&task->ent); task->zvr = zvr; return (task); } static void zv_request_task_free(zv_request_task_t *task) { kmem_free(task, sizeof (*task)); } #ifdef HAVE_BLK_MQ /* * This is called when a new block multiqueue request comes in. A request * contains one or more BIOs. */ static blk_status_t zvol_mq_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct request *rq = bd->rq; zvol_state_t *zv = rq->q->queuedata; /* Tell the kernel that we are starting to process this request */ blk_mq_start_request(rq); if (blk_rq_is_passthrough(rq)) { /* Skip non filesystem request */ blk_mq_end_request(rq, BLK_STS_IOERR); return (BLK_STS_IOERR); } zvol_request_impl(zv, NULL, rq, 0); /* Acknowledge to the kernel that we got this request */ return (BLK_STS_OK); } static struct blk_mq_ops zvol_blk_mq_queue_ops = { .queue_rq = zvol_mq_queue_rq, }; /* Initialize our blk-mq struct */ static int zvol_blk_mq_alloc_tag_set(zvol_state_t *zv) { struct zvol_state_os *zso = zv->zv_zso; memset(&zso->tag_set, 0, sizeof (zso->tag_set)); /* Initialize tag set. */ zso->tag_set.ops = &zvol_blk_mq_queue_ops; zso->tag_set.nr_hw_queues = zvol_blk_mq_actual_threads; zso->tag_set.queue_depth = zvol_actual_blk_mq_queue_depth; zso->tag_set.numa_node = NUMA_NO_NODE; zso->tag_set.cmd_size = 0; /* * We need BLK_MQ_F_BLOCKING here since we do blocking calls in * zvol_request_impl() */ zso->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_BLOCKING; zso->tag_set.driver_data = zv; return (blk_mq_alloc_tag_set(&zso->tag_set)); } #endif /* HAVE_BLK_MQ */ /* * Given a path, return TRUE if path is a ZVOL. */ boolean_t zvol_os_is_zvol(const char *path) { dev_t dev = 0; if (vdev_lookup_bdev(path, &dev) != 0) return (B_FALSE); if (MAJOR(dev) == zvol_major) return (B_TRUE); return (B_FALSE); } static void zvol_write(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; zvol_state_t *zv = zvr->zv; struct request_queue *q; struct gendisk *disk; unsigned long start_time = 0; boolean_t acct = B_FALSE; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); q = zv->zv_zso->zvo_queue; disk = zv->zv_zso->zvo_disk; /* bio marked as FLUSH need to flush before write */ if (io_is_flush(bio, rq)) zil_commit(zv->zv_zilog, ZVOL_OBJ); /* Some requests are just for flush and nothing else. */ if (io_size(bio, rq) == 0) { rw_exit(&zv->zv_suspend_lock); END_IO(zv, bio, rq, 0); return; } zfs_uio_bvec_init(&uio, bio, rq); ssize_t start_resid = uio.uio_resid; /* * With use_blk_mq, accounting is done by blk_mq_start_request() * and blk_mq_end_request(), so we can skip it here. */ if (bio) { acct = blk_queue_io_stat(q); if (acct) { start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); } } boolean_t sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_WRITER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); uint64_t off = uio.uio_loffset; dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); if (bytes > volsize - off) /* don't write past the end */ bytes = volsize - off; dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); /* This will only fail for ENOSPC */ error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); break; } error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); if (error == 0) { zvol_log_write(zv, tx, off, bytes, sync); } dmu_tx_commit(tx); if (error) break; } zfs_rangelock_exit(lr); int64_t nwritten = start_resid - uio.uio_resid; dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); task_io_account_write(nwritten); if (sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); } END_IO(zv, bio, rq, -error); } static void zvol_write_task(void *arg) { zv_request_task_t *task = arg; zvol_write(&task->zvr); zv_request_task_free(task); } static void zvol_discard(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; zvol_state_t *zv = zvr->zv; uint64_t start = io_offset(bio, rq); uint64_t size = io_size(bio, rq); uint64_t end = start + size; boolean_t sync; int error = 0; dmu_tx_t *tx; struct request_queue *q = zv->zv_zso->zvo_queue; struct gendisk *disk = zv->zv_zso->zvo_disk; unsigned long start_time = 0; boolean_t acct = B_FALSE; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); ASSERT3P(zv->zv_zilog, !=, NULL); if (bio) { acct = blk_queue_io_stat(q); if (acct) { start_time = blk_generic_start_io_acct(q, disk, WRITE, bio); } } sync = io_is_fua(bio, rq) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; if (end > zv->zv_volsize) { error = SET_ERROR(EIO); goto unlock; } /* * Align the request to volume block boundaries when a secure erase is * not required. This will prevent dnode_free_range() from zeroing out * the unaligned parts which is slow (read-modify-write) and useless * since we are not freeing any space by doing so. */ if (!io_is_secure_erase(bio, rq)) { start = P2ROUNDUP(start, zv->zv_volblocksize); end = P2ALIGN(end, zv->zv_volblocksize); size = end - start; } if (start >= end) goto unlock; zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, start, size, RL_WRITER); tx = dmu_tx_create(zv->zv_objset); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); } else { zvol_log_truncate(zv, tx, start, size, B_TRUE); dmu_tx_commit(tx); error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, start, size); } zfs_rangelock_exit(lr); if (error == 0 && sync) zil_commit(zv->zv_zilog, ZVOL_OBJ); unlock: rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, WRITE, bio, start_time); } END_IO(zv, bio, rq, -error); } static void zvol_discard_task(void *arg) { zv_request_task_t *task = arg; zvol_discard(&task->zvr); zv_request_task_free(task); } static void zvol_read(zv_request_t *zvr) { struct bio *bio = zvr->bio; struct request *rq = zvr->rq; int error = 0; zfs_uio_t uio; boolean_t acct = B_FALSE; zvol_state_t *zv = zvr->zv; struct request_queue *q; struct gendisk *disk; unsigned long start_time = 0; ASSERT3P(zv, !=, NULL); ASSERT3U(zv->zv_open_count, >, 0); zfs_uio_bvec_init(&uio, bio, rq); q = zv->zv_zso->zvo_queue; disk = zv->zv_zso->zvo_disk; ssize_t start_resid = uio.uio_resid; /* * When blk-mq is being used, accounting is done by * blk_mq_start_request() and blk_mq_end_request(). */ if (bio) { acct = blk_queue_io_stat(q); if (acct) start_time = blk_generic_start_io_acct(q, disk, READ, bio); } zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, uio.uio_loffset, uio.uio_resid, RL_READER); uint64_t volsize = zv->zv_volsize; while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); /* don't read past the end */ if (bytes > volsize - uio.uio_loffset) bytes = volsize - uio.uio_loffset; error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } } zfs_rangelock_exit(lr); int64_t nread = start_resid - uio.uio_resid; dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); task_io_account_read(nread); rw_exit(&zv->zv_suspend_lock); if (bio && acct) { blk_generic_end_io_acct(q, disk, READ, bio, start_time); } END_IO(zv, bio, rq, -error); } static void zvol_read_task(void *arg) { zv_request_task_t *task = arg; zvol_read(&task->zvr); zv_request_task_free(task); } /* * Process a BIO or request * * Either 'bio' or 'rq' should be set depending on if we are processing a * bio or a request (both should not be set). * * force_sync: Set to 0 to defer processing to a background taskq * Set to 1 to process data synchronously */ static void zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, boolean_t force_sync) { fstrans_cookie_t cookie = spl_fstrans_mark(); uint64_t offset = io_offset(bio, rq); uint64_t size = io_size(bio, rq); int rw = io_data_dir(bio, rq); if (zvol_request_sync) force_sync = 1; zv_request_t zvr = { .zv = zv, .bio = bio, .rq = rq, }; if (io_has_data(bio, rq) && offset + size > zv->zv_volsize) { printk(KERN_INFO "%s: bad access: offset=%llu, size=%lu\n", zv->zv_zso->zvo_disk->disk_name, (long long unsigned)offset, (long unsigned)size); END_IO(zv, bio, rq, -SET_ERROR(EIO)); goto out; } zv_request_task_t *task; if (rw == WRITE) { if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { END_IO(zv, bio, rq, -SET_ERROR(EROFS)); goto out; } /* * Prevents the zvol from being suspended, or the ZIL being * concurrently opened. Will be released after the i/o * completes. */ rw_enter(&zv->zv_suspend_lock, RW_READER); /* * Open a ZIL if this is the first time we have written to this * zvol. We protect zv->zv_zilog with zv_suspend_lock rather * than zv_state_lock so that we don't need to acquire an * additional lock in this path. */ if (zv->zv_zilog == NULL) { rw_exit(&zv->zv_suspend_lock); rw_enter(&zv->zv_suspend_lock, RW_WRITER); if (zv->zv_zilog == NULL) { zv->zv_zilog = zil_open(zv->zv_objset, zvol_get_data, &zv->zv_kstat.dk_zil_sums); zv->zv_flags |= ZVOL_WRITTEN_TO; /* replay / destroy done in zvol_create_minor */ VERIFY0((zv->zv_zilog->zl_header->zh_flags & ZIL_REPLAY_NEEDED)); } rw_downgrade(&zv->zv_suspend_lock); } /* * We don't want this thread to be blocked waiting for i/o to * complete, so we instead wait from a taskq callback. The * i/o may be a ZIL write (via zil_commit()), or a read of an * indirect block, or a read of a data block (if this is a * partial-block write). We will indicate that the i/o is * complete by calling END_IO() from the taskq callback. * * This design allows the calling thread to continue and * initiate more concurrent operations by calling * zvol_request() again. There are typically only a small * number of threads available to call zvol_request() (e.g. * one per iSCSI target), so keeping the latency of * zvol_request() low is important for performance. * * The zvol_request_sync module parameter allows this * behavior to be altered, for performance evaluation * purposes. If the callback blocks, setting * zvol_request_sync=1 will result in much worse performance. * * We can have up to zvol_threads concurrent i/o's being * processed for all zvols on the system. This is typically * a vast improvement over the zvol_request_sync=1 behavior * of one i/o at a time per zvol. However, an even better * design would be for zvol_request() to initiate the zio * directly, and then be notified by the zio_done callback, * which would call END_IO(). Unfortunately, the DMU/ZIL * interfaces lack this functionality (they block waiting for * the i/o to complete). */ if (io_is_discard(bio, rq) || io_is_secure_erase(bio, rq)) { if (force_sync) { zvol_discard(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, zvol_discard_task, task, 0, &task->ent); } } else { if (force_sync) { zvol_write(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, zvol_write_task, task, 0, &task->ent); } } } else { /* * The SCST driver, and possibly others, may issue READ I/Os * with a length of zero bytes. These empty I/Os contain no * data and require no additional handling. */ if (size == 0) { END_IO(zv, bio, rq, 0); goto out; } rw_enter(&zv->zv_suspend_lock, RW_READER); /* See comment in WRITE case above. */ if (force_sync) { zvol_read(&zvr); } else { task = zv_request_task_create(zvr); taskq_dispatch_ent(zvol_taskq, zvol_read_task, task, 0, &task->ent); } } out: spl_fstrans_unmark(cookie); } #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS #ifdef HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID static void zvol_submit_bio(struct bio *bio) #else static blk_qc_t zvol_submit_bio(struct bio *bio) #endif #else static MAKE_REQUEST_FN_RET zvol_request(struct request_queue *q, struct bio *bio) #endif { #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS #if defined(HAVE_BIO_BDEV_DISK) struct request_queue *q = bio->bi_bdev->bd_disk->queue; #else struct request_queue *q = bio->bi_disk->queue; #endif #endif zvol_state_t *zv = q->queuedata; zvol_request_impl(zv, bio, NULL, 0); #if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ !defined(HAVE_BDEV_SUBMIT_BIO_RETURNS_VOID) return (BLK_QC_T_NONE); #endif } static int #ifdef HAVE_BLK_MODE_T zvol_open(struct gendisk *disk, blk_mode_t flag) #else zvol_open(struct block_device *bdev, fmode_t flag) #endif { zvol_state_t *zv; int error = 0; boolean_t drop_suspend = B_FALSE; #ifndef HAVE_BLKDEV_GET_ERESTARTSYS hrtime_t timeout = MSEC2NSEC(zvol_open_timeout_ms); hrtime_t start = gethrtime(); retry: #endif rw_enter(&zvol_state_lock, RW_READER); /* * Obtain a copy of private_data under the zvol_state_lock to make * sure that either the result of zvol free code path setting * disk->private_data to NULL is observed, or zvol_os_free() * is not called on this zv because of the positive zv_open_count. */ #ifdef HAVE_BLK_MODE_T zv = disk->private_data; #else zv = bdev->bd_disk->private_data; #endif if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(-ENXIO)); } mutex_enter(&zv->zv_state_lock); /* * Make sure zvol is not suspended during first open * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 0) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 0) { rw_exit(&zv->zv_suspend_lock); } else { drop_suspend = B_TRUE; } } else { drop_suspend = B_TRUE; } } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); if (zv->zv_open_count == 0) { boolean_t drop_namespace = B_FALSE; ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); /* * In all other call paths the spa_namespace_lock is taken * before the bdev->bd_mutex lock. However, on open(2) * the __blkdev_get() function calls fops->open() with the * bdev->bd_mutex lock held. This can result in a deadlock * when zvols from one pool are used as vdevs in another. * * To prevent a lock inversion deadlock we preemptively * take the spa_namespace_lock. Normally the lock will not * be contended and this is safe because spa_open_common() * handles the case where the caller already holds the * spa_namespace_lock. * * When the lock cannot be aquired after multiple retries * this must be the vdev on zvol deadlock case and we have * no choice but to return an error. For 5.12 and older * kernels returning -ERESTARTSYS will result in the * bdev->bd_mutex being dropped, then reacquired, and * fops->open() being called again. This process can be * repeated safely until both locks are acquired. For 5.13 * and newer the -ERESTARTSYS retry logic was removed from * the kernel so the only option is to return the error for * the caller to handle it. */ if (!mutex_owned(&spa_namespace_lock)) { if (!mutex_tryenter(&spa_namespace_lock)) { mutex_exit(&zv->zv_state_lock); rw_exit(&zv->zv_suspend_lock); #ifdef HAVE_BLKDEV_GET_ERESTARTSYS schedule(); return (SET_ERROR(-ERESTARTSYS)); #else if ((gethrtime() - start) > timeout) return (SET_ERROR(-ERESTARTSYS)); schedule_timeout(MSEC_TO_TICK(10)); goto retry; #endif } else { drop_namespace = B_TRUE; } } error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); if (drop_namespace) mutex_exit(&spa_namespace_lock); } if (error == 0) { if ((blk_mode_is_open_write(flag)) && (zv->zv_flags & ZVOL_RDONLY)) { if (zv->zv_open_count == 0) zvol_last_close(zv); error = SET_ERROR(-EROFS); } else { zv->zv_open_count++; } } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); if (error == 0) #ifdef HAVE_BLK_MODE_T disk_check_media_change(disk); #else zfs_check_media_change(bdev); #endif return (error); } static void #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG zvol_release(struct gendisk *disk) #else zvol_release(struct gendisk *disk, fmode_t unused) #endif { #if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) (void) unused; #endif zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; rw_enter(&zvol_state_lock, RW_READER); zv = disk->private_data; mutex_enter(&zv->zv_state_lock); ASSERT3U(zv->zv_open_count, >, 0); /* * make sure zvol is not suspended during last close * (hold zv_suspend_lock) and respect proper lock acquisition * ordering - zv_suspend_lock before zv_state_lock */ if (zv->zv_open_count == 1) { if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { mutex_exit(&zv->zv_state_lock); rw_enter(&zv->zv_suspend_lock, RW_READER); mutex_enter(&zv->zv_state_lock); /* check to see if zv_suspend_lock is needed */ if (zv->zv_open_count != 1) { rw_exit(&zv->zv_suspend_lock); drop_suspend = B_FALSE; } } } else { drop_suspend = B_FALSE; } rw_exit(&zvol_state_lock); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); zv->zv_open_count--; if (zv->zv_open_count == 0) { ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); zvol_last_close(zv); } mutex_exit(&zv->zv_state_lock); if (drop_suspend) rw_exit(&zv->zv_suspend_lock); } static int zvol_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { zvol_state_t *zv = bdev->bd_disk->private_data; int error = 0; ASSERT3U(zv->zv_open_count, >, 0); switch (cmd) { case BLKFLSBUF: +#ifdef HAVE_FSYNC_BDEV fsync_bdev(bdev); +#elif defined(HAVE_SYNC_BLOCKDEV) + sync_blockdev(bdev); +#else +#error "Neither fsync_bdev() nor sync_blockdev() found" +#endif invalidate_bdev(bdev); rw_enter(&zv->zv_suspend_lock, RW_READER); if (!(zv->zv_flags & ZVOL_RDONLY)) txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); rw_exit(&zv->zv_suspend_lock); break; case BLKZNAME: mutex_enter(&zv->zv_state_lock); error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); mutex_exit(&zv->zv_state_lock); break; default: error = -ENOTTY; break; } return (SET_ERROR(error)); } #ifdef CONFIG_COMPAT static int zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { return (zvol_ioctl(bdev, mode, cmd, arg)); } #else #define zvol_compat_ioctl NULL #endif static unsigned int zvol_check_events(struct gendisk *disk, unsigned int clearing) { unsigned int mask = 0; rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; zv->zv_changed = 0; mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (mask); } static int zvol_revalidate_disk(struct gendisk *disk) { rw_enter(&zvol_state_lock, RW_READER); zvol_state_t *zv = disk->private_data; if (zv != NULL) { mutex_enter(&zv->zv_state_lock); set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> SECTOR_BITS); mutex_exit(&zv->zv_state_lock); } rw_exit(&zvol_state_lock); return (0); } int zvol_os_update_volsize(zvol_state_t *zv, uint64_t volsize) { struct gendisk *disk = zv->zv_zso->zvo_disk; #if defined(HAVE_REVALIDATE_DISK_SIZE) revalidate_disk_size(disk, zvol_revalidate_disk(disk) == 0); #elif defined(HAVE_REVALIDATE_DISK) revalidate_disk(disk); #else zvol_revalidate_disk(disk); #endif return (0); } void zvol_os_clear_private(zvol_state_t *zv) { /* * Cleared while holding zvol_state_lock as a writer * which will prevent zvol_open() from opening it. */ zv->zv_zso->zvo_disk->private_data = NULL; } /* * Provide a simple virtual geometry for legacy compatibility. For devices * smaller than 1 MiB a small head and sector count is used to allow very * tiny devices. For devices over 1 Mib a standard head and sector count * is used to keep the cylinders count reasonable. */ static int zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) { zvol_state_t *zv = bdev->bd_disk->private_data; sector_t sectors; ASSERT3U(zv->zv_open_count, >, 0); sectors = get_capacity(zv->zv_zso->zvo_disk); if (sectors > 2048) { geo->heads = 16; geo->sectors = 63; } else { geo->heads = 2; geo->sectors = 4; } geo->start = 0; geo->cylinders = sectors / (geo->heads * geo->sectors); return (0); } /* * Why have two separate block_device_operations structs? * * Normally we'd just have one, and assign 'submit_bio' as needed. However, * it's possible the user's kernel is built with CONSTIFY_PLUGIN, meaning we * can't just change submit_bio dynamically at runtime. So just create two * separate structs to get around this. */ static const struct block_device_operations zvol_ops_blk_mq = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .check_events = zvol_check_events, #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK .revalidate_disk = zvol_revalidate_disk, #endif .getgeo = zvol_getgeo, .owner = THIS_MODULE, }; static const struct block_device_operations zvol_ops = { .open = zvol_open, .release = zvol_release, .ioctl = zvol_ioctl, .compat_ioctl = zvol_compat_ioctl, .check_events = zvol_check_events, #ifdef HAVE_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK .revalidate_disk = zvol_revalidate_disk, #endif .getgeo = zvol_getgeo, .owner = THIS_MODULE, #ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS .submit_bio = zvol_submit_bio, #endif }; static int zvol_alloc_non_blk_mq(struct zvol_state_os *zso) { #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) #if defined(HAVE_BLK_ALLOC_DISK) zso->zvo_disk = blk_alloc_disk(NUMA_NO_NODE); if (zso->zvo_disk == NULL) return (1); zso->zvo_disk->minors = ZVOL_MINORS; zso->zvo_queue = zso->zvo_disk->queue; #else zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); if (zso->zvo_queue == NULL) return (1); zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); return (1); } zso->zvo_disk->queue = zso->zvo_queue; #endif /* HAVE_BLK_ALLOC_DISK */ #else zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); if (zso->zvo_queue == NULL) return (1); zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); return (1); } zso->zvo_disk->queue = zso->zvo_queue; #endif /* HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ return (0); } static int zvol_alloc_blk_mq(zvol_state_t *zv) { #ifdef HAVE_BLK_MQ struct zvol_state_os *zso = zv->zv_zso; /* Allocate our blk-mq tag_set */ if (zvol_blk_mq_alloc_tag_set(zv) != 0) return (1); #if defined(HAVE_BLK_ALLOC_DISK) zso->zvo_disk = blk_mq_alloc_disk(&zso->tag_set, zv); if (zso->zvo_disk == NULL) { blk_mq_free_tag_set(&zso->tag_set); return (1); } zso->zvo_queue = zso->zvo_disk->queue; zso->zvo_disk->minors = ZVOL_MINORS; #else zso->zvo_disk = alloc_disk(ZVOL_MINORS); if (zso->zvo_disk == NULL) { blk_cleanup_queue(zso->zvo_queue); blk_mq_free_tag_set(&zso->tag_set); return (1); } /* Allocate queue */ zso->zvo_queue = blk_mq_init_queue(&zso->tag_set); if (IS_ERR(zso->zvo_queue)) { blk_mq_free_tag_set(&zso->tag_set); return (1); } /* Our queue is now created, assign it to our disk */ zso->zvo_disk->queue = zso->zvo_queue; #endif #endif return (0); } /* * Allocate memory for a new zvol_state_t and setup the required * request queue and generic disk structures for the block device. */ static zvol_state_t * zvol_alloc(dev_t dev, const char *name) { zvol_state_t *zv; struct zvol_state_os *zso; uint64_t volmode; int ret; if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) return (NULL); if (volmode == ZFS_VOLMODE_DEFAULT) volmode = zvol_volmode; if (volmode == ZFS_VOLMODE_NONE) return (NULL); zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); zv->zv_zso = zso; zv->zv_volmode = volmode; list_link_init(&zv->zv_next); mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); #ifdef HAVE_BLK_MQ zv->zv_zso->use_blk_mq = zvol_use_blk_mq; #endif /* * The block layer has 3 interfaces for getting BIOs: * * 1. blk-mq request queues (new) * 2. submit_bio() (oldest) * 3. regular request queues (old). * * Each of those interfaces has two permutations: * * a) We have blk_alloc_disk()/blk_mq_alloc_disk(), which allocates * both the disk and its queue (5.14 kernel or newer) * * b) We don't have blk_*alloc_disk(), and have to allocate the * disk and the queue separately. (5.13 kernel or older) */ if (zv->zv_zso->use_blk_mq) { ret = zvol_alloc_blk_mq(zv); zso->zvo_disk->fops = &zvol_ops_blk_mq; } else { ret = zvol_alloc_non_blk_mq(zso); zso->zvo_disk->fops = &zvol_ops; } if (ret != 0) goto out_kmem; blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); /* Limit read-ahead to a single page to prevent over-prefetching. */ blk_queue_set_read_ahead(zso->zvo_queue, 1); if (!zv->zv_zso->use_blk_mq) { /* Disable write merging in favor of the ZIO pipeline. */ blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); } /* Enable /proc/diskstats */ blk_queue_flag_set(QUEUE_FLAG_IO_STAT, zso->zvo_queue); zso->zvo_queue->queuedata = zv; zso->zvo_dev = dev; zv->zv_open_count = 0; strlcpy(zv->zv_name, name, MAXNAMELEN); zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); zso->zvo_disk->major = zvol_major; zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; /* * Setting ZFS_VOLMODE_DEV disables partitioning on ZVOL devices. * This is accomplished by limiting the number of minors for the * device to one and explicitly disabling partition scanning. */ if (volmode == ZFS_VOLMODE_DEV) { zso->zvo_disk->minors = 1; zso->zvo_disk->flags &= ~ZFS_GENHD_FL_EXT_DEVT; zso->zvo_disk->flags |= ZFS_GENHD_FL_NO_PART; } zso->zvo_disk->first_minor = (dev & MINORMASK); zso->zvo_disk->private_data = zv; snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", ZVOL_DEV_NAME, (dev & MINORMASK)); return (zv); out_kmem: kmem_free(zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); return (NULL); } /* * Cleanup then free a zvol_state_t which was created by zvol_alloc(). * At this time, the structure is not opened by anyone, is taken off * the zvol_state_list, and has its private data set to NULL. * The zvol_state_lock is dropped. * * This function may take many milliseconds to complete (e.g. we've seen * it take over 256ms), due to the calls to "blk_cleanup_queue" and * "del_gendisk". Thus, consumers need to be careful to account for this * latency when calling this function. */ void zvol_os_free(zvol_state_t *zv) { ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); ASSERT0(zv->zv_open_count); ASSERT3P(zv->zv_zso->zvo_disk->private_data, ==, NULL); rw_destroy(&zv->zv_suspend_lock); zfs_rangelock_fini(&zv->zv_rangelock); del_gendisk(zv->zv_zso->zvo_disk); #if defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) && \ defined(HAVE_BLK_ALLOC_DISK) #if defined(HAVE_BLK_CLEANUP_DISK) blk_cleanup_disk(zv->zv_zso->zvo_disk); #else put_disk(zv->zv_zso->zvo_disk); #endif #else blk_cleanup_queue(zv->zv_zso->zvo_queue); put_disk(zv->zv_zso->zvo_disk); #endif #ifdef HAVE_BLK_MQ if (zv->zv_zso->use_blk_mq) blk_mq_free_tag_set(&zv->zv_zso->tag_set); #endif ida_simple_remove(&zvol_ida, MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); mutex_destroy(&zv->zv_state_lock); dataset_kstats_destroy(&zv->zv_kstat); kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); kmem_free(zv, sizeof (zvol_state_t)); } void zvol_wait_close(zvol_state_t *zv) { } /* * Create a block device minor node and setup the linkage between it * and the specified volume. Once this function returns the block * device is live and ready for use. */ int zvol_os_create_minor(const char *name) { zvol_state_t *zv; objset_t *os; dmu_object_info_t *doi; uint64_t volsize; uint64_t len; unsigned minor = 0; int error = 0; int idx; uint64_t hash = zvol_name_hash(name); bool replayed_zil = B_FALSE; if (zvol_inhibit_dev) return (0); idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); if (idx < 0) return (SET_ERROR(-idx)); minor = idx << ZVOL_MINOR_BITS; zv = zvol_find_by_name_hash(name, hash, RW_NONE); if (zv) { ASSERT(MUTEX_HELD(&zv->zv_state_lock)); mutex_exit(&zv->zv_state_lock); ida_simple_remove(&zvol_ida, idx); return (SET_ERROR(EEXIST)); } doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); if (error) goto out_doi; error = dmu_object_info(os, ZVOL_OBJ, doi); if (error) goto out_dmu_objset_disown; error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); if (error) goto out_dmu_objset_disown; zv = zvol_alloc(MKDEV(zvol_major, minor), name); if (zv == NULL) { error = SET_ERROR(EAGAIN); goto out_dmu_objset_disown; } zv->zv_hash = hash; if (dmu_objset_is_snapshot(os)) zv->zv_flags |= ZVOL_RDONLY; zv->zv_volblocksize = doi->doi_data_block_size; zv->zv_volsize = volsize; zv->zv_objset = os; set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, (DMU_MAX_ACCESS / 4) >> 9); if (zv->zv_zso->use_blk_mq) { /* * IO requests can be really big (1MB). When an IO request * comes in, it is passed off to zvol_read() or zvol_write() * in a new thread, where it is chunked up into 'volblocksize' * sized pieces and processed. So for example, if the request * is a 1MB write and your volblocksize is 128k, one zvol_write * thread will take that request and sequentially do ten 128k * IOs. This is due to the fact that the thread needs to lock * each volblocksize sized block. So you might be wondering: * "instead of passing the whole 1MB request to one thread, * why not pass ten individual 128k chunks to ten threads and * process the whole write in parallel?" The short answer is * that there's a sweet spot number of chunks that balances * the greater parallelism with the added overhead of more * threads. The sweet spot can be different depending on if you * have a read or write heavy workload. Writes typically want * high chunk counts while reads typically want lower ones. On * a test pool with 6 NVMe drives in a 3x 2-disk mirror * configuration, with volblocksize=8k, the sweet spot for good * sequential reads and writes was at 8 chunks. */ /* * Below we tell the kernel how big we want our requests * to be. You would think that blk_queue_io_opt() would be * used to do this since it is used to "set optimal request * size for the queue", but that doesn't seem to do * anything - the kernel still gives you huge requests * with tons of little PAGE_SIZE segments contained within it. * * Knowing that the kernel will just give you PAGE_SIZE segments * no matter what, you can say "ok, I want PAGE_SIZE byte * segments, and I want 'N' of them per request", where N is * the correct number of segments for the volblocksize and * number of chunks you want. */ #ifdef HAVE_BLK_MQ if (zvol_blk_mq_blocks_per_thread != 0) { unsigned int chunks; chunks = MIN(zvol_blk_mq_blocks_per_thread, UINT16_MAX); blk_queue_max_segment_size(zv->zv_zso->zvo_queue, PAGE_SIZE); blk_queue_max_segments(zv->zv_zso->zvo_queue, (zv->zv_volblocksize * chunks) / PAGE_SIZE); } else { /* * Special case: zvol_blk_mq_blocks_per_thread = 0 * Max everything out. */ blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); } #endif } else { blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); } blk_queue_physical_block_size(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); blk_queue_discard_granularity(zv->zv_zso->zvo_queue, zv->zv_volblocksize); #ifdef QUEUE_FLAG_DISCARD blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); #endif #ifdef QUEUE_FLAG_NONROT blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); #endif #ifdef QUEUE_FLAG_ADD_RANDOM blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); #endif /* This flag was introduced in kernel version 4.12. */ #ifdef QUEUE_FLAG_SCSI_PASSTHROUGH blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); #endif ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); error = dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); if (error) goto out_dmu_objset_disown; ASSERT3P(zv->zv_zilog, ==, NULL); zv->zv_zilog = zil_open(os, zvol_get_data, &zv->zv_kstat.dk_zil_sums); if (spa_writeable(dmu_objset_spa(os))) { if (zil_replay_disable) replayed_zil = zil_destroy(zv->zv_zilog, B_FALSE); else replayed_zil = zil_replay(os, zv, zvol_replay_vector); } if (replayed_zil) zil_close(zv->zv_zilog); zv->zv_zilog = NULL; /* * When udev detects the addition of the device it will immediately * invoke blkid(8) to determine the type of content on the device. * Prefetching the blocks commonly scanned by blkid(8) will speed * up this process. */ len = MIN(zvol_prefetch_bytes, SPA_MAXBLOCKSIZE); if (len > 0) { dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, ZIO_PRIORITY_SYNC_READ); } zv->zv_objset = NULL; out_dmu_objset_disown: dmu_objset_disown(os, B_TRUE, FTAG); out_doi: kmem_free(doi, sizeof (dmu_object_info_t)); /* * Keep in mind that once add_disk() is called, the zvol is * announced to the world, and zvol_open()/zvol_release() can * be called at any time. Incidentally, add_disk() itself calls * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() * directly as well. */ if (error == 0) { rw_enter(&zvol_state_lock, RW_WRITER); zvol_insert(zv); rw_exit(&zvol_state_lock); #ifdef HAVE_ADD_DISK_RET error = add_disk(zv->zv_zso->zvo_disk); #else add_disk(zv->zv_zso->zvo_disk); #endif } else { ida_simple_remove(&zvol_ida, idx); } return (error); } void zvol_os_rename_minor(zvol_state_t *zv, const char *newname) { int readonly = get_disk_ro(zv->zv_zso->zvo_disk); ASSERT(RW_LOCK_HELD(&zvol_state_lock)); ASSERT(MUTEX_HELD(&zv->zv_state_lock)); strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); /* move to new hashtable entry */ zv->zv_hash = zvol_name_hash(zv->zv_name); hlist_del(&zv->zv_hlink); hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); /* * The block device's read-only state is briefly changed causing * a KOBJ_CHANGE uevent to be issued. This ensures udev detects * the name change and fixes the symlinks. This does not change * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never * changes. This would normally be done using kobject_uevent() but * that is a GPL-only symbol which is why we need this workaround. */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); } void zvol_os_set_disk_ro(zvol_state_t *zv, int flags) { set_disk_ro(zv->zv_zso->zvo_disk, flags); } void zvol_os_set_capacity(zvol_state_t *zv, uint64_t capacity) { set_capacity(zv->zv_zso->zvo_disk, capacity); } int zvol_init(void) { int error; /* * zvol_threads is the module param the user passes in. * * zvol_actual_threads is what we use internally, since the user can * pass zvol_thread = 0 to mean "use all the CPUs" (the default). */ static unsigned int zvol_actual_threads; if (zvol_threads == 0) { /* * See dde9380a1 for why 32 was chosen here. This should * probably be refined to be some multiple of the number * of CPUs. */ zvol_actual_threads = MAX(num_online_cpus(), 32); } else { zvol_actual_threads = MIN(MAX(zvol_threads, 1), 1024); } error = register_blkdev(zvol_major, ZVOL_DRIVER); if (error) { printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); return (error); } #ifdef HAVE_BLK_MQ if (zvol_blk_mq_queue_depth == 0) { zvol_actual_blk_mq_queue_depth = BLKDEV_DEFAULT_RQ; } else { zvol_actual_blk_mq_queue_depth = MAX(zvol_blk_mq_queue_depth, BLKDEV_MIN_RQ); } if (zvol_blk_mq_threads == 0) { zvol_blk_mq_actual_threads = num_online_cpus(); } else { zvol_blk_mq_actual_threads = MIN(MAX(zvol_blk_mq_threads, 1), 1024); } #endif zvol_taskq = taskq_create(ZVOL_DRIVER, zvol_actual_threads, maxclsyspri, zvol_actual_threads, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); if (zvol_taskq == NULL) { unregister_blkdev(zvol_major, ZVOL_DRIVER); return (-ENOMEM); } zvol_init_impl(); ida_init(&zvol_ida); return (0); } void zvol_fini(void) { zvol_fini_impl(); unregister_blkdev(zvol_major, ZVOL_DRIVER); taskq_destroy(zvol_taskq); ida_destroy(&zvol_ida); } /* BEGIN CSTYLED */ module_param(zvol_inhibit_dev, uint, 0644); MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); module_param(zvol_major, uint, 0444); MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); module_param(zvol_threads, uint, 0444); MODULE_PARM_DESC(zvol_threads, "Number of threads to handle I/O requests. Set" "to 0 to use all active CPUs"); module_param(zvol_request_sync, uint, 0644); MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); module_param(zvol_max_discard_blocks, ulong, 0444); MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); module_param(zvol_prefetch_bytes, uint, 0644); MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); module_param(zvol_volmode, uint, 0644); MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); #ifdef HAVE_BLK_MQ module_param(zvol_blk_mq_queue_depth, uint, 0644); MODULE_PARM_DESC(zvol_blk_mq_queue_depth, "Default blk-mq queue depth"); module_param(zvol_use_blk_mq, uint, 0644); MODULE_PARM_DESC(zvol_use_blk_mq, "Use the blk-mq API for zvols"); module_param(zvol_blk_mq_blocks_per_thread, uint, 0644); MODULE_PARM_DESC(zvol_blk_mq_blocks_per_thread, "Process volblocksize blocks per thread"); #endif #ifndef HAVE_BLKDEV_GET_ERESTARTSYS module_param(zvol_open_timeout_ms, uint, 0644); MODULE_PARM_DESC(zvol_open_timeout_ms, "Timeout for ZVOL open retries"); #endif /* END CSTYLED */