diff --git a/config/kernel-vfs-invalidate_folio.m4 b/config/kernel-vfs-invalidate_folio.m4 deleted file mode 100644 index 61a5c8478af1..000000000000 --- a/config/kernel-vfs-invalidate_folio.m4 +++ /dev/null @@ -1,33 +0,0 @@ -dnl # -dnl # Linux 5.18 uses invalidate_folio in lieu of invalidate_page -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_INVALIDATE_FOLIO], [ - ZFS_LINUX_TEST_SRC([vfs_has_invalidate_folio], [ - #include - - static void - test_invalidate_folio(struct folio *folio, size_t offset, - size_t len) { - (void) folio; (void) offset; (void) len; - return; - } - - static const struct address_space_operations - aops __attribute__ ((unused)) = { - .invalidate_folio = test_invalidate_folio, - }; - ],[]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_VFS_INVALIDATE_FOLIO], [ - dnl # - dnl # Linux 5.18 uses invalidate_folio in lieu of invalidate_page - dnl # - AC_MSG_CHECKING([whether invalidate_folio exists]) - ZFS_LINUX_TEST_RESULT([vfs_has_invalidate_folio], [ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_INVALIDATE_FOLIO, 1, [invalidate_folio exists]) - ],[ - AC_MSG_RESULT([no]) - ]) -]) diff --git a/config/kernel-vfs-release_folio.m4 b/config/kernel-vfs-release_folio.m4 deleted file mode 100644 index f31db5677fd3..000000000000 --- a/config/kernel-vfs-release_folio.m4 +++ /dev/null @@ -1,32 +0,0 @@ -dnl # -dnl # Linux 5.19 uses release_folio in lieu of releasepage -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_RELEASE_FOLIO], [ - ZFS_LINUX_TEST_SRC([vfs_has_release_folio], [ - #include - - static bool - test_release_folio(struct folio *folio, gfp_t gfp) { - (void) folio; (void) gfp; - return (0); - } - - static const struct address_space_operations - aops __attribute__ ((unused)) = { - .release_folio = test_release_folio, - }; - ],[]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_VFS_RELEASE_FOLIO], [ - dnl # - dnl # Linux 5.19 uses release_folio in lieu of releasepage - dnl # - AC_MSG_CHECKING([whether release_folio exists]) - ZFS_LINUX_TEST_RESULT([vfs_has_release_folio], [ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_VFS_RELEASE_FOLIO, 1, [release_folio exists]) - ],[ - AC_MSG_RESULT([no]) - ]) -]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 556df58082f9..761f9310753a 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -1,1020 +1,1016 @@ dnl # dnl # Default ZFS kernel configuration dnl # AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ AM_COND_IF([BUILD_LINUX], [ dnl # Setup the kernel build environment. ZFS_AC_KERNEL ZFS_AC_QAT dnl # Sanity checks for module building and CONFIG_* defines ZFS_AC_KERNEL_CONFIG_DEFINED ZFS_AC_MODULE_SYMVERS dnl # Sequential ZFS_LINUX_TRY_COMPILE tests ZFS_AC_KERNEL_FPU_HEADER ZFS_AC_KERNEL_OBJTOOL_HEADER ZFS_AC_KERNEL_MISC_MINOR ZFS_AC_KERNEL_DECLARE_EVENT_CLASS dnl # Parallel ZFS_LINUX_TEST_SRC / ZFS_LINUX_TEST_RESULT tests ZFS_AC_KERNEL_TEST_SRC ZFS_AC_KERNEL_TEST_RESULT AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ" ]) AC_SUBST(KERNEL_MAKE) ]) ]) dnl # dnl # Generate and compile all of the kernel API test cases to determine dnl # which interfaces are available. By invoking the kernel build system dnl # only once the compilation can be done in parallel significantly dnl # speeding up the process. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_TYPES ZFS_AC_KERNEL_SRC_OBJTOOL ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE ZFS_AC_KERNEL_SRC_PDE_DATA ZFS_AC_KERNEL_SRC_GENERIC_FADVISE ZFS_AC_KERNEL_SRC_SCHED ZFS_AC_KERNEL_SRC_USLEEP_RANGE ZFS_AC_KERNEL_SRC_VMALLOC_PAGE_KERNEL ZFS_AC_KERNEL_SRC_INODE_TIMES ZFS_AC_KERNEL_SRC_PROC_OPERATIONS ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS ZFS_AC_KERNEL_SRC_BIO ZFS_AC_KERNEL_SRC_BLKDEV ZFS_AC_KERNEL_SRC_BLK_QUEUE ZFS_AC_KERNEL_SRC_GENHD_FLAGS ZFS_AC_KERNEL_SRC_REVALIDATE_DISK ZFS_AC_KERNEL_SRC_GET_DISK_RO ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_SRC_XATTR ZFS_AC_KERNEL_SRC_ACL ZFS_AC_KERNEL_SRC_INODE_SETATTR ZFS_AC_KERNEL_SRC_INODE_GETATTR ZFS_AC_KERNEL_SRC_SHOW_OPTIONS ZFS_AC_KERNEL_SRC_SHRINKER ZFS_AC_KERNEL_SRC_MKDIR ZFS_AC_KERNEL_SRC_LOOKUP_FLAGS ZFS_AC_KERNEL_SRC_CREATE ZFS_AC_KERNEL_SRC_PERMISSION ZFS_AC_KERNEL_SRC_TMPFILE ZFS_AC_KERNEL_SRC_AUTOMOUNT ZFS_AC_KERNEL_SRC_COMMIT_METADATA ZFS_AC_KERNEL_SRC_SETATTR_PREPARE ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SRC_SECURITY_INODE ZFS_AC_KERNEL_SRC_FST_MOUNT ZFS_AC_KERNEL_SRC_SET_NLINK ZFS_AC_KERNEL_SRC_SGET ZFS_AC_KERNEL_SRC_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_SRC_VFS_READ_FOLIO - ZFS_AC_KERNEL_SRC_VFS_RELEASE_FOLIO - ZFS_AC_KERNEL_SRC_VFS_INVALIDATE_FOLIO ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_READPAGES ZFS_AC_KERNEL_SRC_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_SRC_VFS_IOV_ITER ZFS_AC_KERNEL_SRC_VFS_GENERIC_COPY_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_SPLICE_COPY_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_REMAP_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_CLONE_FILE_RANGE ZFS_AC_KERNEL_SRC_VFS_DEDUPE_FILE_RANGE ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_SRC_KMAP_LOCAL_PAGE ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT ZFS_AC_KERNEL_SRC_FPU ZFS_AC_KERNEL_SRC_FMODE_T ZFS_AC_KERNEL_SRC_KUIDGID_T ZFS_AC_KERNEL_SRC_KUID_HELPERS ZFS_AC_KERNEL_SRC_RENAME ZFS_AC_KERNEL_SRC_TOTALRAM_PAGES_FUNC ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES ZFS_AC_KERNEL_SRC_PERCPU ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR ZFS_AC_KERNEL_SRC_MKNOD ZFS_AC_KERNEL_SRC_SYMLINK ZFS_AC_KERNEL_SRC_BIO_MAX_SEGS ZFS_AC_KERNEL_SRC_SIGINFO ZFS_AC_KERNEL_SRC_SYSFS ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG ZFS_AC_KERNEL_SRC_STRLCPY ZFS_AC_KERNEL_SRC_PAGEMAP_FOLIO_WAIT_BIT ZFS_AC_KERNEL_SRC_ADD_DISK ZFS_AC_KERNEL_SRC_KTHREAD ZFS_AC_KERNEL_SRC_ZERO_PAGE ZFS_AC_KERNEL_SRC___COPY_FROM_USER_INATOMIC ZFS_AC_KERNEL_SRC_IDMAP_MNT_API ZFS_AC_KERNEL_SRC_IDMAP_NO_USERNS ZFS_AC_KERNEL_SRC_IATTR_VFSID ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_SZ ZFS_AC_KERNEL_SRC_PROC_HANDLER_CTL_TABLE_CONST ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ ZFS_AC_KERNEL_SRC_SYNC_BDEV ZFS_AC_KERNEL_SRC_MM_PAGE_FLAGS ZFS_AC_KERNEL_SRC_MM_PAGE_SIZE ZFS_AC_KERNEL_SRC_MM_PAGE_MAPPING ZFS_AC_KERNEL_SRC_FILE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE ;; riscv*) ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE ;; esac AC_MSG_CHECKING([for available kernel interfaces]) ZFS_LINUX_TEST_COMPILE_ALL([kabi]) AC_MSG_RESULT([done]) ]) dnl # dnl # Check results of kernel interface tests. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_TYPES ZFS_AC_KERNEL_ACCESS_OK_TYPE ZFS_AC_KERNEL_OBJTOOL ZFS_AC_KERNEL_PDE_DATA ZFS_AC_KERNEL_GENERIC_FADVISE ZFS_AC_KERNEL_SCHED ZFS_AC_KERNEL_USLEEP_RANGE ZFS_AC_KERNEL_VMALLOC_PAGE_KERNEL ZFS_AC_KERNEL_INODE_TIMES ZFS_AC_KERNEL_PROC_OPERATIONS ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS ZFS_AC_KERNEL_BIO ZFS_AC_KERNEL_BLKDEV ZFS_AC_KERNEL_BLK_QUEUE ZFS_AC_KERNEL_GENHD_FLAGS ZFS_AC_KERNEL_REVALIDATE_DISK ZFS_AC_KERNEL_GET_DISK_RO ZFS_AC_KERNEL_DISCARD_GRANULARITY ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_XATTR ZFS_AC_KERNEL_ACL ZFS_AC_KERNEL_INODE_SETATTR ZFS_AC_KERNEL_INODE_GETATTR ZFS_AC_KERNEL_SHOW_OPTIONS ZFS_AC_KERNEL_SHRINKER ZFS_AC_KERNEL_MKDIR ZFS_AC_KERNEL_LOOKUP_FLAGS ZFS_AC_KERNEL_CREATE ZFS_AC_KERNEL_PERMISSION ZFS_AC_KERNEL_TMPFILE ZFS_AC_KERNEL_AUTOMOUNT ZFS_AC_KERNEL_COMMIT_METADATA ZFS_AC_KERNEL_SETATTR_PREPARE ZFS_AC_KERNEL_INSERT_INODE_LOCKED ZFS_AC_KERNEL_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SECURITY_INODE ZFS_AC_KERNEL_FST_MOUNT ZFS_AC_KERNEL_SET_NLINK ZFS_AC_KERNEL_SGET ZFS_AC_KERNEL_VFS_FILEMAP_DIRTY_FOLIO ZFS_AC_KERNEL_VFS_READ_FOLIO - ZFS_AC_KERNEL_VFS_RELEASE_FOLIO - ZFS_AC_KERNEL_VFS_INVALIDATE_FOLIO ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_READPAGES ZFS_AC_KERNEL_VFS_SET_PAGE_DIRTY_NOBUFFERS ZFS_AC_KERNEL_VFS_IOV_ITER ZFS_AC_KERNEL_VFS_GENERIC_COPY_FILE_RANGE ZFS_AC_KERNEL_VFS_SPLICE_COPY_FILE_RANGE ZFS_AC_KERNEL_VFS_REMAP_FILE_RANGE ZFS_AC_KERNEL_VFS_CLONE_FILE_RANGE ZFS_AC_KERNEL_VFS_DEDUPE_FILE_RANGE ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_KMAP_LOCAL_PAGE ZFS_AC_KERNEL_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_MAKE_REQUEST_FN ZFS_AC_KERNEL_GENERIC_IO_ACCT ZFS_AC_KERNEL_FPU ZFS_AC_KERNEL_FMODE_T ZFS_AC_KERNEL_KUIDGID_T ZFS_AC_KERNEL_KUID_HELPERS ZFS_AC_KERNEL_RENAME ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC ZFS_AC_KERNEL_TOTALHIGH_PAGES ZFS_AC_KERNEL_PERCPU ZFS_AC_KERNEL_GENERIC_FILLATTR ZFS_AC_KERNEL_MKNOD ZFS_AC_KERNEL_SYMLINK ZFS_AC_KERNEL_BIO_MAX_SEGS ZFS_AC_KERNEL_SIGINFO ZFS_AC_KERNEL_SYSFS ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG ZFS_AC_KERNEL_STRLCPY ZFS_AC_KERNEL_PAGEMAP_FOLIO_WAIT_BIT ZFS_AC_KERNEL_ADD_DISK ZFS_AC_KERNEL_KTHREAD ZFS_AC_KERNEL_ZERO_PAGE ZFS_AC_KERNEL___COPY_FROM_USER_INATOMIC ZFS_AC_KERNEL_IDMAP_MNT_API ZFS_AC_KERNEL_IDMAP_NO_USERNS ZFS_AC_KERNEL_IATTR_VFSID ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE ZFS_AC_KERNEL_REGISTER_SYSCTL_SZ ZFS_AC_KERNEL_PROC_HANDLER_CTL_TABLE_CONST ZFS_AC_KERNEL_COPY_SPLICE_READ ZFS_AC_KERNEL_SYNC_BDEV ZFS_AC_KERNEL_MM_PAGE_FLAGS ZFS_AC_KERNEL_MM_PAGE_SIZE ZFS_AC_KERNEL_MM_PAGE_MAPPING ZFS_AC_KERNEL_1ARG_ASSIGN_STR ZFS_AC_KERNEL_FILE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE ;; riscv*) ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE ;; esac ]) dnl # dnl # Detect name used for Module.symvers file in kernel dnl # AC_DEFUN([ZFS_AC_MODULE_SYMVERS], [ modpost=$LINUX/scripts/Makefile.modpost AC_MSG_CHECKING([kernel file name for module symbols]) AS_IF([test "x$enable_linux_builtin" != xyes -a -f "$modpost"], [ AS_IF([grep -q Modules.symvers $modpost], [ LINUX_SYMBOLS=Modules.symvers ], [ LINUX_SYMBOLS=Module.symvers ]) AS_IF([test ! -f "$LINUX_OBJ/$LINUX_SYMBOLS"], [ AC_MSG_ERROR([ *** Please make sure the kernel devel package for your distribution *** is installed. If you are building with a custom kernel, make sure *** the kernel is configured, built, and the '--with-linux=PATH' *** configure option refers to the location of the kernel source. ]) ]) ], [ LINUX_SYMBOLS=NONE ]) AC_MSG_RESULT($LINUX_SYMBOLS) AC_SUBST(LINUX_SYMBOLS) ]) dnl # dnl # Detect the kernel to be built against dnl # dnl # Most modern Linux distributions have separate locations for bare dnl # source (source) and prebuilt (build) files. Additionally, there are dnl # `source` and `build` symlinks in `/lib/modules/$(KERNEL_VERSION)` dnl # pointing to them. The directory search order is now: dnl # dnl # - `configure` command line values if both `--with-linux` and dnl # `--with-linux-obj` were defined dnl # dnl # - If only `--with-linux` was defined, `--with-linux-obj` is assumed dnl # to have the same value as `--with-linux` dnl # dnl # - If neither `--with-linux` nor `--with-linux-obj` were defined dnl # autodetection is used: dnl # dnl # - `/lib/modules/$(uname -r)/{source,build}` respectively, if exist. dnl # dnl # - If only `/lib/modules/$(uname -r)/build` exists, it is assumed dnl # to be both source and build directory. dnl # dnl # - The first directory in `/lib/modules` with the highest version dnl # number according to `sort -V` which contains both `source` and dnl # `build` symlinks/directories. If module directory contains only dnl # `build` component, it is assumed to be both source and build dnl # directory. dnl # dnl # - Last resort: the first directory matching `/usr/src/kernels/*` dnl # and `/usr/src/linux-*` with the highest version number according dnl # to `sort -V` is assumed to be both source and build directory. dnl # AC_DEFUN([ZFS_AC_KERNEL], [ AC_ARG_WITH([linux], AS_HELP_STRING([--with-linux=PATH], [Path to kernel source]), [kernelsrc="$withval"]) AC_ARG_WITH(linux-obj, AS_HELP_STRING([--with-linux-obj=PATH], [Path to kernel build objects]), [kernelbuild="$withval"]) AC_MSG_CHECKING([kernel source and build directories]) AS_IF([test -n "$kernelsrc" && test -z "$kernelbuild"], [ kernelbuild="$kernelsrc" ], [test -z "$kernelsrc"], [ AS_IF([test -e "/lib/modules/$(uname -r)/source" && \ test -e "/lib/modules/$(uname -r)/build"], [ src="/lib/modules/$(uname -r)/source" build="/lib/modules/$(uname -r)/build" ], [test -e "/lib/modules/$(uname -r)/build"], [ build="/lib/modules/$(uname -r)/build" src="$build" ], [ src= for d in $(ls -1d /lib/modules/* 2>/dev/null | sort -Vr); do if test -e "$d/source" && test -e "$d/build"; then src="$d/source" build="$d/build" break fi if test -e "$d/build"; then src="$d/build" build="$d/build" break fi done # the least reliable method if test -z "$src"; then src=$(ls -1d /usr/src/kernels/* /usr/src/linux-* \ 2>/dev/null | grep -v obj | sort -Vr | head -1) build="$src" fi ]) AS_IF([test -n "$src" && test -e "$src"], [ kernelsrc=$(readlink -e "$src") ], [ kernelsrc="[Not found]" ]) AS_IF([test -n "$build" && test -e "$build"], [ kernelbuild=$(readlink -e "$build") ], [ kernelbuild="[Not found]" ]) ], [ AS_IF([test "$kernelsrc" = "NONE"], [ kernsrcver=NONE ]) withlinux=yes ]) AC_MSG_RESULT([done]) AC_MSG_CHECKING([kernel source directory]) AC_MSG_RESULT([$kernelsrc]) AC_MSG_CHECKING([kernel build directory]) AC_MSG_RESULT([$kernelbuild]) AS_IF([test ! -d "$kernelsrc" || test ! -d "$kernelbuild"], [ AC_MSG_ERROR([ *** Please make sure the kernel devel package for your distribution *** is installed and then try again. If that fails, you can specify the *** location of the kernel source and build with the '--with-linux=PATH' and *** '--with-linux-obj=PATH' options respectively.]) ]) AC_MSG_CHECKING([kernel source version]) utsrelease1=$kernelbuild/include/linux/version.h utsrelease2=$kernelbuild/include/linux/utsrelease.h utsrelease3=$kernelbuild/include/generated/utsrelease.h AS_IF([test -r $utsrelease1 && grep -qF UTS_RELEASE $utsrelease1], [ utsrelease=$utsrelease1 ], [test -r $utsrelease2 && grep -qF UTS_RELEASE $utsrelease2], [ utsrelease=$utsrelease2 ], [test -r $utsrelease3 && grep -qF UTS_RELEASE $utsrelease3], [ utsrelease=$utsrelease3 ]) AS_IF([test -n "$utsrelease"], [ kernsrcver=$($AWK '/UTS_RELEASE/ { gsub(/"/, "", $[3]); print $[3] }' $utsrelease) AS_IF([test -z "$kernsrcver"], [ AC_MSG_RESULT([Not found]) AC_MSG_ERROR([ *** Cannot determine kernel version. ]) ]) ], [ AC_MSG_RESULT([Not found]) if test "x$enable_linux_builtin" != xyes; then AC_MSG_ERROR([ *** Cannot find UTS_RELEASE definition. ]) else AC_MSG_ERROR([ *** Cannot find UTS_RELEASE definition. *** Please run 'make prepare' inside the kernel source tree.]) fi ]) AC_MSG_RESULT([$kernsrcver]) AX_COMPARE_VERSION([$kernsrcver], [ge], [$ZFS_META_KVER_MIN], [], [ AC_MSG_ERROR([ *** Cannot build against kernel version $kernsrcver. *** The minimum supported kernel version is $ZFS_META_KVER_MIN. ]) ]) AC_ARG_ENABLE([linux-experimental], AS_HELP_STRING([--enable-linux-experimental], [Allow building against some unsupported kernel versions])) AX_COMPARE_VERSION([$kernsrcver], [ge], [$ZFS_META_KVER_MAX], [ AX_COMPARE_VERSION([$kernsrcver], [eq2], [$ZFS_META_KVER_MAX], [ kern_max_version_ok=yes ], [ kern_max_version_ok=no ]) ], [ kern_max_version_ok=yes ]) AS_IF([test "x$kern_max_version_ok" != "xyes"], [ AS_IF([test "x$enable_linux_experimental" == "xyes"], [ AC_DEFINE(HAVE_LINUX_EXPERIMENTAL, 1, [building against unsupported kernel version]) ], [ AC_MSG_ERROR([ *** Cannot build against kernel version $kernsrcver. *** The maximum supported kernel version is $ZFS_META_KVER_MAX. ]) ]) ]) LINUX=${kernelsrc} LINUX_OBJ=${kernelbuild} LINUX_VERSION=${kernsrcver} AC_SUBST(LINUX) AC_SUBST(LINUX_OBJ) AC_SUBST(LINUX_VERSION) ]) AC_DEFUN([ZFS_AC_KERNEL_VERSION_WARNING], [ AS_IF([test "x$enable_linux_experimental" = "xyes" && \ test "x$kern_max_version_ok" != "xyes"], [ AC_MSG_WARN([ You are building OpenZFS against Linux version $kernsrcver. This combination is considered EXPERIMENTAL by the OpenZFS project. Even if it appears to build and run correctly, there may be bugs that can cause SERIOUS DATA LOSS. YOU HAVE BEEN WARNED! If you choose to continue, we'd appreciate if you could report your results on the OpenZFS issue tracker at: https://github.com/openzfs/zfs/issues/new Your feedback will help us prepare a new OpenZFS release that supports this version of Linux. ]) ]) ]) dnl # dnl # Detect the QAT module to be built against, QAT provides hardware dnl # acceleration for data compression: dnl # dnl # https://01.org/intel-quickassist-technology dnl # dnl # 1) Download and install QAT driver from the above link dnl # 2) Start QAT driver in your system: dnl # service qat_service start dnl # 3) Enable QAT in ZFS, e.g.: dnl # ./configure --with-qat=/QAT1.6 dnl # make dnl # 4) Set GZIP compression in ZFS dataset: dnl # zfs set compression = gzip dnl # dnl # Then the data written to this ZFS pool is compressed by QAT accelerator dnl # automatically, and de-compressed by QAT when read from the pool. dnl # dnl # 1) Get QAT hardware statistics with: dnl # cat /proc/icp_dh895xcc_dev/qat dnl # 2) To disable QAT: dnl # insmod zfs.ko zfs_qat_disable=1 dnl # AC_DEFUN([ZFS_AC_QAT], [ AC_ARG_WITH([qat], AS_HELP_STRING([--with-qat=PATH], [Path to qat source]), AS_IF([test "$withval" = "yes"], AC_MSG_ERROR([--with-qat=PATH requires a PATH]), [qatsrc="$withval"])) AC_ARG_WITH([qat-obj], AS_HELP_STRING([--with-qat-obj=PATH], [Path to qat build objects]), [qatbuild="$withval"]) AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat source directory]) AC_MSG_RESULT([$qatsrc]) QAT_SRC="${qatsrc}/quickassist" AS_IF([ test ! -e "$QAT_SRC/include/cpa.h"], [ AC_MSG_ERROR([ *** Please make sure the qat driver package is installed *** and specify the location of the qat source with the *** '--with-qat=PATH' option then try again. Failed to *** find cpa.h in: ${QAT_SRC}/include]) ]) ]) AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat build directory]) AS_IF([test -z "$qatbuild"], [ qatbuild="${qatsrc}/build" ]) AC_MSG_RESULT([$qatbuild]) QAT_OBJ=${qatbuild} AS_IF([ ! test -e "$QAT_OBJ/icp_qa_al.ko" && ! test -e "$QAT_OBJ/qat_api.ko"], [ AC_MSG_ERROR([ *** Please make sure the qat driver is installed then try again. *** Failed to find icp_qa_al.ko or qat_api.ko in: $QAT_OBJ]) ]) AC_SUBST(QAT_SRC) AC_SUBST(QAT_OBJ) AC_DEFINE(HAVE_QAT, 1, [qat is enabled and existed]) ]) dnl # dnl # Detect the name used for the QAT Module.symvers file. dnl # AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat file for module symbols]) QAT_SYMBOLS=$QAT_SRC/lookaside/access_layer/src/Module.symvers AS_IF([test -r $QAT_SYMBOLS], [ AC_MSG_RESULT([$QAT_SYMBOLS]) AC_SUBST(QAT_SYMBOLS) ],[ AC_MSG_ERROR([ *** Please make sure the qat driver is installed then try again. *** Failed to find Module.symvers in: $QAT_SYMBOLS ]) ]) ]) ]) dnl # dnl # ZFS_LINUX_CONFTEST_H dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_H], [ test -d build/$2 || mkdir -p build/$2 cat - <<_ACEOF >build/$2/$2.h $1 _ACEOF ]) dnl # dnl # ZFS_LINUX_CONFTEST_C dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_C], [ test -d build/$2 || mkdir -p build/$2 cat confdefs.h - <<_ACEOF >build/$2/$2.c $1 _ACEOF ]) dnl # dnl # ZFS_LINUX_CONFTEST_MAKEFILE dnl # dnl # $1 - test case name dnl # $2 - add to top-level Makefile dnl # $3 - additional build flags dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_MAKEFILE], [ test -d build || mkdir -p build test -d build/$1 || mkdir -p build/$1 file=build/$1/Makefile dnl # Example command line to manually build source. cat - <<_ACEOF >$file # Example command line to manually build source # make modules -C $LINUX_OBJ $ARCH_UM M=$PWD/build/$1 ccflags-y := -Werror $FRAME_LARGER_THAN _ACEOF dnl # Additional custom CFLAGS as requested. m4_ifval($3, [echo "ccflags-y += $3" >>$file], []) dnl # Test case source echo "obj-m := $1.o" >>$file AS_IF([test "x$2" = "xyes"], [echo "obj-m += $1/" >>build/Makefile], []) ]) dnl # dnl # ZFS_LINUX_TEST_PROGRAM(C)([PROLOGUE], [BODY]) dnl # m4_define([ZFS_LINUX_TEST_PROGRAM], [ #include $1 int main (void) { $2 ; return 0; } MODULE_DESCRIPTION("conftest"); MODULE_AUTHOR(ZFS_META_AUTHOR); MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); MODULE_LICENSE($3); ]) dnl # dnl # ZFS_LINUX_TEST_REMOVE dnl # dnl # Removes the specified test source and results. dnl # AC_DEFUN([ZFS_LINUX_TEST_REMOVE], [ test -d build/$1 && rm -Rf build/$1 test -f build/Makefile && sed '/$1/d' build/Makefile ]) dnl # dnl # ZFS_LINUX_COMPILE dnl # dnl # $1 - build dir dnl # $2 - test command dnl # $3 - pass command dnl # $4 - fail command dnl # $5 - set KBUILD_MODPOST_NOFINAL='yes' dnl # $6 - set KBUILD_MODPOST_WARN='yes' dnl # dnl # Used internally by ZFS_LINUX_TEST_{COMPILE,MODPOST} dnl # AC_DEFUN([ZFS_LINUX_COMPILE], [ AC_ARG_VAR([KERNEL_CC], [C compiler for building kernel modules]) AC_ARG_VAR([KERNEL_LD], [Linker for building kernel modules]) AC_ARG_VAR([KERNEL_LLVM], [Binary option to build kernel modules with LLVM/CLANG toolchain]) AC_TRY_COMMAND([ KBUILD_MODPOST_NOFINAL="$5" KBUILD_MODPOST_WARN="$6" make modules -k -j$TEST_JOBS ${KERNEL_CC:+CC=$KERNEL_CC} ${KERNEL_LD:+LD=$KERNEL_LD} ${KERNEL_LLVM:+LLVM=$KERNEL_LLVM} CONFIG_MODULES=y CFLAGS_MODULE=-DCONFIG_MODULES -C $LINUX_OBJ $ARCH_UM M=$PWD/$1 >$1/build.log 2>&1]) AS_IF([AC_TRY_COMMAND([$2])], [$3], [$4]) ]) dnl # dnl # ZFS_LINUX_TEST_COMPILE dnl # dnl # Perform a full compile excluding the final modpost phase. dnl # AC_DEFUN([ZFS_LINUX_TEST_COMPILE], [ ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ mv $2/Makefile $2/Makefile.compile.$1 mv $2/build.log $2/build.log.$1 ],[ AC_MSG_ERROR([ *** Unable to compile test source to determine kernel interfaces.]) ], [yes], []) ]) dnl # dnl # ZFS_LINUX_TEST_MODPOST dnl # dnl # Perform a full compile including the modpost phase. This may dnl # be an incremental build if the objects have already been built. dnl # AC_DEFUN([ZFS_LINUX_TEST_MODPOST], [ ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ mv $2/Makefile $2/Makefile.modpost.$1 cat $2/build.log >>build/build.log.$1 ],[ AC_MSG_ERROR([ *** Unable to modpost test source to determine kernel interfaces.]) ], [], [yes]) ]) dnl # dnl # Perform the compilation of the test cases in two phases. dnl # dnl # Phase 1) attempt to build the object files for all of the tests dnl # defined by the ZFS_LINUX_TEST_SRC macro. But do not dnl # perform the final modpost stage. dnl # dnl # Phase 2) disable all tests which failed the initial compilation, dnl # then invoke the final modpost step for the remaining tests. dnl # dnl # This allows us efficiently build the test cases in parallel while dnl # remaining resilient to build failures which are expected when dnl # detecting the available kernel interfaces. dnl # dnl # The maximum allowed parallelism can be controlled by setting the dnl # TEST_JOBS environment variable. Otherwise, it default to $(nproc). dnl # AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [ dnl # Phase 1 - Compilation only, final linking is skipped. ZFS_LINUX_TEST_COMPILE([$1], [build]) dnl # dnl # Phase 2 - When building external modules disable test cases dnl # which failed to compile and invoke modpost to verify the dnl # final linking. dnl # dnl # Test names suffixed with '_license' call modpost independently dnl # to ensure that a single incompatibility does not result in the dnl # modpost phase exiting early. This check is not performed on dnl # every symbol since the majority are compatible and doing so dnl # would significantly slow down this phase. dnl # dnl # When configuring for builtin (--enable-linux-builtin) dnl # fake the linking step artificially create the expected .ko dnl # files for tests which did compile. This is required for dnl # kernels which do not have loadable module support or have dnl # not yet been built. dnl # AS_IF([test "x$enable_linux_builtin" = "xno"], [ for dir in $(awk '/^obj-m/ { print [$]3 }' \ build/Makefile.compile.$1); do name=${dir%/} AS_IF([test -f build/$name/$name.o], [ AS_IF([test "${name##*_}" = "license"], [ ZFS_LINUX_TEST_MODPOST([$1], [build/$name]) echo "obj-n += $dir" >>build/Makefile ], [ echo "obj-m += $dir" >>build/Makefile ]) ], [ echo "obj-n += $dir" >>build/Makefile ]) done ZFS_LINUX_TEST_MODPOST([$1], [build]) ], [ for dir in $(awk '/^obj-m/ { print [$]3 }' \ build/Makefile.compile.$1); do name=${dir%/} AS_IF([test -f build/$name/$name.o], [ touch build/$name/$name.ko ]) done ]) ]) dnl # dnl # ZFS_LINUX_TEST_SRC dnl # dnl # $1 - name dnl # $2 - global dnl # $3 - source dnl # $4 - extra cflags dnl # $5 - check license-compatibility dnl # dnl # Check if the test source is buildable at all and then if it is dnl # license compatible. dnl # dnl # N.B because all of the test cases are compiled in parallel they dnl # must never depend on the results of previous tests. Each test dnl # needs to be entirely independent. dnl # AC_DEFUN([ZFS_LINUX_TEST_SRC], [ ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[$2]], [[$3]], [["Dual BSD/GPL"]])], [$1]) ZFS_LINUX_CONFTEST_MAKEFILE([$1], [yes], [$4]) AS_IF([ test -n "$5" ], [ ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM( [[$2]], [[$3]], [[$5]])], [$1_license]) ZFS_LINUX_CONFTEST_MAKEFILE([$1_license], [yes], [$4]) ]) ]) dnl # dnl # ZFS_LINUX_TEST_RESULT dnl # dnl # $1 - name of a test source (ZFS_LINUX_TEST_SRC) dnl # $2 - run on success (valid .ko generated) dnl # $3 - run on failure (unable to compile) dnl # AC_DEFUN([ZFS_LINUX_TEST_RESULT], [ AS_IF([test -d build/$1], [ AS_IF([test -f build/$1/$1.ko], [$2], [$3]) ], [ AC_MSG_ERROR([ *** No matching source for the "$1" test, check that *** both the test source and result macros refer to the same name. ]) ]) ]) dnl # dnl # ZFS_LINUX_TEST_ERROR dnl # dnl # Generic error message which can be used when none of the expected dnl # kernel interfaces were detected. dnl # AC_DEFUN([ZFS_LINUX_TEST_ERROR], [ AC_MSG_ERROR([ *** None of the expected "$1" interfaces were detected. *** This may be because your kernel version is newer than what is *** supported, or you are using a patched custom kernel with *** incompatible modifications. *** *** ZFS Version: $ZFS_META_ALIAS *** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX ]) ]) dnl # dnl # ZFS_LINUX_TEST_RESULT_SYMBOL dnl # dnl # Like ZFS_LINUX_TEST_RESULT except ZFS_CHECK_SYMBOL_EXPORT is called to dnl # verify symbol exports, unless --enable-linux-builtin was provided to dnl # configure. dnl # AC_DEFUN([ZFS_LINUX_TEST_RESULT_SYMBOL], [ AS_IF([ ! test -f build/$1/$1.ko], [ $5 ], [ AS_IF([test "x$enable_linux_builtin" != "xyes"], [ ZFS_CHECK_SYMBOL_EXPORT([$2], [$3], [$4], [$5]) ], [ $4 ]) ]) ]) dnl # dnl # ZFS_LINUX_COMPILE_IFELSE dnl # AC_DEFUN([ZFS_LINUX_COMPILE_IFELSE], [ ZFS_LINUX_TEST_REMOVE([conftest]) m4_ifvaln([$1], [ZFS_LINUX_CONFTEST_C([$1], [conftest])]) m4_ifvaln([$5], [ZFS_LINUX_CONFTEST_H([$5], [conftest])], [ZFS_LINUX_CONFTEST_H([], [conftest])]) ZFS_LINUX_CONFTEST_MAKEFILE([conftest], [no], [m4_ifvaln([$5], [-I$PWD/build/conftest], [])]) ZFS_LINUX_COMPILE([build/conftest], [$2], [$3], [$4], [], []) ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE dnl # dnl # $1 - global dnl # $2 - source dnl # $3 - run on success (valid .ko generated) dnl # $4 - run on failure (unable to compile) dnl # dnl # When configuring as builtin (--enable-linux-builtin) for kernels dnl # without loadable module support (CONFIG_MODULES=n) only the object dnl # file is created. See ZFS_LINUX_TEST_COMPILE_ALL for details. dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE], [ AS_IF([test "x$enable_linux_builtin" = "xyes"], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.o], [$3], [$4]) ], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.ko], [$3], [$4]) ]) ]) dnl # dnl # ZFS_CHECK_SYMBOL_EXPORT dnl # dnl # Check if a symbol is exported on not by consulting the symbols dnl # file, or optionally the source code. dnl # AC_DEFUN([ZFS_CHECK_SYMBOL_EXPORT], [ grep -q -E '[[[:space:]]]$1[[[:space:]]]' \ $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null rc=$? if test $rc -ne 0; then export=0 for file in $2; do grep -q -E "EXPORT_SYMBOL.*($1)" \ "$LINUX/$file" 2>/dev/null rc=$? if test $rc -eq 0; then export=1 break; fi done if test $export -eq 0; then : $4 else : $3 fi else : $3 fi ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE_SYMBOL dnl # dnl # Like ZFS_LINUX_TRY_COMPILER except ZFS_CHECK_SYMBOL_EXPORT is called dnl # to verify symbol exports, unless --enable-linux-builtin was provided dnl # to configure. dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE_SYMBOL], [ ZFS_LINUX_TRY_COMPILE([$1], [$2], [rc=0], [rc=1]) if test $rc -ne 0; then : $6 else if test "x$enable_linux_builtin" != xyes; then ZFS_CHECK_SYMBOL_EXPORT([$3], [$4], [rc=0], [rc=1]) fi if test $rc -ne 0; then : $6 else : $5 fi fi ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE_HEADER dnl # like ZFS_LINUX_TRY_COMPILE, except the contents conftest.h are dnl # provided via the fifth parameter dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE_HEADER], [ AS_IF([test "x$enable_linux_builtin" = "xyes"], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.o], [$3], [$4], [$5]) ], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.ko], [$3], [$4], [$5]) ]) ]) dnl # dnl # AS_VERSION_COMPARE_LE dnl # like AS_VERSION_COMPARE_LE, but runs $3 if (and only if) $1 <= $2 dnl # AS_VERSION_COMPARE_LE (version-1, version-2, [action-if-less-or-equal], [action-if-greater]) dnl # AC_DEFUN([AS_VERSION_COMPARE_LE], [ AS_VERSION_COMPARE([$1], [$2], [$3], [$3], [$4]) ]) dnl # dnl # ZFS_LINUX_REQUIRE_API dnl # like ZFS_LINUX_TEST_ERROR, except only fails if the kernel is dnl # at least some specified version. dnl # AC_DEFUN([ZFS_LINUX_REQUIRE_API], [ AS_VERSION_COMPARE_LE([$2], [$kernsrcver], [ AC_MSG_ERROR([ *** None of the expected "$1" interfaces were detected. This *** interface is expected for kernels version "$2" and above. *** This may be because your kernel version is newer than what is *** supported, or you are using a patched custom kernel with *** incompatible modifications. Newer kernels may have incompatible *** APIs. *** *** ZFS Version: $ZFS_META_ALIAS *** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX ]) ], [ AC_MSG_RESULT(no) ]) ]) diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 469197220859..dd9fd760b9c2 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1,4368 +1,4351 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using zfs_enter(zfsvfs). * A zfs_exit(zfsvfs) is needed before all returns. Any znodes * must be checked with zfs_verify_zp(zp). Both of these macros * can return EIO from the calling function. * * (2) zrele() should always be the last thing except for zil_commit() (if * necessary) and zfs_exit(). This is for 3 reasons: First, if it's the * last reference, the vnode/znode can be freed, so the zp may point to * freed memory. Second, the last reference will call zfs_zinactive(), * which may induce a lot of work -- pushing cached pages (which acquires * range locks) and syncing out cached atime changes. Third, * zfs_zinactive() may require a new tx, which could deadlock the system * if you were already holding one. This deadlock occurs because the tx * currently being operated on prevents a txg from syncing, which * prevents the new tx from progressing, resulting in a deadlock. If you * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() * is a synonym for zrele(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from zfs_enter()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * zfs_enter(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * zfs_exit(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * zil_commit(zilog, foid); // synchronous when necessary * zfs_exit(zfsvfs); // finished in zfs * return (error); // done, report error */ int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); /* Honor ZFS_APPENDONLY file attribute */ if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } /* * Keep a count of the synchronous opens in the znode. On first * synchronous open we must convert all previous async transactions * into sync to keep correct ordering. */ if (flag & O_SYNC) { if (atomic_inc_32_nv(&zp->z_sync_cnt) == 1) zil_async_to_sync(zfsvfs->z_log, zp->z_id); } zfs_exit(zfsvfs, FTAG); return (0); } int zfs_close(struct inode *ip, int flag, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); /* Decrement the synchronous opens in the znode */ if (flag & O_SYNC) atomic_dec_32(&zp->z_sync_cnt); zfs_exit(zfsvfs, FTAG); return (0); } #if defined(_KERNEL) static int zfs_fillpage(struct inode *ip, struct page *pp); /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. Update all mapped * pages with the contents of the coresponding dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { struct address_space *mp = ZTOI(zp)->i_mapping; int64_t off = start & (PAGE_SIZE - 1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { uint64_t nbytes = MIN(PAGE_SIZE - off, len); struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); void *pb = kmap(pp); int error = dmu_read(os, zp->z_id, start + off, nbytes, pb + off, DMU_READ_PREFETCH); kunmap(pp); if (error) { SetPageError(pp); ClearPageUptodate(pp); } else { ClearPageError(pp); SetPageUptodate(pp); - if (!PagePrivate(pp)) { - /* - * Set private bit so page migration - * will wait for us to finish writeback - * before calling migrate_folio(). - */ - SetPagePrivate(pp); - get_page(pp); - } if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); } unlock_page(pp); put_page(pp); } len -= nbytes; off = 0; } } /* * When a file is memory mapped, we must keep the I/O data synchronized * between the DMU cache and the memory mapped pages. Preferentially read * from memory mapped pages, otherwise fallback to reading through the dmu. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; int64_t start = uio->uio_loffset; int64_t off = start & (PAGE_SIZE - 1); int len = nbytes; int error = 0; for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { uint64_t bytes = MIN(PAGE_SIZE - off, len); struct page *pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { /* * If filemap_fault() retries there exists a window * where the page will be unlocked and not up to date. * In this case we must try and fill the page. */ if (unlikely(!PageUptodate(pp))) { error = zfs_fillpage(ip, pp); if (error) { unlock_page(pp); put_page(pp); return (error); } } ASSERT(PageUptodate(pp) || PageDirty(pp)); unlock_page(pp); void *pb = kmap(pp); error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); put_page(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); } len -= bytes; off = 0; if (error) break; } return (error); } #endif /* _KERNEL */ static unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; /* * Write the bytes to a file. * * IN: zp - znode of file to be written to * data - bytes to write * len - number of bytes to write * pos - offset to start writing at * * OUT: resid - remaining bytes to write * * RETURN: 0 if success * positive error code if failure. EIO is returned * for a short write when residp isn't provided. * * Timestamps: * zp - ctime|mtime updated if byte count > 0 */ int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *residp) { fstrans_cookie_t cookie; int error; struct iovec iov; iov.iov_base = (void *)data; iov.iov_len = len; zfs_uio_t uio; zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); cookie = spl_fstrans_mark(); error = zfs_write(zp, &uio, 0, kcred); spl_fstrans_unmark(cookie); if (error == 0) { if (residp != NULL) *residp = zfs_uio_resid(&uio); else if (zfs_uio_resid(&uio) != 0) error = SET_ERROR(EIO); } return (error); } static void zfs_rele_async_task(void *arg) { iput(arg); } void zfs_zrele_async(znode_t *zp) { struct inode *ip = ZTOI(zp); objset_t *os = ITOZSB(ip)->z_os; ASSERT(atomic_read(&ip->i_count) > 0); ASSERT(os != NULL); /* * If decrementing the count would put us at 0, we can't do it inline * here, because that would be synchronous. Instead, dispatch an iput * to run later. * * For more information on the dangers of a synchronous iput, see the * header comment of this file. */ if (!atomic_add_unless(&ip->i_count, -1, 1)) { VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), zfs_rele_async_task, ip, TQ_SLEEP) != TASKQID_INVALID); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held inode reference for it. * * IN: zdp - znode of directory to search. * nm - name of entry to lookup. * flags - LOOKUP_XATTR set if looking for an attribute. * cr - credentials of caller. * direntflags - directory lookup flags * realpnp - returned pathname. * * OUT: zpp - znode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ int zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ZTOZSB(zdp); int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { if (!S_ISDIR(ZTOI(zdp)->i_mode)) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { error = zfs_fastaccesschk_execute(zdp, cr); if (!error) { *zpp = zdp; zhold(*zpp); return (0); } return (error); } } if ((error = zfs_enter_verify_zp(zfsvfs, zdp, FTAG)) != 0) return (error); *zpp = NULL; if (flags & LOOKUP_XATTR) { /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { zfs_exit(zfsvfs, FTAG); return (error); } /* * Do we have permission to get into attribute directory? */ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, B_TRUE, cr, zfs_init_idmap))) { zrele(*zpp); *zpp = NULL; } zfs_exit(zfsvfs, FTAG); return (error); } if (!S_ISDIR(ZTOI(zdp)->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTDIR)); } /* * Check accessibility of directory. */ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr, zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); if ((error == 0) && (*zpp)) zfs_znode_update_vfs(*zpp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Perform a linear search in directory for the name of specific inode. * Note we don't pass in the buffer size of name because it's hardcoded to * NAME_MAX+1(256) in Linux. * * IN: dzp - znode of directory to search. * zp - znode of the target * * OUT: name - dentry name of the target * * RETURN: 0 on success, error code on failure. */ int zfs_get_name(znode_t *dzp, char *name, znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(dzp); int error = 0; if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); if ((error = zfs_verify_zp(zp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* ctldir should have got their name in zfs_vget */ if (dzp->z_is_ctldir || zp->z_is_ctldir) { zfs_exit(zfsvfs, FTAG); return (ENOENT); } /* buffer len is hardcoded to 256 in Linux kernel */ error = zap_value_search(zfsvfs->z_os, dzp->z_id, zp->z_id, ZFS_DIRENT_OBJ(-1ULL), name, ZAP_MAXNAMELEN); zfs_exit(zfsvfs, FTAG); return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the ip of the created or trunc'd file. * * IN: dzp - znode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - file flag. * vsecp - ACL to be set * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated if new entry created * zp - ctime|mtime always, atime if new */ int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; boolean_t skip_acl = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } top: *zpp = NULL; if (*name == '\0') { /* * Null component name refers to the directory itself. */ zhold(dzp); zp = dzp; dl = NULL; error = 0; } else { /* possible igrab(zp) */ int zflg = 0; if (flag & FIGNORECASE) zflg |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { if (have_acl) zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); zfs_exit(zfsvfs, FTAG); return (error); } } if (zp == NULL) { uint64_t txtype; uint64_t projid = ZFS_DEFAULT_PROJID; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, skip_acl, cr, mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { if (have_acl) zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EINVAL); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { /* * Since, we failed to add the directory entry for it, * delete the newly created dnode. */ zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & O_APPEND) ? V_APPEND : 0; if (have_acl) zfs_acl_ids_free(&acl_ids); /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl) { error = SET_ERROR(EEXIST); goto out; } /* * Can't open a directory for writing. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr, mnt_ns))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if (S_ISREG(ZTOI(zp)->i_mode) && (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { /* we can't hold any locks when calling zfs_freesp() */ if (dl) { zfs_dirent_unlock(dl); dl = NULL; } error = zfs_freesp(zp, 0, 0, mode, TRUE); } } out: if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *zpp = zp; } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp, zidmap_t *mnt_ns) { (void) excl, (void) mode, (void) flag; znode_t *zp = NULL, *dzp = ITOZ(dip); zfsvfs_t *zfsvfs = ITOZSB(dip); objset_t *os; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; uint64_t projid = ZFS_DEFAULT_PROJID; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); os = zfsvfs->z_os; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } top: *ipp = NULL; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* Add to unlinked set */ zp->z_unlinked = B_TRUE; zfs_unlinked_add(zp, tx); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); out: if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *ipp = ZTOI(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove an entry from a directory. * * IN: dzp - znode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime * ip - ctime (if nlink > 0) */ static uint64_t null_xattr = 0; int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) { znode_t *zp; znode_t *xzp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; uint64_t links; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { zflg |= ZCILOOK; pn_alloc(&realnm); realnmp = &realnm; } top: xattr_obj = 0; xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, realnmp))) { if (realnmp) pn_free(realnmp); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { goto out; } /* * Need to use rmdir for removing directories. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EPERM); goto out; } mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && !zn_has_cached_data(zp, 0, LLONG_MAX); mutex_exit(&zp->z_lock); /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the inode. So we dmu_tx_hold() the right things to * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } mutex_enter(&zp->z_lock); if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); goto top; } if (realnmp) pn_free(realnmp); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to * zfs_sa_upgrade(). */ mutex_enter(&zp->z_lock); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && atomic_read(&ZTOI(zp)->i_count) == 1 && !zn_has_cached_data(zp, 0, LLONG_MAX) && xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; VERIFY_IMPLY(xattr_obj_unlinked, xzp); } if (delete_now) { if (xattr_obj_unlinked) { ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; clear_nlink(ZTOI(xzp)); links = 0; error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx); ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); else error = sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &null_xattr, sizeof (uint64_t), tx); ASSERT0(error); } /* * Add to the unlinked set because a new reference could be * taken concurrently resulting in a deferred destruction. */ zfs_unlinked_add(zp, tx); mutex_exit(&zp->z_lock); } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); dmu_tx_commit(tx); out: if (realnmp) pn_free(realnmp); zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); if (delete_now) zrele(zp); else zfs_zrele_async(zp); if (xzp) { zfs_znode_update_vfs(xzp); zfs_zrele_async(xzp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create a new directory and insert it into dzp using the name * provided. Return a pointer to the inserted directory. * * IN: dzp - znode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * flags - case flags. * vsecp - ACL to be set * mnt_ns - user namespace of the mount * * OUT: zpp - znode of created directory. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime updated * zpp - ctime|mtime|atime updated */ int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp, zidmap_t *mnt_ns) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; int zf = ZNEW; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t waited = B_FALSE; ASSERT(S_ISDIR(vap->va_mode)); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (dirname == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids, mnt_ns)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ top: *zpp = NULL; if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL))) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); /* * Now put new name in parent dir. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); *zpp = zp; txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (error != 0) { zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dzp - znode of directory to remove from. * name - name of directory to be removed. * cwd - inode of current working directory. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated */ int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL))) { zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr, zfs_init_idmap))) { goto out; } if (!S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } if (zp == cwd) { error = SET_ERROR(EINVAL); goto out; } /* * Grab a lock on the directory to make sure that no one is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); goto top; } dmu_tx_abort(tx); zrele(zp); zfs_exit(zfsvfs, FTAG); return (error); } error = zfs_link_destroy(dl, zp, tx, zflg, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, B_FALSE); } dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); out: zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); zrele(zp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); } /* * Read directory entries from the given directory cursor position and emit * name and position for each entry. * * IN: ip - inode of directory to read. * ctx - directory entry context. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ int zfs_readdir(struct inode *ip, struct dir_context *ctx, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; zap_cursor_t zc; zap_attribute_t *zap; int error; uint8_t prefetch; uint8_t type; int done = 0; uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) goto out; /* * Quit if directory has been removed (posix) */ if (zp->z_unlinked) goto out; error = 0; os = zfsvfs->z_os; offset = ctx->pos; prefetch = zp->z_zn_prefetch; zap = zap_attribute_long_alloc(); /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Transform to file-system independent format */ while (!done) { uint64_t objnum; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap->za_name, "."); zap->za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap->za_name, ".."); zap->za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap->za_name, ZFS_CTLDIR_NAME); zap->za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if ((error = zap_cursor_retrieve(&zc, zap))) { if (error == ENOENT) break; else goto update; } /* * Allow multiple entries provided the first entry is * the object id. Non-zpl consumers may safely make * use of the additional space. * * XXX: This should be a feature flag for compatibility */ if (zap->za_integer_length != 8 || zap->za_num_integers == 0) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld, " "length = %d, num = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset, zap->za_integer_length, (u_longlong_t)zap->za_num_integers); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap->za_first_integer); type = ZFS_DIRENT_TYPE(zap->za_first_integer); } done = !dir_emit(ctx, zap->za_name, strlen(zap->za_name), objnum, type); if (done) break; if (prefetch) dmu_prefetch_dnode(os, objnum, ZIO_PRIORITY_SYNC_READ); /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } ctx->pos = offset; } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ update: zap_cursor_fini(&zc); zap_attribute_free(zap); if (error == ENOENT) error = 0; out: zfs_exit(zfsvfs, FTAG); return (error); } /* * Get the basic file attributes and place them in the provided kstat * structure. The inode is assumed to be the authoritative source * for most of the attributes. However, the znode currently has the * authoritative atime, blksize, and block count. * * IN: ip - inode of file. * * OUT: sp - kstat values. * * RETURN: 0 (always succeeds) */ int #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK zfs_getattr_fast(zidmap_t *user_ns, u32 request_mask, struct inode *ip, struct kstat *sp) #else zfs_getattr_fast(zidmap_t *user_ns, struct inode *ip, struct kstat *sp) #endif { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t blksize; u_longlong_t nblocks; int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); mutex_enter(&zp->z_lock); #ifdef HAVE_GENERIC_FILLATTR_IDMAP_REQMASK zpl_generic_fillattr(user_ns, request_mask, ip, sp); #else zpl_generic_fillattr(user_ns, ip, sp); #endif /* * +1 link count for root inode with visible '.zfs' directory. */ if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) if (sp->nlink < ZFS_LINK_MAX) sp->nlink++; sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); sp->blksize = blksize; sp->blocks = nblocks; if (unlikely(zp->z_blksz == 0)) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ sp->blksize = zfsvfs->z_max_blksz; } mutex_exit(&zp->z_lock); /* * Required to prevent NFS client from detecting different inode * numbers of snapshot root dentry before and after snapshot mount. */ if (zfsvfs->z_issnap) { if (ip->i_sb->s_root->d_inode == ip) sp->ino = ZFSCTL_INO_SNAPDIRS - dmu_objset_id(zfsvfs->z_os); } zfs_exit(zfsvfs, FTAG); return (0); } /* * For the operation of changing file's user/group/project, we need to * handle not only the main object that is assigned to the file directly, * but also the ones that are used by the file via hidden xattr directory. * * Because the xattr directory may contains many EA entries, as to it may * be impossible to change all of them via the transaction of changing the * main object's user/group/project attributes. Then we have to change them * via other multiple independent transactions one by one. It may be not good * solution, but we have no better idea yet. */ static int zfs_setattr_dir(znode_t *dzp) { struct inode *dxip = ZTOI(dzp); struct inode *xip = NULL; zfsvfs_t *zfsvfs = ZTOZSB(dzp); objset_t *os = zfsvfs->z_os; zap_cursor_t zc; zap_attribute_t *zap; zfs_dirlock_t *dl; znode_t *zp = NULL; dmu_tx_t *tx = NULL; uint64_t uid, gid; sa_bulk_attr_t bulk[4]; int count; int err; zap = zap_attribute_alloc(); zap_cursor_init(&zc, os, dzp->z_id); while ((err = zap_cursor_retrieve(&zc, zap)) == 0) { count = 0; if (zap->za_integer_length != 8 || zap->za_num_integers != 1) { err = ENXIO; break; } err = zfs_dirent_lock(&dl, dzp, (char *)zap->za_name, &zp, ZEXISTS, NULL, NULL); if (err == ENOENT) goto next; if (err) break; xip = ZTOI(zp); if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && zp->z_projid == dzp->z_projid) goto next; tx = dmu_tx_create(os); if (!(zp->z_pflags & ZFS_PROJID)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); err = dmu_tx_assign(tx, TXG_WAIT); if (err) break; mutex_enter(&dzp->z_lock); if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { xip->i_uid = dxip->i_uid; uid = zfs_uid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, sizeof (uid)); } if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { xip->i_gid = dxip->i_gid; gid = zfs_gid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, sizeof (gid)); } uint64_t projid = dzp->z_projid; if (zp->z_projid != projid) { if (!(zp->z_pflags & ZFS_PROJID)) { err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) { err = 0; } else if (err != 0) { goto sa_add_projid_err; } else { projid = ZFS_INVALID_PROJID; } } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } } sa_add_projid_err: mutex_exit(&dzp->z_lock); if (likely(count > 0)) { err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } else if (projid == ZFS_INVALID_PROJID) { dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } tx = NULL; if (err != 0 && err != ENOENT) break; next: if (zp) { zrele(zp); zp = NULL; zfs_dirent_unlock(dl); } zap_cursor_advance(&zc); } if (tx) dmu_tx_abort(tx); if (zp) { zrele(zp); zfs_dirent_unlock(dl); } zap_cursor_fini(&zc); zap_attribute_free(zap); return (err == ENOENT ? 0 : err); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: zp - znode of file to be modified. * vap - new attribute values. * If ATTR_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * mnt_ns - user namespace of the mount * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime updated, mtime updated if size changed. */ int zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t *tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; int trim_mask = 0; uint64_t new_mode; uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2], atime[2]; uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; int err, err2 = 0; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; boolean_t handle_eadir = B_FALSE; sa_bulk_attr_t *bulk, *xattr_bulk; int count = 0, xattr_count = 0, bulks = 8; if (mask == 0) return (0); if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); ip = ZTOI(zp); os = zfsvfs->z_os; /* * If this is a xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); if (xoap != NULL && (mask & ATTR_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) projid = ZFS_INVALID_PROJID; else need_policy = TRUE; } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOTSUP)); } } zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & ATTR_XVATTR))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EISDIR)); } if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); xva_init(tmpxvattr); bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { err = SET_ERROR(EPERM); goto out3; } if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { err = SET_ERROR(EPERM); goto out3; } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (ATTR_ATIME | ATTR_MTIME)) { if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { err = SET_ERROR(EOVERFLOW); goto out3; } } top: attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfs_is_readonly(zfsvfs)) { err = SET_ERROR(EROFS); goto out3; } /* * First validate permissions */ if (mask & ATTR_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr, mnt_ns); if (err) goto out3; /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) goto out3; } if (mask & (ATTR_ATIME|ATTR_MTIME) || ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr, mnt_ns); } if (mask & (ATTR_UID|ATTR_GID)) { int idmask = (mask & (ATTR_UID|ATTR_GID)); int take_owner; int take_group; uid_t uid; gid_t gid; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & ATTR_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ uid = zfs_uid_to_vfsuid(mnt_ns, zfs_i_user_ns(ip), vap->va_uid); gid = zfs_gid_to_vfsgid(mnt_ns, zfs_i_user_ns(ip), vap->va_gid); take_owner = (mask & ATTR_UID) && (uid == crgetuid(cr)); take_group = (mask & ATTR_GID) && zfs_groupmember(zfsvfs, gid, cr); /* * If both ATTR_UID and ATTR_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (ATTR_UID|ATTR_GID)) && take_owner && take_group) || ((idmask == ATTR_UID) && take_owner) || ((idmask == ATTR_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr, mnt_ns) == 0) { /* * Remove setuid/setgid for non-privileged users */ (void) secpolicy_setid_clear(vap, cr); trim_mask = (mask & (ATTR_UID|ATTR_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & ATTR_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { if (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_PROJINHERIT); XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((!S_ISREG(ip->i_mode) && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); err = SET_ERROR(EPERM); goto out3; } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } mutex_exit(&zp->z_lock); if (mask & ATTR_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr, mnt_ns) == 0) { err = secpolicy_setid_setsticky_clear(ip, vap, &oldva, cr, mnt_ns, zfs_i_user_ns(ip)); if (err) goto out3; trim_mask |= ATTR_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, zfs_zaccess_unix, zp); if (err) goto out3; if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { handle_eadir = B_TRUE; err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); if (err) goto out2; } if (mask & ATTR_UID) { new_kuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, new_kuid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & ATTR_GID) { new_kgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, new_kgid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (projid != ZFS_INVALID_PROJID && zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { if (attrzp) zrele(attrzp); err = EDQUOT; goto out2; } } tx = dmu_tx_create(os); if (mask & ATTR_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = EPERM; goto out; } if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) goto out; mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if (((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { /* * For the existed object that is upgraded from old system, * its on-disk layout has no slot for the project ID attribute. * But quota accounting logic needs to access related slots by * offset directly. So we need to adjust old objects' layout * to make the project ID to some unified and fixed offset. */ if (attrzp) err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); if (err == 0) err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) err = 0; else if (err != 0) goto out; else projid = ZFS_INVALID_PROJID; } if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); if (projid != ZFS_INVALID_PROJID) { attrzp->z_projid = projid; SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, sizeof (attrzp->z_projid)); } } if (mask & (ATTR_UID|ATTR_GID)) { if (mask & ATTR_UID) { ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); new_uid = zfs_uid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); } } if (mask & ATTR_GID) { ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); new_gid = zfs_gid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); } } if (!(mask & ATTR_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & ATTR_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = ZTOI(zp)->i_mode = new_mode; ASSERT3P(aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; inode_timespec_t tmp_atime = zpl_inode_get_atime(ip); ZFS_TIME_ENCODE(&tmp_atime, atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); zpl_inode_set_mtime_to_ts(ZTOI(zp), zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp))); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); zpl_inode_set_ctime_to_ts(ZTOI(zp), zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp))); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } if (attrzp && mask) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & ATTR_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { XVA_SET_REQ(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(S_ISREG(ip->i_mode)); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&attrzp->z_acl_lock); mutex_exit(&attrzp->z_lock); } out: if (err == 0 && xattr_count > 0) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); if (attrzp) zrele(attrzp); if (err == ERESTART) goto top; } else { if (count > 0) err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); if (attrzp) { if (err2 == 0 && handle_eadir) err = zfs_setattr_dir(attrzp); zrele(attrzp); } zfs_znode_update_vfs(zp); } out2: if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); out3: kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(tmpxvattr, sizeof (xvattr_t)); zfs_exit(zfsvfs, FTAG); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) zfs_zrele_async(zl->zl_znode); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = ZTOZSB(zp)->z_root; uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (oidp == szp->z_id) /* We're a descendant of szp */ return (SET_ERROR(EINVAL)); if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(ZTOZSB(zp), oidp, &zp); if (error) return (error); zl->zl_znode = zp; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdzp - Source directory containing the "old entry". * snm - Old entry name. * tdzp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * flags - case flags * rflags - RENAME_* flags * wa_vap - attributes for RENAME_WHITEOUT (must be a char 0:0). * mnt_ns - user namespace of the mount * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdzp,tdzp - ctime|mtime updated */ int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, cred_t *cr, int flags, uint64_t rflags, vattr_t *wo_vap, zidmap_t *mnt_ns) { znode_t *szp, *tzp; zfsvfs_t *zfsvfs = ZTOZSB(sdzp); zilog_t *zilog; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr; int error = 0; int zflg = 0; boolean_t waited = B_FALSE; /* Needed for whiteout inode creation. */ boolean_t fuid_dirtied; zfs_acl_ids_t acl_ids; boolean_t have_acl = B_FALSE; znode_t *wzp = NULL; if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); if (rflags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return (SET_ERROR(EINVAL)); /* Already checked by Linux VFS, but just to make sure. */ if (rflags & RENAME_EXCHANGE && (rflags & (RENAME_NOREPLACE | RENAME_WHITEOUT))) return (SET_ERROR(EINVAL)); /* * Make sure we only get wo_vap iff. RENAME_WHITEOUT and that it's the * right kind of vattr_t for the whiteout file. These are set * internally by ZFS so should never be incorrect. */ VERIFY_EQUIV(rflags & RENAME_WHITEOUT, wo_vap != NULL); VERIFY_IMPLY(wo_vap, wo_vap->va_mode == S_IFCHR); VERIFY_IMPLY(wo_vap, wo_vap->va_rdev == makedevice(0, 0)); if ((error = zfs_enter_verify_zp(zfsvfs, sdzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if ((error = zfs_verify_zp(tdzp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || zfsctl_is_node(ZTOI(tdzp))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { /* * First compare the two name arguments without * considering any case folding. */ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ zfs_exit(zfsvfs, FTAG); return (0); } /* * If the file system is case-folding, then we may * have some more checking to do. A case-folding file * system is either supporting mixed case sensitivity * access or is completely case-insensitive. Note * that the file system is always case preserving. * * In mixed sensitivity mode case sensitive behavior * is the default. FIGNORECASE must be used to * explicitly request case insensitive behavior. * * If the source and target names provided differ only * by case (e.g., a request to rename 'tim' to 'Tim'), * we will treat this as a special case in the * case-insensitive mode: as long as the source name * is an exact match, we will allow this to proceed as * a name-change request. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || (zfsvfs->z_case == ZFS_CASE_MIXED && flags & FIGNORECASE)) && u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, &error) == 0) { /* * case preserving rename request, require exact * name matches */ zflg |= ZCIEXACT; zflg &= ~ZCILOOK; } } /* * If the source and destination directories are the same, we should * grab the z_name_lock of that directory only once. */ if (sdzp == tdzp) { zflg |= ZHAVELOCK; rw_enter(&sdzp->z_name_lock, RW_READER); } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, zflg, NULL, NULL); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, NULL, NULL); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) zrele(tzp); } if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(snm, "..") == 0) serr = EINVAL; zfs_exit(zfsvfs, FTAG); return (serr); } if (terr) { zfs_dirent_unlock(sdl); zrele(szp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(tnm, "..") == 0) terr = EINVAL; zfs_exit(zfsvfs, FTAG); return (terr); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow renames into our tree when the project * IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { error = SET_ERROR(EXDEV); goto out; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr, mnt_ns))) goto out; if (S_ISDIR(ZTOI(szp)->i_mode)) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) goto out; } /* * Does target exist? */ if (tzp) { if (rflags & RENAME_NOREPLACE) { error = SET_ERROR(EEXIST); goto out; } /* * Source and target must be the same type (unless exchanging). */ if (!(rflags & RENAME_EXCHANGE)) { boolean_t s_is_dir = S_ISDIR(ZTOI(szp)->i_mode) != 0; boolean_t t_is_dir = S_ISDIR(ZTOI(tzp)->i_mode) != 0; if (s_is_dir != t_is_dir) { error = SET_ERROR(s_is_dir ? ENOTDIR : EISDIR); goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } else if (rflags & RENAME_EXCHANGE) { /* Target must exist for RENAME_EXCHANGE. */ error = SET_ERROR(ENOENT); goto out; } /* Set up inode creation for RENAME_WHITEOUT. */ if (rflags & RENAME_WHITEOUT) { /* * Whiteout files are not regular files or directories, so to * match zfs_create() we do not inherit the project id. */ uint64_t wo_projid = ZFS_DEFAULT_PROJID; error = zfs_zaccess(sdzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns); if (error) goto out; if (!have_acl) { error = zfs_acl_ids_create(sdzp, 0, wo_vap, cr, NULL, &acl_ids, mnt_ns); if (error) goto out; have_acl = B_TRUE; } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, wo_projid)) { error = SET_ERROR(EDQUOT); goto out; } } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, (rflags & RENAME_EXCHANGE) ? TRUE : FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } if (rflags & RENAME_WHITEOUT) { dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, sdzp->z_id, TRUE, snm); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); goto top; } dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); zfs_exit(zfsvfs, FTAG); return (error); } /* * Unlink the source. */ szp->z_pflags |= ZFS_AV_MODIFIED; if (tdzp->z_pflags & ZFS_PROJINHERIT) szp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); VERIFY0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error) goto commit; /* * Unlink the target. */ if (tzp) { int tzflg = zflg; if (rflags & RENAME_EXCHANGE) { /* This inode will be re-linked soon. */ tzflg |= ZRENAMING; tzp->z_pflags |= ZFS_AV_MODIFIED; if (sdzp->z_pflags & ZFS_PROJINHERIT) tzp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(tzp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&tzp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); } error = zfs_link_destroy(tdl, tzp, tx, tzflg, NULL); if (error) goto commit_link_szp; } /* * Create the new target links: * * We always link the target. * * RENAME_EXCHANGE: Link the old target to the source. * * RENAME_WHITEOUT: Create a whiteout inode in-place of the source. */ error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error) { /* * If we have removed the existing target, a subsequent call to * zfs_link_create() to add back the same entry, but with a new * dnode (szp), should not fail. */ ASSERT3P(tzp, ==, NULL); goto commit_link_tzp; } switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { case RENAME_EXCHANGE: error = zfs_link_create(sdl, tzp, tx, ZRENAMING); /* * The same argument as zfs_link_create() failing for * szp applies here, since the source directory must * have had an entry we are replacing. */ ASSERT0(error); if (error) goto commit_unlink_td_szp; break; case RENAME_WHITEOUT: zfs_mknode(sdzp, wo_vap, tx, cr, 0, &wzp, &acl_ids); error = zfs_link_create(sdl, wzp, tx, ZNEW); if (error) { zfs_znode_delete(wzp, tx); remove_inode_hash(ZTOI(wzp)); goto commit_unlink_td_szp; } break; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); switch (rflags & (RENAME_EXCHANGE | RENAME_WHITEOUT)) { case RENAME_EXCHANGE: zfs_log_rename_exchange(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); break; case RENAME_WHITEOUT: zfs_log_rename_whiteout(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp, wzp); break; default: ASSERT0(rflags & ~RENAME_NOREPLACE); zfs_log_rename(zilog, tx, (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); break; } commit: dmu_tx_commit(tx); out: if (have_acl) zfs_acl_ids_free(&acl_ids); zfs_znode_update_vfs(sdzp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (sdzp != tdzp) zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); zrele(szp); if (wzp) { zfs_znode_update_vfs(wzp); zrele(wzp); } if (tzp) { zfs_znode_update_vfs(tzp); zrele(tzp); } if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); zfs_exit(zfsvfs, FTAG); return (error); /* * Clean-up path for broken link state. * * At this point we are in a (very) bad state, so we need to do our * best to correct the state. In particular, all of the nlinks are * wrong because we were destroying and creating links with ZRENAMING. * * In some form, all of these operations have to resolve the state: * * * link_destroy() *must* succeed. Fortunately, this is very likely * since we only just created it. * * * link_create()s are allowed to fail (though they shouldn't because * we only just unlinked them and are putting the entries back * during clean-up). But if they fail, we can just forcefully drop * the nlink value to (at the very least) avoid broken nlink values * -- though in the case of non-empty directories we will have to * panic (otherwise we'd have a leaked directory with a broken ..). */ commit_unlink_td_szp: VERIFY0(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL)); commit_link_tzp: if (tzp) { if (zfs_link_create(tdl, tzp, tx, ZRENAMING)) VERIFY0(zfs_drop_nlink(tzp, tx, NULL)); } commit_link_szp: if (zfs_link_create(sdl, szp, tx, ZRENAMING)) VERIFY0(zfs_drop_nlink(szp, tx, NULL)); goto commit; } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dzp - Directory to contain new symbolic link. * name - Name of directory entry in dip. * vap - Attributes of new entry. * link - Name for new symlink entry. * cr - credentials of caller. * flags - case flags * mnt_ns - user namespace of the mount * * OUT: zpp - Znode for new symbolic link. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dip - ctime|mtime updated */ int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, znode_t **zpp, cred_t *cr, int flags, zidmap_t *mnt_ns) { znode_t *zp; zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; boolean_t waited = B_FALSE; ASSERT(S_ISLNK(vap->va_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, dzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids, mnt_ns)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } top: *zpp = NULL; /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); zfs_exit(zfsvfs, FTAG); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr, mnt_ns))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datasets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (error == 0) { *zpp = zp; if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); } else { zrele(zp); } zfs_exit(zfsvfs, FTAG); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by ip. * * IN: ip - inode of symbolic link * uio - structure to contain the link path. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated */ int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) { (void) cr; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); zfs_exit(zfsvfs, FTAG); return (error); } /* * Insert a new entry into directory tdzp referencing szp. * * IN: tdzp - Directory to contain new entry. * szp - znode of new entry. * name - name of new entry. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * tdzp - ctime|mtime updated * szp - ctime updated */ int zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, int flags) { struct inode *sip = ZTOI(szp); znode_t *tzp; zfsvfs_t *zfsvfs = ZTOZSB(tdzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zf = ZNEW; uint64_t parent; uid_t owner; boolean_t waited = B_FALSE; boolean_t is_tmpfile = 0; uint64_t txg; is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); if ((error = zfs_enter_verify_zp(zfsvfs, tdzp, FTAG)) != 0) return (error); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (S_ISDIR(sip->i_mode)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((error = zfs_verify_zp(szp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow hard link creation in our tree when the * project IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EXDEV)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } if (parent == zfsvfs->z_shares_dir) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr, zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); if (error) { zfs_exit(zfsvfs, FTAG); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); if (is_tmpfile) dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, tdzp); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); zfs_exit(zfsvfs, FTAG); return (error); } /* unmark z_unlinked so zfs_link_create will not reject */ if (is_tmpfile) szp->z_unlinked = B_FALSE; error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; /* * tmpfile is created to be in z_unlinkedobj, so remove it. * Also, we don't log in ZIL, because all previous file * operation on the tmpfile are ignored by ZIL. Instead we * always wait for txg to sync to make sure all previous * operation are sync safe. */ if (is_tmpfile) { VERIFY(zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, tdzp, szp, name); } } else if (is_tmpfile) { /* restore z_unlinked since when linking failed */ szp->z_unlinked = B_TRUE; } txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); zfs_exit(zfsvfs, FTAG); return (error); } static void zfs_putpage_sync_commit_cb(void *arg) { struct page *pp = arg; ClearPageError(pp); end_page_writeback(pp); } static void zfs_putpage_async_commit_cb(void *arg) { struct page *pp = arg; znode_t *zp = ITOZ(pp->mapping->host); ClearPageError(pp); end_page_writeback(pp); atomic_dec_32(&zp->z_async_writes_cnt); } /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. * * IN: ip - page mapped for inode. * pp - page to push (page is locked) * wbc - writeback control data * for_sync - does the caller intend to wait synchronously for the * page writeback to complete? * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated */ int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, boolean_t for_sync) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t offset; loff_t pgoff; unsigned int pglen; dmu_tx_t *tx; caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; inode_timespec_t tmp_ts; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); ASSERT(PageLocked(pp)); pgoff = page_offset(pp); /* Page byte-offset in file */ offset = i_size_read(ip); /* File length in bytes */ pglen = MIN(PAGE_SIZE, /* Page length in bytes */ P2ROUNDUP(offset, PAGE_SIZE)-pgoff); /* Page is beyond end of file */ if (pgoff >= offset) { unlock_page(pp); zfs_exit(zfsvfs, FTAG); return (0); } /* Truncate page length to end of file */ if (pgoff + pglen > offset) pglen = offset - pgoff; #if 0 /* * FIXME: Allow mmap writes past its quota. The correct fix * is to register a page_mkwrite() handler to count the page * against its quota when it is about to be dirtied. */ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, KUID_TO_SUID(ip->i_uid)) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, KGID_TO_SGID(ip->i_gid)) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { err = EDQUOT; } #endif /* * The ordering here is critical and must adhere to the following * rules in order to avoid deadlocking in either zfs_read() or * zfs_free_range() due to a lock inversion. * * 1) The page must be unlocked prior to acquiring the range lock. * This is critical because zfs_read() calls find_lock_page() * which may block on the page lock while holding the range lock. * * 2) Before setting or clearing write back on a page the range lock * must be held in order to prevent a lock inversion with the * zfs_free_range() function. * * This presents a problem because upon entering this function the * page lock is already held. To safely acquire the range lock the * page lock must be dropped. This creates a window where another * process could truncate, invalidate, dirty, or write out the page. * * Therefore, after successfully reacquiring the range and page locks * the current page state is checked. In the common case everything * will be as is expected and it can be written out. However, if * the page state has changed it must be handled accordingly. */ mapping = pp->mapping; redirty_page_for_writepage(wbc, pp); unlock_page(pp); zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, pgoff, pglen, RL_WRITER); lock_page(pp); /* Page mapping changed or it was no longer dirty, we're done */ if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { unlock_page(pp); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (0); } /* Another process started write block if required */ if (PageWriteback(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* * Speed up any non-sync page writebacks since * they may take several seconds to complete. * Refer to the comment in zpl_fsync() for details. */ if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { zil_commit(zfsvfs->z_log, zp->z_id); } if (PageWriteback(pp)) #ifdef HAVE_PAGEMAP_FOLIO_WAIT_BIT folio_wait_bit(page_folio(pp), PG_writeback); #else wait_on_page_bit(pp, PG_writeback); #endif } zfs_exit(zfsvfs, FTAG); return (0); } /* Clear the dirty flag the required locks are held */ if (!clear_page_dirty_for_io(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (0); } /* * Counterpart for redirty_page_for_writepage() above. This page * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; if (!for_sync) atomic_inc_32(&zp->z_async_writes_cnt); set_page_writeback(pp); unlock_page(pp); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO filemap_dirty_folio(page_mapping(pp), page_folio(pp)); #else __set_page_dirty_nobuffers(pp); #endif ClearPageError(pp); end_page_writeback(pp); if (!for_sync) atomic_dec_32(&zp->z_async_writes_cnt); zfs_rangelock_exit(lr); zfs_exit(zfsvfs, FTAG); return (err); } va = kmap(pp); ASSERT3U(pglen, <=, PAGE_SIZE); dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); kunmap(pp); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ tmp_ts = zpl_inode_get_mtime(ip); ZFS_TIME_ENCODE(&tmp_ts, mtime); tmp_ts = zpl_inode_get_ctime(ip); ZFS_TIME_ENCODE(&tmp_ts, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); boolean_t commit = B_FALSE; if (wbc->sync_mode != WB_SYNC_NONE) { /* * Note that this is rarely called under writepages(), because * writepages() normally handles the entire commit for * performance reasons. */ commit = B_TRUE; } else if (!for_sync && atomic_load_32(&zp->z_sync_writes_cnt) > 0) { /* * If the caller does not intend to wait synchronously * for this page writeback to complete and there are active * synchronous calls on this file, do a commit so that * the latter don't accidentally end up waiting for * our writeback to complete. Refer to the comment in * zpl_fsync() (when HAVE_FSYNC_RANGE is defined) for details. */ commit = B_TRUE; } zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, commit, B_FALSE, for_sync ? zfs_putpage_sync_commit_cb : zfs_putpage_async_commit_cb, pp); dmu_tx_commit(tx); zfs_rangelock_exit(lr); if (commit) zil_commit(zfsvfs->z_log, zp->z_id); dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, pglen); zfs_exit(zfsvfs, FTAG); return (err); } /* * Update the system attributes when the inode has been dirtied. For the * moment we only update the mode, atime, mtime, and ctime. */ int zfs_dirty_inode(struct inode *ip, int flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; inode_timespec_t tmp_ts; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return (0); if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); #ifdef I_DIRTY_TIME /* * This is the lazytime semantic introduced in Linux 4.0 * This flag will only be called from update_time when lazytime is set. * (Note, I_DIRTY_SYNC will also set if not lazytime) * Fortunately mtime and ctime are managed within ZFS itself, so we * only need to dirty atime. */ if (flags == I_DIRTY_TIME) { zp->z_atime_dirty = B_TRUE; goto out; } #endif tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } mutex_enter(&zp->z_lock); zp->z_atime_dirty = B_FALSE; SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ tmp_ts = zpl_inode_get_atime(ip); ZFS_TIME_ENCODE(&tmp_ts, atime); tmp_ts = zpl_inode_get_mtime(ip); ZFS_TIME_ENCODE(&tmp_ts, mtime); tmp_ts = zpl_inode_get_ctime(ip); ZFS_TIME_ENCODE(&tmp_ts, ctime); mode = ip->i_mode; zp->z_mode = mode; error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); mutex_exit(&zp->z_lock); dmu_tx_commit(tx); out: zfs_exit(zfsvfs, FTAG); return (error); } void zfs_inactive(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t atime[2]; int error; int need_unlock = 0; /* Only read lock if we haven't already write locked, e.g. rollback */ if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { need_unlock = 1; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); } if (zp->z_sa_hdl == NULL) { if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); return; } if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { inode_timespec_t tmp_atime; tmp_atime = zpl_inode_get_atime(ip); ZFS_TIME_ENCODE(&tmp_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); zp->z_atime_dirty = B_FALSE; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * Fill pages with data from the disk. */ static int zfs_fillpage(struct inode *ip, struct page *pp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); size_t io_len = PAGE_SIZE; ASSERT3U(io_off, <, i_size); if (io_off + io_len > i_size) io_len = i_size - io_off; void *va = kmap(pp); int error = dmu_read(zfsvfs->z_os, zp->z_id, io_off, io_len, va, DMU_READ_PREFETCH); if (io_len != PAGE_SIZE) memset((char *)va + io_len, 0, PAGE_SIZE - io_len); kunmap(pp); if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); SetPageError(pp); ClearPageUptodate(pp); } else { ClearPageError(pp); SetPageUptodate(pp); - if (!PagePrivate(pp)) { - /* - * Set private bit so page migration will wait for us to - * finish writeback before calling migrate_folio(). - */ - SetPagePrivate(pp); - get_page(pp); - } } return (error); } /* * Uses zfs_fillpage to read data from the file and fill the page. * * IN: ip - inode of file to get data from. * pp - page to read * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ int zfs_getpage(struct inode *ip, struct page *pp) { zfsvfs_t *zfsvfs = ITOZSB(ip); znode_t *zp = ITOZ(ip); int error; loff_t i_size = i_size_read(ip); u_offset_t io_off = page_offset(pp); size_t io_len = PAGE_SIZE; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); ASSERT3U(io_off, <, i_size); if (io_off + io_len > i_size) io_len = i_size - io_off; /* * It is important to hold the rangelock here because it is possible * a Direct I/O write or block clone might be taking place at the same * time that a page is being faulted in through filemap_fault(). With * Direct I/O writes and block cloning db->db_data will be set to NULL * with dbuf_clear_data() in dmu_buif_will_clone_or_dio(). If the * rangelock is not held, then there is a race between faulting in a * page and writing out a Direct I/O write or block cloning. Without * the rangelock a NULL pointer dereference can occur in * dmu_read_impl() for db->db_data during the mempcy operation when * zfs_fillpage() calls dmu_read(). */ zfs_locked_range_t *lr = zfs_rangelock_tryenter(&zp->z_rangelock, io_off, io_len, RL_READER); if (lr == NULL) { /* * It is important to drop the page lock before grabbing the * rangelock to avoid another deadlock between here and * zfs_write() -> update_pages(). update_pages() holds both the * rangelock and the page lock. */ get_page(pp); unlock_page(pp); lr = zfs_rangelock_enter(&zp->z_rangelock, io_off, io_len, RL_READER); lock_page(pp); put_page(pp); } error = zfs_fillpage(ip, pp); zfs_rangelock_exit(lr); if (error == 0) dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, PAGE_SIZE); zfs_exit(zfsvfs, FTAG); return (error); } /* * Check ZFS specific permissions to memory map a section of a file. * * IN: ip - inode of the file to mmap * off - file offset * addrp - start address in memory region * len - length of memory region * vm_flags- address flags * * RETURN: 0 if success * error code if failure */ int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, unsigned long vm_flags) { (void) addrp; znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if ((vm_flags & VM_WRITE) && (vm_flags & VM_SHARED) && (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); } if ((vm_flags & (VM_READ | VM_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EACCES)); } if (off < 0 || len > MAXOFFSET_T - off) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENXIO)); } zfs_exit(zfsvfs, FTAG); return (0); } /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat * misnamed, as its functionality includes the ability to allocate as * well as free space. * * IN: zp - znode of file to free data in. * cmd - action to take (only F_FREESP supported). * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * zp - ctime|mtime updated */ int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr) { (void) offset; zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t off, len; int error; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); if (cmd != F_FREESP) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EINVAL)); } /* * Permissions aren't checked on Solaris because on this OS * zfs_space() can only be called with an opened file handle. * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr, zfs_init_idmap))) { zfs_exit(zfsvfs, FTAG); return (error); } off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ error = zfs_freesp(zp, off, len, flag, TRUE); zfs_exit(zfsvfs, FTAG); return (error); } int zfs_fid(struct inode *ip, fid_t *fidp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; if ((error = zfs_enter(zfsvfs, FTAG)) != 0) return (error); if (fidp->fid_len < SHORT_FID_LEN) { fidp->fid_len = SHORT_FID_LEN; zfs_exit(zfsvfs, FTAG); return (SET_ERROR(ENOSPC)); } if ((error = zfs_verify_zp(zp)) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { zfs_exit(zfsvfs, FTAG); return (error); } gen = (uint32_t)gen64; size = SHORT_FID_LEN; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); zfs_exit(zfsvfs, FTAG); return (0); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_open); EXPORT_SYMBOL(zfs_close); EXPORT_SYMBOL(zfs_lookup); EXPORT_SYMBOL(zfs_create); EXPORT_SYMBOL(zfs_tmpfile); EXPORT_SYMBOL(zfs_remove); EXPORT_SYMBOL(zfs_mkdir); EXPORT_SYMBOL(zfs_rmdir); EXPORT_SYMBOL(zfs_readdir); EXPORT_SYMBOL(zfs_getattr_fast); EXPORT_SYMBOL(zfs_setattr); EXPORT_SYMBOL(zfs_rename); EXPORT_SYMBOL(zfs_symlink); EXPORT_SYMBOL(zfs_readlink); EXPORT_SYMBOL(zfs_link); EXPORT_SYMBOL(zfs_inactive); EXPORT_SYMBOL(zfs_space); EXPORT_SYMBOL(zfs_fid); EXPORT_SYMBOL(zfs_getpage); EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); EXPORT_SYMBOL(zfs_map); /* CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); #endif diff --git a/module/os/linux/zfs/zfs_znode_os.c b/module/os/linux/zfs/zfs_znode_os.c index bc1e17f086d9..bbaca2f58394 100644 --- a/module/os/linux/zfs/zfs_znode_os.c +++ b/module/os/linux/zfs/zfs_znode_os.c @@ -1,1983 +1,1975 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* Portions Copyright 2007 Jeremy Teo */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_prop.h" #include "zfs_comutil.h" static kmem_cache_t *znode_cache = NULL; static kmem_cache_t *znode_hold_cache = NULL; unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; /* * This is used by the test suite so that it can delay znodes from being * freed in order to inspect the unlinked set. */ static int zfs_unlink_suspend_progress = 0; /* * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on * z_rangelock. It will modify the offset and length of the lock to reflect * znode-specific information, and convert RL_APPEND to RL_WRITER. This is * called with the rangelock_t's rl_lock held, which avoids races. */ static void zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) { znode_t *zp = arg; /* * If in append mode, convert to writer and lock starting at the * current end of file. */ if (new->lr_type == RL_APPEND) { new->lr_offset = zp->z_size; new->lr_type = RL_WRITER; } /* * If we need to grow the block size then lock the whole file range. */ uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { new->lr_offset = 0; new->lr_length = UINT64_MAX; } } static int zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) { (void) arg, (void) kmflags; znode_t *zp = buf; inode_init_once(ZTOI(zp)); list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); zp->z_dirlocks = NULL; zp->z_acl_cached = NULL; zp->z_xattr_cached = NULL; zp->z_xattr_parent = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; return (0); } static void zfs_znode_cache_destructor(void *buf, void *arg) { (void) arg; znode_t *zp = buf; ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); rw_destroy(&zp->z_xattr_lock); zfs_rangelock_fini(&zp->z_rangelock); ASSERT3P(zp->z_dirlocks, ==, NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); ASSERT0(atomic_load_32(&zp->z_sync_writes_cnt)); ASSERT0(atomic_load_32(&zp->z_async_writes_cnt)); } static int zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) { (void) arg, (void) kmflags; znode_hold_t *zh = buf; mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); zh->zh_refcount = 0; return (0); } static void zfs_znode_hold_cache_destructor(void *buf, void *arg) { (void) arg; znode_hold_t *zh = buf; mutex_destroy(&zh->zh_lock); } void zfs_znode_init(void) { /* * Initialize zcache. The KMC_SLAB hint is used in order that it be * backed by kmalloc() when on the Linux slab in order that any * wait_on_bit() operations on the related inode operate properly. */ ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB | KMC_RECLAIMABLE); ASSERT(znode_hold_cache == NULL); znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); } void zfs_znode_fini(void) { /* * Cleanup zcache */ if (znode_cache) kmem_cache_destroy(znode_cache); znode_cache = NULL; if (znode_hold_cache) kmem_cache_destroy(znode_hold_cache); znode_hold_cache = NULL; } /* * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to * serialize access to a znode and its SA buffer while the object is being * created or destroyed. This kind of locking would normally reside in the * znode itself but in this case that's impossible because the znode and SA * buffer may not yet exist. Therefore the locking is handled externally * with an array of mutexes and AVLs trees which contain per-object locks. * * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted * in to the correct AVL tree and finally the per-object lock is held. In * zfs_znode_hold_exit() the process is reversed. The per-object lock is * released, removed from the AVL tree and destroyed if there are no waiters. * * This scheme has two important properties: * * 1) No memory allocations are performed while holding one of the z_hold_locks. * This ensures evict(), which can be called from direct memory reclaim, will * never block waiting on a z_hold_locks which just happens to have hashed * to the same index. * * 2) All locks used to serialize access to an object are per-object and never * shared. This minimizes lock contention without creating a large number * of dedicated locks. * * On the downside it does require znode_lock_t structures to be frequently * allocated and freed. However, because these are backed by a kmem cache * and very short lived this cost is minimal. */ int zfs_znode_hold_compare(const void *a, const void *b) { const znode_hold_t *zh_a = (const znode_hold_t *)a; const znode_hold_t *zh_b = (const znode_hold_t *)b; return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); } static boolean_t __maybe_unused zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, search; int i = ZFS_OBJ_HASH(zfsvfs, obj); boolean_t held; search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; mutex_exit(&zfsvfs->z_hold_locks[i]); return (held); } znode_hold_t * zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) { znode_hold_t *zh, *zh_new, search; int i = ZFS_OBJ_HASH(zfsvfs, obj); boolean_t found = B_FALSE; zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); search.zh_obj = obj; mutex_enter(&zfsvfs->z_hold_locks[i]); zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); if (likely(zh == NULL)) { zh = zh_new; zh->zh_obj = obj; avl_add(&zfsvfs->z_hold_trees[i], zh); } else { ASSERT3U(zh->zh_obj, ==, obj); found = B_TRUE; } zh->zh_refcount++; ASSERT3S(zh->zh_refcount, >, 0); mutex_exit(&zfsvfs->z_hold_locks[i]); if (found == B_TRUE) kmem_cache_free(znode_hold_cache, zh_new); ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); mutex_enter(&zh->zh_lock); return (zh); } void zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) { int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); boolean_t remove = B_FALSE; ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); mutex_exit(&zh->zh_lock); mutex_enter(&zfsvfs->z_hold_locks[i]); ASSERT3S(zh->zh_refcount, >, 0); if (--zh->zh_refcount == 0) { avl_remove(&zfsvfs->z_hold_trees[i], zh); remove = B_TRUE; } mutex_exit(&zfsvfs->z_hold_locks[i]); if (remove == B_TRUE) kmem_cache_free(znode_hold_cache, zh); } dev_t zfs_cmpldev(uint64_t dev) { return (dev); } static void zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) { ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); mutex_enter(&zp->z_lock); ASSERT(zp->z_sa_hdl == NULL); ASSERT(zp->z_acl_cached == NULL); if (sa_hdl == NULL) { VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, SA_HDL_SHARED, &zp->z_sa_hdl)); } else { zp->z_sa_hdl = sa_hdl; sa_set_userp(sa_hdl, zp); } zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; mutex_exit(&zp->z_lock); } void zfs_znode_dmu_fini(znode_t *zp) { ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; } /* * Called by new_inode() to allocate a new inode. */ int zfs_inode_alloc(struct super_block *sb, struct inode **ip) { znode_t *zp; zp = kmem_cache_alloc(znode_cache, KM_SLEEP); *ip = ZTOI(zp); return (0); } /* * Called in multiple places when an inode should be destroyed. */ void zfs_inode_destroy(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); mutex_enter(&zfsvfs->z_znodes_lock); if (list_link_active(&zp->z_link_node)) { list_remove(&zfsvfs->z_all_znodes, zp); } mutex_exit(&zfsvfs->z_znodes_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } kmem_cache_free(znode_cache, zp); } static void zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) { uint64_t rdev = 0; switch (ip->i_mode & S_IFMT) { case S_IFREG: ip->i_op = &zpl_inode_operations; ip->i_fop = &zpl_file_operations; ip->i_mapping->a_ops = &zpl_address_space_operations; break; case S_IFDIR: ip->i_op = &zpl_dir_inode_operations; ip->i_fop = &zpl_dir_file_operations; ITOZ(ip)->z_zn_prefetch = B_TRUE; break; case S_IFLNK: ip->i_op = &zpl_symlink_inode_operations; break; /* * rdev is only stored in a SA only for device files. */ case S_IFCHR: case S_IFBLK: (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, sizeof (rdev)); zfs_fallthrough; case S_IFIFO: case S_IFSOCK: init_special_inode(ip, ip->i_mode, rdev); ip->i_op = &zpl_special_inode_operations; break; default: zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", (u_longlong_t)ip->i_ino, ip->i_mode); /* Assume the inode is a file and attempt to continue */ ip->i_mode = S_IFREG | 0644; ip->i_op = &zpl_inode_operations; ip->i_fop = &zpl_file_operations; ip->i_mapping->a_ops = &zpl_address_space_operations; break; } } static void zfs_set_inode_flags(znode_t *zp, struct inode *ip) { /* * Linux and Solaris have different sets of file attributes, so we * restrict this conversion to the intersection of the two. */ unsigned int flags = 0; if (zp->z_pflags & ZFS_IMMUTABLE) flags |= S_IMMUTABLE; if (zp->z_pflags & ZFS_APPENDONLY) flags |= S_APPEND; inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); } /* * Update the embedded inode given the znode. */ void zfs_znode_update_vfs(znode_t *zp) { struct inode *ip; uint32_t blksize; u_longlong_t i_blocks; ASSERT(zp != NULL); ip = ZTOI(zp); /* Skip .zfs control nodes which do not exist on disk. */ if (zfsctl_is_node(ip)) return; dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); spin_lock(&ip->i_lock); ip->i_mode = zp->z_mode; ip->i_blocks = i_blocks; i_size_write(ip, zp->z_size); spin_unlock(&ip->i_lock); } /* * Construct a znode+inode and initialize. * * This does not do a call to dmu_set_user() that is * up to the caller to do, in case you don't want to * return the znode */ static znode_t * zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, dmu_object_type_t obj_type, sa_handle_t *hdl) { znode_t *zp; struct inode *ip; uint64_t mode; uint64_t parent; uint64_t tmp_gen; uint64_t links; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; inode_timespec_t tmp_ts; uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[12]; int count = 0; ASSERT(zfsvfs != NULL); ip = new_inode(zfsvfs->z_sb); if (ip == NULL) return (NULL); zp = ITOZ(ip); ASSERT(zp->z_dirlocks == NULL); ASSERT3P(zp->z_acl_cached, ==, NULL); ASSERT3P(zp->z_xattr_cached, ==, NULL); zp->z_unlinked = B_FALSE; zp->z_atime_dirty = B_FALSE; zp->z_is_ctldir = B_FALSE; zp->z_suspended = B_FALSE; zp->z_sa_hdl = NULL; zp->z_mapcnt = 0; zp->z_id = db->db_object; zp->z_blksz = blksz; zp->z_seq = 0x7A4653; zp->z_sync_cnt = 0; zp->z_sync_writes_cnt = 0; zp->z_async_writes_cnt = 0; zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || (dmu_objset_projectquota_enabled(zfsvfs->z_os) && (zp->z_pflags & ZFS_PROJID) && sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { if (hdl == NULL) sa_handle_destroy(zp->z_sa_hdl); zp->z_sa_hdl = NULL; goto error; } zp->z_projid = projid; zp->z_mode = ip->i_mode = mode; ip->i_generation = (uint32_t)tmp_gen; ip->i_blkbits = SPA_MINBLOCKSHIFT; set_nlink(ip, (uint32_t)links); zfs_uid_write(ip, z_uid); zfs_gid_write(ip, z_gid); zfs_set_inode_flags(zp, ip); /* Cache the xattr parent id */ if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; ZFS_TIME_DECODE(&tmp_ts, atime); zpl_inode_set_atime_to_ts(ip, tmp_ts); ZFS_TIME_DECODE(&tmp_ts, mtime); zpl_inode_set_mtime_to_ts(ip, tmp_ts); ZFS_TIME_DECODE(&tmp_ts, ctime); zpl_inode_set_ctime_to_ts(ip, tmp_ts); ZFS_TIME_DECODE(&zp->z_btime, btime); ip->i_ino = zp->z_id; zfs_znode_update_vfs(zp); zfs_inode_set_ops(zfsvfs, ip); /* * The only way insert_inode_locked() can fail is if the ip->i_ino * number is already hashed for this super block. This can never * happen because the inode numbers map 1:1 with the object numbers. * * Exceptions include rolling back a mounted file system, either * from the zfs rollback or zfs recv command. * * Active inodes are unhashed during the rollback, but since zrele * can happen asynchronously, we can't guarantee they've been * unhashed. This can cause hash collisions in unlinked drain * processing so do not hash unlinked znodes. */ if (links > 0) VERIFY3S(insert_inode_locked(ip), ==, 0); mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); if (links > 0) unlock_new_inode(ip); return (zp); error: iput(ip); return (NULL); } /* * Safely mark an inode dirty. Inodes which are part of a read-only * file system or snapshot may not be dirtied. */ void zfs_mark_inode_dirty(struct inode *ip) { zfsvfs_t *zfsvfs = ITOZSB(ip); if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return; mark_inode_dirty(ip); } static uint64_t empty_xattr; static uint64_t pad[4]; static zfs_acl_phys_t acl_phys; /* * Create a new DMU object to hold a zfs znode. * * IN: dzp - parent directory for new znode * vap - file attributes for new znode * tx - dmu transaction id for zap operations * cr - credentials of caller * flag - flags: * IS_ROOT_NODE - new object will be root * IS_TMPFILE - new object is of O_TMPFILE * IS_XATTR - new object is an attribute * acl_ids - ACL related attributes * * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) * */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) { uint64_t crtime[2], atime[2], mtime[2], ctime[2]; uint64_t mode, size, links, parent, pflags; uint64_t projid = ZFS_DEFAULT_PROJID; uint64_t rdev = 0; zfsvfs_t *zfsvfs = ZTOZSB(dzp); dmu_buf_t *db; inode_timespec_t now; uint64_t gen, obj; int bonuslen; int dnodesize; sa_handle_t *sa_hdl; dmu_object_type_t obj_type; sa_bulk_attr_t *sa_attrs; int cnt = 0; zfs_acl_locator_cb_t locate = { 0 }; znode_hold_t *zh; if (zfsvfs->z_replay) { obj = vap->va_nodeid; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ dnodesize = vap->va_fsid; /* ditto */ } else { obj = 0; gethrestime(&now); gen = dmu_tx_get_txg(tx); dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); } if (dnodesize == 0) dnodesize = DNODE_MIN_SIZE; obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; bonuslen = (obj_type == DMU_OT_SA) ? DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; /* * Create a new DMU object. */ /* * There's currently no mechanism for pre-reading the blocks that will * be needed to allocate a new object, so we accept the small chance * that there will be an i/o error and we will fail one of the * assertions below. */ if (S_ISDIR(vap->va_mode)) { if (zfsvfs->z_replay) { VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx)); } else { obj = zap_create_norm_dnsize(zfsvfs->z_os, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, obj_type, bonuslen, dnodesize, tx); } } else { if (zfsvfs->z_replay) { VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx)); } else { obj = dmu_object_alloc_dnsize(zfsvfs->z_os, DMU_OT_PLAIN_FILE_CONTENTS, 0, obj_type, bonuslen, dnodesize, tx); } } zh = zfs_znode_hold_enter(zfsvfs, obj); VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); /* * If this is the root, fix up the half-initialized parent pointer * to reference the just-allocated physical data area. */ if (flag & IS_ROOT_NODE) { dzp->z_id = obj; } /* * If parent is an xattr, so am I. */ if (dzp->z_pflags & ZFS_XATTR) { flag |= IS_XATTR; } if (zfsvfs->z_use_fuids) pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; else pflags = 0; if (S_ISDIR(vap->va_mode)) { size = 2; /* contents ("." and "..") */ links = 2; } else { size = 0; links = (flag & IS_TMPFILE) ? 0 : 1; } if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) rdev = vap->va_rdev; parent = dzp->z_id; mode = acl_ids->z_mode; if (flag & IS_XATTR) pflags |= ZFS_XATTR; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { /* * With ZFS_PROJID flag, we can easily know whether there is * project ID stored on disk or not. See zfs_space_delta_cb(). */ if (obj_type != DMU_OT_ZNODE && dmu_objset_projectquota_enabled(zfsvfs->z_os)) pflags |= ZFS_PROJID; /* * Inherit project ID from parent if required. */ projid = zfs_inherit_projid(dzp); if (dzp->z_pflags & ZFS_PROJINHERIT) pflags |= ZFS_PROJINHERIT; } /* * No execs denied will be determined when zfs_mode_compute() is called. */ pflags |= acl_ids->z_aclp->z_hints & (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); ZFS_TIME_ENCODE(&now, crtime); ZFS_TIME_ENCODE(&now, ctime); if (vap->va_mask & ATTR_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, atime); } else { ZFS_TIME_ENCODE(&now, atime); } if (vap->va_mask & ATTR_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); } else { ZFS_TIME_ENCODE(&now, mtime); } /* Now add in all of the "SA" attributes */ VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, &sa_hdl)); /* * Setup the array of attributes to be replaced/set on the new file * * order for DMU_OT_ZNODE is critical since it needs to be constructed * in the old znode_phys_t format. Don't change this ordering */ sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); } else { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), NULL, &size, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); } SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, &empty_xattr, 8); } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && pflags & ZFS_PROJID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8); } if (obj_type == DMU_OT_ZNODE || (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); } if (obj_type == DMU_OT_ZNODE) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &pflags, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, &acl_ids->z_fuid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, &acl_ids->z_fgid, 8); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, sizeof (uint64_t) * 4); SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, &acl_phys, sizeof (zfs_acl_phys_t)); } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, &acl_ids->z_aclp->z_acl_count, 8); locate.cb_aclp = acl_ids->z_aclp; SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), zfs_acl_data_locator, &locate, acl_ids->z_aclp->z_acl_bytes); mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, acl_ids->z_fuid, acl_ids->z_fgid); } VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); if (!(flag & IS_ROOT_NODE)) { /* * The call to zfs_znode_alloc() may fail if memory is low * via the call path: alloc_inode() -> inode_init_always() -> * security_inode_alloc() -> inode_alloc_security(). Since * the existing code is written such that zfs_mknode() can * not fail retry until sufficient memory has been reclaimed. */ do { *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); } while (*zpp == NULL); VERIFY(*zpp != NULL); VERIFY(dzp != NULL); } else { /* * If we are creating the root node, the "parent" we * passed in is the znode for the root. */ *zpp = dzp; (*zpp)->z_sa_hdl = sa_hdl; } (*zpp)->z_pflags = pflags; (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; (*zpp)->z_dnodesize = dnodesize; (*zpp)->z_projid = projid; if (obj_type == DMU_OT_ZNODE || acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); } kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); zfs_znode_hold_exit(zfsvfs, zh); } /* * Update in-core attributes. It is assumed the caller will be doing an * sa_bulk_update to push the changes out. */ void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) { xoptattr_t *xoap; boolean_t update_inode = B_FALSE; xoap = xva_getxoptattr(xvap); ASSERT(xoap); if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { uint64_t times[2]; ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), ×, sizeof (times), tx); XVA_SET_RTN(xvap, XAT_CREATETIME); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_IMMUTABLE); update_inode = B_TRUE; } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_APPENDONLY); update_inode = B_TRUE; } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, xoap->xoa_av_quarantined, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { zfs_sa_set_scanstamp(zp, xvap, tx); XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_SPARSE); } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, zp->z_pflags, tx); XVA_SET_RTN(xvap, XAT_PROJINHERIT); } if (update_inode) zfs_set_inode_flags(zp, ZTOI(zp)); } int zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) { dmu_object_info_t doi; dmu_buf_t *db; znode_t *zp; znode_hold_t *zh; int err; sa_handle_t *hdl; *zpp = NULL; again: zh = zfs_znode_hold_enter(zfsvfs, obj_num); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EINVAL)); } hdl = dmu_buf_get_user(db); if (hdl != NULL) { zp = sa_get_userdata(hdl); /* * Since "SA" does immediate eviction we * should never find a sa handle that doesn't * know about the znode. */ ASSERT3P(zp, !=, NULL); mutex_enter(&zp->z_lock); ASSERT3U(zp->z_id, ==, obj_num); /* * If zp->z_unlinked is set, the znode is already marked * for deletion and should not be discovered. Check this * after checking igrab() due to fsetxattr() & O_TMPFILE. * * If igrab() returns NULL the VFS has independently * determined the inode should be evicted and has * called iput_final() to start the eviction process. * The SA handle is still valid but because the VFS * requires that the eviction succeed we must drop * our locks and references to allow the eviction to * complete. The zfs_zget() may then be retried. * * This unlikely case could be optimized by registering * a sops->drop_inode() callback. The callback would * need to detect the active SA hold thereby informing * the VFS that this inode should not be evicted. */ if (igrab(ZTOI(zp)) == NULL) { if (zp->z_unlinked) err = SET_ERROR(ENOENT); else err = SET_ERROR(EAGAIN); } else { *zpp = zp; err = 0; } mutex_exit(&zp->z_lock); sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); if (err == EAGAIN) { /* inode might need this to finish evict */ cond_resched(); goto again; } return (err); } /* * Not found create new znode/vnode but only if file exists. * * There is a small window where zfs_vget() could * find this object while a file create is still in * progress. This is checked for in zfs_znode_alloc() * * if zfs_znode_alloc() fails it will drop the hold on the * bonus buffer. */ zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, doi.doi_bonus_type, NULL); if (zp == NULL) { err = SET_ERROR(ENOENT); } else { *zpp = zp; } zfs_znode_hold_exit(zfsvfs, zh); return (err); } int zfs_rezget(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_object_info_t doi; dmu_buf_t *db; uint64_t obj_num = zp->z_id; uint64_t mode; uint64_t links; sa_bulk_attr_t bulk[11]; int err; int count = 0; uint64_t gen; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; inode_timespec_t tmp_ts; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; /* * skip ctldir, otherwise they will always get invalidated. This will * cause funny behaviour for the mounted snapdirs. Especially for * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent * anyone automount it again as long as someone is still using the * detached mount. */ if (zp->z_is_ctldir) return (0); zh = zfs_znode_hold_enter(zfsvfs, obj_num); mutex_enter(&zp->z_acl_lock); if (zp->z_acl_cached) { zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = NULL; } mutex_exit(&zp->z_acl_lock); rw_enter(&zp->z_xattr_lock, RW_WRITER); if (zp->z_xattr_cached) { nvlist_free(zp->z_xattr_cached); zp->z_xattr_cached = NULL; } rw_exit(&zp->z_xattr_lock); ASSERT(zp->z_sa_hdl == NULL); err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); if (err) { zfs_znode_hold_exit(zfsvfs, zh); return (err); } dmu_object_info_from_db(db, &doi); if (doi.doi_bonus_type != DMU_OT_SA && (doi.doi_bonus_type != DMU_OT_ZNODE || (doi.doi_bonus_type == DMU_OT_ZNODE && doi.doi_bonus_size < sizeof (znode_phys_t)))) { sa_buf_rele(db, NULL); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EINVAL)); } zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); /* reload cached values */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, sizeof (gen)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, sizeof (links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, sizeof (z_uid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, sizeof (z_gid)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, sizeof (mode)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &btime, 16); if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); } if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8); if (err != 0 && err != ENOENT) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(err)); } } zp->z_projid = projid; zp->z_mode = ZTOI(zp)->i_mode = mode; zfs_uid_write(ZTOI(zp), z_uid); zfs_gid_write(ZTOI(zp), z_gid); ZFS_TIME_DECODE(&tmp_ts, atime); zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts); ZFS_TIME_DECODE(&tmp_ts, mtime); zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); ZFS_TIME_DECODE(&tmp_ts, ctime); zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); ZFS_TIME_DECODE(&zp->z_btime, btime); if ((uint32_t)gen != ZTOI(zp)->i_generation) { zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (SET_ERROR(EIO)); } set_nlink(ZTOI(zp), (uint32_t)links); zfs_set_inode_flags(zp, ZTOI(zp)); zp->z_blksz = doi.doi_data_block_size; zp->z_atime_dirty = B_FALSE; zfs_znode_update_vfs(zp); /* * If the file has zero links, then it has been unlinked on the send * side and it must be in the received unlinked set. * We call zfs_znode_dmu_fini() now to prevent any accesses to the * stale data and to prevent automatic removal of the file in * zfs_zinactive(). The file will be removed either when it is removed * on the send side and the next incremental stream is received or * when the unlinked set gets processed. */ zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); if (zp->z_unlinked) zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); return (0); } void zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; uint64_t obj = zp->z_id; uint64_t acl_obj = zfs_external_acl(zp); znode_hold_t *zh; zh = zfs_znode_hold_enter(zfsvfs, obj); if (acl_obj) { VERIFY(!zp->z_is_sa); VERIFY(0 == dmu_object_free(os, acl_obj, tx)); } VERIFY(0 == dmu_object_free(os, obj, tx)); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } void zfs_zinactive(znode_t *zp) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t z_id = zp->z_id; znode_hold_t *zh; ASSERT(zp->z_sa_hdl); /* * Don't allow a zfs_zget() while were trying to release this znode. */ zh = zfs_znode_hold_enter(zfsvfs, z_id); mutex_enter(&zp->z_lock); /* * If this was the last reference to a file with no links, remove * the file from the file system unless the file system is mounted * read-only. That can happen, for example, if the file system was * originally read-write, the file was opened, then unlinked and * the file system was made read-only before the file was finally * closed. The file will remain in the unlinked set. */ if (zp->z_unlinked) { ASSERT(!zfsvfs->z_issnap); if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { mutex_exit(&zp->z_lock); zfs_znode_hold_exit(zfsvfs, zh); zfs_rmnode(zp); return; } } mutex_exit(&zp->z_lock); zfs_znode_dmu_fini(zp); zfs_znode_hold_exit(zfsvfs, zh); } /* * Determine whether the znode's atime must be updated. The logic mostly * duplicates the Linux kernel's relatime_need_update() functionality. * This function is only called if the underlying filesystem actually has * atime updates enabled. */ boolean_t zfs_relatime_need_update(const struct inode *ip) { inode_timespec_t now, tmp_atime, tmp_ts; gethrestime(&now); tmp_atime = zpl_inode_get_atime(ip); /* * In relatime mode, only update the atime if the previous atime * is earlier than either the ctime or mtime or if at least a day * has passed since the last update of atime. */ tmp_ts = zpl_inode_get_mtime(ip); if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0) return (B_TRUE); tmp_ts = zpl_inode_get_ctime(ip); if (timespec64_compare(&tmp_ts, &tmp_atime) >= 0) return (B_TRUE); if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60) return (B_TRUE); return (B_FALSE); } /* * Prepare to update znode time stamps. * * IN: zp - znode requiring timestamp update * flag - ATTR_MTIME, ATTR_CTIME flags * * OUT: zp - z_seq * mtime - new mtime * ctime - new ctime * * Note: We don't update atime here, because we rely on Linux VFS to do * atime updating. */ void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { inode_timespec_t now, tmp_ts; gethrestime(&now); zp->z_seq++; if (flag & ATTR_MTIME) { ZFS_TIME_ENCODE(&now, mtime); ZFS_TIME_DECODE(&tmp_ts, mtime); zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); if (ZTOZSB(zp)->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); } } if (flag & ATTR_CTIME) { ZFS_TIME_ENCODE(&now, ctime); ZFS_TIME_DECODE(&tmp_ts, ctime); zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); if (ZTOZSB(zp)->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } } /* * Grow the block size for a file. * * IN: zp - znode of file to free data in. * size - requested block size * tx - open transaction. * * NOTE: this function assumes that the znode is write locked. */ void zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) { int error; u_longlong_t dummy; if (size <= zp->z_blksz) return; /* * If the file size is already greater than the current blocksize, * we will not grow. If there is more than one block in a file, * the blocksize cannot change. */ if (zp->z_blksz && zp->z_size > zp->z_blksz) return; error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, size, 0, tx); if (error == ENOTSUP) return; ASSERT0(error); /* What blocksize did we actually get? */ dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); } /* * Increase the file length * * IN: zp - znode of file to free data in. * end - new end-of-file * * RETURN: 0 on success, error code on failure */ static int zfs_extend(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; uint64_t newblksz; int error; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end <= zp->z_size) { zfs_rangelock_exit(lr); return (0); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); if (end > zp->z_blksz && (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { /* * We are growing the file past the current block size. */ if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); } else { newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); } dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); } else { newblksz = 0; } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } if (newblksz) zfs_grow_blocksize(zp, newblksz, tx); zp->z_size = end; VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), &zp->z_size, sizeof (zp->z_size), tx)); zfs_rangelock_exit(lr); dmu_tx_commit(tx); return (0); } /* * zfs_zero_partial_page - Modeled after update_pages() but * with different arguments and semantics for use by zfs_freesp(). * * Zeroes a piece of a single page cache entry for zp at offset * start and length len. * * Caller must acquire a range lock on the file for the region * being zeroed in order that the ARC and page cache stay in sync. */ static void zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) { struct address_space *mp = ZTOI(zp)->i_mapping; struct page *pp; int64_t off; void *pb; ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); off = start & (PAGE_SIZE - 1); start &= PAGE_MASK; pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); pb = kmap(pp); memset(pb + off, 0, len); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); - if (!PagePrivate(pp)) { - /* - * Set private bit so page migration will wait for us to - * finish writeback before calling migrate_folio(). - */ - SetPagePrivate(pp); - get_page(pp); - } unlock_page(pp); put_page(pp); } } /* * Free space in a file. * * IN: zp - znode of file to free data in. * off - start of section to free. * len - length of section to free. * * RETURN: 0 on success, error code on failure */ static int zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) { zfsvfs_t *zfsvfs = ZTOZSB(zp); zfs_locked_range_t *lr; int error; /* * Lock the range being freed. */ lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (off >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } if (off + len > zp->z_size) len = zp->z_size - off; error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); /* * Zero partial page cache entries. This must be done under a * range lock in order to keep the ARC and page cache in sync. */ if (zn_has_cached_data(zp, off, off + len - 1)) { loff_t first_page, last_page, page_len; loff_t first_page_offset, last_page_offset; /* first possible full page in hole */ first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; /* last page of hole */ last_page = (off + len) >> PAGE_SHIFT; /* offset of first_page */ first_page_offset = first_page << PAGE_SHIFT; /* offset of last_page */ last_page_offset = last_page << PAGE_SHIFT; /* truncate whole pages */ if (last_page_offset > first_page_offset) { truncate_inode_pages_range(ZTOI(zp)->i_mapping, first_page_offset, last_page_offset - 1); } /* truncate sub-page ranges */ if (first_page > last_page) { /* entire punched area within a single page */ zfs_zero_partial_page(zp, off, len); } else { /* beginning of punched area at the end of a page */ page_len = first_page_offset - off; if (page_len > 0) zfs_zero_partial_page(zp, off, page_len); /* end of punched area at the beginning of a page */ page_len = off + len - last_page_offset; if (page_len > 0) zfs_zero_partial_page(zp, last_page_offset, page_len); } } zfs_rangelock_exit(lr); return (error); } /* * Truncate a file * * IN: zp - znode of file to free data in. * end - new end-of-file. * * RETURN: 0 on success, error code on failure */ static int zfs_trunc(znode_t *zp, uint64_t end) { zfsvfs_t *zfsvfs = ZTOZSB(zp); dmu_tx_t *tx; zfs_locked_range_t *lr; int error; sa_bulk_attr_t bulk[2]; int count = 0; /* * We will change zp_size, lock the whole file. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); /* * Nothing to do if file already at desired length. */ if (end >= zp->z_size) { zfs_rangelock_exit(lr); return (0); } error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, DMU_OBJECT_END); if (error) { zfs_rangelock_exit(lr); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); zfs_rangelock_exit(lr); return (error); } zp->z_size = end; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, sizeof (zp->z_size)); if (end == 0) { zp->z_pflags &= ~ZFS_SPARSE; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); } VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); dmu_tx_commit(tx); zfs_rangelock_exit(lr); return (0); } /* * Free space in a file * * IN: zp - znode of file to free data in. * off - start of range * len - end of range (0 => EOF) * flag - current file open mode flags. * log - TRUE if this action should be logged * * RETURN: 0 on success, error code on failure */ int zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) { dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(zp); zilog_t *zilog = zfsvfs->z_log; uint64_t mode; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; int error; if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, sizeof (mode))) != 0) return (error); if (off > zp->z_size) { error = zfs_extend(zp, off+len); if (error == 0 && log) goto log; goto out; } if (len == 0) { error = zfs_trunc(zp, off); } else { if ((error = zfs_free_range(zp, off, len)) == 0 && off + len > zp->z_size) error = zfs_extend(zp, off+len); } if (error || !log) goto out; log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT(error == 0); zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); dmu_tx_commit(tx); zfs_znode_update_vfs(zp); error = 0; out: /* * Truncate the page cache - for file truncate operations, use * the purpose-built API for truncations. For punching operations, * the truncation is handled under a range lock in zfs_free_range. */ if (len == 0) truncate_setsize(ZTOI(zp), off); return (error); } void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { struct super_block *sb; zfsvfs_t *zfsvfs; uint64_t moid, obj, sa_obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int size; int error; int i; znode_t *rootzp = NULL; vattr_t vattr; znode_t *zp; zfs_acl_ids_t acl_ids; /* * First attempt to create master node. */ /* * In an empty objset, there are no blocks to read and thus * there can be no i/o errors (which we assert below). */ moid = MASTER_NODE_OBJ; error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, DMU_OT_NONE, 0, tx); ASSERT(error == 0); /* * Set starting attributes. */ version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ uint64_t val; const char *name; ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); VERIFY(nvpair_value_uint64(elem, &val) == 0); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { if (val < version) version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } ASSERT(error == 0); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) norm = val; else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) sense = val; } ASSERT(version != 0); error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); ASSERT(error == 0); /* * Create zap object used for SA attribute registration */ if (version >= ZPL_VERSION_SA) { sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT(error == 0); } else { sa_obj = 0; } /* * Create a delete queue. */ obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); /* * Create root znode. Create minimal znode/inode/zfsvfs/sb * to allow zfs_mknode to work. */ vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; vattr.va_mode = S_IFDIR|0755; vattr.va_uid = crgetuid(cr); vattr.va_gid = crgetgid(cr); rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); rootzp->z_unlinked = B_FALSE; rootzp->z_atime_dirty = B_FALSE; rootzp->z_is_sa = USE_SA(version, os); rootzp->z_pflags = 0; zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); zfsvfs->z_os = os; zfsvfs->z_parent = zfsvfs; zfsvfs->z_version = version; zfsvfs->z_use_fuids = USE_FUIDS(version, os); zfsvfs->z_use_sa = USE_SA(version, os); zfsvfs->z_norm = norm; sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); sb->s_fs_info = zfsvfs; ZTOI(rootzp)->i_sb = sb; error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); ASSERT(error == 0); /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); zfsvfs->z_hold_size = size; zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, KM_SLEEP); zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); for (i = 0; i != size; i++) { avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); } VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, cr, NULL, &acl_ids, zfs_init_idmap)); zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); ASSERT3P(zp, ==, rootzp); error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); zfs_acl_ids_free(&acl_ids); atomic_set(&ZTOI(rootzp)->i_count, 0); sa_handle_destroy(rootzp->z_sa_hdl); kmem_cache_free(znode_cache, rootzp); for (i = 0; i != size; i++) { avl_destroy(&zfsvfs->z_hold_trees[i]); mutex_destroy(&zfsvfs->z_hold_locks[i]); } mutex_destroy(&zfsvfs->z_znodes_lock); vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); kmem_free(sb, sizeof (struct super_block)); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } EXPORT_SYMBOL(zfs_create_fs); EXPORT_SYMBOL(zfs_obj_to_path); /* CSTYLED */ module_param(zfs_object_mutex_size, uint, 0644); MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); module_param(zfs_unlink_suspend_progress, int, 0644); MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " "(debug - leaks space into the unlinked set)"); diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 50c63695dcc8..4d1bf1d5477f 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -1,1189 +1,1143 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #ifdef CONFIG_COMPAT #include #endif #include #include #include #include #include #include #include #if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \ defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) #include #endif #include #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO #include #endif /* * When using fallocate(2) to preallocate space, inflate the requested * capacity check by 10% to account for the required metadata blocks. */ static unsigned int zfs_fallocate_reserve_percent = 110; static int zpl_open(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; error = generic_file_open(ip, filp); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_release(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); if (ITOZ(ip)->z_atime_dirty) zfs_mark_inode_dirty(ip); crhold(cr); error = -zfs_close(ip, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_iterate(struct file *filp, struct dir_context *ctx) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_readdir(file_inode(filp), ctx, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; znode_t *zp = ITOZ(inode); zfsvfs_t *zfsvfs = ITOZSB(inode); cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; /* * The variables z_sync_writes_cnt and z_async_writes_cnt work in * tandem so that sync writes can detect if there are any non-sync * writes going on and vice-versa. The "vice-versa" part to this logic * is located in zfs_putpage() where non-sync writes check if there are * any ongoing sync writes. If any sync and non-sync writes overlap, * we do a commit to complete the non-sync writes since the latter can * potentially take several seconds to complete and thus block sync * writes in the upcoming call to filemap_write_and_wait_range(). */ atomic_inc_32(&zp->z_sync_writes_cnt); /* * If the following check does not detect an overlapping non-sync write * (say because it's just about to start), then it is guaranteed that * the non-sync write will detect this sync write. This is because we * always increment z_sync_writes_cnt / z_async_writes_cnt before doing * the check on z_async_writes_cnt / z_sync_writes_cnt here and in * zfs_putpage() respectively. */ if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { if ((error = zpl_enter(zfsvfs, FTAG)) != 0) { atomic_dec_32(&zp->z_sync_writes_cnt); return (error); } zil_commit(zfsvfs->z_log, zp->z_id); zpl_exit(zfsvfs, FTAG); } error = filemap_write_and_wait_range(inode->i_mapping, start, end); /* * The sync write is not complete yet but we decrement * z_sync_writes_cnt since zfs_fsync() increments and decrements * it internally. If a non-sync write starts just after the decrement * operation but before we call zfs_fsync(), it may not detect this * overlapping sync write but it does not matter since we have already * gone past filemap_write_and_wait_range() and we won't block due to * the non-sync write. */ atomic_dec_32(&zp->z_sync_writes_cnt); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(zp, datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static inline int zfs_io_flags(struct kiocb *kiocb) { int flags = 0; #if defined(IOCB_DSYNC) if (kiocb->ki_flags & IOCB_DSYNC) flags |= O_DSYNC; #endif #if defined(IOCB_SYNC) if (kiocb->ki_flags & IOCB_SYNC) flags |= O_SYNC; #endif #if defined(IOCB_APPEND) if (kiocb->ki_flags & IOCB_APPEND) flags |= O_APPEND; #endif #if defined(IOCB_DIRECT) if (kiocb->ki_flags & IOCB_DIRECT) flags |= O_DIRECT; #endif return (flags); } /* * If relatime is enabled, call file_accessed() if zfs_relatime_need_update() * is true. This is needed since datasets with inherited "relatime" property * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after * `zfs set relatime=...`), which is what relatime test in VFS by * relatime_need_update() is based on. */ static inline void zpl_file_accessed(struct file *filp) { struct inode *ip = filp->f_mapping->host; if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) { if (zfs_relatime_need_update(ip)) file_accessed(filp); } else { file_accessed(filp); } } /* * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to * manipulate the iov_iter are available. In which case the full iov_iter * can be attached to the uio and correctly handled in the lower layers. * Otherwise, for older kernels extract the iovec and pass it instead. */ static void zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, loff_t pos, ssize_t count, size_t skip) { #if defined(HAVE_VFS_IOV_ITER) zfs_uio_iov_iter_init(uio, to, pos, count, skip); #else zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, zfs_uio_iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, count, skip); #endif } static ssize_t zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; ssize_t count = iov_iter_count(to); zfs_uio_t uio; zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); crhold(cr); cookie = spl_fstrans_mark(); ssize_t ret = -zfs_read(ITOZ(filp->f_mapping->host), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (ret < 0) return (ret); ssize_t read = count - uio.uio_resid; kiocb->ki_pos += read; zpl_file_accessed(filp); return (read); } static inline ssize_t zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, size_t *countp) { ssize_t ret = generic_write_checks(kiocb, from); if (ret <= 0) return (ret); *countp = ret; return (0); } static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; zfs_uio_t uio; size_t count = 0; ssize_t ret; ret = zpl_generic_write_checks(kiocb, from, &count); if (ret) return (ret); zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); crhold(cr); cookie = spl_fstrans_mark(); ret = -zfs_write(ITOZ(ip), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (ret < 0) return (ret); ssize_t wrote = count - uio.uio_resid; kiocb->ki_pos += wrote; return (wrote); } static ssize_t zpl_direct_IO_impl(void) { /* * All O_DIRECT requests should be handled by * zpl_{iter/aio}_{write/read}(). There is no way kernel generic code * should call the direct_IO address_space_operations function. We set * this code path to be fatal if it is executed. */ PANIC(0); return (0); } #if defined(HAVE_VFS_DIRECT_IO_ITER) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) { return (zpl_direct_IO_impl()); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { return (zpl_direct_IO_impl()); } #else #error "Unknown Direct I/O interface" #endif static loff_t zpl_llseek(struct file *filp, loff_t offset, int whence) { #if defined(SEEK_HOLE) && defined(SEEK_DATA) fstrans_cookie_t cookie; if (whence == SEEK_DATA || whence == SEEK_HOLE) { struct inode *ip = filp->f_mapping->host; loff_t maxbytes = ip->i_sb->s_maxbytes; loff_t error; spl_inode_lock_shared(ip); cookie = spl_fstrans_mark(); error = -zfs_holey(ITOZ(ip), whence, &offset); spl_fstrans_unmark(cookie); if (error == 0) error = lseek_execute(filp, ip, offset, maxbytes); spl_inode_unlock_shared(ip); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ return (generic_file_llseek(filp, offset, whence)); } /* * It's worth taking a moment to describe how mmap is implemented * for zfs because it differs considerably from other Linux filesystems. * However, this issue is handled the same way under OpenSolaris. * * The issue is that by design zfs bypasses the Linux page cache and * leaves all caching up to the ARC. This has been shown to work * well for the common read(2)/write(2) case. However, mmap(2) * is problem because it relies on being tightly integrated with the * page cache. To handle this we cache mmap'ed files twice, once in * the ARC and a second time in the page cache. The code is careful * to keep both copies synchronized. * * When a file with an mmap'ed region is written to using write(2) * both the data in the ARC and existing pages in the page cache * are updated. For a read(2) data will be read first from the page * cache then the ARC if needed. Neither a write(2) or read(2) will * will ever result in new pages being added to the page cache. * * New pages are added to the page cache only via .readpage() which * is called when the vfs needs to read a page off disk to back the * virtual memory region. These pages may be modified without * notifying the ARC and will be written out periodically via * .writepage(). This will occur due to either a sync or the usual * page aging behavior. Note because a read(2) of a mmap'ed file * will always check the page cache first even when the ARC is out * of date correct data will still be returned. * * While this implementation ensures correct behavior it does have * have some drawbacks. The most obvious of which is that it * increases the required memory footprint when access mmap'ed * files. It also adds additional complexity to the code keeping * both caches synchronized. * * Longer term it may be possible to cleanly resolve this wart by * mapping page cache pages directly on to the ARC buffers. The * Linux address space operations are flexible enough to allow * selection of which pages back a particular index. The trick * would be working out the details of which subsystem is in * charge, the ARC, the page cache, or both. It may also prove * helpful to move the ARC buffers to a scatter-gather lists * rather than a vmalloc'ed region. */ static int zpl_mmap(struct file *filp, struct vm_area_struct *vma) { struct inode *ip = filp->f_mapping->host; int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); spl_fstrans_unmark(cookie); if (error) return (error); error = generic_file_mmap(filp, vma); if (error) return (error); return (error); } /* * Populate a page with data for the Linux page cache. This function is * only used to support mmap(2). There will be an identical copy of the * data in the ARC which is kept up to date via .write() and .writepage(). */ static inline int zpl_readpage_common(struct page *pp) { fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); cookie = spl_fstrans_mark(); int error = -zfs_getpage(pp->mapping->host, pp); spl_fstrans_unmark(cookie); unlock_page(pp); return (error); } #ifdef HAVE_VFS_READ_FOLIO static int zpl_read_folio(struct file *filp, struct folio *folio) { return (zpl_readpage_common(&folio->page)); } #else static int zpl_readpage(struct file *filp, struct page *pp) { return (zpl_readpage_common(pp)); } #endif static int zpl_readpage_filler(void *data, struct page *pp) { return (zpl_readpage_common(pp)); } /* * Populate a set of pages with data for the Linux page cache. This * function will only be called for read ahead and never for demand * paging. For simplicity, the code relies on read_cache_pages() to * correctly lock each page for IO and call zpl_readpage(). */ #ifdef HAVE_VFS_READPAGES static int zpl_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL)); } #else static void zpl_readahead(struct readahead_control *ractl) { struct page *page; while ((page = readahead_page(ractl)) != NULL) { int ret; ret = zpl_readpage_filler(NULL, page); put_page(page); if (ret) break; } } #endif static int zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) { boolean_t *for_sync = data; fstrans_cookie_t cookie; int ret; ASSERT(PageLocked(pp)); ASSERT(!PageWriteback(pp)); cookie = spl_fstrans_mark(); ret = zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); spl_fstrans_unmark(cookie); return (ret); } #ifdef HAVE_WRITEPAGE_T_FOLIO static int zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) { return (zpl_putpage(&pp->page, wbc, data)); } #endif static inline int zpl_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, void *data) { int result; #ifdef HAVE_WRITEPAGE_T_FOLIO result = write_cache_pages(mapping, wbc, zpl_putfolio, data); #else result = write_cache_pages(mapping, wbc, zpl_putpage, data); #endif return (result); } static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) { znode_t *zp = ITOZ(mapping->host); zfsvfs_t *zfsvfs = ITOZSB(mapping->host); enum writeback_sync_modes sync_mode; int result; if ((result = zpl_enter(zfsvfs, FTAG)) != 0) return (result); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; zpl_exit(zfsvfs, FTAG); sync_mode = wbc->sync_mode; /* * We don't want to run write_cache_pages() in SYNC mode here, because * that would make putpage() wait for a single page to be committed to * disk every single time, resulting in atrocious performance. Instead * we run it once in non-SYNC mode so that the ZIL gets all the data, * and then we commit it all in one go. */ boolean_t for_sync = (sync_mode == WB_SYNC_ALL); wbc->sync_mode = WB_SYNC_NONE; result = zpl_write_cache_pages(mapping, wbc, &for_sync); if (sync_mode != wbc->sync_mode) { if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (result); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, zp->z_id); zpl_exit(zfsvfs, FTAG); /* * We need to call write_cache_pages() again (we can't just * return after the commit) because the previous call in * non-SYNC mode does not guarantee that we got all the dirty * pages (see the implementation of write_cache_pages() for * details). That being said, this is a no-op in most cases. */ wbc->sync_mode = sync_mode; result = zpl_write_cache_pages(mapping, wbc, &for_sync); } return (result); } /* * Write out dirty pages to the ARC, this function is only required to * support mmap(2). Mapped pages may be dirtied by memory operations * which never call .write(). These dirty pages are kept in sync with * the ARC buffers via this hook. */ static int zpl_writepage(struct page *pp, struct writeback_control *wbc) { if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL); return (zpl_putpage(pp, wbc, &for_sync)); } -static int -zpl_releasepage(struct page *pp, gfp_t gfp) -{ - if (PagePrivate(pp)) { - ClearPagePrivate(pp); - put_page(pp); - } - return (1); -} - -#ifdef HAVE_VFS_RELEASE_FOLIO -static bool -zpl_release_folio(struct folio *folio, gfp_t gfp) -{ - return (zpl_releasepage(&folio->page, gfp)); -} -#endif - -#ifdef HAVE_VFS_INVALIDATE_FOLIO -static void -zpl_invalidate_folio(struct folio *folio, size_t offset, size_t len) -{ - if ((offset == 0) && (len == PAGE_SIZE)) { - zpl_releasepage(&folio->page, 0); - } -} -#else -static void -zpl_invalidatepage(struct page *pp, unsigned int offset, unsigned int len) -{ - if ((offset == 0) && (len == PAGE_SIZE)) { - zpl_releasepage(pp, 0); - } -} -#endif - /* * The flag combination which matches the behavior of zfs_space() is * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE * flag was introduced in the 2.6.38 kernel. * * The original mode=0 (allocate space) behavior can be reasonably emulated * by checking if enough space exists and creating a sparse file, as real * persistent space reservation is not possible due to COW, snapshots, etc. */ static long zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) { cred_t *cr = CRED(); loff_t olen; fstrans_cookie_t cookie; int error = 0; int test_mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE; if ((mode & ~(FALLOC_FL_KEEP_SIZE | test_mode)) != 0) return (-EOPNOTSUPP); if (offset < 0 || len <= 0) return (-EINVAL); spl_inode_lock(ip); olen = i_size_read(ip); crhold(cr); cookie = spl_fstrans_mark(); if (mode & (test_mode)) { flock64_t bf; if (mode & FALLOC_FL_KEEP_SIZE) { if (offset > olen) goto out_unmark; if (offset + len > olen) len = olen - offset; } bf.l_type = F_WRLCK; bf.l_whence = SEEK_SET; bf.l_start = offset; bf.l_len = len; bf.l_pid = 0; error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr); } else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) { unsigned int percent = zfs_fallocate_reserve_percent; struct kstatfs statfs; /* Legacy mode, disable fallocate compatibility. */ if (percent == 0) { error = -EOPNOTSUPP; goto out_unmark; } /* * Use zfs_statvfs() instead of dmu_objset_space() since it * also checks project quota limits, which are relevant here. */ error = zfs_statvfs(ip, &statfs); if (error) goto out_unmark; /* * Shrink available space a bit to account for overhead/races. * We know the product previously fit into availbytes from * dmu_objset_space(), so the smaller product will also fit. */ if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) { error = -ENOSPC; goto out_unmark; } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen) error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE); } out_unmark: spl_fstrans_unmark(cookie); spl_inode_unlock(ip); crfree(cr); return (error); } static long zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) { return zpl_fallocate_common(file_inode(filp), mode, offset, len); } static int zpl_ioctl_getversion(struct file *filp, void __user *arg) { uint32_t generation = file_inode(filp)->i_generation; return (copy_to_user(arg, &generation, sizeof (generation))); } static int zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) { struct inode *ip = file_inode(filp); znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os = zfsvfs->z_os; int error = 0; if (S_ISFIFO(ip->i_mode)) return (-ESPIPE); if (offset < 0 || len < 0) return (-EINVAL); if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_WILLNEED: #ifdef HAVE_GENERIC_FADVISE if (zn_has_cached_data(zp, offset, offset + len - 1)) error = generic_fadvise(filp, offset, len, advice); #endif /* * Pass on the caller's size directly, but note that * dmu_prefetch_max will effectively cap it. If there * really is a larger sequential access pattern, perhaps * dmu_zfetch will detect it. */ if (len == 0) len = i_size_read(ip) - offset; dmu_prefetch(os, zp->z_id, 0, offset, len, ZIO_PRIORITY_ASYNC_READ); break; case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: case POSIX_FADV_DONTNEED: case POSIX_FADV_NOREUSE: /* ignored for now */ break; default: error = -EINVAL; break; } zfs_exit(zfsvfs, FTAG); return (error); } #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) static uint32_t __zpl_ioctl_getflags(struct inode *ip) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; uint32_t ioctl_flags = 0; if (zfs_flags & ZFS_IMMUTABLE) ioctl_flags |= FS_IMMUTABLE_FL; if (zfs_flags & ZFS_APPENDONLY) ioctl_flags |= FS_APPEND_FL; if (zfs_flags & ZFS_NODUMP) ioctl_flags |= FS_NODUMP_FL; if (zfs_flags & ZFS_PROJINHERIT) ioctl_flags |= ZFS_PROJINHERIT_FL; return (ioctl_flags & ZFS_FL_USER_VISIBLE); } /* * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file * attributes common to both Linux and Solaris are mapped. */ static int zpl_ioctl_getflags(struct file *filp, void __user *arg) { uint32_t flags; int err; flags = __zpl_ioctl_getflags(file_inode(filp)); err = copy_to_user(arg, &flags, sizeof (flags)); return (err); } /* * fchange() is a helper macro to detect if we have been asked to change a * flag. This is ugly, but the requirement that we do this is a consequence of * how the Linux file attribute interface was designed. Another consequence is * that concurrent modification of files suffers from a TOCTOU race. Neither * are things we can fix without modifying the kernel-userland interface, which * is outside of our jurisdiction. */ #define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) static int __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; xoptattr_t *xoap; if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | ZFS_PROJINHERIT_FL)) return (-EOPNOTSUPP); if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) return (-EACCES); if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && !capable(CAP_LINUX_IMMUTABLE)) return (-EPERM); if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EACCES); xva_init(xva); xoap = xva_getxoptattr(xva); #define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \ if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \ ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \ XVA_SET_REQ(xva, (xflag)); \ (xfield) = ((ioctl_flags & (iflag)) != 0); \ } \ } while (0) FLAG_CHANGE(FS_IMMUTABLE_FL, ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable); FLAG_CHANGE(FS_APPEND_FL, ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly); FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump); FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT, xoap->xoa_projinherit); #undef FLAG_CHANGE return (0); } static int zpl_ioctl_setflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint32_t flags; cred_t *cr = CRED(); xvattr_t xva; int err; fstrans_cookie_t cookie; if (copy_from_user(&flags, arg, sizeof (flags))) return (-EFAULT); err = __zpl_ioctl_setflags(ip, flags, &xva); if (err) return (err); crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); return (err); } static int zpl_ioctl_getxattr(struct file *filp, void __user *arg) { zfsxattr_t fsx = { 0 }; struct inode *ip = file_inode(filp); int err; fsx.fsx_xflags = __zpl_ioctl_getflags(ip); fsx.fsx_projid = ITOZ(ip)->z_projid; err = copy_to_user(arg, &fsx, sizeof (fsx)); return (err); } static int zpl_ioctl_setxattr(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); zfsxattr_t fsx; cred_t *cr = CRED(); xvattr_t xva; xoptattr_t *xoap; int err; fstrans_cookie_t cookie; if (copy_from_user(&fsx, arg, sizeof (fsx))) return (-EFAULT); if (!zpl_is_valid_projid(fsx.fsx_projid)) return (-EINVAL); err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); if (err) return (err); xoap = xva_getxoptattr(&xva); XVA_SET_REQ(&xva, XAT_PROJID); xoap->xoa_projid = fsx.fsx_projid; crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); return (err); } /* * Expose Additional File Level Attributes of ZFS. */ static int zpl_ioctl_getdosflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint64_t dosflags = ITOZ(ip)->z_pflags; dosflags &= ZFS_DOS_FL_USER_VISIBLE; int err = copy_to_user(arg, &dosflags, sizeof (dosflags)); return (err); } static int __zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; xoptattr_t *xoap; if (ioctl_flags & (~ZFS_DOS_FL_USER_VISIBLE)) return (-EOPNOTSUPP); if ((fchange(ioctl_flags, zfs_flags, ZFS_IMMUTABLE, ZFS_IMMUTABLE) || fchange(ioctl_flags, zfs_flags, ZFS_APPENDONLY, ZFS_APPENDONLY)) && !capable(CAP_LINUX_IMMUTABLE)) return (-EPERM); if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EACCES); xva_init(xva); xoap = xva_getxoptattr(xva); #define FLAG_CHANGE(iflag, xflag, xfield) do { \ if (((ioctl_flags & (iflag)) && !(zfs_flags & (iflag))) || \ ((zfs_flags & (iflag)) && !(ioctl_flags & (iflag)))) { \ XVA_SET_REQ(xva, (xflag)); \ (xfield) = ((ioctl_flags & (iflag)) != 0); \ } \ } while (0) FLAG_CHANGE(ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable); FLAG_CHANGE(ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly); FLAG_CHANGE(ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump); FLAG_CHANGE(ZFS_READONLY, XAT_READONLY, xoap->xoa_readonly); FLAG_CHANGE(ZFS_HIDDEN, XAT_HIDDEN, xoap->xoa_hidden); FLAG_CHANGE(ZFS_SYSTEM, XAT_SYSTEM, xoap->xoa_system); FLAG_CHANGE(ZFS_ARCHIVE, XAT_ARCHIVE, xoap->xoa_archive); FLAG_CHANGE(ZFS_NOUNLINK, XAT_NOUNLINK, xoap->xoa_nounlink); FLAG_CHANGE(ZFS_REPARSE, XAT_REPARSE, xoap->xoa_reparse); FLAG_CHANGE(ZFS_OFFLINE, XAT_OFFLINE, xoap->xoa_offline); FLAG_CHANGE(ZFS_SPARSE, XAT_SPARSE, xoap->xoa_sparse); #undef FLAG_CHANGE return (0); } /* * Set Additional File Level Attributes of ZFS. */ static int zpl_ioctl_setdosflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint64_t dosflags; cred_t *cr = CRED(); xvattr_t xva; int err; fstrans_cookie_t cookie; if (copy_from_user(&dosflags, arg, sizeof (dosflags))) return (-EFAULT); err = __zpl_ioctl_setdosflags(ip, dosflags, &xva); if (err) return (err); crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); return (err); } static long zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC_GETVERSION: return (zpl_ioctl_getversion(filp, (void *)arg)); case FS_IOC_GETFLAGS: return (zpl_ioctl_getflags(filp, (void *)arg)); case FS_IOC_SETFLAGS: return (zpl_ioctl_setflags(filp, (void *)arg)); case ZFS_IOC_FSGETXATTR: return (zpl_ioctl_getxattr(filp, (void *)arg)); case ZFS_IOC_FSSETXATTR: return (zpl_ioctl_setxattr(filp, (void *)arg)); case ZFS_IOC_GETDOSFLAGS: return (zpl_ioctl_getdosflags(filp, (void *)arg)); case ZFS_IOC_SETDOSFLAGS: return (zpl_ioctl_setdosflags(filp, (void *)arg)); case ZFS_IOC_COMPAT_FICLONE: return (zpl_ioctl_ficlone(filp, (void *)arg)); case ZFS_IOC_COMPAT_FICLONERANGE: return (zpl_ioctl_ficlonerange(filp, (void *)arg)); case ZFS_IOC_COMPAT_FIDEDUPERANGE: return (zpl_ioctl_fideduperange(filp, (void *)arg)); default: return (-ENOTTY); } } #ifdef CONFIG_COMPAT static long zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; default: return (-ENOTTY); } return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); } #endif /* CONFIG_COMPAT */ const struct address_space_operations zpl_address_space_operations = { #ifdef HAVE_VFS_READPAGES .readpages = zpl_readpages, #else .readahead = zpl_readahead, #endif #ifdef HAVE_VFS_READ_FOLIO .read_folio = zpl_read_folio, #else .readpage = zpl_readpage, #endif .writepage = zpl_writepage, .writepages = zpl_writepages, .direct_IO = zpl_direct_IO, #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS .set_page_dirty = __set_page_dirty_nobuffers, #endif #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO .dirty_folio = filemap_dirty_folio, #endif -#ifdef HAVE_VFS_RELEASE_FOLIO - .release_folio = zpl_release_folio, -#else - .releasepage = zpl_releasepage, -#endif -#ifdef HAVE_VFS_INVALIDATE_FOLIO - .invalidate_folio = zpl_invalidate_folio, -#else - .invalidatepage = zpl_invalidatepage, -#endif }; const struct file_operations zpl_file_operations = { .open = zpl_open, .release = zpl_release, .llseek = zpl_llseek, .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, #ifdef HAVE_VFS_IOV_ITER #ifdef HAVE_COPY_SPLICE_READ .splice_read = copy_splice_read, #else .splice_read = generic_file_splice_read, #endif .splice_write = iter_file_splice_write, #endif .mmap = zpl_mmap, .fsync = zpl_fsync, .fallocate = zpl_fallocate, .copy_file_range = zpl_copy_file_range, #ifdef HAVE_VFS_CLONE_FILE_RANGE .clone_file_range = zpl_clone_file_range, #endif #ifdef HAVE_VFS_REMAP_FILE_RANGE .remap_file_range = zpl_remap_file_range, #endif #ifdef HAVE_VFS_DEDUPE_FILE_RANGE .dedupe_file_range = zpl_dedupe_file_range, #endif .fadvise = zpl_fadvise, .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; const struct file_operations zpl_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate_shared = zpl_iterate, .fsync = zpl_fsync, .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; /* CSTYLED */ module_param(zfs_fallocate_reserve_percent, uint, 0644); MODULE_PARM_DESC(zfs_fallocate_reserve_percent, "Percentage of length to use for the available capacity check");