diff --git a/config/kernel-generic_fillattr.m4 b/config/kernel-generic_fillattr.m4 new file mode 100644 index 000000000000..50c8031305b3 --- /dev/null +++ b/config/kernel-generic_fillattr.m4 @@ -0,0 +1,28 @@ +dnl # +dnl # 5.12 API +dnl # +dnl # generic_fillattr in linux/fs.h now requires a struct user_namespace* +dnl # as the first arg, to support idmapped mounts. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR_USERNS], [ + ZFS_LINUX_TEST_SRC([generic_fillattr_userns], [ + #include + ],[ + struct user_namespace *userns = NULL; + struct inode *in = NULL; + struct kstat *k = NULL; + generic_fillattr(userns, in, k); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GENERIC_FILLATTR_USERNS], [ + AC_MSG_CHECKING([whether generic_fillattr requres struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([generic_fillattr_userns], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_GENERIC_FILLATTR_USERNS, 1, + [generic_fillattr requires struct user_namespace*]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) + diff --git a/config/kernel-inode-create.m4 b/config/kernel-inode-create.m4 index 9f28bcbd4f7f..a6ea11fb61b2 100644 --- a/config/kernel-inode-create.m4 +++ b/config/kernel-inode-create.m4 @@ -1,26 +1,53 @@ -dnl # -dnl # 3.6 API change -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_CREATE_FLAGS], [ +AC_DEFUN([ZFS_AC_KERNEL_SRC_CREATE], [ + dnl # + dnl # 5.12 API change that added the struct user_namespace* arg + dnl # to the front of this function type's arg list. + dnl # + ZFS_LINUX_TEST_SRC([create_userns], [ + #include + #include + + int inode_create(struct user_namespace *userns, + struct inode *inode ,struct dentry *dentry, + umode_t umode, bool flag) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .create = inode_create, + }; + ],[]) + + dnl # + dnl # 3.6 API change + dnl # ZFS_LINUX_TEST_SRC([create_flags], [ #include #include int inode_create(struct inode *inode ,struct dentry *dentry, umode_t umode, bool flag) { return 0; } static const struct inode_operations iops __attribute__ ((unused)) = { .create = inode_create, }; ],[]) ]) -AC_DEFUN([ZFS_AC_KERNEL_CREATE_FLAGS], [ - AC_MSG_CHECKING([whether iops->create() passes flags]) - ZFS_LINUX_TEST_RESULT([create_flags], [ +AC_DEFUN([ZFS_AC_KERNEL_CREATE], [ + AC_MSG_CHECKING([whether iops->create() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([create_userns], [ AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_CREATE_USERNS, 1, + [iops->create() takes struct user_namespace*]) ],[ - ZFS_LINUX_TEST_ERROR([iops->create()]) + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether iops->create() passes flags]) + ZFS_LINUX_TEST_RESULT([create_flags], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([iops->create()]) + ]) ]) ]) diff --git a/config/kernel-inode-getattr.m4 b/config/kernel-inode-getattr.m4 index 48391d66f8bd..f62e82f5230a 100644 --- a/config/kernel-inode-getattr.m4 +++ b/config/kernel-inode-getattr.m4 @@ -1,53 +1,92 @@ -dnl # -dnl # Linux 4.11 API -dnl # See torvalds/linux@a528d35 -dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_GETATTR], [ + dnl # + dnl # Linux 5.12 API + dnl # The getattr I/O operations handler type was extended to require + dnl # a struct user_namespace* as its first arg, to support idmapped + dnl # mounts. + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_getattr_userns], [ + #include + + int test_getattr( + struct user_namespace *userns, + const struct path *p, struct kstat *k, + u32 request_mask, unsigned int query_flags) + { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .getattr = test_getattr, + }; + ],[]) + + dnl # + dnl # Linux 4.11 API + dnl # See torvalds/linux@a528d35 + dnl # ZFS_LINUX_TEST_SRC([inode_operations_getattr_path], [ #include int test_getattr( const struct path *p, struct kstat *k, u32 request_mask, unsigned int query_flags) { return 0; } static const struct inode_operations iops __attribute__ ((unused)) = { .getattr = test_getattr, }; ],[]) ZFS_LINUX_TEST_SRC([inode_operations_getattr_vfsmount], [ #include int test_getattr( struct vfsmount *mnt, struct dentry *d, struct kstat *k) { return 0; } static const struct inode_operations iops __attribute__ ((unused)) = { .getattr = test_getattr, }; ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_INODE_GETATTR], [ - AC_MSG_CHECKING([whether iops->getattr() takes a path]) - ZFS_LINUX_TEST_RESULT([inode_operations_getattr_path], [ + dnl # + dnl # Kernel 5.12 test + dnl # + AC_MSG_CHECKING([whether iops->getattr() takes user_namespace]) + ZFS_LINUX_TEST_RESULT([inode_operations_getattr_userns], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_PATH_IOPS_GETATTR, 1, - [iops->getattr() takes a path]) + AC_DEFINE(HAVE_USERNS_IOPS_GETATTR, 1, + [iops->getattr() takes struct user_namespace*]) ],[ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether iops->getattr() takes a vfsmount]) - ZFS_LINUX_TEST_RESULT([inode_operations_getattr_vfsmount], [ + dnl # + dnl # Kernel 4.11 test + dnl # + AC_MSG_CHECKING([whether iops->getattr() takes a path]) + ZFS_LINUX_TEST_RESULT([inode_operations_getattr_path], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_VFSMOUNT_IOPS_GETATTR, 1, - [iops->getattr() takes a vfsmount]) + AC_DEFINE(HAVE_PATH_IOPS_GETATTR, 1, + [iops->getattr() takes a path]) ],[ AC_MSG_RESULT(no) + + dnl # + dnl # Kernel < 4.11 test + dnl # + AC_MSG_CHECKING([whether iops->getattr() takes a vfsmount]) + ZFS_LINUX_TEST_RESULT([inode_operations_getattr_vfsmount], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VFSMOUNT_IOPS_GETATTR, 1, + [iops->getattr() takes a vfsmount]) + ],[ + AC_MSG_RESULT(no) + ]) ]) ]) ]) diff --git a/config/kernel-is_owner_or_cap.m4 b/config/kernel-is_owner_or_cap.m4 index 3df6163da270..3c3c6ad2240f 100644 --- a/config/kernel-is_owner_or_cap.m4 +++ b/config/kernel-is_owner_or_cap.m4 @@ -1,23 +1,42 @@ dnl # dnl # 2.6.39 API change, dnl # The is_owner_or_cap() macro was renamed to inode_owner_or_capable(), dnl # This is used for permission checks in the xattr and file attribute call dnl # paths. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE], [ ZFS_LINUX_TEST_SRC([inode_owner_or_capable], [ #include ],[ struct inode *ip = NULL; (void) inode_owner_or_capable(ip); ]) + + ZFS_LINUX_TEST_SRC([inode_owner_or_capable_idmapped], [ + #include + ],[ + struct inode *ip = NULL; + (void) inode_owner_or_capable(&init_user_ns, ip); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE], [ AC_MSG_CHECKING([whether inode_owner_or_capable() exists]) ZFS_LINUX_TEST_RESULT([inode_owner_or_capable], [ AC_MSG_RESULT(yes) - ],[ - ZFS_LINUX_TEST_ERROR([capability]) + AC_DEFINE(HAVE_INODE_OWNER_OR_CAPABLE, 1, + [inode_owner_or_capable() exists]) + ], [ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING( + [whether inode_owner_or_capable() takes user_ns]) + ZFS_LINUX_TEST_RESULT([inode_owner_or_capable_idmapped], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_OWNER_OR_CAPABLE_IDMAPPED, 1, + [inode_owner_or_capable() takes user_ns]) + ],[ + ZFS_LINUX_TEST_ERROR([capability]) + ]) ]) ]) diff --git a/config/kernel-mkdir-umode-t.m4 b/config/kernel-mkdir-umode-t.m4 deleted file mode 100644 index 19599670df3b..000000000000 --- a/config/kernel-mkdir-umode-t.m4 +++ /dev/null @@ -1,32 +0,0 @@ -dnl # -dnl # 3.3 API change -dnl # The VFS .create, .mkdir and .mknod callbacks were updated to take a -dnl # umode_t type rather than an int. The expectation is that any backport -dnl # would also change all three prototypes. However, if it turns out that -dnl # some distribution doesn't backport the whole thing this could be -dnl # broken apart into three separate checks. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_MKDIR_UMODE_T], [ - ZFS_LINUX_TEST_SRC([inode_operations_mkdir], [ - #include - - int mkdir(struct inode *inode, struct dentry *dentry, - umode_t umode) { return 0; } - - static const struct inode_operations - iops __attribute__ ((unused)) = { - .mkdir = mkdir, - }; - ],[]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_MKDIR_UMODE_T], [ - AC_MSG_CHECKING([whether iops->create()/mkdir()/mknod() take umode_t]) - ZFS_LINUX_TEST_RESULT([inode_operations_mkdir], [ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_MKDIR_UMODE_T, 1, - [iops->create()/mkdir()/mknod() take umode_t]) - ],[ - ZFS_LINUX_TEST_ERROR([mkdir()]) - ]) -]) diff --git a/config/kernel-mkdir.m4 b/config/kernel-mkdir.m4 new file mode 100644 index 000000000000..a162bcd880ff --- /dev/null +++ b/config/kernel-mkdir.m4 @@ -0,0 +1,65 @@ +dnl # +dnl # Supported mkdir() interfaces checked newest to oldest. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_MKDIR], [ + dnl # + dnl # 5.12 API change + dnl # The struct user_namespace arg was added as the first argument to + dnl # mkdir() + dnl # + ZFS_LINUX_TEST_SRC([mkdir_user_namespace], [ + #include + + int mkdir(struct user_namespace *userns, + struct inode *inode, struct dentry *dentry, + umode_t umode) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .mkdir = mkdir, + }; + ],[]) + + dnl # + dnl # 3.3 API change + dnl # The VFS .create, .mkdir and .mknod callbacks were updated to take a + dnl # umode_t type rather than an int. The expectation is that any backport + dnl # would also change all three prototypes. However, if it turns out that + dnl # some distribution doesn't backport the whole thing this could be + dnl # broken apart into three separate checks. + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_mkdir], [ + #include + + int mkdir(struct inode *inode, struct dentry *dentry, + umode_t umode) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .mkdir = mkdir, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_MKDIR], [ + dnl # + dnl # 5.12 API change + dnl # The struct user_namespace arg was added as the first argument to + dnl # mkdir() of the iops structure. + dnl # + AC_MSG_CHECKING([whether iops->mkdir() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([mkdir_user_namespace], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_MKDIR_USERNS, 1, + [iops->mkdir() takes struct user_namespace*]) + ],[ + AC_MSG_CHECKING([whether iops->mkdir() takes umode_t]) + ZFS_LINUX_TEST_RESULT([inode_operations_mkdir], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MKDIR_UMODE_T, 1, + [iops->mkdir() takes umode_t]) + ],[ + ZFS_LINUX_TEST_ERROR([mkdir()]) + ]) + ]) +]) diff --git a/config/kernel-mknod.m4 b/config/kernel-mknod.m4 new file mode 100644 index 000000000000..ffe45106003a --- /dev/null +++ b/config/kernel-mknod.m4 @@ -0,0 +1,30 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_MKNOD], [ + dnl # + dnl # 5.12 API change that added the struct user_namespace* arg + dnl # to the front of this function type's arg list. + dnl # + ZFS_LINUX_TEST_SRC([mknod_userns], [ + #include + #include + + int tmp_mknod(struct user_namespace *userns, + struct inode *inode ,struct dentry *dentry, + umode_t u, dev_t d) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .mknod = tmp_mknod, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_MKNOD], [ + AC_MSG_CHECKING([whether iops->mknod() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([mknod_userns], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_MKNOD_USERNS, 1, + [iops->mknod() takes struct user_namespace*]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-rename.m4 b/config/kernel-rename.m4 index f707391539d8..31d199f33bba 100644 --- a/config/kernel-rename.m4 +++ b/config/kernel-rename.m4 @@ -1,29 +1,55 @@ -dnl # -dnl # 4.9 API change, -dnl # iops->rename2() merged into iops->rename(), and iops->rename() now wants -dnl # flags. -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME_WANTS_FLAGS], [ - ZFS_LINUX_TEST_SRC([inode_operations_rename], [ +AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [ + dnl # + dnl # 4.9 API change, + dnl # iops->rename2() merged into iops->rename(), and iops->rename() now wants + dnl # flags. + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_rename_flags], [ #include int rename_fn(struct inode *sip, struct dentry *sdp, struct inode *tip, struct dentry *tdp, unsigned int flags) { return 0; } static const struct inode_operations iops __attribute__ ((unused)) = { .rename = rename_fn, }; ],[]) + + dnl # + dnl # 5.12 API change, + dnl # + dnl # Linux 5.12 introduced passing struct user_namespace* as the first argument + dnl # of the rename() and other inode_operations members. + dnl # + ZFS_LINUX_TEST_SRC([inode_operations_rename_userns], [ + #include + int rename_fn(struct user_namespace *user_ns, struct inode *sip, + struct dentry *sdp, struct inode *tip, struct dentry *tdp, + unsigned int flags) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .rename = rename_fn, + }; + ],[]) ]) -AC_DEFUN([ZFS_AC_KERNEL_RENAME_WANTS_FLAGS], [ - AC_MSG_CHECKING([whether iops->rename() wants flags]) - ZFS_LINUX_TEST_RESULT([inode_operations_rename], [ +AC_DEFUN([ZFS_AC_KERNEL_RENAME], [ + AC_MSG_CHECKING([whether iops->rename() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([inode_operations_rename_userns], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1, - [iops->rename() wants flags]) + AC_DEFINE(HAVE_IOPS_RENAME_USERNS, 1, + [iops->rename() takes struct user_namespace*]) ],[ AC_MSG_RESULT(no) + + ZFS_LINUX_TEST_RESULT([inode_operations_rename_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1, + [iops->rename() wants flags]) + ],[ + AC_MSG_RESULT(no) + ]) ]) ]) diff --git a/config/kernel-setattr-prepare.m4 b/config/kernel-setattr-prepare.m4 index 45408c45c69b..24245aa53448 100644 --- a/config/kernel-setattr-prepare.m4 +++ b/config/kernel-setattr-prepare.m4 @@ -1,27 +1,52 @@ -dnl # -dnl # 4.9 API change -dnl # The inode_change_ok() function has been renamed setattr_prepare() -dnl # and updated to take a dentry rather than an inode. -dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_SETATTR_PREPARE], [ + dnl # + dnl # 4.9 API change + dnl # The inode_change_ok() function has been renamed setattr_prepare() + dnl # and updated to take a dentry rather than an inode. + dnl # ZFS_LINUX_TEST_SRC([setattr_prepare], [ #include ], [ struct dentry *dentry = NULL; struct iattr *attr = NULL; int error __attribute__ ((unused)) = - setattr_prepare(dentry, attr); + setattr_prepare(dentry, attr); + ]) + + dnl # + dnl # 5.12 API change + dnl # The setattr_prepare() function has been changed to accept a new argument + dnl # for struct user_namespace* + dnl # + ZFS_LINUX_TEST_SRC([setattr_prepare_userns], [ + #include + ], [ + struct dentry *dentry = NULL; + struct iattr *attr = NULL; + struct user_namespace *userns = NULL; + int error __attribute__ ((unused)) = + setattr_prepare(userns, dentry, attr); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_SETATTR_PREPARE], [ - AC_MSG_CHECKING([whether setattr_prepare() is available]) - ZFS_LINUX_TEST_RESULT_SYMBOL([setattr_prepare], + AC_MSG_CHECKING([whether setattr_prepare() is available and accepts struct user_namespace*]) + ZFS_LINUX_TEST_RESULT_SYMBOL([setattr_prepare_userns], [setattr_prepare], [fs/attr.c], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SETATTR_PREPARE, 1, - [setattr_prepare() is available]) + AC_DEFINE(HAVE_SETATTR_PREPARE_USERNS, 1, + [setattr_prepare() accepts user_namespace]) ], [ AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether setattr_prepare() is available, doesn't accept user_namespace]) + ZFS_LINUX_TEST_RESULT_SYMBOL([setattr_prepare], + [setattr_prepare], [fs/attr.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SETATTR_PREPARE_NO_USERNS, 1, + [setattr_prepare() is available, doesn't accept user_namespace]) + ], [ + AC_MSG_RESULT(no) + ]) ]) ]) diff --git a/config/kernel-symlink.m4 b/config/kernel-symlink.m4 new file mode 100644 index 000000000000..d90366d04b72 --- /dev/null +++ b/config/kernel-symlink.m4 @@ -0,0 +1,30 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_SYMLINK], [ + dnl # + dnl # 5.12 API change that added the struct user_namespace* arg + dnl # to the front of this function type's arg list. + dnl # + ZFS_LINUX_TEST_SRC([symlink_userns], [ + #include + #include + + int tmp_symlink(struct user_namespace *userns, + struct inode *inode ,struct dentry *dentry, + const char *path) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .symlink = tmp_symlink, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SYMLINK], [ + AC_MSG_CHECKING([whether iops->symlink() takes struct user_namespace*]) + ZFS_LINUX_TEST_RESULT([symlink_userns], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IOPS_SYMLINK_USERNS, 1, + [iops->symlink() takes struct user_namespace*]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4 index 137bf4a8aff0..00b1e74a9ccb 100644 --- a/config/kernel-xattr-handler.m4 +++ b/config/kernel-xattr-handler.m4 @@ -1,397 +1,425 @@ dnl # dnl # 2.6.35 API change, dnl # The 'struct xattr_handler' was constified in the generic dnl # super_block structure. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_CONST_XATTR_HANDLER], [ ZFS_LINUX_TEST_SRC([const_xattr_handler], [ #include #include const struct xattr_handler xattr_test_handler = { .prefix = "test", .get = NULL, .set = NULL, }; const struct xattr_handler *xattr_handlers[] = { &xattr_test_handler, }; const struct super_block sb __attribute__ ((unused)) = { .s_xattr = xattr_handlers, }; ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_CONST_XATTR_HANDLER], [ AC_MSG_CHECKING([whether super_block uses const struct xattr_handler]) ZFS_LINUX_TEST_RESULT([const_xattr_handler], [ AC_MSG_RESULT([yes]) ],[ ZFS_LINUX_TEST_ERROR([const xattr_handler]) ]) ]) dnl # dnl # 4.5 API change, dnl # struct xattr_handler added new member "name". dnl # xattr_handler which matches to whole name rather than prefix should use dnl # "name" instead of "prefix", e.g. "system.posix_acl_access" dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_NAME], [ ZFS_LINUX_TEST_SRC([xattr_handler_name], [ #include static const struct xattr_handler xops __attribute__ ((unused)) = { .name = XATTR_NAME_POSIX_ACL_ACCESS, }; ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_NAME], [ AC_MSG_CHECKING([whether xattr_handler has name]) ZFS_LINUX_TEST_RESULT([xattr_handler_name], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_HANDLER_NAME, 1, [xattr_handler has name]) ],[ AC_MSG_RESULT(no) ]) ]) dnl # dnl # Supported xattr handler get() interfaces checked newest to oldest. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET], [ ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry_inode], [ #include int get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { return 0; } static const struct xattr_handler xops __attribute__ ((unused)) = { .get = get, }; ],[]) ZFS_LINUX_TEST_SRC([xattr_handler_get_xattr_handler], [ #include int get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { return 0; } static const struct xattr_handler xops __attribute__ ((unused)) = { .get = get, }; ],[]) ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry], [ #include int get(struct dentry *dentry, const char *name, void *buffer, size_t size, int handler_flags) { return 0; } static const struct xattr_handler xops __attribute__ ((unused)) = { .get = get, }; ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ dnl # dnl # 4.7 API change, dnl # The xattr_handler->get() callback was changed to take both dnl # dentry and inode. dnl # AC_MSG_CHECKING([whether xattr_handler->get() wants dentry and inode]) ZFS_LINUX_TEST_RESULT([xattr_handler_get_dentry_inode], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_GET_DENTRY_INODE, 1, [xattr_handler->get() wants both dentry and inode]) ],[ dnl # dnl # 4.4 API change, dnl # The xattr_handler->get() callback was changed to take a dnl # attr_handler, and handler_flags argument was removed and dnl # should be accessed by handler->flags. dnl # AC_MSG_RESULT(no) AC_MSG_CHECKING( [whether xattr_handler->get() wants xattr_handler]) ZFS_LINUX_TEST_RESULT([xattr_handler_get_xattr_handler], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_GET_HANDLER, 1, [xattr_handler->get() wants xattr_handler]) ],[ dnl # dnl # 2.6.33 API change, dnl # The xattr_handler->get() callback was changed dnl # to take a dentry instead of an inode, and a dnl # handler_flags argument was added. dnl # AC_MSG_RESULT(no) AC_MSG_CHECKING( [whether xattr_handler->get() wants dentry]) ZFS_LINUX_TEST_RESULT([xattr_handler_get_dentry], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_GET_DENTRY, 1, [xattr_handler->get() wants dentry]) ],[ ZFS_LINUX_TEST_ERROR([xattr get()]) ]) ]) ]) ]) dnl # dnl # Supported xattr handler set() interfaces checked newest to oldest. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_SET], [ + ZFS_LINUX_TEST_SRC([xattr_handler_set_userns], [ + #include + + int set(const struct xattr_handler *handler, + struct user_namespace *mnt_userns, + struct dentry *dentry, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) + { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .set = set, + }; + ],[]) + ZFS_LINUX_TEST_SRC([xattr_handler_set_dentry_inode], [ #include int set(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) { return 0; } static const struct xattr_handler xops __attribute__ ((unused)) = { .set = set, }; ],[]) ZFS_LINUX_TEST_SRC([xattr_handler_set_xattr_handler], [ #include int set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags) { return 0; } static const struct xattr_handler xops __attribute__ ((unused)) = { .set = set, }; ],[]) ZFS_LINUX_TEST_SRC([xattr_handler_set_dentry], [ #include int set(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int handler_flags) { return 0; } static const struct xattr_handler xops __attribute__ ((unused)) = { .set = set, }; ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_SET], [ dnl # - dnl # 4.7 API change, - dnl # The xattr_handler->set() callback was changed to take both - dnl # dentry and inode. + dnl # 5.12 API change, + dnl # The xattr_handler->set() callback was changed to 8 arguments, and + dnl # struct user_namespace* was inserted as arg #2 dnl # - AC_MSG_CHECKING([whether xattr_handler->set() wants dentry and inode]) - ZFS_LINUX_TEST_RESULT([xattr_handler_set_dentry_inode], [ + AC_MSG_CHECKING([whether xattr_handler->set() wants dentry, inode, and user_namespace]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_userns], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_SET_DENTRY_INODE, 1, - [xattr_handler->set() wants both dentry and inode]) + AC_DEFINE(HAVE_XATTR_SET_USERNS, 1, + [xattr_handler->set() takes user_namespace]) ],[ dnl # - dnl # 4.4 API change, - dnl # The xattr_handler->set() callback was changed to take a - dnl # xattr_handler, and handler_flags argument was removed and - dnl # should be accessed by handler->flags. + dnl # 4.7 API change, + dnl # The xattr_handler->set() callback was changed to take both + dnl # dentry and inode. dnl # AC_MSG_RESULT(no) - AC_MSG_CHECKING( - [whether xattr_handler->set() wants xattr_handler]) - ZFS_LINUX_TEST_RESULT([xattr_handler_set_xattr_handler], [ + AC_MSG_CHECKING([whether xattr_handler->set() wants dentry and inode]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_dentry_inode], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_SET_HANDLER, 1, - [xattr_handler->set() wants xattr_handler]) + AC_DEFINE(HAVE_XATTR_SET_DENTRY_INODE, 1, + [xattr_handler->set() wants both dentry and inode]) ],[ dnl # - dnl # 2.6.33 API change, - dnl # The xattr_handler->set() callback was changed - dnl # to take a dentry instead of an inode, and a - dnl # handler_flags argument was added. + dnl # 4.4 API change, + dnl # The xattr_handler->set() callback was changed to take a + dnl # xattr_handler, and handler_flags argument was removed and + dnl # should be accessed by handler->flags. dnl # AC_MSG_RESULT(no) AC_MSG_CHECKING( - [whether xattr_handler->set() wants dentry]) - ZFS_LINUX_TEST_RESULT([xattr_handler_set_dentry], [ + [whether xattr_handler->set() wants xattr_handler]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_xattr_handler], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_XATTR_SET_DENTRY, 1, - [xattr_handler->set() wants dentry]) + AC_DEFINE(HAVE_XATTR_SET_HANDLER, 1, + [xattr_handler->set() wants xattr_handler]) ],[ - ZFS_LINUX_TEST_ERROR([xattr set()]) + dnl # + dnl # 2.6.33 API change, + dnl # The xattr_handler->set() callback was changed + dnl # to take a dentry instead of an inode, and a + dnl # handler_flags argument was added. + dnl # + AC_MSG_RESULT(no) + AC_MSG_CHECKING( + [whether xattr_handler->set() wants dentry]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_dentry], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_SET_DENTRY, 1, + [xattr_handler->set() wants dentry]) + ],[ + ZFS_LINUX_TEST_ERROR([xattr set()]) + ]) ]) ]) ]) ]) dnl # dnl # Supported xattr handler list() interfaces checked newest to oldest. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_LIST], [ ZFS_LINUX_TEST_SRC([xattr_handler_list_simple], [ #include bool list(struct dentry *dentry) { return 0; } static const struct xattr_handler xops __attribute__ ((unused)) = { .list = list, }; ],[]) ZFS_LINUX_TEST_SRC([xattr_handler_list_xattr_handler], [ #include size_t list(const struct xattr_handler *handler, struct dentry *dentry, char *list, size_t list_size, const char *name, size_t name_len) { return 0; } static const struct xattr_handler xops __attribute__ ((unused)) = { .list = list, }; ],[]) ZFS_LINUX_TEST_SRC([xattr_handler_list_dentry], [ #include size_t list(struct dentry *dentry, char *list, size_t list_size, const char *name, size_t name_len, int handler_flags) { return 0; } static const struct xattr_handler xops __attribute__ ((unused)) = { .list = list, }; ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_LIST], [ dnl # 4.5 API change, dnl # The xattr_handler->list() callback was changed to take only a dnl # dentry and it only needs to return if it's accessible. AC_MSG_CHECKING([whether xattr_handler->list() wants simple]) ZFS_LINUX_TEST_RESULT([xattr_handler_list_simple], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_LIST_SIMPLE, 1, [xattr_handler->list() wants simple]) ],[ dnl # dnl # 4.4 API change, dnl # The xattr_handler->list() callback was changed to take a dnl # xattr_handler, and handler_flags argument was removed dnl # and should be accessed by handler->flags. dnl # AC_MSG_RESULT(no) AC_MSG_CHECKING( [whether xattr_handler->list() wants xattr_handler]) ZFS_LINUX_TEST_RESULT([xattr_handler_list_xattr_handler], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_LIST_HANDLER, 1, [xattr_handler->list() wants xattr_handler]) ],[ dnl # dnl # 2.6.33 API change, dnl # The xattr_handler->list() callback was changed dnl # to take a dentry instead of an inode, and a dnl # handler_flags argument was added. dnl # AC_MSG_RESULT(no) AC_MSG_CHECKING( [whether xattr_handler->list() wants dentry]) ZFS_LINUX_TEST_RESULT([xattr_handler_list_dentry], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_XATTR_LIST_DENTRY, 1, [xattr_handler->list() wants dentry]) ],[ ZFS_LINUX_TEST_ERROR([xattr list()]) ]) ]) ]) ]) dnl # dnl # 3.7 API change, dnl # The posix_acl_{from,to}_xattr functions gained a new dnl # parameter: user_ns dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_POSIX_ACL_FROM_XATTR_USERNS], [ ZFS_LINUX_TEST_SRC([posix_acl_from_xattr_userns], [ #include #include #include ],[ posix_acl_from_xattr(&init_user_ns, NULL, 0); ]) ]) AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS], [ AC_MSG_CHECKING([whether posix_acl_from_xattr() needs user_ns]) ZFS_LINUX_TEST_RESULT([posix_acl_from_xattr_userns], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_POSIX_ACL_FROM_XATTR_USERNS, 1, [posix_acl_from_xattr() needs user_ns]) ],[ ZFS_LINUX_TEST_ERROR([posix_acl_from_xattr()]) ]) ]) dnl # dnl # 4.9 API change, dnl # iops->{set,get,remove}xattr and generic_{set,get,remove}xattr are dnl # removed. xattr operations will directly go through sb->s_xattr. dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_SETXATTR], [ ZFS_LINUX_TEST_SRC([have_generic_setxattr], [ #include #include static const struct inode_operations iops __attribute__ ((unused)) = { .setxattr = generic_setxattr }; ],[]) ]) AC_DEFUN([ZFS_AC_KERNEL_GENERIC_SETXATTR], [ AC_MSG_CHECKING([whether generic_setxattr() exists]) ZFS_LINUX_TEST_RESULT([have_generic_setxattr], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_GENERIC_SETXATTR, 1, [generic_setxattr() exists]) ],[ AC_MSG_RESULT(no) ]) ]) AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR], [ ZFS_AC_KERNEL_SRC_CONST_XATTR_HANDLER ZFS_AC_KERNEL_SRC_XATTR_HANDLER_NAME ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET ZFS_AC_KERNEL_SRC_XATTR_HANDLER_SET ZFS_AC_KERNEL_SRC_XATTR_HANDLER_LIST ZFS_AC_KERNEL_SRC_POSIX_ACL_FROM_XATTR_USERNS ZFS_AC_KERNEL_SRC_GENERIC_SETXATTR ]) AC_DEFUN([ZFS_AC_KERNEL_XATTR], [ ZFS_AC_KERNEL_CONST_XATTR_HANDLER ZFS_AC_KERNEL_XATTR_HANDLER_NAME ZFS_AC_KERNEL_XATTR_HANDLER_GET ZFS_AC_KERNEL_XATTR_HANDLER_SET ZFS_AC_KERNEL_XATTR_HANDLER_LIST ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS ZFS_AC_KERNEL_GENERIC_SETXATTR ]) diff --git a/config/kernel.m4 b/config/kernel.m4 index f31be845f5d9..24db38f09f43 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -1,873 +1,879 @@ dnl # dnl # Default ZFS kernel configuration dnl # AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ AM_COND_IF([BUILD_LINUX], [ dnl # Setup the kernel build environment. ZFS_AC_KERNEL ZFS_AC_QAT dnl # Sanity checks for module building and CONFIG_* defines ZFS_AC_KERNEL_TEST_MODULE ZFS_AC_KERNEL_CONFIG_DEFINED dnl # Sequential ZFS_LINUX_TRY_COMPILE tests ZFS_AC_KERNEL_FPU_HEADER ZFS_AC_KERNEL_OBJTOOL_HEADER ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T ZFS_AC_KERNEL_MISC_MINOR ZFS_AC_KERNEL_DECLARE_EVENT_CLASS dnl # Parallel ZFS_LINUX_TEST_SRC / ZFS_LINUX_TEST_RESULT tests ZFS_AC_KERNEL_TEST_SRC ZFS_AC_KERNEL_TEST_RESULT AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ" ]) AC_SUBST(KERNEL_MAKE) ]) ]) dnl # dnl # Generate and compile all of the kernel API test cases to determine dnl # which interfaces are available. By invoking the kernel build system dnl # only once the compilation can be done in parallel significantly dnl # speeding up the process. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_OBJTOOL ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE ZFS_AC_KERNEL_SRC_PDE_DATA ZFS_AC_KERNEL_SRC_FALLOCATE ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_SRC_RWSEM ZFS_AC_KERNEL_SRC_SCHED ZFS_AC_KERNEL_SRC_USLEEP_RANGE ZFS_AC_KERNEL_SRC_KMEM_CACHE ZFS_AC_KERNEL_SRC_KVMALLOC ZFS_AC_KERNEL_SRC_VMALLOC_PAGE_KERNEL ZFS_AC_KERNEL_SRC_WAIT ZFS_AC_KERNEL_SRC_INODE_TIMES ZFS_AC_KERNEL_SRC_INODE_LOCK ZFS_AC_KERNEL_SRC_GROUP_INFO_GID ZFS_AC_KERNEL_SRC_RW ZFS_AC_KERNEL_SRC_TIMER_SETUP ZFS_AC_KERNEL_SRC_SUPER_USER_NS ZFS_AC_KERNEL_SRC_PROC_OPERATIONS ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS ZFS_AC_KERNEL_SRC_BIO ZFS_AC_KERNEL_SRC_BLKDEV ZFS_AC_KERNEL_SRC_BLK_QUEUE ZFS_AC_KERNEL_SRC_REVALIDATE_DISK ZFS_AC_KERNEL_SRC_GET_DISK_RO ZFS_AC_KERNEL_SRC_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_SRC_XATTR ZFS_AC_KERNEL_SRC_ACL ZFS_AC_KERNEL_SRC_INODE_GETATTR ZFS_AC_KERNEL_SRC_INODE_SET_FLAGS ZFS_AC_KERNEL_SRC_INODE_SET_IVERSION ZFS_AC_KERNEL_SRC_SHOW_OPTIONS ZFS_AC_KERNEL_SRC_FILE_INODE ZFS_AC_KERNEL_SRC_FILE_DENTRY ZFS_AC_KERNEL_SRC_FSYNC ZFS_AC_KERNEL_SRC_AIO_FSYNC ZFS_AC_KERNEL_SRC_EVICT_INODE ZFS_AC_KERNEL_SRC_DIRTY_INODE ZFS_AC_KERNEL_SRC_SHRINKER - ZFS_AC_KERNEL_SRC_MKDIR_UMODE_T + ZFS_AC_KERNEL_SRC_MKDIR ZFS_AC_KERNEL_SRC_LOOKUP_FLAGS - ZFS_AC_KERNEL_SRC_CREATE_FLAGS + ZFS_AC_KERNEL_SRC_CREATE ZFS_AC_KERNEL_SRC_GET_LINK ZFS_AC_KERNEL_SRC_PUT_LINK ZFS_AC_KERNEL_SRC_TMPFILE ZFS_AC_KERNEL_SRC_AUTOMOUNT ZFS_AC_KERNEL_SRC_ENCODE_FH_WITH_INODE ZFS_AC_KERNEL_SRC_COMMIT_METADATA ZFS_AC_KERNEL_SRC_CLEAR_INODE ZFS_AC_KERNEL_SRC_SETATTR_PREPARE ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED ZFS_AC_KERNEL_SRC_DENTRY ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SRC_SECURITY_INODE ZFS_AC_KERNEL_SRC_FST_MOUNT ZFS_AC_KERNEL_SRC_BDI ZFS_AC_KERNEL_SRC_SET_NLINK ZFS_AC_KERNEL_SRC_SGET ZFS_AC_KERNEL_SRC_LSEEK_EXECUTE ZFS_AC_KERNEL_SRC_VFS_GETATTR ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_SRC_VFS_ITERATE ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS ZFS_AC_KERNEL_SRC_VFS_IOV_ITER ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT ZFS_AC_KERNEL_SRC_FPU ZFS_AC_KERNEL_SRC_FMODE_T ZFS_AC_KERNEL_SRC_KUIDGID_T ZFS_AC_KERNEL_SRC_KUID_HELPERS ZFS_AC_KERNEL_SRC_MODULE_PARAM_CALL_CONST - ZFS_AC_KERNEL_SRC_RENAME_WANTS_FLAGS + ZFS_AC_KERNEL_SRC_RENAME ZFS_AC_KERNEL_SRC_CURRENT_TIME ZFS_AC_KERNEL_SRC_USERNS_CAPABILITIES ZFS_AC_KERNEL_SRC_IN_COMPAT_SYSCALL ZFS_AC_KERNEL_SRC_KTIME ZFS_AC_KERNEL_SRC_TOTALRAM_PAGES_FUNC ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES ZFS_AC_KERNEL_SRC_KSTRTOUL ZFS_AC_KERNEL_SRC_PERCPU ZFS_AC_KERNEL_SRC_CPU_HOTPLUG + ZFS_AC_KERNEL_SRC_GENERIC_FILLATTR_USERNS + ZFS_AC_KERNEL_SRC_MKNOD + ZFS_AC_KERNEL_SRC_SYMLINK AC_MSG_CHECKING([for available kernel interfaces]) ZFS_LINUX_TEST_COMPILE_ALL([kabi]) AC_MSG_RESULT([done]) ]) dnl # dnl # Check results of kernel interface tests. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_ACCESS_OK_TYPE ZFS_AC_KERNEL_GLOBAL_PAGE_STATE ZFS_AC_KERNEL_OBJTOOL ZFS_AC_KERNEL_PDE_DATA ZFS_AC_KERNEL_FALLOCATE ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE ZFS_AC_KERNEL_RWSEM ZFS_AC_KERNEL_SCHED ZFS_AC_KERNEL_USLEEP_RANGE ZFS_AC_KERNEL_KMEM_CACHE ZFS_AC_KERNEL_KVMALLOC ZFS_AC_KERNEL_VMALLOC_PAGE_KERNEL ZFS_AC_KERNEL_WAIT ZFS_AC_KERNEL_INODE_TIMES ZFS_AC_KERNEL_INODE_LOCK ZFS_AC_KERNEL_GROUP_INFO_GID ZFS_AC_KERNEL_RW ZFS_AC_KERNEL_TIMER_SETUP ZFS_AC_KERNEL_SUPER_USER_NS ZFS_AC_KERNEL_PROC_OPERATIONS ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS ZFS_AC_KERNEL_BIO ZFS_AC_KERNEL_BLKDEV ZFS_AC_KERNEL_BLK_QUEUE ZFS_AC_KERNEL_REVALIDATE_DISK ZFS_AC_KERNEL_GET_DISK_RO ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL ZFS_AC_KERNEL_DISCARD_GRANULARITY ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE ZFS_AC_KERNEL_XATTR ZFS_AC_KERNEL_ACL ZFS_AC_KERNEL_INODE_GETATTR ZFS_AC_KERNEL_INODE_SET_FLAGS ZFS_AC_KERNEL_INODE_SET_IVERSION ZFS_AC_KERNEL_SHOW_OPTIONS ZFS_AC_KERNEL_FILE_INODE ZFS_AC_KERNEL_FILE_DENTRY ZFS_AC_KERNEL_FSYNC ZFS_AC_KERNEL_AIO_FSYNC ZFS_AC_KERNEL_EVICT_INODE ZFS_AC_KERNEL_DIRTY_INODE ZFS_AC_KERNEL_SHRINKER - ZFS_AC_KERNEL_MKDIR_UMODE_T + ZFS_AC_KERNEL_MKDIR ZFS_AC_KERNEL_LOOKUP_FLAGS - ZFS_AC_KERNEL_CREATE_FLAGS + ZFS_AC_KERNEL_CREATE ZFS_AC_KERNEL_GET_LINK ZFS_AC_KERNEL_PUT_LINK ZFS_AC_KERNEL_TMPFILE ZFS_AC_KERNEL_AUTOMOUNT ZFS_AC_KERNEL_ENCODE_FH_WITH_INODE ZFS_AC_KERNEL_COMMIT_METADATA ZFS_AC_KERNEL_CLEAR_INODE ZFS_AC_KERNEL_SETATTR_PREPARE ZFS_AC_KERNEL_INSERT_INODE_LOCKED ZFS_AC_KERNEL_DENTRY ZFS_AC_KERNEL_TRUNCATE_SETSIZE ZFS_AC_KERNEL_SECURITY_INODE ZFS_AC_KERNEL_FST_MOUNT ZFS_AC_KERNEL_BDI ZFS_AC_KERNEL_SET_NLINK ZFS_AC_KERNEL_SGET ZFS_AC_KERNEL_LSEEK_EXECUTE ZFS_AC_KERNEL_VFS_GETATTR ZFS_AC_KERNEL_VFS_FSYNC_2ARGS ZFS_AC_KERNEL_VFS_ITERATE ZFS_AC_KERNEL_VFS_DIRECT_IO ZFS_AC_KERNEL_VFS_RW_ITERATE ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS ZFS_AC_KERNEL_VFS_IOV_ITER ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS ZFS_AC_KERNEL_FOLLOW_DOWN_ONE ZFS_AC_KERNEL_MAKE_REQUEST_FN ZFS_AC_KERNEL_GENERIC_IO_ACCT ZFS_AC_KERNEL_FPU ZFS_AC_KERNEL_FMODE_T ZFS_AC_KERNEL_KUIDGID_T ZFS_AC_KERNEL_KUID_HELPERS ZFS_AC_KERNEL_MODULE_PARAM_CALL_CONST - ZFS_AC_KERNEL_RENAME_WANTS_FLAGS + ZFS_AC_KERNEL_RENAME ZFS_AC_KERNEL_CURRENT_TIME ZFS_AC_KERNEL_USERNS_CAPABILITIES ZFS_AC_KERNEL_IN_COMPAT_SYSCALL ZFS_AC_KERNEL_KTIME ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC ZFS_AC_KERNEL_TOTALHIGH_PAGES ZFS_AC_KERNEL_KSTRTOUL ZFS_AC_KERNEL_PERCPU ZFS_AC_KERNEL_CPU_HOTPLUG + ZFS_AC_KERNEL_GENERIC_FILLATTR_USERNS + ZFS_AC_KERNEL_MKNOD + ZFS_AC_KERNEL_SYMLINK ]) dnl # dnl # Detect name used for Module.symvers file in kernel dnl # AC_DEFUN([ZFS_AC_MODULE_SYMVERS], [ modpost=$LINUX/scripts/Makefile.modpost AC_MSG_CHECKING([kernel file name for module symbols]) AS_IF([test "x$enable_linux_builtin" != xyes -a -f "$modpost"], [ AS_IF([grep -q Modules.symvers $modpost], [ LINUX_SYMBOLS=Modules.symvers ], [ LINUX_SYMBOLS=Module.symvers ]) AS_IF([test ! -f "$LINUX_OBJ/$LINUX_SYMBOLS"], [ AC_MSG_ERROR([ *** Please make sure the kernel devel package for your distribution *** is installed. If you are building with a custom kernel, make sure *** the kernel is configured, built, and the '--with-linux=PATH' *** configure option refers to the location of the kernel source. ]) ]) ], [ LINUX_SYMBOLS=NONE ]) AC_MSG_RESULT($LINUX_SYMBOLS) AC_SUBST(LINUX_SYMBOLS) ]) dnl # dnl # Detect the kernel to be built against dnl # AC_DEFUN([ZFS_AC_KERNEL], [ AC_ARG_WITH([linux], AS_HELP_STRING([--with-linux=PATH], [Path to kernel source]), [kernelsrc="$withval"]) AC_ARG_WITH(linux-obj, AS_HELP_STRING([--with-linux-obj=PATH], [Path to kernel build objects]), [kernelbuild="$withval"]) AC_MSG_CHECKING([kernel source directory]) AS_IF([test -z "$kernelsrc"], [ AS_IF([test -e "/lib/modules/$(uname -r)/source"], [ headersdir="/lib/modules/$(uname -r)/source" sourcelink=$(readlink -f "$headersdir") ], [test -e "/lib/modules/$(uname -r)/build"], [ headersdir="/lib/modules/$(uname -r)/build" sourcelink=$(readlink -f "$headersdir") ], [ sourcelink=$(ls -1d /usr/src/kernels/* \ /usr/src/linux-* \ 2>/dev/null | grep -v obj | tail -1) ]) AS_IF([test -n "$sourcelink" && test -e ${sourcelink}], [ kernelsrc=`readlink -f ${sourcelink}` ], [ kernelsrc="[Not found]" ]) ], [ AS_IF([test "$kernelsrc" = "NONE"], [ kernsrcver=NONE ]) withlinux=yes ]) AC_MSG_RESULT([$kernelsrc]) AS_IF([test ! -d "$kernelsrc"], [ AC_MSG_ERROR([ *** Please make sure the kernel devel package for your distribution *** is installed and then try again. If that fails, you can specify the *** location of the kernel source with the '--with-linux=PATH' option.]) ]) AC_MSG_CHECKING([kernel build directory]) AS_IF([test -z "$kernelbuild"], [ AS_IF([test x$withlinux != xyes -a -e "/lib/modules/$(uname -r)/build"], [ kernelbuild=`readlink -f /lib/modules/$(uname -r)/build` ], [test -d ${kernelsrc}-obj/${target_cpu}/${target_cpu}], [ kernelbuild=${kernelsrc}-obj/${target_cpu}/${target_cpu} ], [test -d ${kernelsrc}-obj/${target_cpu}/default], [ kernelbuild=${kernelsrc}-obj/${target_cpu}/default ], [test -d `dirname ${kernelsrc}`/build-${target_cpu}], [ kernelbuild=`dirname ${kernelsrc}`/build-${target_cpu} ], [ kernelbuild=${kernelsrc} ]) ]) AC_MSG_RESULT([$kernelbuild]) AC_MSG_CHECKING([kernel source version]) utsrelease1=$kernelbuild/include/linux/version.h utsrelease2=$kernelbuild/include/linux/utsrelease.h utsrelease3=$kernelbuild/include/generated/utsrelease.h AS_IF([test -r $utsrelease1 && fgrep -q UTS_RELEASE $utsrelease1], [ utsrelease=$utsrelease1 ], [test -r $utsrelease2 && fgrep -q UTS_RELEASE $utsrelease2], [ utsrelease=$utsrelease2 ], [test -r $utsrelease3 && fgrep -q UTS_RELEASE $utsrelease3], [ utsrelease=$utsrelease3 ]) AS_IF([test -n "$utsrelease"], [ kernsrcver=$($AWK '/UTS_RELEASE/ { gsub(/"/, "", $[3]); print $[3] }' $utsrelease) AS_IF([test -z "$kernsrcver"], [ AC_MSG_RESULT([Not found]) AC_MSG_ERROR([ *** Cannot determine kernel version. ]) ]) ], [ AC_MSG_RESULT([Not found]) if test "x$enable_linux_builtin" != xyes; then AC_MSG_ERROR([ *** Cannot find UTS_RELEASE definition. ]) else AC_MSG_ERROR([ *** Cannot find UTS_RELEASE definition. *** Please run 'make prepare' inside the kernel source tree.]) fi ]) AC_MSG_RESULT([$kernsrcver]) AS_VERSION_COMPARE([$kernsrcver], [$ZFS_META_KVER_MIN], [ AC_MSG_ERROR([ *** Cannot build against kernel version $kernsrcver. *** The minimum supported kernel version is $ZFS_META_KVER_MIN. ]) ]) LINUX=${kernelsrc} LINUX_OBJ=${kernelbuild} LINUX_VERSION=${kernsrcver} AC_SUBST(LINUX) AC_SUBST(LINUX_OBJ) AC_SUBST(LINUX_VERSION) ZFS_AC_MODULE_SYMVERS ]) dnl # dnl # Detect the QAT module to be built against, QAT provides hardware dnl # acceleration for data compression: dnl # dnl # https://01.org/intel-quickassist-technology dnl # dnl # 1) Download and install QAT driver from the above link dnl # 2) Start QAT driver in your system: dnl # service qat_service start dnl # 3) Enable QAT in ZFS, e.g.: dnl # ./configure --with-qat=/QAT1.6 dnl # make dnl # 4) Set GZIP compression in ZFS dataset: dnl # zfs set compression = gzip dnl # dnl # Then the data written to this ZFS pool is compressed by QAT accelerator dnl # automatically, and de-compressed by QAT when read from the pool. dnl # dnl # 1) Get QAT hardware statistics with: dnl # cat /proc/icp_dh895xcc_dev/qat dnl # 2) To disable QAT: dnl # insmod zfs.ko zfs_qat_disable=1 dnl # AC_DEFUN([ZFS_AC_QAT], [ AC_ARG_WITH([qat], AS_HELP_STRING([--with-qat=PATH], [Path to qat source]), AS_IF([test "$withval" = "yes"], AC_MSG_ERROR([--with-qat=PATH requires a PATH]), [qatsrc="$withval"])) AC_ARG_WITH([qat-obj], AS_HELP_STRING([--with-qat-obj=PATH], [Path to qat build objects]), [qatbuild="$withval"]) AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat source directory]) AC_MSG_RESULT([$qatsrc]) QAT_SRC="${qatsrc}/quickassist" AS_IF([ test ! -e "$QAT_SRC/include/cpa.h"], [ AC_MSG_ERROR([ *** Please make sure the qat driver package is installed *** and specify the location of the qat source with the *** '--with-qat=PATH' option then try again. Failed to *** find cpa.h in: ${QAT_SRC}/include]) ]) ]) AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat build directory]) AS_IF([test -z "$qatbuild"], [ qatbuild="${qatsrc}/build" ]) AC_MSG_RESULT([$qatbuild]) QAT_OBJ=${qatbuild} AS_IF([ ! test -e "$QAT_OBJ/icp_qa_al.ko" && ! test -e "$QAT_OBJ/qat_api.ko"], [ AC_MSG_ERROR([ *** Please make sure the qat driver is installed then try again. *** Failed to find icp_qa_al.ko or qat_api.ko in: $QAT_OBJ]) ]) AC_SUBST(QAT_SRC) AC_SUBST(QAT_OBJ) AC_DEFINE(HAVE_QAT, 1, [qat is enabled and existed]) ]) dnl # dnl # Detect the name used for the QAT Module.symvers file. dnl # AS_IF([test ! -z "${qatsrc}"], [ AC_MSG_CHECKING([qat file for module symbols]) QAT_SYMBOLS=$QAT_SRC/lookaside/access_layer/src/Module.symvers AS_IF([test -r $QAT_SYMBOLS], [ AC_MSG_RESULT([$QAT_SYMBOLS]) AC_SUBST(QAT_SYMBOLS) ],[ AC_MSG_ERROR([ *** Please make sure the qat driver is installed then try again. *** Failed to find Module.symvers in: $QAT_SYMBOLS ]) ]) ]) ]) dnl # dnl # Basic toolchain sanity check. dnl # AC_DEFUN([ZFS_AC_KERNEL_TEST_MODULE], [ AC_MSG_CHECKING([whether modules can be built]) ZFS_LINUX_TRY_COMPILE([], [], [ AC_MSG_RESULT([yes]) ],[ AC_MSG_RESULT([no]) if test "x$enable_linux_builtin" != xyes; then AC_MSG_ERROR([ *** Unable to build an empty module. ]) else AC_MSG_ERROR([ *** Unable to build an empty module. *** Please run 'make scripts' inside the kernel source tree.]) fi ]) ]) dnl # dnl # ZFS_LINUX_CONFTEST_H dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_H], [ test -d build/$2 || mkdir -p build/$2 cat - <<_ACEOF >build/$2/$2.h $1 _ACEOF ]) dnl # dnl # ZFS_LINUX_CONFTEST_C dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_C], [ test -d build/$2 || mkdir -p build/$2 cat confdefs.h - <<_ACEOF >build/$2/$2.c $1 _ACEOF ]) dnl # dnl # ZFS_LINUX_CONFTEST_MAKEFILE dnl # dnl # $1 - test case name dnl # $2 - add to top-level Makefile dnl # $3 - additional build flags dnl # AC_DEFUN([ZFS_LINUX_CONFTEST_MAKEFILE], [ test -d build || mkdir -p build test -d build/$1 || mkdir -p build/$1 file=build/$1/Makefile dnl # Example command line to manually build source. cat - <<_ACEOF >$file # Example command line to manually build source # make modules -C $LINUX_OBJ $ARCH_UM M=$PWD/build/$1 ccflags-y := -Werror $FRAME_LARGER_THAN _ACEOF dnl # Additional custom CFLAGS as requested. m4_ifval($3, [echo "ccflags-y += $3" >>$file], []) dnl # Test case source echo "obj-m := $1.o" >>$file AS_IF([test "x$2" = "xyes"], [echo "obj-m += $1/" >>build/Makefile], []) ]) dnl # dnl # ZFS_LINUX_TEST_PROGRAM(C)([PROLOGUE], [BODY]) dnl # m4_define([ZFS_LINUX_TEST_PROGRAM], [ #include $1 int main (void) { $2 ; return 0; } MODULE_DESCRIPTION("conftest"); MODULE_AUTHOR(ZFS_META_AUTHOR); MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); MODULE_LICENSE($3); ]) dnl # dnl # ZFS_LINUX_TEST_REMOVE dnl # dnl # Removes the specified test source and results. dnl # AC_DEFUN([ZFS_LINUX_TEST_REMOVE], [ test -d build/$1 && rm -Rf build/$1 test -f build/Makefile && sed '/$1/d' build/Makefile ]) dnl # dnl # ZFS_LINUX_COMPILE dnl # dnl # $1 - build dir dnl # $2 - test command dnl # $3 - pass command dnl # $4 - fail command dnl # $5 - set KBUILD_MODPOST_NOFINAL='yes' dnl # $6 - set KBUILD_MODPOST_WARN='yes' dnl # dnl # Used internally by ZFS_LINUX_TEST_{COMPILE,MODPOST} dnl # AC_DEFUN([ZFS_LINUX_COMPILE], [ AC_TRY_COMMAND([ KBUILD_MODPOST_NOFINAL="$5" KBUILD_MODPOST_WARN="$6" make modules -k -j$TEST_JOBS -C $LINUX_OBJ $ARCH_UM M=$PWD/$1 >$1/build.log 2>&1]) AS_IF([AC_TRY_COMMAND([$2])], [$3], [$4]) ]) dnl # dnl # ZFS_LINUX_TEST_COMPILE dnl # dnl # Perform a full compile excluding the final modpost phase. dnl # AC_DEFUN([ZFS_LINUX_TEST_COMPILE], [ ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ mv $2/Makefile $2/Makefile.compile.$1 mv $2/build.log $2/build.log.$1 ],[ AC_MSG_ERROR([ *** Unable to compile test source to determine kernel interfaces.]) ], [yes], []) ]) dnl # dnl # ZFS_LINUX_TEST_MODPOST dnl # dnl # Perform a full compile including the modpost phase. This may dnl # be an incremental build if the objects have already been built. dnl # AC_DEFUN([ZFS_LINUX_TEST_MODPOST], [ ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ mv $2/Makefile $2/Makefile.modpost.$1 cat $2/build.log >>build/build.log.$1 ],[ AC_MSG_ERROR([ *** Unable to modpost test source to determine kernel interfaces.]) ], [], [yes]) ]) dnl # dnl # Perform the compilation of the test cases in two phases. dnl # dnl # Phase 1) attempt to build the object files for all of the tests dnl # defined by the ZFS_LINUX_TEST_SRC macro. But do not dnl # perform the final modpost stage. dnl # dnl # Phase 2) disable all tests which failed the initial compilation, dnl # then invoke the final modpost step for the remaining tests. dnl # dnl # This allows us efficiently build the test cases in parallel while dnl # remaining resilient to build failures which are expected when dnl # detecting the available kernel interfaces. dnl # dnl # The maximum allowed parallelism can be controlled by setting the dnl # TEST_JOBS environment variable. Otherwise, it default to $(nproc). dnl # AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [ dnl # Phase 1 - Compilation only, final linking is skipped. ZFS_LINUX_TEST_COMPILE([$1], [build]) dnl # dnl # Phase 2 - When building external modules disable test cases dnl # which failed to compile and invoke modpost to verify the dnl # final linking. dnl # dnl # Test names suffixed with '_license' call modpost independently dnl # to ensure that a single incompatibility does not result in the dnl # modpost phase exiting early. This check is not performed on dnl # every symbol since the majority are compatible and doing so dnl # would significantly slow down this phase. dnl # dnl # When configuring for builtin (--enable-linux-builtin) dnl # fake the linking step artificially create the expected .ko dnl # files for tests which did compile. This is required for dnl # kernels which do not have loadable module support or have dnl # not yet been built. dnl # AS_IF([test "x$enable_linux_builtin" = "xno"], [ for dir in $(awk '/^obj-m/ { print [$]3 }' \ build/Makefile.compile.$1); do name=${dir%/} AS_IF([test -f build/$name/$name.o], [ AS_IF([test "${name##*_}" = "license"], [ ZFS_LINUX_TEST_MODPOST([$1], [build/$name]) echo "obj-n += $dir" >>build/Makefile ], [ echo "obj-m += $dir" >>build/Makefile ]) ], [ echo "obj-n += $dir" >>build/Makefile ]) done ZFS_LINUX_TEST_MODPOST([$1], [build]) ], [ for dir in $(awk '/^obj-m/ { print [$]3 }' \ build/Makefile.compile.$1); do name=${dir%/} AS_IF([test -f build/$name/$name.o], [ touch build/$name/$name.ko ]) done ]) ]) dnl # dnl # ZFS_LINUX_TEST_SRC dnl # dnl # $1 - name dnl # $2 - global dnl # $3 - source dnl # $4 - extra cflags dnl # $5 - check license-compatibility dnl # dnl # Check if the test source is buildable at all and then if it is dnl # license compatible. dnl # dnl # N.B because all of the test cases are compiled in parallel they dnl # must never depend on the results of previous tests. Each test dnl # needs to be entirely independent. dnl # AC_DEFUN([ZFS_LINUX_TEST_SRC], [ ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[$2]], [[$3]], [["Dual BSD/GPL"]])], [$1]) ZFS_LINUX_CONFTEST_MAKEFILE([$1], [yes], [$4]) AS_IF([ test -n "$5" ], [ ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM( [[$2]], [[$3]], [[$5]])], [$1_license]) ZFS_LINUX_CONFTEST_MAKEFILE([$1_license], [yes], [$4]) ]) ]) dnl # dnl # ZFS_LINUX_TEST_RESULT dnl # dnl # $1 - name of a test source (ZFS_LINUX_TEST_SRC) dnl # $2 - run on success (valid .ko generated) dnl # $3 - run on failure (unable to compile) dnl # AC_DEFUN([ZFS_LINUX_TEST_RESULT], [ AS_IF([test -d build/$1], [ AS_IF([test -f build/$1/$1.ko], [$2], [$3]) ], [ AC_MSG_ERROR([ *** No matching source for the "$1" test, check that *** both the test source and result macros refer to the same name. ]) ]) ]) dnl # dnl # ZFS_LINUX_TEST_ERROR dnl # dnl # Generic error message which can be used when none of the expected dnl # kernel interfaces were detected. dnl # AC_DEFUN([ZFS_LINUX_TEST_ERROR], [ AC_MSG_ERROR([ *** None of the expected "$1" interfaces were detected. *** This may be because your kernel version is newer than what is *** supported, or you are using a patched custom kernel with *** incompatible modifications. *** *** ZFS Version: $ZFS_META_ALIAS *** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX ]) ]) dnl # dnl # ZFS_LINUX_TEST_RESULT_SYMBOL dnl # dnl # Like ZFS_LINUX_TEST_RESULT except ZFS_CHECK_SYMBOL_EXPORT is called to dnl # verify symbol exports, unless --enable-linux-builtin was provided to dnl # configure. dnl # AC_DEFUN([ZFS_LINUX_TEST_RESULT_SYMBOL], [ AS_IF([ ! test -f build/$1/$1.ko], [ $5 ], [ AS_IF([test "x$enable_linux_builtin" != "xyes"], [ ZFS_CHECK_SYMBOL_EXPORT([$2], [$3], [$4], [$5]) ], [ $4 ]) ]) ]) dnl # dnl # ZFS_LINUX_COMPILE_IFELSE dnl # AC_DEFUN([ZFS_LINUX_COMPILE_IFELSE], [ ZFS_LINUX_TEST_REMOVE([conftest]) m4_ifvaln([$1], [ZFS_LINUX_CONFTEST_C([$1], [conftest])]) m4_ifvaln([$5], [ZFS_LINUX_CONFTEST_H([$5], [conftest])], [ZFS_LINUX_CONFTEST_H([], [conftest])]) ZFS_LINUX_CONFTEST_MAKEFILE([conftest], [no], [m4_ifvaln([$5], [-I$PWD/build/conftest], [])]) ZFS_LINUX_COMPILE([build/conftest], [$2], [$3], [$4], [], []) ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE dnl # dnl # $1 - global dnl # $2 - source dnl # $3 - run on success (valid .ko generated) dnl # $4 - run on failure (unable to compile) dnl # dnl # When configuring as builtin (--enable-linux-builtin) for kernels dnl # without loadable module support (CONFIG_MODULES=n) only the object dnl # file is created. See ZFS_LINUX_TEST_COMPILE_ALL for details. dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE], [ AS_IF([test "x$enable_linux_builtin" = "xyes"], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.o], [$3], [$4]) ], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.ko], [$3], [$4]) ]) ]) dnl # dnl # ZFS_CHECK_SYMBOL_EXPORT dnl # dnl # Check if a symbol is exported on not by consulting the symbols dnl # file, or optionally the source code. dnl # AC_DEFUN([ZFS_CHECK_SYMBOL_EXPORT], [ grep -q -E '[[[:space:]]]$1[[[:space:]]]' \ $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null rc=$? if test $rc -ne 0; then export=0 for file in $2; do grep -q -E "EXPORT_SYMBOL.*($1)" \ "$LINUX/$file" 2>/dev/null rc=$? if test $rc -eq 0; then export=1 break; fi done if test $export -eq 0; then : $4 else : $3 fi else : $3 fi ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE_SYMBOL dnl # dnl # Like ZFS_LINUX_TRY_COMPILER except ZFS_CHECK_SYMBOL_EXPORT is called dnl # to verify symbol exports, unless --enable-linux-builtin was provided dnl # to configure. dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE_SYMBOL], [ ZFS_LINUX_TRY_COMPILE([$1], [$2], [rc=0], [rc=1]) if test $rc -ne 0; then : $6 else if test "x$enable_linux_builtin" != xyes; then ZFS_CHECK_SYMBOL_EXPORT([$3], [$4], [rc=0], [rc=1]) fi if test $rc -ne 0; then : $6 else : $5 fi fi ]) dnl # dnl # ZFS_LINUX_TRY_COMPILE_HEADER dnl # like ZFS_LINUX_TRY_COMPILE, except the contents conftest.h are dnl # provided via the fifth parameter dnl # AC_DEFUN([ZFS_LINUX_TRY_COMPILE_HEADER], [ ZFS_LINUX_COMPILE_IFELSE( [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]], [[ZFS_META_LICENSE]])], [test -f build/conftest/conftest.ko], [$3], [$4], [$5]) ]) diff --git a/include/os/linux/kernel/linux/vfs_compat.h b/include/os/linux/kernel/linux/vfs_compat.h index c35e80d31cd7..91e908598fbb 100644 --- a/include/os/linux/kernel/linux/vfs_compat.h +++ b/include/os/linux/kernel/linux/vfs_compat.h @@ -1,439 +1,461 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2011 Lawrence Livermore National Security, LLC. * Copyright (C) 2015 Jörg Thalheim. */ #ifndef _ZFS_VFS_H #define _ZFS_VFS_H #include #include #include #include /* * 2.6.34 - 3.19, bdi_setup_and_register() takes 3 arguments. * 4.0 - 4.11, bdi_setup_and_register() takes 2 arguments. * 4.12 - x.y, super_setup_bdi_name() new interface. */ #if defined(HAVE_SUPER_SETUP_BDI_NAME) extern atomic_long_t zfs_bdi_seq; static inline int zpl_bdi_setup(struct super_block *sb, char *name) { return super_setup_bdi_name(sb, "%.28s-%ld", name, atomic_long_inc_return(&zfs_bdi_seq)); } static inline void zpl_bdi_destroy(struct super_block *sb) { } #elif defined(HAVE_2ARGS_BDI_SETUP_AND_REGISTER) static inline int zpl_bdi_setup(struct super_block *sb, char *name) { struct backing_dev_info *bdi; int error; bdi = kmem_zalloc(sizeof (struct backing_dev_info), KM_SLEEP); error = bdi_setup_and_register(bdi, name); if (error) { kmem_free(bdi, sizeof (struct backing_dev_info)); return (error); } sb->s_bdi = bdi; return (0); } static inline void zpl_bdi_destroy(struct super_block *sb) { struct backing_dev_info *bdi = sb->s_bdi; bdi_destroy(bdi); kmem_free(bdi, sizeof (struct backing_dev_info)); sb->s_bdi = NULL; } #elif defined(HAVE_3ARGS_BDI_SETUP_AND_REGISTER) static inline int zpl_bdi_setup(struct super_block *sb, char *name) { struct backing_dev_info *bdi; int error; bdi = kmem_zalloc(sizeof (struct backing_dev_info), KM_SLEEP); error = bdi_setup_and_register(bdi, name, BDI_CAP_MAP_COPY); if (error) { kmem_free(sb->s_bdi, sizeof (struct backing_dev_info)); return (error); } sb->s_bdi = bdi; return (0); } static inline void zpl_bdi_destroy(struct super_block *sb) { struct backing_dev_info *bdi = sb->s_bdi; bdi_destroy(bdi); kmem_free(bdi, sizeof (struct backing_dev_info)); sb->s_bdi = NULL; } #else #error "Unsupported kernel" #endif /* * 4.14 adds SB_* flag definitions, define them to MS_* equivalents * if not set. */ #ifndef SB_RDONLY #define SB_RDONLY MS_RDONLY #endif #ifndef SB_SILENT #define SB_SILENT MS_SILENT #endif #ifndef SB_ACTIVE #define SB_ACTIVE MS_ACTIVE #endif #ifndef SB_POSIXACL #define SB_POSIXACL MS_POSIXACL #endif #ifndef SB_MANDLOCK #define SB_MANDLOCK MS_MANDLOCK #endif #ifndef SB_NOATIME #define SB_NOATIME MS_NOATIME #endif /* * 3.5 API change, * The clear_inode() function replaces end_writeback() and introduces an * ordering change regarding when the inode_sync_wait() occurs. See the * configure check in config/kernel-clear-inode.m4 for full details. */ #if defined(HAVE_EVICT_INODE) && !defined(HAVE_CLEAR_INODE) #define clear_inode(ip) end_writeback(ip) #endif /* HAVE_EVICT_INODE && !HAVE_CLEAR_INODE */ #if defined(SEEK_HOLE) && defined(SEEK_DATA) && !defined(HAVE_LSEEK_EXECUTE) static inline loff_t lseek_execute( struct file *filp, struct inode *inode, loff_t offset, loff_t maxsize) { if (offset < 0 && !(filp->f_mode & FMODE_UNSIGNED_OFFSET)) return (-EINVAL); if (offset > maxsize) return (-EINVAL); if (offset != filp->f_pos) { spin_lock(&filp->f_lock); filp->f_pos = offset; filp->f_version = 0; spin_unlock(&filp->f_lock); } return (offset); } #endif /* SEEK_HOLE && SEEK_DATA && !HAVE_LSEEK_EXECUTE */ #if defined(CONFIG_FS_POSIX_ACL) /* * These functions safely approximates the behavior of posix_acl_release() * which cannot be used because it calls the GPL-only symbol kfree_rcu(). * The in-kernel version, which can access the RCU, frees the ACLs after * the grace period expires. Because we're unsure how long that grace * period may be this implementation conservatively delays for 60 seconds. * This is several orders of magnitude larger than expected grace period. * At 60 seconds the kernel will also begin issuing RCU stall warnings. */ #include #if defined(HAVE_POSIX_ACL_RELEASE) && !defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY) #define zpl_posix_acl_release(arg) posix_acl_release(arg) #else void zpl_posix_acl_release_impl(struct posix_acl *); static inline void zpl_posix_acl_release(struct posix_acl *acl) { if ((acl == NULL) || (acl == ACL_NOT_CACHED)) return; #ifdef HAVE_ACL_REFCOUNT if (refcount_dec_and_test(&acl->a_refcount)) zpl_posix_acl_release_impl(acl); #else if (atomic_dec_and_test(&acl->a_refcount)) zpl_posix_acl_release_impl(acl); #endif } #endif /* HAVE_POSIX_ACL_RELEASE */ #ifdef HAVE_SET_CACHED_ACL_USABLE #define zpl_set_cached_acl(ip, ty, n) set_cached_acl(ip, ty, n) #define zpl_forget_cached_acl(ip, ty) forget_cached_acl(ip, ty) #else static inline void zpl_set_cached_acl(struct inode *ip, int type, struct posix_acl *newer) { struct posix_acl *older = NULL; spin_lock(&ip->i_lock); if ((newer != ACL_NOT_CACHED) && (newer != NULL)) posix_acl_dup(newer); switch (type) { case ACL_TYPE_ACCESS: older = ip->i_acl; rcu_assign_pointer(ip->i_acl, newer); break; case ACL_TYPE_DEFAULT: older = ip->i_default_acl; rcu_assign_pointer(ip->i_default_acl, newer); break; } spin_unlock(&ip->i_lock); zpl_posix_acl_release(older); } static inline void zpl_forget_cached_acl(struct inode *ip, int type) { zpl_set_cached_acl(ip, type, (struct posix_acl *)ACL_NOT_CACHED); } #endif /* HAVE_SET_CACHED_ACL_USABLE */ /* * 3.1 API change, * posix_acl_chmod() was added as the preferred interface. * * 3.14 API change, * posix_acl_chmod() was changed to __posix_acl_chmod() */ #ifndef HAVE___POSIX_ACL_CHMOD #ifdef HAVE_POSIX_ACL_CHMOD #define __posix_acl_chmod(acl, gfp, mode) posix_acl_chmod(acl, gfp, mode) #define __posix_acl_create(acl, gfp, mode) posix_acl_create(acl, gfp, mode) #else #error "Unsupported kernel" #endif /* HAVE_POSIX_ACL_CHMOD */ #endif /* HAVE___POSIX_ACL_CHMOD */ /* * 4.8 API change, * posix_acl_valid() now must be passed a namespace, the namespace from * from super block associated with the given inode is used for this purpose. */ #ifdef HAVE_POSIX_ACL_VALID_WITH_NS #define zpl_posix_acl_valid(ip, acl) posix_acl_valid(ip->i_sb->s_user_ns, acl) #else #define zpl_posix_acl_valid(ip, acl) posix_acl_valid(acl) #endif #endif /* CONFIG_FS_POSIX_ACL */ /* * 3.19 API change * struct access f->f_dentry->d_inode was replaced by accessor function * file_inode(f) */ #ifndef HAVE_FILE_INODE static inline struct inode *file_inode(const struct file *f) { return (f->f_dentry->d_inode); } #endif /* HAVE_FILE_INODE */ /* * 4.1 API change * struct access file->f_path.dentry was replaced by accessor function * file_dentry(f) */ #ifndef HAVE_FILE_DENTRY static inline struct dentry *file_dentry(const struct file *f) { return (f->f_path.dentry); } #endif /* HAVE_FILE_DENTRY */ static inline uid_t zfs_uid_read_impl(struct inode *ip) { #ifdef HAVE_SUPER_USER_NS return (from_kuid(ip->i_sb->s_user_ns, ip->i_uid)); #else return (from_kuid(kcred->user_ns, ip->i_uid)); #endif } static inline uid_t zfs_uid_read(struct inode *ip) { return (zfs_uid_read_impl(ip)); } static inline gid_t zfs_gid_read_impl(struct inode *ip) { #ifdef HAVE_SUPER_USER_NS return (from_kgid(ip->i_sb->s_user_ns, ip->i_gid)); #else return (from_kgid(kcred->user_ns, ip->i_gid)); #endif } static inline gid_t zfs_gid_read(struct inode *ip) { return (zfs_gid_read_impl(ip)); } static inline void zfs_uid_write(struct inode *ip, uid_t uid) { #ifdef HAVE_SUPER_USER_NS ip->i_uid = make_kuid(ip->i_sb->s_user_ns, uid); #else ip->i_uid = make_kuid(kcred->user_ns, uid); #endif } static inline void zfs_gid_write(struct inode *ip, gid_t gid) { #ifdef HAVE_SUPER_USER_NS ip->i_gid = make_kgid(ip->i_sb->s_user_ns, gid); #else ip->i_gid = make_kgid(kcred->user_ns, gid); #endif } /* * 4.9 API change */ -#ifndef HAVE_SETATTR_PREPARE +#if !(defined(HAVE_SETATTR_PREPARE_NO_USERNS) || \ + defined(HAVE_SETATTR_PREPARE_USERNS)) static inline int setattr_prepare(struct dentry *dentry, struct iattr *ia) { return (inode_change_ok(dentry->d_inode, ia)); } #endif /* * 4.11 API change * These macros are defined by kernel 4.11. We define them so that the same * code builds under kernels < 4.11 and >= 4.11. The macros are set to 0 so * that it will create obvious failures if they are accidentally used when built * against a kernel >= 4.11. */ #ifndef STATX_BASIC_STATS #define STATX_BASIC_STATS 0 #endif #ifndef AT_STATX_SYNC_AS_STAT #define AT_STATX_SYNC_AS_STAT 0 #endif /* * 4.11 API change * 4.11 takes struct path *, < 4.11 takes vfsmount * */ #ifdef HAVE_VFSMOUNT_IOPS_GETATTR #define ZPL_GETATTR_WRAPPER(func) \ static int \ func(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) \ { \ struct path path = { .mnt = mnt, .dentry = dentry }; \ return func##_impl(&path, stat, STATX_BASIC_STATS, \ AT_STATX_SYNC_AS_STAT); \ } #elif defined(HAVE_PATH_IOPS_GETATTR) #define ZPL_GETATTR_WRAPPER(func) \ static int \ func(const struct path *path, struct kstat *stat, u32 request_mask, \ unsigned int query_flags) \ { \ return (func##_impl(path, stat, request_mask, query_flags)); \ } +#elif defined(HAVE_USERNS_IOPS_GETATTR) +#define ZPL_GETATTR_WRAPPER(func) \ +static int \ +func(struct user_namespace *user_ns, const struct path *path, \ + struct kstat *stat, u32 request_mask, unsigned int query_flags) \ +{ \ + return (func##_impl(user_ns, path, stat, request_mask, \ + query_flags)); \ +} #else #error #endif /* * 4.9 API change * Preferred interface to get the current FS time. */ #if !defined(HAVE_CURRENT_TIME) static inline struct timespec current_time(struct inode *ip) { return (timespec_trunc(current_kernel_time(), ip->i_sb->s_time_gran)); } #endif /* * 4.16 API change * Added iversion interface for managing inode version field. */ #ifdef HAVE_INODE_SET_IVERSION #include #else static inline void inode_set_iversion(struct inode *ip, u64 val) { ip->i_version = val; } #endif /* * Returns true when called in the context of a 32-bit system call. */ static inline int zpl_is_32bit_api(void) { #ifdef CONFIG_COMPAT #ifdef HAVE_IN_COMPAT_SYSCALL return (in_compat_syscall()); #else return (is_compat_task()); #endif #else return (BITS_PER_LONG == 32); #endif } +/* + * 5.12 API change + * To support id-mapped mounts, generic_fillattr() was modified to + * accept a new struct user_namespace* as its first arg. + */ +#ifdef HAVE_GENERIC_FILLATTR_USERNS +#define zpl_generic_fillattr(user_ns, ip, sp) \ + generic_fillattr(user_ns, ip, sp) +#else +#define zpl_generic_fillattr(user_ns, ip, sp) generic_fillattr(ip, sp) +#endif + #endif /* _ZFS_VFS_H */ diff --git a/include/os/linux/kernel/linux/xattr_compat.h b/include/os/linux/kernel/linux/xattr_compat.h index 8348e99198af..54690727eab9 100644 --- a/include/os/linux/kernel/linux/xattr_compat.h +++ b/include/os/linux/kernel/linux/xattr_compat.h @@ -1,184 +1,199 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2011 Lawrence Livermore National Security, LLC. */ #ifndef _ZFS_XATTR_H #define _ZFS_XATTR_H #include /* * 2.6.35 API change, * The const keyword was added to the 'struct xattr_handler' in the * generic Linux super_block structure. To handle this we define an * appropriate xattr_handler_t typedef which can be used. This was * the preferred solution because it keeps the code clean and readable. */ typedef const struct xattr_handler xattr_handler_t; /* * 4.5 API change, */ #if defined(HAVE_XATTR_LIST_SIMPLE) #define ZPL_XATTR_LIST_WRAPPER(fn) \ static bool \ fn(struct dentry *dentry) \ { \ return (!!__ ## fn(dentry->d_inode, NULL, 0, NULL, 0)); \ } /* * 4.4 API change, */ #elif defined(HAVE_XATTR_LIST_DENTRY) #define ZPL_XATTR_LIST_WRAPPER(fn) \ static size_t \ fn(struct dentry *dentry, char *list, size_t list_size, \ const char *name, size_t name_len, int type) \ { \ return (__ ## fn(dentry->d_inode, \ list, list_size, name, name_len)); \ } /* * 2.6.33 API change, */ #elif defined(HAVE_XATTR_LIST_HANDLER) #define ZPL_XATTR_LIST_WRAPPER(fn) \ static size_t \ fn(const struct xattr_handler *handler, struct dentry *dentry, \ char *list, size_t list_size, const char *name, size_t name_len) \ { \ return (__ ## fn(dentry->d_inode, \ list, list_size, name, name_len)); \ } #else #error "Unsupported kernel" #endif /* * 4.7 API change, * The xattr_handler->get() callback was changed to take a both dentry and * inode, because the dentry might not be attached to an inode yet. */ #if defined(HAVE_XATTR_GET_DENTRY_INODE) #define ZPL_XATTR_GET_WRAPPER(fn) \ static int \ fn(const struct xattr_handler *handler, struct dentry *dentry, \ struct inode *inode, const char *name, void *buffer, size_t size) \ { \ return (__ ## fn(inode, name, buffer, size)); \ } /* * 4.4 API change, * The xattr_handler->get() callback was changed to take a xattr_handler, * and handler_flags argument was removed and should be accessed by * handler->flags. */ #elif defined(HAVE_XATTR_GET_HANDLER) #define ZPL_XATTR_GET_WRAPPER(fn) \ static int \ fn(const struct xattr_handler *handler, struct dentry *dentry, \ const char *name, void *buffer, size_t size) \ { \ return (__ ## fn(dentry->d_inode, name, buffer, size)); \ } /* * 2.6.33 API change, * The xattr_handler->get() callback was changed to take a dentry * instead of an inode, and a handler_flags argument was added. */ #elif defined(HAVE_XATTR_GET_DENTRY) #define ZPL_XATTR_GET_WRAPPER(fn) \ static int \ fn(struct dentry *dentry, const char *name, void *buffer, size_t size, \ int unused_handler_flags) \ { \ return (__ ## fn(dentry->d_inode, name, buffer, size)); \ } #else #error "Unsupported kernel" #endif +/* + * 5.12 API change, + * The xattr_handler->set() callback was changed to take the + * struct user_namespace* as the first arg, to support idmapped + * mounts. + */ +#if defined(HAVE_XATTR_SET_USERNS) +#define ZPL_XATTR_SET_WRAPPER(fn) \ +static int \ +fn(const struct xattr_handler *handler, struct user_namespace *user_ns, \ + struct dentry *dentry, struct inode *inode, const char *name, \ + const void *buffer, size_t size, int flags) \ +{ \ + return (__ ## fn(inode, name, buffer, size, flags)); \ +} /* * 4.7 API change, * The xattr_handler->set() callback was changed to take a both dentry and * inode, because the dentry might not be attached to an inode yet. */ -#if defined(HAVE_XATTR_SET_DENTRY_INODE) +#elif defined(HAVE_XATTR_SET_DENTRY_INODE) #define ZPL_XATTR_SET_WRAPPER(fn) \ static int \ fn(const struct xattr_handler *handler, struct dentry *dentry, \ struct inode *inode, const char *name, const void *buffer, \ size_t size, int flags) \ { \ return (__ ## fn(inode, name, buffer, size, flags)); \ } /* * 4.4 API change, * The xattr_handler->set() callback was changed to take a xattr_handler, * and handler_flags argument was removed and should be accessed by * handler->flags. */ #elif defined(HAVE_XATTR_SET_HANDLER) #define ZPL_XATTR_SET_WRAPPER(fn) \ static int \ fn(const struct xattr_handler *handler, struct dentry *dentry, \ const char *name, const void *buffer, size_t size, int flags) \ { \ return (__ ## fn(dentry->d_inode, name, buffer, size, flags)); \ } /* * 2.6.33 API change, * The xattr_handler->set() callback was changed to take a dentry * instead of an inode, and a handler_flags argument was added. */ #elif defined(HAVE_XATTR_SET_DENTRY) #define ZPL_XATTR_SET_WRAPPER(fn) \ static int \ fn(struct dentry *dentry, const char *name, const void *buffer, \ size_t size, int flags, int unused_handler_flags) \ { \ return (__ ## fn(dentry->d_inode, name, buffer, size, flags)); \ } #else #error "Unsupported kernel" #endif /* * Linux 3.7 API change. posix_acl_{from,to}_xattr gained the user_ns * parameter. All callers are expected to pass the &init_user_ns which * is available through the init credential (kcred). */ static inline struct posix_acl * zpl_acl_from_xattr(const void *value, int size) { return (posix_acl_from_xattr(kcred->user_ns, value, size)); } static inline int zpl_acl_to_xattr(struct posix_acl *acl, void *value, int size) { return (posix_acl_to_xattr(kcred->user_ns, acl, value, size)); } #endif /* _ZFS_XATTR_H */ diff --git a/include/os/linux/zfs/sys/zfs_vnops_os.h b/include/os/linux/zfs/sys/zfs_vnops_os.h index ef76de3e2981..47f91e4a6cf4 100644 --- a/include/os/linux/zfs/sys/zfs_vnops_os.h +++ b/include/os/linux/zfs/sys/zfs_vnops_os.h @@ -1,82 +1,83 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. */ #ifndef _SYS_FS_ZFS_VNOPS_OS_H #define _SYS_FS_ZFS_VNOPS_OS_H #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif extern int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr); extern int zfs_close(struct inode *ip, int flag, cred_t *cr); extern int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *resid); extern int zfs_lookup(znode_t *dzp, char *nm, znode_t **zpp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp); extern int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp); extern int zfs_tmpfile(struct inode *dip, vattr_t *vapzfs, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp); extern int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags); extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp); extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags); extern int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr); -extern int zfs_getattr_fast(struct inode *ip, struct kstat *sp); +extern int zfs_getattr_fast(struct user_namespace *, struct inode *ip, + struct kstat *sp); extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr); extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, cred_t *cr, int flags); extern int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, znode_t **zpp, cred_t *cr, int flags); extern int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr); extern int zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, int flags); extern void zfs_inactive(struct inode *ip); extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr); extern int zfs_fid(struct inode *ip, fid_t *fidp); extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages); extern int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc); extern int zfs_dirty_inode(struct inode *ip, int flags); extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, unsigned long vm_flags); extern void zfs_zrele_async(znode_t *zp); #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_VNOPS_H */ diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index b0bb9c29c0b4..21825d1f378e 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -1,174 +1,192 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. */ #ifndef _SYS_ZPL_H #define _SYS_ZPL_H #include #include #include #include #include #include #include #include #include #include #include /* zpl_inode.c */ extern void zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr); extern const struct inode_operations zpl_inode_operations; extern const struct inode_operations zpl_dir_inode_operations; extern const struct inode_operations zpl_symlink_inode_operations; extern const struct inode_operations zpl_special_inode_operations; extern dentry_operations_t zpl_dentry_operations; extern const struct address_space_operations zpl_address_space_operations; extern const struct file_operations zpl_file_operations; extern const struct file_operations zpl_dir_file_operations; /* zpl_super.c */ extern void zpl_prune_sb(int64_t nr_to_scan, void *arg); extern const struct super_operations zpl_super_operations; extern const struct export_operations zpl_export_operations; extern struct file_system_type zpl_fs_type; /* zpl_xattr.c */ extern ssize_t zpl_xattr_list(struct dentry *dentry, char *buf, size_t size); extern int zpl_xattr_security_init(struct inode *ip, struct inode *dip, const struct qstr *qstr); #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) extern int zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type); #endif /* HAVE_SET_ACL */ extern struct posix_acl *zpl_get_acl(struct inode *ip, int type); extern int zpl_init_acl(struct inode *ip, struct inode *dir); extern int zpl_chmod_acl(struct inode *ip); #else static inline int zpl_init_acl(struct inode *ip, struct inode *dir) { return (0); } static inline int zpl_chmod_acl(struct inode *ip) { return (0); } #endif /* CONFIG_FS_POSIX_ACL */ extern xattr_handler_t *zpl_xattr_handlers[]; /* zpl_ctldir.c */ extern const struct file_operations zpl_fops_root; extern const struct inode_operations zpl_ops_root; extern const struct file_operations zpl_fops_snapdir; extern const struct inode_operations zpl_ops_snapdir; extern const struct dentry_operations zpl_dops_snapdirs; extern const struct file_operations zpl_fops_shares; extern const struct inode_operations zpl_ops_shares; #if defined(HAVE_VFS_ITERATE) || defined(HAVE_VFS_ITERATE_SHARED) #define ZPL_DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ .actor = _actor, \ .pos = _pos, \ } typedef struct dir_context zpl_dir_context_t; #define zpl_dir_emit dir_emit #define zpl_dir_emit_dot dir_emit_dot #define zpl_dir_emit_dotdot dir_emit_dotdot #define zpl_dir_emit_dots dir_emit_dots #else typedef struct zpl_dir_context { void *dirent; const filldir_t actor; loff_t pos; } zpl_dir_context_t; #define ZPL_DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ .dirent = _dirent, \ .actor = _actor, \ .pos = _pos, \ } static inline bool zpl_dir_emit(zpl_dir_context_t *ctx, const char *name, int namelen, uint64_t ino, unsigned type) { return (!ctx->actor(ctx->dirent, name, namelen, ctx->pos, ino, type)); } static inline bool zpl_dir_emit_dot(struct file *file, zpl_dir_context_t *ctx) { return (ctx->actor(ctx->dirent, ".", 1, ctx->pos, file_inode(file)->i_ino, DT_DIR) == 0); } static inline bool zpl_dir_emit_dotdot(struct file *file, zpl_dir_context_t *ctx) { return (ctx->actor(ctx->dirent, "..", 2, ctx->pos, parent_ino(file_dentry(file)), DT_DIR) == 0); } static inline bool zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx) { if (ctx->pos == 0) { if (!zpl_dir_emit_dot(file, ctx)) return (false); ctx->pos = 1; } if (ctx->pos == 1) { if (!zpl_dir_emit_dotdot(file, ctx)) return (false); ctx->pos = 2; } return (true); } #endif /* HAVE_VFS_ITERATE */ #if defined(HAVE_INODE_TIMESTAMP_TRUNCATE) #define zpl_inode_timestamp_truncate(ts, ip) timestamp_truncate(ts, ip) #elif defined(HAVE_INODE_TIMESPEC64_TIMES) #define zpl_inode_timestamp_truncate(ts, ip) \ timespec64_trunc(ts, (ip)->i_sb->s_time_gran) #else #define zpl_inode_timestamp_truncate(ts, ip) \ timespec_trunc(ts, (ip)->i_sb->s_time_gran) #endif +#if defined(HAVE_INODE_OWNER_OR_CAPABLE) +#define zpl_inode_owner_or_capable(ns, ip) inode_owner_or_capable(ip) +#elif defined(HAVE_INODE_OWNER_OR_CAPABLE_IDMAPPED) +#define zpl_inode_owner_or_capable(ns, ip) inode_owner_or_capable(ns, ip) +#else +#error "Unsupported kernel" +#endif + +#ifdef HAVE_SETATTR_PREPARE_USERNS +#define zpl_setattr_prepare(ns, dentry, ia) setattr_prepare(ns, dentry, ia) +#else +/* + * Use kernel-provided version, or our own from + * linux/vfs_compat.h + */ +#define zpl_setattr_prepare(ns, dentry, ia) setattr_prepare(dentry, ia) +#endif + #endif /* _SYS_ZPL_H */ diff --git a/module/os/linux/zfs/policy.c b/module/os/linux/zfs/policy.c index 8780d7f6c70a..bbccb2e572d9 100644 --- a/module/os/linux/zfs/policy.c +++ b/module/os/linux/zfs/policy.c @@ -1,375 +1,375 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2013, Joyent, Inc. All rights reserved. * Copyright (C) 2016 Lawrence Livermore National Security, LLC. * * For Linux the vast majority of this enforcement is already handled via * the standard Linux VFS permission checks. However certain administrative * commands which bypass the standard mechanisms may need to make use of * this functionality. */ #include #include #include /* * The passed credentials cannot be directly verified because Linux only * provides and interface to check the *current* process credentials. In * order to handle this the capable() test is only run when the passed * credentials match the current process credentials or the kcred. In * all other cases this function must fail and return the passed err. */ static int priv_policy_ns(const cred_t *cr, int capability, int err, struct user_namespace *ns) { if (cr != CRED() && (cr != kcred)) return (err); #if defined(CONFIG_USER_NS) if (!(ns ? ns_capable(ns, capability) : capable(capability))) #else if (!capable(capability)) #endif return (err); return (0); } static int priv_policy(const cred_t *cr, int capability, int err) { return (priv_policy_ns(cr, capability, err, NULL)); } static int priv_policy_user(const cred_t *cr, int capability, int err) { /* * All priv_policy_user checks are preceded by kuid/kgid_has_mapping() * checks. If we cannot do them, we shouldn't be using ns_capable() * since we don't know whether the affected files are valid in our * namespace. */ #if defined(CONFIG_USER_NS) return (priv_policy_ns(cr, capability, err, cr->user_ns)); #else return (priv_policy_ns(cr, capability, err, NULL)); #endif } /* * Checks for operations that are either client-only or are used by * both clients and servers. */ int secpolicy_nfs(const cred_t *cr) { return (priv_policy(cr, CAP_SYS_ADMIN, EPERM)); } /* * Catch all system configuration. */ int secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) { return (priv_policy(cr, CAP_SYS_ADMIN, EPERM)); } /* * Like secpolicy_vnode_access() but we get the actual wanted mode and the * current mode of the file, not the missing bits. * * Enforced in the Linux VFS. */ int secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner, mode_t curmode, mode_t wantmode) { return (0); } /* * This is a special routine for ZFS; it is used to determine whether * any of the privileges in effect allow any form of access to the * file. There's no reason to audit this or any reason to record * this. More work is needed to do the "KPLD" stuff. */ int secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner) { if (crgetfsuid(cr) == owner) return (0); - if (inode_owner_or_capable(ip)) + if (zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (0); #if defined(CONFIG_USER_NS) if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) return (EPERM); #endif if (priv_policy_user(cr, CAP_DAC_OVERRIDE, EPERM) == 0) return (0); if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, EPERM) == 0) return (0); return (EPERM); } /* * Determine if subject can chown owner of a file. */ int secpolicy_vnode_chown(const cred_t *cr, uid_t owner) { if (crgetfsuid(cr) == owner) return (0); #if defined(CONFIG_USER_NS) if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) return (EPERM); #endif return (priv_policy_user(cr, CAP_FOWNER, EPERM)); } /* * Determine if subject can change group ownership of a file. */ int secpolicy_vnode_create_gid(const cred_t *cr) { return (priv_policy(cr, CAP_SETGID, EPERM)); } /* * Policy determines whether we can remove an entry from a directory, * regardless of permission bits. */ int secpolicy_vnode_remove(const cred_t *cr) { return (priv_policy(cr, CAP_FOWNER, EPERM)); } /* * Determine that subject can modify the mode of a file. allzone privilege * needed when modifying root owned object. */ int secpolicy_vnode_setdac(const cred_t *cr, uid_t owner) { if (crgetfsuid(cr) == owner) return (0); #if defined(CONFIG_USER_NS) if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) return (EPERM); #endif return (priv_policy_user(cr, CAP_FOWNER, EPERM)); } /* * Are we allowed to retain the set-uid/set-gid bits when * changing ownership or when writing to a file? * "issuid" should be true when set-uid; only in that case * root ownership is checked (setgid is assumed). * * Enforced in the Linux VFS. */ int secpolicy_vnode_setid_retain(struct znode *zp __maybe_unused, const cred_t *cr, boolean_t issuidroot) { return (priv_policy_user(cr, CAP_FSETID, EPERM)); } /* * Determine that subject can set the file setgid flag. */ int secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid) { #if defined(CONFIG_USER_NS) if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid))) return (EPERM); #endif if (crgetfsgid(cr) != gid && !groupmember(gid, cr)) return (priv_policy_user(cr, CAP_FSETID, EPERM)); return (0); } /* * Determine if the subject can inject faults in the ZFS fault injection * framework. Requires all privileges. */ int secpolicy_zinject(const cred_t *cr) { return (priv_policy(cr, CAP_SYS_ADMIN, EACCES)); } /* * Determine if the subject has permission to manipulate ZFS datasets * (not pools). Equivalent to the SYS_MOUNT privilege. */ int secpolicy_zfs(const cred_t *cr) { return (priv_policy(cr, CAP_SYS_ADMIN, EACCES)); } /* * Equivalent to secpolicy_zfs(), but works even if the cred_t is not that of * the current process. Takes both cred_t and proc_t so that this can work * easily on all platforms. * * The has_capability() function was first exported in the 4.10 Linux kernel * then backported to some LTS kernels. Prior to this change there was no * mechanism to perform this check therefore EACCES is returned when the * functionality is not present in the kernel. */ int secpolicy_zfs_proc(const cred_t *cr, proc_t *proc) { #if defined(HAVE_HAS_CAPABILITY) if (!has_capability(proc, CAP_SYS_ADMIN)) return (EACCES); return (0); #else return (EACCES); #endif } void secpolicy_setid_clear(vattr_t *vap, cred_t *cr) { if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(NULL, cr, (vap->va_mode & S_ISUID) != 0 && (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) { vap->va_mask |= AT_MODE; vap->va_mode &= ~(S_ISUID|S_ISGID); } } /* * Determine that subject can set the file setid flags. */ static int secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner) { if (crgetfsuid(cr) == owner) return (0); #if defined(CONFIG_USER_NS) if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) return (EPERM); #endif return (priv_policy_user(cr, CAP_FSETID, EPERM)); } /* * Determine that subject can make a file a "sticky". * * Enforced in the Linux VFS. */ static int secpolicy_vnode_stky_modify(const cred_t *cr) { return (0); } int secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, const vattr_t *ovap, cred_t *cr) { int error; if ((vap->va_mode & S_ISUID) != 0 && (error = secpolicy_vnode_setid_modify(cr, ovap->va_uid)) != 0) { return (error); } /* * Check privilege if attempting to set the * sticky bit on a non-directory. */ if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 && secpolicy_vnode_stky_modify(cr) != 0) { vap->va_mode &= ~S_ISVTX; } /* * Check for privilege if attempting to set the * group-id bit. */ if ((vap->va_mode & S_ISGID) != 0 && secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) { vap->va_mode &= ~S_ISGID; } return (0); } /* * Check privileges for setting xvattr attributes */ int secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, mode_t type) { return (secpolicy_vnode_chown(cr, owner)); } /* * Check privileges for setattr attributes. * * Enforced in the Linux VFS. */ int secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap, const struct vattr *ovap, int flags, int unlocked_access(void *, int, cred_t *), void *node) { return (0); } /* * Check privileges for links. * * Enforced in the Linux VFS. */ int secpolicy_basic_link(const cred_t *cr) { return (0); } diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 84c33b541ea3..8aeed6f568cf 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1,4010 +1,4011 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) zrele() should always be the last thing except for zil_commit() (if * necessary) and ZFS_EXIT(). This is for 3 reasons: First, if it's the * last reference, the vnode/znode can be freed, so the zp may point to * freed memory. Second, the last reference will call zfs_zinactive(), * which may induce a lot of work -- pushing cached pages (which acquires * range locks) and syncing out cached atime changes. Third, * zfs_zinactive() may require a new tx, which could deadlock the system * if you were already holding one. This deadlock occurs because the tx * currently being operated on prevents a txg from syncing, which * prevents the new tx from progressing, resulting in a deadlock. If you * must call zrele() within a tx, use zfs_zrele_async(). Note that iput() * is a synonym for zrele(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * zrele(...); // release held znodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* * Virus scanning is unsupported. It would be possible to add a hook * here to performance the required virus scan. This could be done * entirely in the kernel or potentially as an update to invoke a * scanning utility. */ static int zfs_vscan(struct inode *ip, cred_t *cr, int async) { return (0); } /* ARGSUSED */ int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Honor ZFS_APPENDONLY file attribute */ if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Virus scan eligible files on open */ if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (zfs_vscan(ip, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & O_SYNC) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ int zfs_close(struct inode *ip, int flag, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if (flag & O_SYNC) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(zfs_vscan(ip, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } #if defined(_KERNEL) /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ void update_pages(znode_t *zp, int64_t start, int len, objset_t *os) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; struct page *pp; uint64_t nbytes; int64_t off; void *pb; off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { nbytes = MIN(PAGE_SIZE - off, len); pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { if (mapping_writably_mapped(mp)) flush_dcache_page(pp); pb = kmap(pp); (void) dmu_read(os, zp->z_id, start + off, nbytes, pb + off, DMU_READ_PREFETCH); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); SetPageUptodate(pp); ClearPageError(pp); unlock_page(pp); put_page(pp); } len -= nbytes; off = 0; } } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ int mappedread(znode_t *zp, int nbytes, zfs_uio_t *uio) { struct inode *ip = ZTOI(zp); struct address_space *mp = ip->i_mapping; struct page *pp; int64_t start, off; uint64_t bytes; int len = nbytes; int error = 0; void *pb; start = uio->uio_loffset; off = start & (PAGE_SIZE-1); for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { bytes = MIN(PAGE_SIZE - off, len); pp = find_lock_page(mp, start >> PAGE_SHIFT); if (pp) { ASSERT(PageUptodate(pp)); unlock_page(pp); pb = kmap(pp); error = zfs_uiomove(pb + off, bytes, UIO_READ, uio); kunmap(pp); if (mapping_writably_mapped(mp)) flush_dcache_page(pp); mark_page_accessed(pp); put_page(pp); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); } len -= bytes; off = 0; if (error) break; } return (error); } #endif /* _KERNEL */ unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; /* * Write the bytes to a file. * * IN: zp - znode of file to be written to * data - bytes to write * len - number of bytes to write * pos - offset to start writing at * * OUT: resid - remaining bytes to write * * RETURN: 0 if success * positive error code if failure. EIO is returned * for a short write when residp isn't provided. * * Timestamps: * zp - ctime|mtime updated if byte count > 0 */ int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *residp) { fstrans_cookie_t cookie; int error; struct iovec iov; iov.iov_base = (void *)data; iov.iov_len = len; zfs_uio_t uio; zfs_uio_iovec_init(&uio, &iov, 1, pos, UIO_SYSSPACE, len, 0); cookie = spl_fstrans_mark(); error = zfs_write(zp, &uio, 0, kcred); spl_fstrans_unmark(cookie); if (error == 0) { if (residp != NULL) *residp = zfs_uio_resid(&uio); else if (zfs_uio_resid(&uio) != 0) error = SET_ERROR(EIO); } return (error); } void zfs_zrele_async(znode_t *zp) { struct inode *ip = ZTOI(zp); objset_t *os = ITOZSB(ip)->z_os; ASSERT(atomic_read(&ip->i_count) > 0); ASSERT(os != NULL); /* * If decrementing the count would put us at 0, we can't do it inline * here, because that would be synchronous. Instead, dispatch an iput * to run later. * * For more information on the dangers of a synchronous iput, see the * header comment of this file. */ if (!atomic_add_unless(&ip->i_count, -1, 1)) { VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held inode reference for it. * * IN: zdp - znode of directory to search. * nm - name of entry to lookup. * flags - LOOKUP_XATTR set if looking for an attribute. * cr - credentials of caller. * direntflags - directory lookup flags * realpnp - returned pathname. * * OUT: zpp - znode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ int zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) { zfsvfs_t *zfsvfs = ZTOZSB(zdp); int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { if (!S_ISDIR(ZTOI(zdp)->i_mode)) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { error = zfs_fastaccesschk_execute(zdp, cr); if (!error) { *zpp = zdp; zhold(*zpp); return (0); } return (error); } } ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *zpp = NULL; if (flags & LOOKUP_XATTR) { /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { ZFS_EXIT(zfsvfs); return (error); } /* * Do we have permission to get into attribute directory? */ if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, B_FALSE, cr))) { zrele(*zpp); *zpp = NULL; } ZFS_EXIT(zfsvfs); return (error); } if (!S_ISDIR(ZTOI(zdp)->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTDIR)); } /* * Check accessibility of directory. */ if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); if ((error == 0) && (*zpp)) zfs_znode_update_vfs(*zpp); ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the ip of the created or trunc'd file. * * IN: dzp - znode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - file flag. * vsecp - ACL to be set * * OUT: zpp - znode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated if new entry created * zp - ctime|mtime always, atime if new */ /* ARGSUSED */ int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; objset_t *os; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } top: *zpp = NULL; if (*name == '\0') { /* * Null component name refers to the directory itself. */ zhold(dzp); zp = dzp; dl = NULL; error = 0; } else { /* possible igrab(zp) */ int zflg = 0; if (flag & FIGNORECASE) zflg |= ZCILOOK; error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { if (have_acl) zfs_acl_ids_free(&acl_ids); if (strcmp(name, "..") == 0) error = SET_ERROR(EISDIR); ZFS_EXIT(zfsvfs); return (error); } } if (zp == NULL) { uint64_t txtype; uint64_t projid = ZFS_DEFAULT_PROJID; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { if (have_acl) zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EINVAL); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { /* * Since, we failed to add the directory entry for it, * delete the newly created dnode. */ zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & O_APPEND) ? V_APPEND : 0; if (have_acl) zfs_acl_ids_free(&acl_ids); have_acl = B_FALSE; /* * A directory entry already exists for this name. */ /* * Can't truncate an existing file if in exclusive mode. */ if (excl) { error = SET_ERROR(EEXIST); goto out; } /* * Can't open a directory for writing. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } /* * Verify requested access to file. */ if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { goto out; } mutex_enter(&dzp->z_lock); dzp->z_seq++; mutex_exit(&dzp->z_lock); /* * Truncate regular files if requested. */ if (S_ISREG(ZTOI(zp)->i_mode) && (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { /* we can't hold any locks when calling zfs_freesp() */ if (dl) { zfs_dirent_unlock(dl); dl = NULL; } error = zfs_freesp(zp, 0, 0, mode, TRUE); } } out: if (dl) zfs_dirent_unlock(dl); if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *zpp = zp; } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* ARGSUSED */ int zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) { znode_t *zp = NULL, *dzp = ITOZ(dip); zfsvfs_t *zfsvfs = ITOZSB(dip); objset_t *os; dmu_tx_t *tx; int error; uid_t uid; gid_t gid; zfs_acl_ids_t acl_ids; uint64_t projid = ZFS_DEFAULT_PROJID; boolean_t fuid_dirtied; boolean_t have_acl = B_FALSE; boolean_t waited = B_FALSE; /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ gid = crgetgid(cr); uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } top: *ipp = NULL; /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { if (have_acl) zfs_acl_ids_free(&acl_ids); goto out; } if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; have_acl = B_TRUE; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* Add to unlinked set */ zp->z_unlinked = B_TRUE; zfs_unlinked_add(zp, tx); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); out: if (error) { if (zp) zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); *ipp = ZTOI(zp); } ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dzp - znode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime * ip - ctime (if nlink > 0) */ uint64_t null_xattr = 0; /*ARGSUSED*/ int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) { znode_t *zp; znode_t *xzp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t acl_obj, xattr_obj; uint64_t xattr_obj_unlinked = 0; uint64_t obj = 0; uint64_t links; zfs_dirlock_t *dl; dmu_tx_t *tx; boolean_t may_delete_now, delete_now = FALSE; boolean_t unlinked, toobig = FALSE; uint64_t txtype; pathname_t *realnmp = NULL; pathname_t realnm; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) { zflg |= ZCILOOK; pn_alloc(&realnm); realnmp = &realnm; } top: xattr_obj = 0; xzp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, realnmp))) { if (realnmp) pn_free(realnmp); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } /* * Need to use rmdir for removing directories. */ if (S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(EPERM); goto out; } mutex_enter(&zp->z_lock); may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && !(zp->z_is_mapped); mutex_exit(&zp->z_lock); /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the inode. So we dmu_tx_hold() the right things to * allow for either case. */ obj = zp->z_id; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (may_delete_now) { toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; /* if the file is too big, only hold_free a token amount */ dmu_tx_hold_free(tx, zp->z_id, 0, (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); } /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } mutex_enter(&zp->z_lock); if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); mutex_exit(&zp->z_lock); /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); goto top; } if (realnmp) pn_free(realnmp); dmu_tx_abort(tx); zrele(zp); if (xzp) zrele(xzp); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { /* * Hold z_lock so that we can make sure that the ACL obj * hasn't changed. Could have been deleted due to * zfs_sa_upgrade(). */ mutex_enter(&zp->z_lock); (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); delete_now = may_delete_now && !toobig && atomic_read(&ZTOI(zp)->i_count) == 1 && !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && zfs_external_acl(zp) == acl_obj; } if (delete_now) { if (xattr_obj_unlinked) { ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); mutex_enter(&xzp->z_lock); xzp->z_unlinked = B_TRUE; clear_nlink(ZTOI(xzp)); links = 0; error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &links, sizeof (links), tx); ASSERT3U(error, ==, 0); mutex_exit(&xzp->z_lock); zfs_unlinked_add(xzp, tx); if (zp->z_is_sa) error = sa_remove(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), tx); else error = sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &null_xattr, sizeof (uint64_t), tx); ASSERT0(error); } /* * Add to the unlinked set because a new reference could be * taken concurrently resulting in a deferred destruction. */ zfs_unlinked_add(zp, tx); mutex_exit(&zp->z_lock); } else if (unlinked) { mutex_exit(&zp->z_lock); zfs_unlinked_add(zp, tx); } txtype = TX_REMOVE; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); dmu_tx_commit(tx); out: if (realnmp) pn_free(realnmp); zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); if (delete_now) zrele(zp); else zfs_zrele_async(zp); if (xzp) { zfs_znode_update_vfs(xzp); zfs_zrele_async(xzp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new directory and insert it into dzp using the name * provided. Return a pointer to the inserted directory. * * IN: dzp - znode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * flags - case flags. * vsecp - ACL to be set * * OUT: zpp - znode of created directory. * * RETURN: 0 if success * error code if failure * * Timestamps: * dzp - ctime|mtime updated * zpp - ctime|mtime|atime updated */ /*ARGSUSED*/ int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; uint64_t txtype; dmu_tx_t *tx; int error; int zf = ZNEW; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; boolean_t waited = B_FALSE; ASSERT(S_ISDIR(vap->va_mode)); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); if (dirname == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; if (vap->va_mask & ATTR_XVATTR) { if ((error = secpolicy_xvattr((xvattr_t *)vap, crgetuid(cr), cr, vap->va_mode)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ top: *zpp = NULL; if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, NULL, NULL))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); /* * Now put new name in parent dir. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); goto out; } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); *zpp = zp; txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, acl_ids.z_fuidp, vap); out: zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (error != 0) { zrele(zp); } else { zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } ZFS_EXIT(zfsvfs); return (error); } /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dzp - znode of directory to remove from. * name - name of directory to be removed. * cwd - inode of current working directory. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dzp - ctime|mtime updated */ /*ARGSUSED*/ int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags) { znode_t *zp; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zflg = ZEXISTS; boolean_t waited = B_FALSE; if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (flags & FIGNORECASE) zflg |= ZCILOOK; top: zp = NULL; /* * Attempt to lock directory; fail if entry doesn't exist. */ if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL))) { ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } if (!S_ISDIR(ZTOI(zp)->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } if (zp == cwd) { error = SET_ERROR(EINVAL); goto out; } /* * Grab a lock on the directory to make sure that no one is * trying to add (or lookup) entries while we are removing it. */ rw_enter(&zp->z_name_lock, RW_WRITER); /* * Grab a lock on the parent pointer to make sure we play well * with the treewalk and directory rename code. */ rw_enter(&zp->z_parent_lock, RW_WRITER); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(zp); goto top; } dmu_tx_abort(tx); zrele(zp); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_destroy(dl, zp, tx, zflg, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, B_FALSE); } dmu_tx_commit(tx); rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); out: zfs_dirent_unlock(dl); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); zrele(zp); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Read directory entries from the given directory cursor position and emit * name and position for each entry. * * IN: ip - inode of directory to read. * ctx - directory entry context. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; zap_cursor_t zc; zap_attribute_t zap; int error; uint8_t prefetch; uint8_t type; int done = 0; uint64_t parent; uint64_t offset; /* must be unsigned; checks for < 1 */ ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) goto out; /* * Quit if directory has been removed (posix) */ if (zp->z_unlinked) goto out; error = 0; os = zfsvfs->z_os; offset = ctx->pos; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Transform to file-system independent format */ while (!done) { uint64_t objnum; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if ((error = zap_cursor_retrieve(&zc, &zap))) { if (error == ENOENT) break; else goto update; } /* * Allow multiple entries provided the first entry is * the object id. Non-zpl consumers may safely make * use of the additional space. * * XXX: This should be a feature flag for compatibility */ if (zap.za_integer_length != 8 || zap.za_num_integers == 0) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld, " "length = %d, num = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset, zap.za_integer_length, (u_longlong_t)zap.za_num_integers); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); type = ZFS_DIRENT_TYPE(zap.za_first_integer); } done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), objnum, type); if (done) break; /* Prefetch znode */ if (prefetch) { dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); } /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } ctx->pos = offset; } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ update: zap_cursor_fini(&zc); if (error == ENOENT) error = 0; out: ZFS_EXIT(zfsvfs); return (error); } /* * Get the basic file attributes and place them in the provided kstat * structure. The inode is assumed to be the authoritative source * for most of the attributes. However, the znode currently has the * authoritative atime, blksize, and block count. * * IN: ip - inode of file. * * OUT: sp - kstat values. * * RETURN: 0 (always succeeds) */ /* ARGSUSED */ int -zfs_getattr_fast(struct inode *ip, struct kstat *sp) +zfs_getattr_fast(struct user_namespace *user_ns, struct inode *ip, + struct kstat *sp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t blksize; u_longlong_t nblocks; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); mutex_enter(&zp->z_lock); - generic_fillattr(ip, sp); + zpl_generic_fillattr(user_ns, ip, sp); /* * +1 link count for root inode with visible '.zfs' directory. */ if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) if (sp->nlink < ZFS_LINK_MAX) sp->nlink++; sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); sp->blksize = blksize; sp->blocks = nblocks; if (unlikely(zp->z_blksz == 0)) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ sp->blksize = zfsvfs->z_max_blksz; } mutex_exit(&zp->z_lock); /* * Required to prevent NFS client from detecting different inode * numbers of snapshot root dentry before and after snapshot mount. */ if (zfsvfs->z_issnap) { if (ip->i_sb->s_root->d_inode == ip) sp->ino = ZFSCTL_INO_SNAPDIRS - dmu_objset_id(zfsvfs->z_os); } ZFS_EXIT(zfsvfs); return (0); } /* * For the operation of changing file's user/group/project, we need to * handle not only the main object that is assigned to the file directly, * but also the ones that are used by the file via hidden xattr directory. * * Because the xattr directory may contains many EA entries, as to it may * be impossible to change all of them via the transaction of changing the * main object's user/group/project attributes. Then we have to change them * via other multiple independent transactions one by one. It may be not good * solution, but we have no better idea yet. */ static int zfs_setattr_dir(znode_t *dzp) { struct inode *dxip = ZTOI(dzp); struct inode *xip = NULL; zfsvfs_t *zfsvfs = ZTOZSB(dzp); objset_t *os = zfsvfs->z_os; zap_cursor_t zc; zap_attribute_t zap; zfs_dirlock_t *dl; znode_t *zp = NULL; dmu_tx_t *tx = NULL; uint64_t uid, gid; sa_bulk_attr_t bulk[4]; int count; int err; zap_cursor_init(&zc, os, dzp->z_id); while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { count = 0; if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { err = ENXIO; break; } err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, ZEXISTS, NULL, NULL); if (err == ENOENT) goto next; if (err) break; xip = ZTOI(zp); if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && zp->z_projid == dzp->z_projid) goto next; tx = dmu_tx_create(os); if (!(zp->z_pflags & ZFS_PROJID)) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); err = dmu_tx_assign(tx, TXG_WAIT); if (err) break; mutex_enter(&dzp->z_lock); if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { xip->i_uid = dxip->i_uid; uid = zfs_uid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, sizeof (uid)); } if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { xip->i_gid = dxip->i_gid; gid = zfs_gid_read(dxip); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, sizeof (gid)); } if (zp->z_projid != dzp->z_projid) { if (!(zp->z_pflags & ZFS_PROJID)) { zp->z_pflags |= ZFS_PROJID; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); } zp->z_projid = dzp->z_projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } mutex_exit(&dzp->z_lock); if (likely(count > 0)) { err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } else { dmu_tx_abort(tx); } tx = NULL; if (err != 0 && err != ENOENT) break; next: if (zp) { zrele(zp); zp = NULL; zfs_dirent_unlock(dl); } zap_cursor_advance(&zc); } if (tx) dmu_tx_abort(tx); if (zp) { zrele(zp); zfs_dirent_unlock(dl); } zap_cursor_fini(&zc); return (err == ENOENT ? 0 : err); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: zp - znode of file to be modified. * vap - new attribute values. * If ATTR_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ int zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); objset_t *os = zfsvfs->z_os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t *tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; int trim_mask = 0; uint64_t new_mode; uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2], atime[2]; uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; int err, err2 = 0; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; boolean_t handle_eadir = B_FALSE; sa_bulk_attr_t *bulk, *xattr_bulk; int count = 0, xattr_count = 0, bulks = 8; if (mask == 0) return (0); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); ip = ZTOI(zp); /* * If this is a xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); if (xoap != NULL && (mask & ATTR_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) projid = ZFS_INVALID_PROJID; else need_policy = TRUE; } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOTSUP)); } } zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & ATTR_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); xva_init(tmpxvattr); bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { err = SET_ERROR(EPERM); goto out3; } if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { err = SET_ERROR(EPERM); goto out3; } /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (ATTR_ATIME | ATTR_MTIME)) { if (((mask & ATTR_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & ATTR_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { err = SET_ERROR(EOVERFLOW); goto out3; } } top: attrzp = NULL; aclp = NULL; /* Can this be moved to before the top label? */ if (zfs_is_readonly(zfsvfs)) { err = SET_ERROR(EROFS); goto out3; } /* * First validate permissions */ if (mask & ATTR_SIZE) { err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); if (err) goto out3; /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) goto out3; } if (mask & (ATTR_ATIME|ATTR_MTIME) || ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (ATTR_UID|ATTR_GID)) { int idmask = (mask & (ATTR_UID|ATTR_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & ATTR_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & ATTR_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both ATTR_UID and ATTR_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (ATTR_UID|ATTR_GID)) && take_owner && take_group) || ((idmask == ATTR_UID) && take_owner) || ((idmask == ATTR_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ (void) secpolicy_setid_clear(vap, cr); trim_mask = (mask & (ATTR_UID|ATTR_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } mutex_enter(&zp->z_lock); oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & ATTR_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { if (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_PROJINHERIT); XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((!S_ISREG(ip->i_mode) && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { mutex_exit(&zp->z_lock); err = SET_ERROR(EPERM); goto out3; } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } mutex_exit(&zp->z_lock); if (mask & ATTR_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(ip, vap, &oldva, cr); if (err) goto out3; trim_mask |= ATTR_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; } err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) goto out3; if (trim_mask) vap->va_mask |= saved_mask; } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { handle_eadir = B_TRUE; err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); if (err) goto out2; } if (mask & ATTR_UID) { new_kuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, new_kuid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & ATTR_GID) { new_kgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, new_kgid)) { if (attrzp) zrele(attrzp); err = SET_ERROR(EDQUOT); goto out2; } } if (projid != ZFS_INVALID_PROJID && zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { if (attrzp) zrele(attrzp); err = EDQUOT; goto out2; } } tx = dmu_tx_create(os); if (mask & ATTR_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = EPERM; goto out; } if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) goto out; mutex_enter(&zp->z_lock); if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } mutex_exit(&zp->z_lock); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if (((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { /* * For the existed object that is upgraded from old system, * its on-disk layout has no slot for the project ID attribute. * But quota accounting logic needs to access related slots by * offset directly. So we need to adjust old objects' layout * to make the project ID to some unified and fixed offset. */ if (attrzp) err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); if (err == 0) err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) err = 0; else if (err != 0) goto out; else projid = ZFS_INVALID_PROJID; } if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&zp->z_acl_lock); mutex_enter(&zp->z_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_enter(&attrzp->z_acl_lock); mutex_enter(&attrzp->z_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); if (projid != ZFS_INVALID_PROJID) { attrzp->z_projid = projid; SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, sizeof (attrzp->z_projid)); } } if (mask & (ATTR_UID|ATTR_GID)) { if (mask & ATTR_UID) { ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); new_uid = zfs_uid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); } } if (mask & ATTR_GID) { ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); new_gid = zfs_gid_read(ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); } } if (!(mask & ATTR_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & ATTR_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = ZTOI(zp)->i_mode = new_mode; ASSERT3P(aclp, !=, NULL); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; ZFS_TIME_ENCODE(&ip->i_atime, atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( vap->va_mtime, ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } if (mask & (ATTR_CTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_ctime, ctime); ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, ZTOI(zp)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } if (attrzp && mask) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & ATTR_XVATTR)) { /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { XVA_SET_REQ(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(S_ISREG(ip->i_mode)); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); mutex_exit(&zp->z_lock); if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) mutex_exit(&attrzp->z_acl_lock); mutex_exit(&attrzp->z_lock); } out: if (err == 0 && xattr_count > 0) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); if (attrzp) zrele(attrzp); if (err == ERESTART) goto top; } else { if (count > 0) err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); if (attrzp) { if (err2 == 0 && handle_eadir) err2 = zfs_setattr_dir(attrzp); zrele(attrzp); } zfs_znode_update_vfs(zp); } out2: if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); out3: kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); kmem_free(tmpxvattr, sizeof (xvattr_t)); ZFS_EXIT(zfsvfs); return (err); } typedef struct zfs_zlock { krwlock_t *zl_rwlock; /* lock we acquired */ znode_t *zl_znode; /* znode we held */ struct zfs_zlock *zl_next; /* next in list */ } zfs_zlock_t; /* * Drop locks and release vnodes that were held by zfs_rename_lock(). */ static void zfs_rename_unlock(zfs_zlock_t **zlpp) { zfs_zlock_t *zl; while ((zl = *zlpp) != NULL) { if (zl->zl_znode != NULL) zfs_zrele_async(zl->zl_znode); rw_exit(zl->zl_rwlock); *zlpp = zl->zl_next; kmem_free(zl, sizeof (*zl)); } } /* * Search back through the directory tree, using the ".." entries. * Lock each directory in the chain to prevent concurrent renames. * Fail any attempt to move a directory into one of its own descendants. * XXX - z_parent_lock can overlap with map or grow locks */ static int zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) { zfs_zlock_t *zl; znode_t *zp = tdzp; uint64_t rootid = ZTOZSB(zp)->z_root; uint64_t oidp = zp->z_id; krwlock_t *rwlp = &szp->z_parent_lock; krw_t rw = RW_WRITER; /* * First pass write-locks szp and compares to zp->z_id. * Later passes read-lock zp and compare to zp->z_parent. */ do { if (!rw_tryenter(rwlp, rw)) { /* * Another thread is renaming in this path. * Note that if we are a WRITER, we don't have any * parent_locks held yet. */ if (rw == RW_READER && zp->z_id > szp->z_id) { /* * Drop our locks and restart */ zfs_rename_unlock(&zl); *zlpp = NULL; zp = tdzp; oidp = zp->z_id; rwlp = &szp->z_parent_lock; rw = RW_WRITER; continue; } else { /* * Wait for other thread to drop its locks */ rw_enter(rwlp, rw); } } zl = kmem_alloc(sizeof (*zl), KM_SLEEP); zl->zl_rwlock = rwlp; zl->zl_znode = NULL; zl->zl_next = *zlpp; *zlpp = zl; if (oidp == szp->z_id) /* We're a descendant of szp */ return (SET_ERROR(EINVAL)); if (oidp == rootid) /* We've hit the top */ return (0); if (rw == RW_READER) { /* i.e. not the first pass */ int error = zfs_zget(ZTOZSB(zp), oidp, &zp); if (error) return (error); zl->zl_znode = zp; } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), &oidp, sizeof (oidp)); rwlp = &zp->z_parent_lock; rw = RW_READER; } while (zp->z_id != sdzp->z_id); return (0); } /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdzp - Source directory containing the "old entry". * snm - Old entry name. * tdzp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdzp,tdzp - ctime|mtime updated */ /*ARGSUSED*/ int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, cred_t *cr, int flags) { znode_t *szp, *tzp; zfsvfs_t *zfsvfs = ZTOZSB(sdzp); zilog_t *zilog; zfs_dirlock_t *sdl, *tdl; dmu_tx_t *tx; zfs_zlock_t *zl; int cmp, serr, terr; int error = 0; int zflg = 0; boolean_t waited = B_FALSE; if (snm == NULL || tnm == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(sdzp); zilog = zfsvfs->z_log; ZFS_VERIFY_ZP(tdzp); /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || zfsctl_is_node(ZTOI(tdzp))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; top: szp = NULL; tzp = NULL; zl = NULL; /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Lock source and target directory entries. To prevent deadlock, * a lock ordering must be defined. We lock the directory with * the smallest object id first, or if it's a tie, the one with * the lexically first name. */ if (sdzp->z_id < tdzp->z_id) { cmp = -1; } else if (sdzp->z_id > tdzp->z_id) { cmp = 1; } else { /* * First compare the two name arguments without * considering any case folding. */ int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); ASSERT(error == 0 || !zfsvfs->z_utf8); if (cmp == 0) { /* * POSIX: "If the old argument and the new argument * both refer to links to the same existing file, * the rename() function shall return successfully * and perform no other action." */ ZFS_EXIT(zfsvfs); return (0); } /* * If the file system is case-folding, then we may * have some more checking to do. A case-folding file * system is either supporting mixed case sensitivity * access or is completely case-insensitive. Note * that the file system is always case preserving. * * In mixed sensitivity mode case sensitive behavior * is the default. FIGNORECASE must be used to * explicitly request case insensitive behavior. * * If the source and target names provided differ only * by case (e.g., a request to rename 'tim' to 'Tim'), * we will treat this as a special case in the * case-insensitive mode: as long as the source name * is an exact match, we will allow this to proceed as * a name-change request. */ if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || (zfsvfs->z_case == ZFS_CASE_MIXED && flags & FIGNORECASE)) && u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, &error) == 0) { /* * case preserving rename request, require exact * name matches */ zflg |= ZCIEXACT; zflg &= ~ZCILOOK; } } /* * If the source and destination directories are the same, we should * grab the z_name_lock of that directory only once. */ if (sdzp == tdzp) { zflg |= ZHAVELOCK; rw_enter(&sdzp->z_name_lock, RW_READER); } if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); } else { terr = zfs_dirent_lock(&tdl, tdzp, tnm, &tzp, zflg, NULL, NULL); serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, NULL, NULL); } if (serr) { /* * Source entry invalid or not there. */ if (!terr) { zfs_dirent_unlock(tdl); if (tzp) zrele(tzp); } if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(snm, "..") == 0) serr = EINVAL; ZFS_EXIT(zfsvfs); return (serr); } if (terr) { zfs_dirent_unlock(sdl); zrele(szp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (strcmp(tnm, "..") == 0) terr = EINVAL; ZFS_EXIT(zfsvfs); return (terr); } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow renames into our tree when the project * IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { error = SET_ERROR(EXDEV); goto out; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) goto out; if (S_ISDIR(ZTOI(szp)->i_mode)) { /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) goto out; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if (S_ISDIR(ZTOI(szp)->i_mode)) { if (!S_ISDIR(ZTOI(tzp)->i_mode)) { error = SET_ERROR(ENOTDIR); goto out; } } else { if (S_ISDIR(ZTOI(tzp)->i_mode)) { error = SET_ERROR(EISDIR); goto out; } } /* * POSIX dictates that when the source and target * entries refer to the same file object, rename * must do nothing and exit without error. */ if (szp->z_id == tzp->z_id) { error = 0; goto out; } } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); goto top; } dmu_tx_abort(tx); zrele(szp); if (tzp) zrele(tzp); ZFS_EXIT(zfsvfs); return (error); } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); if (error == 0) { error = zfs_link_create(tdl, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; if (tdzp->z_pflags & ZFS_PROJINHERIT) szp->z_pflags |= ZFS_PROJINHERIT; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME | (flags & FIGNORECASE ? TX_CI : 0), sdzp, sdl->dl_name, tdzp, tdl->dl_name, szp); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdl, szp, tx, ZRENAMING, NULL), ==, 0); } } else { /* * If we had removed the existing target, subsequent * call to zfs_link_create() to add back the same entry * but, the new dnode (szp) should not fail. */ ASSERT(tzp == NULL); } } dmu_tx_commit(tx); out: if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); zfs_znode_update_vfs(sdzp); if (sdzp == tdzp) rw_exit(&sdzp->z_name_lock); if (sdzp != tdzp) zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); zrele(szp); if (tzp) { zfs_znode_update_vfs(tzp); zrele(tzp); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dzp - Directory to contain new symbolic link. * name - Name of directory entry in dip. * vap - Attributes of new entry. * link - Name for new symlink entry. * cr - credentials of caller. * flags - case flags * * OUT: zpp - Znode for new symbolic link. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dip - ctime|mtime updated */ /*ARGSUSED*/ int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, znode_t **zpp, cred_t *cr, int flags) { znode_t *zp; zfs_dirlock_t *dl; dmu_tx_t *tx; zfsvfs_t *zfsvfs = ZTOZSB(dzp); zilog_t *zilog; uint64_t len = strlen(link); int error; int zflg = ZNEW; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; boolean_t waited = B_FALSE; ASSERT(S_ISLNK(vap->va_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zflg |= ZCILOOK; if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } top: *zpp = NULL; /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), link, len, tx); else zfs_sa_symlink(zp, link, len, tx); mutex_exit(&zp->z_lock); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ error = zfs_link_create(dl, zp, tx, ZNEW); if (error != 0) { zfs_znode_delete(zp, tx); remove_inode_hash(ZTOI(zp)); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); zfs_znode_update_vfs(dzp); zfs_znode_update_vfs(zp); } zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (error == 0) { *zpp = zp; if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); } else { zrele(zp); } ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by ip. * * IN: ip - inode of symbolic link * uio - structure to contain the link path. * cr - credentials of caller. * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - atime updated */ /* ARGSUSED */ int zfs_readlink(struct inode *ip, zfs_uio_t *uio, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); mutex_enter(&zp->z_lock); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdzp referencing szp. * * IN: tdzp - Directory to contain new entry. * szp - znode of new entry. * name - name of new entry. * cr - credentials of caller. * flags - case flags. * * RETURN: 0 if success * error code if failure * * Timestamps: * tdzp - ctime|mtime updated * szp - ctime updated */ /* ARGSUSED */ int zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, int flags) { struct inode *sip = ZTOI(szp); znode_t *tzp; zfsvfs_t *zfsvfs = ZTOZSB(tdzp); zilog_t *zilog; zfs_dirlock_t *dl; dmu_tx_t *tx; int error; int zf = ZNEW; uint64_t parent; uid_t owner; boolean_t waited = B_FALSE; boolean_t is_tmpfile = 0; uint64_t txg; #ifdef HAVE_TMPFILE is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); #endif ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); if (name == NULL) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(tdzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (S_ISDIR(sip->i_mode)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } ZFS_VERIFY_ZP(szp); /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow hard link creation in our tree when the * project IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } /* * We check i_sb because snapshots and the ctldir must have different * super blocks. */ if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (flags & FIGNORECASE) zf |= ZCILOOK; /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } top: /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); if (is_tmpfile) dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, tdzp); error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); if (error == ERESTART) { waited = B_TRUE; dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* unmark z_unlinked so zfs_link_create will not reject */ if (is_tmpfile) szp->z_unlinked = B_FALSE; error = zfs_link_create(dl, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; /* * tmpfile is created to be in z_unlinkedobj, so remove it. * Also, we don't log in ZIL, because all previous file * operation on the tmpfile are ignored by ZIL. Instead we * always wait for txg to sync to make sure all previous * operation are sync safe. */ if (is_tmpfile) { VERIFY(zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); } else { if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_link(zilog, tx, txtype, tdzp, szp, name); } } else if (is_tmpfile) { /* restore z_unlinked since when linking failed */ szp->z_unlinked = B_TRUE; } txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); zfs_dirent_unlock(dl); if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); zfs_znode_update_vfs(tdzp); zfs_znode_update_vfs(szp); ZFS_EXIT(zfsvfs); return (error); } static void zfs_putpage_commit_cb(void *arg) { struct page *pp = arg; ClearPageError(pp); end_page_writeback(pp); } /* * Push a page out to disk, once the page is on stable storage the * registered commit callback will be run as notification of completion. * * IN: ip - page mapped for inode. * pp - page to push (page is locked) * wbc - writeback control data * * RETURN: 0 if success * error code if failure * * Timestamps: * ip - ctime|mtime updated */ /* ARGSUSED */ int zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); loff_t offset; loff_t pgoff; unsigned int pglen; dmu_tx_t *tx; caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); ASSERT(PageLocked(pp)); pgoff = page_offset(pp); /* Page byte-offset in file */ offset = i_size_read(ip); /* File length in bytes */ pglen = MIN(PAGE_SIZE, /* Page length in bytes */ P2ROUNDUP(offset, PAGE_SIZE)-pgoff); /* Page is beyond end of file */ if (pgoff >= offset) { unlock_page(pp); ZFS_EXIT(zfsvfs); return (0); } /* Truncate page length to end of file */ if (pgoff + pglen > offset) pglen = offset - pgoff; #if 0 /* * FIXME: Allow mmap writes past its quota. The correct fix * is to register a page_mkwrite() handler to count the page * against its quota when it is about to be dirtied. */ if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, KUID_TO_SUID(ip->i_uid)) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, KGID_TO_SGID(ip->i_gid)) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { err = EDQUOT; } #endif /* * The ordering here is critical and must adhere to the following * rules in order to avoid deadlocking in either zfs_read() or * zfs_free_range() due to a lock inversion. * * 1) The page must be unlocked prior to acquiring the range lock. * This is critical because zfs_read() calls find_lock_page() * which may block on the page lock while holding the range lock. * * 2) Before setting or clearing write back on a page the range lock * must be held in order to prevent a lock inversion with the * zfs_free_range() function. * * This presents a problem because upon entering this function the * page lock is already held. To safely acquire the range lock the * page lock must be dropped. This creates a window where another * process could truncate, invalidate, dirty, or write out the page. * * Therefore, after successfully reacquiring the range and page locks * the current page state is checked. In the common case everything * will be as is expected and it can be written out. However, if * the page state has changed it must be handled accordingly. */ mapping = pp->mapping; redirty_page_for_writepage(wbc, pp); unlock_page(pp); zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, pgoff, pglen, RL_WRITER); lock_page(pp); /* Page mapping changed or it was no longer dirty, we're done */ if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { unlock_page(pp); zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (0); } /* Another process started write block if required */ if (PageWriteback(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { if (PageWriteback(pp)) wait_on_page_bit(pp, PG_writeback); } ZFS_EXIT(zfsvfs); return (0); } /* Clear the dirty flag the required locks are held */ if (!clear_page_dirty_for_io(pp)) { unlock_page(pp); zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (0); } /* * Counterpart for redirty_page_for_writepage() above. This page * was in fact not skipped and should not be counted as if it were. */ wbc->pages_skipped--; set_page_writeback(pp); unlock_page(pp); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { if (err == ERESTART) dmu_tx_wait(tx); dmu_tx_abort(tx); __set_page_dirty_nobuffers(pp); ClearPageError(pp); end_page_writeback(pp); zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (err); } va = kmap(pp); ASSERT3U(pglen, <=, PAGE_SIZE); dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); kunmap(pp); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_mtime, mtime); ZFS_TIME_ENCODE(&ip->i_ctime, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, zfs_putpage_commit_cb, pp); dmu_tx_commit(tx); zfs_rangelock_exit(lr); if (wbc->sync_mode != WB_SYNC_NONE) { /* * Note that this is rarely called under writepages(), because * writepages() normally handles the entire commit for * performance reasons. */ zil_commit(zfsvfs->z_log, zp->z_id); } ZFS_EXIT(zfsvfs); return (err); } /* * Update the system attributes when the inode has been dirtied. For the * moment we only update the mode, atime, mtime, and ctime. */ int zfs_dirty_inode(struct inode *ip, int flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) return (0); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); #ifdef I_DIRTY_TIME /* * This is the lazytime semantic introduced in Linux 4.0 * This flag will only be called from update_time when lazytime is set. * (Note, I_DIRTY_SYNC will also set if not lazytime) * Fortunately mtime and ctime are managed within ZFS itself, so we * only need to dirty atime. */ if (flags == I_DIRTY_TIME) { zp->z_atime_dirty = B_TRUE; goto out; } #endif tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto out; } mutex_enter(&zp->z_lock); zp->z_atime_dirty = B_FALSE; SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ ZFS_TIME_ENCODE(&ip->i_atime, atime); ZFS_TIME_ENCODE(&ip->i_mtime, mtime); ZFS_TIME_ENCODE(&ip->i_ctime, ctime); mode = ip->i_mode; zp->z_mode = mode; error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); mutex_exit(&zp->z_lock); dmu_tx_commit(tx); out: ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ void zfs_inactive(struct inode *ip) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t atime[2]; int error; int need_unlock = 0; /* Only read lock if we haven't already write locked, e.g. rollback */ if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { need_unlock = 1; rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); } if (zp->z_sa_hdl == NULL) { if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); return; } if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { ZFS_TIME_ENCODE(&ip->i_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); zp->z_atime_dirty = B_FALSE; mutex_exit(&zp->z_lock); dmu_tx_commit(tx); } } zfs_zinactive(zp); if (need_unlock) rw_exit(&zfsvfs->z_teardown_inactive_lock); } /* * Fill pages with data from the disk. */ static int zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os; struct page *cur_pp; u_offset_t io_off, total; size_t io_len; loff_t i_size; unsigned page_idx; int err; os = zfsvfs->z_os; io_len = nr_pages << PAGE_SHIFT; i_size = i_size_read(ip); io_off = page_offset(pl[0]); if (io_off + io_len > i_size) io_len = i_size - io_off; /* * Iterate over list of pages and read each page individually. */ page_idx = 0; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { caddr_t va; cur_pp = pl[page_idx++]; va = kmap(cur_pp); err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, DMU_READ_PREFETCH); kunmap(cur_pp); if (err) { /* convert checksum errors into IO errors */ if (err == ECKSUM) err = SET_ERROR(EIO); return (err); } } return (0); } /* * Uses zfs_fillpage to read data from the file and fill the pages. * * IN: ip - inode of file to get data from. * pl - list of pages to read * nr_pages - number of pages to read * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); int err; if (pl == NULL) return (0); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); err = zfs_fillpage(ip, pl, nr_pages); ZFS_EXIT(zfsvfs); return (err); } /* * Check ZFS specific permissions to memory map a section of a file. * * IN: ip - inode of the file to mmap * off - file offset * addrp - start address in memory region * len - length of memory region * vm_flags- address flags * * RETURN: 0 if success * error code if failure */ /*ARGSUSED*/ int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, unsigned long vm_flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((vm_flags & VM_WRITE) && (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((vm_flags & (VM_READ | VM_EXEC)) && (zp->z_pflags & ZFS_AV_QUARANTINED)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } if (off < 0 || len > MAXOFFSET_T - off) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENXIO)); } ZFS_EXIT(zfsvfs); return (0); } /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat * misnamed, as its functionality includes the ability to allocate as * well as free space. * * IN: zp - znode of file to free data in. * cmd - action to take (only F_FREESP supported). * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * zp - ctime|mtime updated */ /* ARGSUSED */ int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t off, len; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (cmd != F_FREESP) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Permissions aren't checked on Solaris because on this OS * zfs_space() can only be called with an opened file handle. * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ error = zfs_freesp(zp, off, len, flag, TRUE); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_fid(struct inode *ip, fid_t *fidp) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = SHORT_FID_LEN; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); ZFS_EXIT(zfsvfs); return (0); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_open); EXPORT_SYMBOL(zfs_close); EXPORT_SYMBOL(zfs_lookup); EXPORT_SYMBOL(zfs_create); EXPORT_SYMBOL(zfs_tmpfile); EXPORT_SYMBOL(zfs_remove); EXPORT_SYMBOL(zfs_mkdir); EXPORT_SYMBOL(zfs_rmdir); EXPORT_SYMBOL(zfs_readdir); EXPORT_SYMBOL(zfs_getattr_fast); EXPORT_SYMBOL(zfs_setattr); EXPORT_SYMBOL(zfs_rename); EXPORT_SYMBOL(zfs_symlink); EXPORT_SYMBOL(zfs_readlink); EXPORT_SYMBOL(zfs_link); EXPORT_SYMBOL(zfs_inactive); EXPORT_SYMBOL(zfs_space); EXPORT_SYMBOL(zfs_fid); EXPORT_SYMBOL(zfs_getpage); EXPORT_SYMBOL(zfs_putpage); EXPORT_SYMBOL(zfs_dirty_inode); EXPORT_SYMBOL(zfs_map); /* BEGIN CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); /* END CSTYLED */ #endif diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c index e6420f19ed87..9b526afd0002 100644 --- a/module/os/linux/zfs/zpl_ctldir.c +++ b/module/os/linux/zfs/zpl_ctldir.c @@ -1,552 +1,597 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (C) 2011 Lawrence Livermore National Security, LLC. * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). * LLNL-CODE-403049. * Rewritten for Linux by: * Rohan Puri * Brian Behlendorf */ #include #include #include #include #include /* * Common open routine. Disallow any write access. */ /* ARGSUSED */ static int zpl_common_open(struct inode *ip, struct file *filp) { if (filp->f_mode & FMODE_WRITE) return (-EACCES); return (generic_file_open(ip, filp)); } /* * Get root directory contents. */ static int zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) { zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); int error = 0; ZPL_ENTER(zfsvfs); if (!zpl_dir_emit_dots(filp, ctx)) goto out; if (ctx->pos == 2) { if (!zpl_dir_emit(ctx, ZFS_SNAPDIR_NAME, strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR)) goto out; ctx->pos++; } if (ctx->pos == 3) { if (!zpl_dir_emit(ctx, ZFS_SHAREDIR_NAME, strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR)) goto out; ctx->pos++; } out: ZPL_EXIT(zfsvfs); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_root_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ /* * Get root directory attributes. */ /* ARGSUSED */ static int +#ifdef HAVE_USERNS_IOPS_GETATTR +zpl_root_getattr_impl(struct user_namespace *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#else zpl_root_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#endif { struct inode *ip = path->dentry->d_inode; +#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR) + generic_fillattr(user_ns, ip, stat); +#else generic_fillattr(ip, stat); +#endif stat->atime = current_time(ip); return (0); } ZPL_GETATTR_WRAPPER(zpl_root_getattr); static struct dentry * zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) { cred_t *cr = CRED(); struct inode *ip; int error; crhold(cr); error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); ASSERT3S(error, <=, 0); crfree(cr); if (error) { if (error == -ENOENT) return (d_splice_alias(NULL, dentry)); else return (ERR_PTR(error)); } return (d_splice_alias(ip, dentry)); } /* * The '.zfs' control directory file and inode operations. */ const struct file_operations zpl_fops_root = { .open = zpl_common_open, .llseek = generic_file_llseek, .read = generic_read_dir, #ifdef HAVE_VFS_ITERATE_SHARED .iterate_shared = zpl_root_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_root_iterate, #else .readdir = zpl_root_readdir, #endif }; const struct inode_operations zpl_ops_root = { .lookup = zpl_root_lookup, .getattr = zpl_root_getattr, }; static struct vfsmount * zpl_snapdir_automount(struct path *path) { int error; error = -zfsctl_snapshot_mount(path, 0); if (error) return (ERR_PTR(error)); /* * Rather than returning the new vfsmount for the snapshot we must * return NULL to indicate a mount collision. This is done because * the user space mount calls do_add_mount() which adds the vfsmount * to the name space. If we returned the new mount here it would be * added again to the vfsmount list resulting in list corruption. */ return (NULL); } /* * Negative dentries must always be revalidated so newly created snapshots * can be detected and automounted. Normal dentries should be kept because * as of the 3.18 kernel revaliding the mountpoint dentry will result in * the snapshot being immediately unmounted. */ static int #ifdef HAVE_D_REVALIDATE_NAMEIDATA zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i) #else zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) #endif { return (!!dentry->d_inode); } dentry_operations_t zpl_dops_snapdirs = { /* * Auto mounting of snapshots is only supported for 2.6.37 and * newer kernels. Prior to this kernel the ops->follow_link() * callback was used as a hack to trigger the mount. The * resulting vfsmount was then explicitly grafted in to the * name space. While it might be possible to add compatibility * code to accomplish this it would require considerable care. */ .d_automount = zpl_snapdir_automount, .d_revalidate = zpl_snapdir_revalidate, }; static struct dentry * zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) { fstrans_cookie_t cookie; cred_t *cr = CRED(); struct inode *ip = NULL; int error; crhold(cr); cookie = spl_fstrans_mark(); error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); ASSERT3S(error, <=, 0); spl_fstrans_unmark(cookie); crfree(cr); if (error && error != -ENOENT) return (ERR_PTR(error)); ASSERT(error == 0 || ip == NULL); d_clear_d_op(dentry); d_set_d_op(dentry, &zpl_dops_snapdirs); dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; return (d_splice_alias(ip, dentry)); } static int zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) { zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); fstrans_cookie_t cookie; char snapname[MAXNAMELEN]; boolean_t case_conflict; uint64_t id, pos; int error = 0; ZPL_ENTER(zfsvfs); cookie = spl_fstrans_mark(); if (!zpl_dir_emit_dots(filp, ctx)) goto out; /* Start the position at 0 if it already emitted . and .. */ pos = (ctx->pos == 2 ? 0 : ctx->pos); while (error == 0) { dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, snapname, &id, &pos, &case_conflict); dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); if (error) goto out; if (!zpl_dir_emit(ctx, snapname, strlen(snapname), ZFSCTL_INO_SHARES - id, DT_DIR)) goto out; ctx->pos = pos; } out: spl_fstrans_unmark(cookie); ZPL_EXIT(zfsvfs); if (error == -ENOENT) return (0); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_snapdir_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ static int +#ifdef HAVE_IOPS_RENAME_USERNS +zpl_snapdir_rename2(struct user_namespace *user_ns, struct inode *sdip, + struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, + unsigned int flags) +#else zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) +#endif { cred_t *cr = CRED(); int error; /* We probably don't want to support renameat2(2) in ctldir */ if (flags) return (-EINVAL); crhold(cr); error = -zfsctl_snapdir_rename(sdip, dname(sdentry), tdip, dname(tdentry), cr, 0); ASSERT3S(error, <=, 0); crfree(cr); return (error); } -#ifndef HAVE_RENAME_WANTS_FLAGS +#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS) static int zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) { return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0)); } #endif static int zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry) { cred_t *cr = CRED(); int error; crhold(cr); error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0); ASSERT3S(error, <=, 0); crfree(cr); return (error); } static int +#ifdef HAVE_IOPS_MKDIR_USERNS +zpl_snapdir_mkdir(struct user_namespace *user_ns, struct inode *dip, + struct dentry *dentry, umode_t mode) +#else zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) +#endif { cred_t *cr = CRED(); vattr_t *vap; struct inode *ip; int error; crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dip, mode | S_IFDIR, cr); error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); if (error == 0) { d_clear_d_op(dentry); d_set_d_op(dentry, &zpl_dops_snapdirs); d_instantiate(dentry, ip); } kmem_free(vap, sizeof (vattr_t)); ASSERT3S(error, <=, 0); crfree(cr); return (error); } /* * Get snapshot directory attributes. */ /* ARGSUSED */ static int +#ifdef HAVE_USERNS_IOPS_GETATTR +zpl_snapdir_getattr_impl(struct user_namespace *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#else zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#endif { struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); ZPL_ENTER(zfsvfs); +#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR) + generic_fillattr(user_ns, ip, stat); +#else generic_fillattr(ip, stat); +#endif stat->nlink = stat->size = 2; stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); stat->atime = current_time(ip); ZPL_EXIT(zfsvfs); return (0); } ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr); /* * The '.zfs/snapshot' directory file operations. These mainly control * generating the list of available snapshots when doing an 'ls' in the * directory. See zpl_snapdir_readdir(). */ const struct file_operations zpl_fops_snapdir = { .open = zpl_common_open, .llseek = generic_file_llseek, .read = generic_read_dir, #ifdef HAVE_VFS_ITERATE_SHARED .iterate_shared = zpl_snapdir_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_snapdir_iterate, #else .readdir = zpl_snapdir_readdir, #endif }; /* * The '.zfs/snapshot' directory inode operations. These mainly control * creating an inode for a snapshot directory and initializing the needed * infrastructure to automount the snapshot. See zpl_snapdir_lookup(). */ const struct inode_operations zpl_ops_snapdir = { .lookup = zpl_snapdir_lookup, .getattr = zpl_snapdir_getattr, -#ifdef HAVE_RENAME_WANTS_FLAGS +#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) .rename = zpl_snapdir_rename2, #else .rename = zpl_snapdir_rename, #endif .rmdir = zpl_snapdir_rmdir, .mkdir = zpl_snapdir_mkdir, }; static struct dentry * zpl_shares_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) { fstrans_cookie_t cookie; cred_t *cr = CRED(); struct inode *ip = NULL; int error; crhold(cr); cookie = spl_fstrans_mark(); error = -zfsctl_shares_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); ASSERT3S(error, <=, 0); spl_fstrans_unmark(cookie); crfree(cr); if (error) { if (error == -ENOENT) return (d_splice_alias(NULL, dentry)); else return (ERR_PTR(error)); } return (d_splice_alias(ip, dentry)); } static int zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) { fstrans_cookie_t cookie; cred_t *cr = CRED(); zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); znode_t *dzp; int error = 0; ZPL_ENTER(zfsvfs); cookie = spl_fstrans_mark(); if (zfsvfs->z_shares_dir == 0) { zpl_dir_emit_dots(filp, ctx); goto out; } error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); if (error) goto out; crhold(cr); error = -zfs_readdir(ZTOI(dzp), ctx, cr); crfree(cr); iput(ZTOI(dzp)); out: spl_fstrans_unmark(cookie); ZPL_EXIT(zfsvfs); ASSERT3S(error, <=, 0); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_shares_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ /* ARGSUSED */ static int +#ifdef HAVE_USERNS_IOPS_GETATTR +zpl_shares_getattr_impl(struct user_namespace *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#else zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#endif { struct inode *ip = path->dentry->d_inode; zfsvfs_t *zfsvfs = ITOZSB(ip); znode_t *dzp; int error; ZPL_ENTER(zfsvfs); if (zfsvfs->z_shares_dir == 0) { +#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR) + generic_fillattr(user_ns, path->dentry->d_inode, stat); +#else generic_fillattr(path->dentry->d_inode, stat); +#endif stat->nlink = stat->size = 2; stat->atime = current_time(ip); ZPL_EXIT(zfsvfs); return (0); } error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); if (error == 0) { - error = -zfs_getattr_fast(ZTOI(dzp), stat); +#if defined(HAVE_GENERIC_FILLATTR_USERNS) && defined(HAVE_USERNS_IOPS_GETATTR) + error = -zfs_getattr_fast(user_ns, ZTOI(dzp), stat); +#else + error = -zfs_getattr_fast(kcred->user_ns, ZTOI(dzp), stat); +#endif iput(ZTOI(dzp)); } ZPL_EXIT(zfsvfs); ASSERT3S(error, <=, 0); return (error); } ZPL_GETATTR_WRAPPER(zpl_shares_getattr); /* * The '.zfs/shares' directory file operations. */ const struct file_operations zpl_fops_shares = { .open = zpl_common_open, .llseek = generic_file_llseek, .read = generic_read_dir, #ifdef HAVE_VFS_ITERATE_SHARED .iterate_shared = zpl_shares_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_shares_iterate, #else .readdir = zpl_shares_readdir, #endif }; /* * The '.zfs/shares' directory inode operations. */ const struct inode_operations zpl_ops_shares = { .lookup = zpl_shares_lookup, .getattr = zpl_shares_getattr, }; diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 970db4a8b73a..ea6993ffa4b0 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -1,1069 +1,1069 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #ifdef CONFIG_COMPAT #include #endif #include #include #include #include #include #include /* * When using fallocate(2) to preallocate space, inflate the requested * capacity check by 10% to account for the required metadata blocks. */ unsigned int zfs_fallocate_reserve_percent = 110; static int zpl_open(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; error = generic_file_open(ip, filp); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_release(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); if (ITOZ(ip)->z_atime_dirty) zfs_mark_inode_dirty(ip); crhold(cr); error = -zfs_close(ip, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_iterate(struct file *filp, zpl_dir_context_t *ctx) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_readdir(file_inode(filp), ctx, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ #if defined(HAVE_FSYNC_WITHOUT_DENTRY) /* * Linux 2.6.35 - 3.0 API, * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed * redundant. The dentry is still accessible via filp->f_path.dentry, * and we are guaranteed that filp will never be NULL. */ static int zpl_fsync(struct file *filp, int datasync) { struct inode *inode = filp->f_mapping->host; cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(ITOZ(inode), datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { return (zpl_fsync(kiocb->ki_filp, datasync)); } #endif #elif defined(HAVE_FSYNC_RANGE) /* * Linux 3.1 - 3.x API, * As of 3.1 the responsibility to call filemap_write_and_wait_range() has * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex * lock is no longer held by the caller, for zfs we don't require the lock * to be held so we don't acquire it. */ static int zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(ITOZ(inode), datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync)); } #endif #else #error "Unsupported fops->fsync() implementation" #endif static inline int zfs_io_flags(struct kiocb *kiocb) { int flags = 0; #if defined(IOCB_DSYNC) if (kiocb->ki_flags & IOCB_DSYNC) flags |= O_DSYNC; #endif #if defined(IOCB_SYNC) if (kiocb->ki_flags & IOCB_SYNC) flags |= O_SYNC; #endif #if defined(IOCB_APPEND) if (kiocb->ki_flags & IOCB_APPEND) flags |= O_APPEND; #endif #if defined(IOCB_DIRECT) if (kiocb->ki_flags & IOCB_DIRECT) flags |= O_DIRECT; #endif return (flags); } /* * If relatime is enabled, call file_accessed() if zfs_relatime_need_update() * is true. This is needed since datasets with inherited "relatime" property * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after * `zfs set relatime=...`), which is what relatime test in VFS by * relatime_need_update() is based on. */ static inline void zpl_file_accessed(struct file *filp) { struct inode *ip = filp->f_mapping->host; if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) { if (zfs_relatime_need_update(ip)) file_accessed(filp); } else { file_accessed(filp); } } #if defined(HAVE_VFS_RW_ITERATE) /* * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to * manipulate the iov_iter are available. In which case the full iov_iter * can be attached to the uio and correctly handled in the lower layers. * Otherwise, for older kernels extract the iovec and pass it instead. */ static void zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, loff_t pos, ssize_t count, size_t skip) { #if defined(HAVE_VFS_IOV_ITER) zfs_uio_iov_iter_init(uio, to, pos, count, skip); #else zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, count, skip); #endif } static ssize_t zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; ssize_t count = iov_iter_count(to); zfs_uio_t uio; zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); crhold(cr); cookie = spl_fstrans_mark(); int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (error < 0) return (error); ssize_t read = count - uio.uio_resid; kiocb->ki_pos += read; zpl_file_accessed(filp); return (read); } static inline ssize_t zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, size_t *countp) { #ifdef HAVE_GENERIC_WRITE_CHECKS_KIOCB ssize_t ret = generic_write_checks(kiocb, from); if (ret <= 0) return (ret); *countp = ret; #else struct file *file = kiocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *ip = mapping->host; int isblk = S_ISBLK(ip->i_mode); *countp = iov_iter_count(from); ssize_t ret = generic_write_checks(file, &kiocb->ki_pos, countp, isblk); if (ret) return (ret); #endif return (0); } static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; zfs_uio_t uio; size_t count = 0; ssize_t ret; ret = zpl_generic_write_checks(kiocb, from, &count); if (ret) return (ret); zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); crhold(cr); cookie = spl_fstrans_mark(); int error = -zfs_write(ITOZ(ip), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (error < 0) return (error); ssize_t wrote = count - uio.uio_resid; kiocb->ki_pos += wrote; if (wrote > 0) iov_iter_advance(from, wrote); return (wrote); } #else /* !HAVE_VFS_RW_ITERATE */ static ssize_t zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; size_t count; ssize_t ret; ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); zfs_uio_t uio; zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); crhold(cr); cookie = spl_fstrans_mark(); int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (error < 0) return (error); ssize_t read = count - uio.uio_resid; kiocb->ki_pos += read; zpl_file_accessed(filp); return (read); } static ssize_t zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; size_t count; ssize_t ret; ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); if (ret) return (ret); ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); if (ret) return (ret); zfs_uio_t uio; zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); crhold(cr); cookie = spl_fstrans_mark(); int error = -zfs_write(ITOZ(ip), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (error < 0) return (error); ssize_t wrote = count - uio.uio_resid; kiocb->ki_pos += wrote; return (wrote); } #endif /* HAVE_VFS_RW_ITERATE */ #if defined(HAVE_VFS_RW_ITERATE) static ssize_t zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter) { if (rw == WRITE) return (zpl_iter_write(kiocb, iter)); else return (zpl_iter_read(kiocb, iter)); } #if defined(HAVE_VFS_DIRECT_IO_ITER) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) { return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { ASSERT3S(pos, ==, kiocb->ki_pos); return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { ASSERT3S(pos, ==, kiocb->ki_pos); return (zpl_direct_IO_impl(rw, kiocb, iter)); } #else #error "Unknown direct IO interface" #endif #else /* HAVE_VFS_RW_ITERATE */ #if defined(HAVE_VFS_DIRECT_IO_IOVEC) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { if (rw == WRITE) return (zpl_aio_write(kiocb, iov, nr_segs, pos)); else return (zpl_aio_read(kiocb, iov, nr_segs, pos)); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { const struct iovec *iovp = iov_iter_iovec(iter); unsigned long nr_segs = iter->nr_segs; ASSERT3S(pos, ==, kiocb->ki_pos); if (rw == WRITE) return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); else return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); } #else #error "Unknown direct IO interface" #endif #endif /* HAVE_VFS_RW_ITERATE */ static loff_t zpl_llseek(struct file *filp, loff_t offset, int whence) { #if defined(SEEK_HOLE) && defined(SEEK_DATA) fstrans_cookie_t cookie; if (whence == SEEK_DATA || whence == SEEK_HOLE) { struct inode *ip = filp->f_mapping->host; loff_t maxbytes = ip->i_sb->s_maxbytes; loff_t error; spl_inode_lock_shared(ip); cookie = spl_fstrans_mark(); error = -zfs_holey(ITOZ(ip), whence, &offset); spl_fstrans_unmark(cookie); if (error == 0) error = lseek_execute(filp, ip, offset, maxbytes); spl_inode_unlock_shared(ip); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ return (generic_file_llseek(filp, offset, whence)); } /* * It's worth taking a moment to describe how mmap is implemented * for zfs because it differs considerably from other Linux filesystems. * However, this issue is handled the same way under OpenSolaris. * * The issue is that by design zfs bypasses the Linux page cache and * leaves all caching up to the ARC. This has been shown to work * well for the common read(2)/write(2) case. However, mmap(2) * is problem because it relies on being tightly integrated with the * page cache. To handle this we cache mmap'ed files twice, once in * the ARC and a second time in the page cache. The code is careful * to keep both copies synchronized. * * When a file with an mmap'ed region is written to using write(2) * both the data in the ARC and existing pages in the page cache * are updated. For a read(2) data will be read first from the page * cache then the ARC if needed. Neither a write(2) or read(2) will * will ever result in new pages being added to the page cache. * * New pages are added to the page cache only via .readpage() which * is called when the vfs needs to read a page off disk to back the * virtual memory region. These pages may be modified without * notifying the ARC and will be written out periodically via * .writepage(). This will occur due to either a sync or the usual * page aging behavior. Note because a read(2) of a mmap'ed file * will always check the page cache first even when the ARC is out * of date correct data will still be returned. * * While this implementation ensures correct behavior it does have * have some drawbacks. The most obvious of which is that it * increases the required memory footprint when access mmap'ed * files. It also adds additional complexity to the code keeping * both caches synchronized. * * Longer term it may be possible to cleanly resolve this wart by * mapping page cache pages directly on to the ARC buffers. The * Linux address space operations are flexible enough to allow * selection of which pages back a particular index. The trick * would be working out the details of which subsystem is in * charge, the ARC, the page cache, or both. It may also prove * helpful to move the ARC buffers to a scatter-gather lists * rather than a vmalloc'ed region. */ static int zpl_mmap(struct file *filp, struct vm_area_struct *vma) { struct inode *ip = filp->f_mapping->host; znode_t *zp = ITOZ(ip); int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); spl_fstrans_unmark(cookie); if (error) return (error); error = generic_file_mmap(filp, vma); if (error) return (error); mutex_enter(&zp->z_lock); zp->z_is_mapped = B_TRUE; mutex_exit(&zp->z_lock); return (error); } /* * Populate a page with data for the Linux page cache. This function is * only used to support mmap(2). There will be an identical copy of the * data in the ARC which is kept up to date via .write() and .writepage(). */ static int zpl_readpage(struct file *filp, struct page *pp) { struct inode *ip; struct page *pl[1]; int error = 0; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); ip = pp->mapping->host; pl[0] = pp; cookie = spl_fstrans_mark(); error = -zfs_getpage(ip, pl, 1); spl_fstrans_unmark(cookie); if (error) { SetPageError(pp); ClearPageUptodate(pp); } else { ClearPageError(pp); SetPageUptodate(pp); flush_dcache_page(pp); } unlock_page(pp); return (error); } /* * Populate a set of pages with data for the Linux page cache. This * function will only be called for read ahead and never for demand * paging. For simplicity, the code relies on read_cache_pages() to * correctly lock each page for IO and call zpl_readpage(). */ static int zpl_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { return (read_cache_pages(mapping, pages, (filler_t *)zpl_readpage, filp)); } static int zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) { struct address_space *mapping = data; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); ASSERT(!PageWriteback(pp)); cookie = spl_fstrans_mark(); (void) zfs_putpage(mapping->host, pp, wbc); spl_fstrans_unmark(cookie); return (0); } static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) { znode_t *zp = ITOZ(mapping->host); zfsvfs_t *zfsvfs = ITOZSB(mapping->host); enum writeback_sync_modes sync_mode; int result; ZPL_ENTER(zfsvfs); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; ZPL_EXIT(zfsvfs); sync_mode = wbc->sync_mode; /* * We don't want to run write_cache_pages() in SYNC mode here, because * that would make putpage() wait for a single page to be committed to * disk every single time, resulting in atrocious performance. Instead * we run it once in non-SYNC mode so that the ZIL gets all the data, * and then we commit it all in one go. */ wbc->sync_mode = WB_SYNC_NONE; result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); if (sync_mode != wbc->sync_mode) { ZPL_ENTER(zfsvfs); ZPL_VERIFY_ZP(zp); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, zp->z_id); ZPL_EXIT(zfsvfs); /* * We need to call write_cache_pages() again (we can't just * return after the commit) because the previous call in * non-SYNC mode does not guarantee that we got all the dirty * pages (see the implementation of write_cache_pages() for * details). That being said, this is a no-op in most cases. */ wbc->sync_mode = sync_mode; result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); } return (result); } /* * Write out dirty pages to the ARC, this function is only required to * support mmap(2). Mapped pages may be dirtied by memory operations * which never call .write(). These dirty pages are kept in sync with * the ARC buffers via this hook. */ static int zpl_writepage(struct page *pp, struct writeback_control *wbc) { if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; return (zpl_putpage(pp, wbc, pp->mapping)); } /* * The flag combination which matches the behavior of zfs_space() is * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE * flag was introduced in the 2.6.38 kernel. * * The original mode=0 (allocate space) behavior can be reasonably emulated * by checking if enough space exists and creating a sparse file, as real * persistent space reservation is not possible due to COW, snapshots, etc. */ static long zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) { cred_t *cr = CRED(); loff_t olen; fstrans_cookie_t cookie; int error = 0; if ((mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) != 0) return (-EOPNOTSUPP); if (offset < 0 || len <= 0) return (-EINVAL); spl_inode_lock(ip); olen = i_size_read(ip); crhold(cr); cookie = spl_fstrans_mark(); if (mode & FALLOC_FL_PUNCH_HOLE) { flock64_t bf; if (offset > olen) goto out_unmark; if (offset + len > olen) len = olen - offset; bf.l_type = F_WRLCK; bf.l_whence = SEEK_SET; bf.l_start = offset; bf.l_len = len; bf.l_pid = 0; error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr); } else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) { unsigned int percent = zfs_fallocate_reserve_percent; struct kstatfs statfs; /* Legacy mode, disable fallocate compatibility. */ if (percent == 0) { error = -EOPNOTSUPP; goto out_unmark; } /* * Use zfs_statvfs() instead of dmu_objset_space() since it * also checks project quota limits, which are relevant here. */ error = zfs_statvfs(ip, &statfs); if (error) goto out_unmark; /* * Shrink available space a bit to account for overhead/races. * We know the product previously fit into availbytes from * dmu_objset_space(), so the smaller product will also fit. */ if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) { error = -ENOSPC; goto out_unmark; } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen) error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE); } out_unmark: spl_fstrans_unmark(cookie); spl_inode_unlock(ip); crfree(cr); return (error); } static long zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) { return zpl_fallocate_common(file_inode(filp), mode, offset, len); } #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) static uint32_t __zpl_ioctl_getflags(struct inode *ip) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; uint32_t ioctl_flags = 0; if (zfs_flags & ZFS_IMMUTABLE) ioctl_flags |= FS_IMMUTABLE_FL; if (zfs_flags & ZFS_APPENDONLY) ioctl_flags |= FS_APPEND_FL; if (zfs_flags & ZFS_NODUMP) ioctl_flags |= FS_NODUMP_FL; if (zfs_flags & ZFS_PROJINHERIT) ioctl_flags |= ZFS_PROJINHERIT_FL; return (ioctl_flags & ZFS_FL_USER_VISIBLE); } /* * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file * attributes common to both Linux and Solaris are mapped. */ static int zpl_ioctl_getflags(struct file *filp, void __user *arg) { uint32_t flags; int err; flags = __zpl_ioctl_getflags(file_inode(filp)); err = copy_to_user(arg, &flags, sizeof (flags)); return (err); } /* * fchange() is a helper macro to detect if we have been asked to change a * flag. This is ugly, but the requirement that we do this is a consequence of * how the Linux file attribute interface was designed. Another consequence is * that concurrent modification of files suffers from a TOCTOU race. Neither * are things we can fix without modifying the kernel-userland interface, which * is outside of our jurisdiction. */ #define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) static int __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; xoptattr_t *xoap; if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | ZFS_PROJINHERIT_FL)) return (-EOPNOTSUPP); if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) return (-EACCES); if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && !capable(CAP_LINUX_IMMUTABLE)) return (-EACCES); - if (!inode_owner_or_capable(ip)) + if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EACCES); xva_init(xva); xoap = xva_getxoptattr(xva); XVA_SET_REQ(xva, XAT_IMMUTABLE); if (ioctl_flags & FS_IMMUTABLE_FL) xoap->xoa_immutable = B_TRUE; XVA_SET_REQ(xva, XAT_APPENDONLY); if (ioctl_flags & FS_APPEND_FL) xoap->xoa_appendonly = B_TRUE; XVA_SET_REQ(xva, XAT_NODUMP); if (ioctl_flags & FS_NODUMP_FL) xoap->xoa_nodump = B_TRUE; XVA_SET_REQ(xva, XAT_PROJINHERIT); if (ioctl_flags & ZFS_PROJINHERIT_FL) xoap->xoa_projinherit = B_TRUE; return (0); } static int zpl_ioctl_setflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint32_t flags; cred_t *cr = CRED(); xvattr_t xva; int err; fstrans_cookie_t cookie; if (copy_from_user(&flags, arg, sizeof (flags))) return (-EFAULT); err = __zpl_ioctl_setflags(ip, flags, &xva); if (err) return (err); crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); spl_fstrans_unmark(cookie); crfree(cr); return (err); } static int zpl_ioctl_getxattr(struct file *filp, void __user *arg) { zfsxattr_t fsx = { 0 }; struct inode *ip = file_inode(filp); int err; fsx.fsx_xflags = __zpl_ioctl_getflags(ip); fsx.fsx_projid = ITOZ(ip)->z_projid; err = copy_to_user(arg, &fsx, sizeof (fsx)); return (err); } static int zpl_ioctl_setxattr(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); zfsxattr_t fsx; cred_t *cr = CRED(); xvattr_t xva; xoptattr_t *xoap; int err; fstrans_cookie_t cookie; if (copy_from_user(&fsx, arg, sizeof (fsx))) return (-EFAULT); if (!zpl_is_valid_projid(fsx.fsx_projid)) return (-EINVAL); err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); if (err) return (err); xoap = xva_getxoptattr(&xva); XVA_SET_REQ(&xva, XAT_PROJID); xoap->xoa_projid = fsx.fsx_projid; crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); spl_fstrans_unmark(cookie); crfree(cr); return (err); } static long zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC_GETFLAGS: return (zpl_ioctl_getflags(filp, (void *)arg)); case FS_IOC_SETFLAGS: return (zpl_ioctl_setflags(filp, (void *)arg)); case ZFS_IOC_FSGETXATTR: return (zpl_ioctl_getxattr(filp, (void *)arg)); case ZFS_IOC_FSSETXATTR: return (zpl_ioctl_setxattr(filp, (void *)arg)); default: return (-ENOTTY); } } #ifdef CONFIG_COMPAT static long zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; default: return (-ENOTTY); } return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); } #endif /* CONFIG_COMPAT */ const struct address_space_operations zpl_address_space_operations = { .readpages = zpl_readpages, .readpage = zpl_readpage, .writepage = zpl_writepage, .writepages = zpl_writepages, .direct_IO = zpl_direct_IO, }; const struct file_operations zpl_file_operations = { .open = zpl_open, .release = zpl_release, .llseek = zpl_llseek, #ifdef HAVE_VFS_RW_ITERATE #ifdef HAVE_NEW_SYNC_READ .read = new_sync_read, .write = new_sync_write, #endif .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, #ifdef HAVE_VFS_IOV_ITER .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, #endif #else .read = do_sync_read, .write = do_sync_write, .aio_read = zpl_aio_read, .aio_write = zpl_aio_write, #endif .mmap = zpl_mmap, .fsync = zpl_fsync, #ifdef HAVE_FILE_AIO_FSYNC .aio_fsync = zpl_aio_fsync, #endif .fallocate = zpl_fallocate, .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; const struct file_operations zpl_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, #if defined(HAVE_VFS_ITERATE_SHARED) .iterate_shared = zpl_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_iterate, #else .readdir = zpl_readdir, #endif .fsync = zpl_fsync, .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; /* BEGIN CSTYLED */ module_param(zfs_fallocate_reserve_percent, uint, 0644); MODULE_PARM_DESC(zfs_fallocate_reserve_percent, "Percentage of length to use for the available capacity check"); /* END CSTYLED */ diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index 117963f44af3..cf0eab3e8c90 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -1,746 +1,787 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #include #include #include #include #include #include #include #include static struct dentry * zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { cred_t *cr = CRED(); struct inode *ip; znode_t *zp; int error; fstrans_cookie_t cookie; pathname_t *ppn = NULL; pathname_t pn; int zfs_flags = 0; zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; if (dlen(dentry) >= ZAP_MAXNAMELEN) return (ERR_PTR(-ENAMETOOLONG)); crhold(cr); cookie = spl_fstrans_mark(); /* If we are a case insensitive fs, we need the real name */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { zfs_flags = FIGNORECASE; pn_alloc(&pn); ppn = &pn; } error = -zfs_lookup(ITOZ(dir), dname(dentry), &zp, zfs_flags, cr, NULL, ppn); spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); crfree(cr); spin_lock(&dentry->d_lock); dentry->d_time = jiffies; spin_unlock(&dentry->d_lock); if (error) { /* * If we have a case sensitive fs, we do not want to * insert negative entries, so return NULL for ENOENT. * Fall through if the error is not ENOENT. Also free memory. */ if (ppn) { pn_free(ppn); if (error == -ENOENT) return (NULL); } if (error == -ENOENT) return (d_splice_alias(NULL, dentry)); else return (ERR_PTR(error)); } ip = ZTOI(zp); /* * If we are case insensitive, call the correct function * to install the name. */ if (ppn) { struct dentry *new_dentry; struct qstr ci_name; if (strcmp(dname(dentry), pn.pn_buf) == 0) { new_dentry = d_splice_alias(ip, dentry); } else { ci_name.name = pn.pn_buf; ci_name.len = strlen(pn.pn_buf); new_dentry = d_add_ci(dentry, ip, &ci_name); } pn_free(ppn); return (new_dentry); } else { return (d_splice_alias(ip, dentry)); } } void zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr) { vap->va_mask = ATTR_MODE; vap->va_mode = mode; vap->va_uid = crgetfsuid(cr); if (dir && dir->i_mode & S_ISGID) { vap->va_gid = KGID_TO_SGID(dir->i_gid); if (S_ISDIR(mode)) vap->va_mode |= S_ISGID; } else { vap->va_gid = crgetfsgid(cr); } } static int +#ifdef HAVE_IOPS_CREATE_USERNS +zpl_create(struct user_namespace *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode, bool flag) +#else zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) +#endif { cred_t *cr = CRED(); znode_t *zp; vattr_t *vap; int error; fstrans_cookie_t cookie; crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, mode, cr); cookie = spl_fstrans_mark(); error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0, mode, &zp, cr, 0, NULL); if (error == 0) { d_instantiate(dentry, ZTOI(zp)); error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) error = zpl_init_acl(ZTOI(zp), dir); if (error) (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0); } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int +#ifdef HAVE_IOPS_MKNOD_USERNS +zpl_mknod(struct user_namespace *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode, +#else zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, +#endif dev_t rdev) { cred_t *cr = CRED(); znode_t *zp; vattr_t *vap; int error; fstrans_cookie_t cookie; /* * We currently expect Linux to supply rdev=0 for all sockets * and fifos, but we want to know if this behavior ever changes. */ if (S_ISSOCK(mode) || S_ISFIFO(mode)) ASSERT(rdev == 0); crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, mode, cr); vap->va_rdev = rdev; cookie = spl_fstrans_mark(); error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0, mode, &zp, cr, 0, NULL); if (error == 0) { d_instantiate(dentry, ZTOI(zp)); error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) error = zpl_init_acl(ZTOI(zp), dir); if (error) (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0); } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_TMPFILE static int zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { cred_t *cr = CRED(); struct inode *ip; vattr_t *vap; int error; fstrans_cookie_t cookie; crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); /* * The VFS does not apply the umask, therefore it is applied here * when POSIX ACLs are not enabled. */ if (!IS_POSIXACL(dir)) mode &= ~current_umask(); zpl_vap_init(vap, dir, mode, cr); cookie = spl_fstrans_mark(); error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL); if (error == 0) { /* d_tmpfile will do drop_nlink, so we should set it first */ set_nlink(ip, 1); d_tmpfile(dentry, ip); error = zpl_xattr_security_init(ip, dir, &dentry->d_name); if (error == 0) error = zpl_init_acl(ip, dir); /* * don't need to handle error here, file is already in * unlinked set. */ } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #endif static int zpl_unlink(struct inode *dir, struct dentry *dentry) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_remove(ITOZ(dir), dname(dentry), cr, 0); /* * For a CI FS we must invalidate the dentry to prevent the * creation of negative entries. */ if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE) d_invalidate(dentry); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int +#ifdef HAVE_IOPS_MKDIR_USERNS +zpl_mkdir(struct user_namespace *user_ns, struct inode *dir, + struct dentry *dentry, umode_t mode) +#else zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +#endif { cred_t *cr = CRED(); vattr_t *vap; znode_t *zp; int error; fstrans_cookie_t cookie; crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, mode | S_IFDIR, cr); cookie = spl_fstrans_mark(); error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL); if (error == 0) { d_instantiate(dentry, ZTOI(zp)); error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error == 0) error = zpl_init_acl(ZTOI(zp), dir); if (error) (void) zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0); } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_rmdir(struct inode *dir, struct dentry *dentry) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0); /* * For a CI FS we must invalidate the dentry to prevent the * creation of negative entries. */ if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE) d_invalidate(dentry); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int +#ifdef HAVE_USERNS_IOPS_GETATTR +zpl_getattr_impl(struct user_namespace *user_ns, + const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +#else zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) +#endif { int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); /* * XXX request_mask and query_flags currently ignored. */ - error = -zfs_getattr_fast(path->dentry->d_inode, stat); +#ifdef HAVE_USERNS_IOPS_GETATTR + error = -zfs_getattr_fast(user_ns, path->dentry->d_inode, stat); +#else + error = -zfs_getattr_fast(kcred->user_ns, path->dentry->d_inode, stat); +#endif spl_fstrans_unmark(cookie); ASSERT3S(error, <=, 0); return (error); } ZPL_GETATTR_WRAPPER(zpl_getattr); static int +#ifdef HAVE_SETATTR_PREPARE_USERNS +zpl_setattr(struct user_namespace *user_ns, struct dentry *dentry, + struct iattr *ia) +#else zpl_setattr(struct dentry *dentry, struct iattr *ia) +#endif { struct inode *ip = dentry->d_inode; cred_t *cr = CRED(); vattr_t *vap; int error; fstrans_cookie_t cookie; - error = setattr_prepare(dentry, ia); + error = zpl_setattr_prepare(kcred->user_ns, dentry, ia); if (error) return (error); crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK; vap->va_mode = ia->ia_mode; vap->va_uid = KUID_TO_SUID(ia->ia_uid); vap->va_gid = KGID_TO_SGID(ia->ia_gid); vap->va_size = ia->ia_size; vap->va_atime = ia->ia_atime; vap->va_mtime = ia->ia_mtime; vap->va_ctime = ia->ia_ctime; if (vap->va_mask & ATTR_ATIME) ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip); cookie = spl_fstrans_mark(); error = -zfs_setattr(ITOZ(ip), vap, 0, cr); if (!error && (ia->ia_valid & ATTR_MODE)) error = zpl_chmod_acl(ip); spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int +#ifdef HAVE_IOPS_RENAME_USERNS +zpl_rename2(struct user_namespace *user_ns, struct inode *sdip, + struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, + unsigned int flags) +#else zpl_rename2(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry, unsigned int flags) +#endif { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; /* We don't have renameat2(2) support */ if (flags) return (-EINVAL); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip), dname(tdentry), cr, 0); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } -#ifndef HAVE_RENAME_WANTS_FLAGS +#if !defined(HAVE_RENAME_WANTS_FLAGS) && !defined(HAVE_IOPS_RENAME_USERNS) static int zpl_rename(struct inode *sdip, struct dentry *sdentry, struct inode *tdip, struct dentry *tdentry) { return (zpl_rename2(sdip, sdentry, tdip, tdentry, 0)); } #endif static int +#ifdef HAVE_IOPS_SYMLINK_USERNS +zpl_symlink(struct user_namespace *user_ns, struct inode *dir, + struct dentry *dentry, const char *name) +#else zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) +#endif { cred_t *cr = CRED(); vattr_t *vap; znode_t *zp; int error; fstrans_cookie_t cookie; crhold(cr); vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr); cookie = spl_fstrans_mark(); error = -zfs_symlink(ITOZ(dir), dname(dentry), vap, (char *)name, &zp, cr, 0); if (error == 0) { d_instantiate(dentry, ZTOI(zp)); error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); if (error) (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0); } spl_fstrans_unmark(cookie); kmem_free(vap, sizeof (vattr_t)); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #if defined(HAVE_PUT_LINK_COOKIE) static void zpl_put_link(struct inode *unused, void *cookie) { kmem_free(cookie, MAXPATHLEN); } #elif defined(HAVE_PUT_LINK_NAMEIDATA) static void zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) { const char *link = nd_get_link(nd); if (!IS_ERR(link)) kmem_free(link, MAXPATHLEN); } #elif defined(HAVE_PUT_LINK_DELAYED) static void zpl_put_link(void *ptr) { kmem_free(ptr, MAXPATHLEN); } #endif static int zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link) { fstrans_cookie_t cookie; cred_t *cr = CRED(); int error; crhold(cr); *link = NULL; struct iovec iov; iov.iov_len = MAXPATHLEN; iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP); zfs_uio_t uio; zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, MAXPATHLEN - 1, 0); cookie = spl_fstrans_mark(); error = -zfs_readlink(ip, &uio, cr); spl_fstrans_unmark(cookie); crfree(cr); if (error) kmem_free(iov.iov_base, MAXPATHLEN); else *link = iov.iov_base; return (error); } #if defined(HAVE_GET_LINK_DELAYED) static const char * zpl_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { char *link = NULL; int error; if (!dentry) return (ERR_PTR(-ECHILD)); error = zpl_get_link_common(dentry, inode, &link); if (error) return (ERR_PTR(error)); set_delayed_call(done, zpl_put_link, link); return (link); } #elif defined(HAVE_GET_LINK_COOKIE) static const char * zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie) { char *link = NULL; int error; if (!dentry) return (ERR_PTR(-ECHILD)); error = zpl_get_link_common(dentry, inode, &link); if (error) return (ERR_PTR(error)); return (*cookie = link); } #elif defined(HAVE_FOLLOW_LINK_COOKIE) static const char * zpl_follow_link(struct dentry *dentry, void **cookie) { char *link = NULL; int error; error = zpl_get_link_common(dentry, dentry->d_inode, &link); if (error) return (ERR_PTR(error)); return (*cookie = link); } #elif defined(HAVE_FOLLOW_LINK_NAMEIDATA) static void * zpl_follow_link(struct dentry *dentry, struct nameidata *nd) { char *link = NULL; int error; error = zpl_get_link_common(dentry, dentry->d_inode, &link); if (error) nd_set_link(nd, ERR_PTR(error)); else nd_set_link(nd, link); return (NULL); } #endif static int zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { cred_t *cr = CRED(); struct inode *ip = old_dentry->d_inode; int error; fstrans_cookie_t cookie; if (ip->i_nlink >= ZFS_LINK_MAX) return (-EMLINK); crhold(cr); ip->i_ctime = current_time(ip); /* Must have an existing ref, so igrab() cannot return NULL */ VERIFY3P(igrab(ip), !=, NULL); cookie = spl_fstrans_mark(); error = -zfs_link(ITOZ(dir), ITOZ(ip), dname(dentry), cr, 0); if (error) { iput(ip); goto out; } d_instantiate(dentry, ip); out: spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int #ifdef HAVE_D_REVALIDATE_NAMEIDATA zpl_revalidate(struct dentry *dentry, struct nameidata *nd) { unsigned int flags = (nd ? nd->flags : 0); #else zpl_revalidate(struct dentry *dentry, unsigned int flags) { #endif /* HAVE_D_REVALIDATE_NAMEIDATA */ /* CSTYLED */ zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; int error; if (flags & LOOKUP_RCU) return (-ECHILD); /* * After a rollback negative dentries created before the rollback * time must be invalidated. Otherwise they can obscure files which * are only present in the rolled back dataset. */ if (dentry->d_inode == NULL) { spin_lock(&dentry->d_lock); error = time_before(dentry->d_time, zfsvfs->z_rollback_time); spin_unlock(&dentry->d_lock); if (error) return (0); } /* * The dentry may reference a stale inode if a mounted file system * was rolled back to a point in time where the object didn't exist. */ if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale) return (0); return (1); } const struct inode_operations zpl_inode_operations = { .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, #endif /* HAVE_SET_ACL */ .get_acl = zpl_get_acl, #endif /* CONFIG_FS_POSIX_ACL */ }; const struct inode_operations zpl_dir_inode_operations = { .create = zpl_create, .lookup = zpl_lookup, .link = zpl_link, .unlink = zpl_unlink, .symlink = zpl_symlink, .mkdir = zpl_mkdir, .rmdir = zpl_rmdir, .mknod = zpl_mknod, -#ifdef HAVE_RENAME_WANTS_FLAGS +#if defined(HAVE_RENAME_WANTS_FLAGS) || defined(HAVE_IOPS_RENAME_USERNS) .rename = zpl_rename2, #else .rename = zpl_rename, #endif #ifdef HAVE_TMPFILE .tmpfile = zpl_tmpfile, #endif .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, #endif /* HAVE_SET_ACL */ .get_acl = zpl_get_acl, #endif /* CONFIG_FS_POSIX_ACL */ }; const struct inode_operations zpl_symlink_inode_operations = { #ifdef HAVE_GENERIC_READLINK .readlink = generic_readlink, #endif #if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE) .get_link = zpl_get_link, #elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA) .follow_link = zpl_follow_link, #endif #if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA) .put_link = zpl_put_link, #endif .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, }; const struct inode_operations zpl_special_inode_operations = { .setattr = zpl_setattr, .getattr = zpl_getattr, #ifdef HAVE_GENERIC_SETXATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .removexattr = generic_removexattr, #endif .listxattr = zpl_xattr_list, #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) .set_acl = zpl_set_acl, #endif /* HAVE_SET_ACL */ .get_acl = zpl_get_acl, #endif /* CONFIG_FS_POSIX_ACL */ }; dentry_operations_t zpl_dentry_operations = { .d_revalidate = zpl_revalidate, }; diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c index 83812f2dcba8..971cd6ad031e 100644 --- a/module/os/linux/zfs/zpl_xattr.c +++ b/module/os/linux/zfs/zpl_xattr.c @@ -1,1486 +1,1486 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * * Extended attributes (xattr) on Solaris are implemented as files * which exist in a hidden xattr directory. These extended attributes * can be accessed using the attropen() system call which opens * the extended attribute. It can then be manipulated just like * a standard file descriptor. This has a couple advantages such * as practically no size limit on the file, and the extended * attributes permissions may differ from those of the parent file. * This interface is really quite clever, but it's also completely * different than what is supported on Linux. It also comes with a * steep performance penalty when accessing small xattrs because they * are not stored with the parent file. * * Under Linux extended attributes are manipulated by the system * calls getxattr(2), setxattr(2), and listxattr(2). They consider * extended attributes to be name/value pairs where the name is a * NULL terminated string. The name must also include one of the * following namespace prefixes: * * user - No restrictions and is available to user applications. * trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use. * system - Used for access control lists (system.nfs4_acl, etc). * security - Used by SELinux to store a files security context. * * The value under Linux to limited to 65536 bytes of binary data. * In practice, individual xattrs tend to be much smaller than this * and are typically less than 100 bytes. A good example of this * are the security.selinux xattrs which are less than 100 bytes and * exist for every file when xattr labeling is enabled. * * The Linux xattr implementation has been written to take advantage of * this typical usage. When the dataset property 'xattr=sa' is set, * then xattrs will be preferentially stored as System Attributes (SA). * This allows tiny xattrs (~100 bytes) to be stored with the dnode and * up to 64k of xattrs to be stored in the spill block. If additional * xattr space is required, which is unlikely under Linux, they will * be stored using the traditional directory approach. * * This optimization results in roughly a 3x performance improvement * when accessing xattrs because it avoids the need to perform a seek * for every xattr value. When multiple xattrs are stored per-file * the performance improvements are even greater because all of the * xattrs stored in the spill block will be cached. * * However, by default SA based xattrs are disabled in the Linux port * to maximize compatibility with other implementations. If you do * enable SA based xattrs then they will not be visible on platforms * which do not support this feature. * * NOTE: One additional consequence of the xattr directory implementation * is that when an extended attribute is manipulated an inode is created. * This inode will exist in the Linux inode cache but there will be no * associated entry in the dentry cache which references it. This is * safe but it may result in some confusion. Enabling SA based xattrs * largely avoids the issue except in the overflow case. */ #include #include #include #include #include #include typedef struct xattr_filldir { size_t size; size_t offset; char *buf; struct dentry *dentry; } xattr_filldir_t; static const struct xattr_handler *zpl_xattr_handler(const char *); static int zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len) { static const struct xattr_handler *handler; struct dentry *d = xf->dentry; handler = zpl_xattr_handler(name); if (!handler) return (0); if (handler->list) { #if defined(HAVE_XATTR_LIST_SIMPLE) if (!handler->list(d)) return (0); #elif defined(HAVE_XATTR_LIST_DENTRY) if (!handler->list(d, NULL, 0, name, name_len, 0)) return (0); #elif defined(HAVE_XATTR_LIST_HANDLER) if (!handler->list(handler, d, NULL, 0, name, name_len)) return (0); #endif } return (1); } /* * Determine is a given xattr name should be visible and if so copy it * in to the provided buffer (xf->buf). */ static int zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len) { /* Check permissions using the per-namespace list xattr handler. */ if (!zpl_xattr_permission(xf, name, name_len)) return (0); /* When xf->buf is NULL only calculate the required size. */ if (xf->buf) { if (xf->offset + name_len + 1 > xf->size) return (-ERANGE); memcpy(xf->buf + xf->offset, name, name_len); xf->buf[xf->offset + name_len] = '\0'; } xf->offset += (name_len + 1); return (0); } /* * Read as many directory entry names as will fit in to the provided buffer, * or when no buffer is provided calculate the required buffer size. */ static int zpl_xattr_readdir(struct inode *dxip, xattr_filldir_t *xf) { zap_cursor_t zc; zap_attribute_t zap; int error; zap_cursor_init(&zc, ITOZSB(dxip)->z_os, ITOZ(dxip)->z_id); while ((error = -zap_cursor_retrieve(&zc, &zap)) == 0) { if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { error = -ENXIO; break; } error = zpl_xattr_filldir(xf, zap.za_name, strlen(zap.za_name)); if (error) break; zap_cursor_advance(&zc); } zap_cursor_fini(&zc); if (error == -ENOENT) error = 0; return (error); } static ssize_t zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr) { struct inode *ip = xf->dentry->d_inode; struct inode *dxip = NULL; znode_t *dxzp; int error; /* Lookup the xattr directory */ error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR, cr, NULL, NULL); if (error) { if (error == -ENOENT) error = 0; return (error); } dxip = ZTOI(dxzp); error = zpl_xattr_readdir(dxip, xf); iput(dxip); return (error); } static ssize_t zpl_xattr_list_sa(xattr_filldir_t *xf) { znode_t *zp = ITOZ(xf->dentry->d_inode); nvpair_t *nvp = NULL; int error = 0; mutex_enter(&zp->z_lock); if (zp->z_xattr_cached == NULL) error = -zfs_sa_get_xattr(zp); mutex_exit(&zp->z_lock); if (error) return (error); ASSERT(zp->z_xattr_cached); while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) { ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); error = zpl_xattr_filldir(xf, nvpair_name(nvp), strlen(nvpair_name(nvp))); if (error) return (error); } return (0); } ssize_t zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) { znode_t *zp = ITOZ(dentry->d_inode); zfsvfs_t *zfsvfs = ZTOZSB(zp); xattr_filldir_t xf = { buffer_size, 0, buffer, dentry }; cred_t *cr = CRED(); fstrans_cookie_t cookie; int error = 0; crhold(cr); cookie = spl_fstrans_mark(); ZPL_ENTER(zfsvfs); ZPL_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_READER); if (zfsvfs->z_use_sa && zp->z_is_sa) { error = zpl_xattr_list_sa(&xf); if (error) goto out; } error = zpl_xattr_list_dir(&xf, cr); if (error) goto out; error = xf.offset; out: rw_exit(&zp->z_xattr_lock); ZPL_EXIT(zfsvfs); spl_fstrans_unmark(cookie); crfree(cr); return (error); } static int zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, size_t size, cred_t *cr) { fstrans_cookie_t cookie; struct inode *xip = NULL; znode_t *dxzp = NULL; znode_t *xzp = NULL; int error; /* Lookup the xattr directory */ error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR, cr, NULL, NULL); if (error) goto out; /* Lookup a specific xattr name in the directory */ error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL); if (error) goto out; xip = ZTOI(xzp); if (!size) { error = i_size_read(xip); goto out; } if (size < i_size_read(xip)) { error = -ERANGE; goto out; } struct iovec iov; iov.iov_base = (void *)value; iov.iov_len = size; zfs_uio_t uio; zfs_uio_iovec_init(&uio, &iov, 1, 0, UIO_SYSSPACE, size, 0); cookie = spl_fstrans_mark(); error = -zfs_read(ITOZ(xip), &uio, 0, cr); spl_fstrans_unmark(cookie); if (error == 0) error = size - zfs_uio_resid(&uio); out: if (xzp) zrele(xzp); if (dxzp) zrele(dxzp); return (error); } static int zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size) { znode_t *zp = ITOZ(ip); uchar_t *nv_value; uint_t nv_size; int error = 0; ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); mutex_enter(&zp->z_lock); if (zp->z_xattr_cached == NULL) error = -zfs_sa_get_xattr(zp); mutex_exit(&zp->z_lock); if (error) return (error); ASSERT(zp->z_xattr_cached); error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name, &nv_value, &nv_size); if (error) return (error); if (size == 0 || value == NULL) return (nv_size); if (size < nv_size) return (-ERANGE); memcpy(value, nv_value, nv_size); return (nv_size); } static int __zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); if (zfsvfs->z_use_sa && zp->z_is_sa) { error = zpl_xattr_get_sa(ip, name, value, size); if (error != -ENOENT) goto out; } error = zpl_xattr_get_dir(ip, name, value, size, cr); out: if (error == -ENOENT) error = -ENODATA; return (error); } #define XATTR_NOENT 0x0 #define XATTR_IN_SA 0x1 #define XATTR_IN_DIR 0x2 /* check where the xattr resides */ static int __zpl_xattr_where(struct inode *ip, const char *name, int *where, cred_t *cr) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; ASSERT(where); ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); *where = XATTR_NOENT; if (zfsvfs->z_use_sa && zp->z_is_sa) { error = zpl_xattr_get_sa(ip, name, NULL, 0); if (error >= 0) *where |= XATTR_IN_SA; else if (error != -ENOENT) return (error); } error = zpl_xattr_get_dir(ip, name, NULL, 0, cr); if (error >= 0) *where |= XATTR_IN_DIR; else if (error != -ENOENT) return (error); if (*where == (XATTR_IN_SA|XATTR_IN_DIR)) cmn_err(CE_WARN, "ZFS: inode %p has xattr \"%s\"" " in both SA and dir", ip, name); if (*where == XATTR_NOENT) error = -ENODATA; else error = 0; return (error); } static int zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); cred_t *cr = CRED(); fstrans_cookie_t cookie; int error; crhold(cr); cookie = spl_fstrans_mark(); ZPL_ENTER(zfsvfs); ZPL_VERIFY_ZP(zp); rw_enter(&zp->z_xattr_lock, RW_READER); error = __zpl_xattr_get(ip, name, value, size, cr); rw_exit(&zp->z_xattr_lock); ZPL_EXIT(zfsvfs); spl_fstrans_unmark(cookie); crfree(cr); return (error); } static int zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, size_t size, int flags, cred_t *cr) { znode_t *dxzp = NULL; znode_t *xzp = NULL; vattr_t *vap = NULL; int lookup_flags, error; const int xattr_mode = S_IFREG | 0644; loff_t pos = 0; /* * Lookup the xattr directory. When we're adding an entry pass * CREATE_XATTR_DIR to ensure the xattr directory is created. * When removing an entry this flag is not passed to avoid * unnecessarily creating a new xattr directory. */ lookup_flags = LOOKUP_XATTR; if (value != NULL) lookup_flags |= CREATE_XATTR_DIR; error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, lookup_flags, cr, NULL, NULL); if (error) goto out; /* Lookup a specific xattr name in the directory */ error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL); if (error && (error != -ENOENT)) goto out; error = 0; /* Remove a specific name xattr when value is set to NULL. */ if (value == NULL) { if (xzp) error = -zfs_remove(dxzp, (char *)name, cr, 0); goto out; } /* Lookup failed create a new xattr. */ if (xzp == NULL) { vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); vap->va_mode = xattr_mode; vap->va_mask = ATTR_MODE; vap->va_uid = crgetfsuid(cr); vap->va_gid = crgetfsgid(cr); error = -zfs_create(dxzp, (char *)name, vap, 0, 0644, &xzp, cr, 0, NULL); if (error) goto out; } ASSERT(xzp != NULL); error = -zfs_freesp(xzp, 0, 0, xattr_mode, TRUE); if (error) goto out; error = -zfs_write_simple(xzp, value, size, pos, NULL); out: if (error == 0) { ip->i_ctime = current_time(ip); zfs_mark_inode_dirty(ip); } if (vap) kmem_free(vap, sizeof (vattr_t)); if (xzp) zrele(xzp); if (dxzp) zrele(dxzp); if (error == -ENOENT) error = -ENODATA; ASSERT3S(error, <=, 0); return (error); } static int zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value, size_t size, int flags, cred_t *cr) { znode_t *zp = ITOZ(ip); nvlist_t *nvl; size_t sa_size; int error = 0; mutex_enter(&zp->z_lock); if (zp->z_xattr_cached == NULL) error = -zfs_sa_get_xattr(zp); mutex_exit(&zp->z_lock); if (error) return (error); ASSERT(zp->z_xattr_cached); nvl = zp->z_xattr_cached; if (value == NULL) { error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); if (error == -ENOENT) error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr); } else { /* Limited to 32k to keep nvpair memory allocations small */ if (size > DXATTR_MAX_ENTRY_SIZE) return (-EFBIG); /* Prevent the DXATTR SA from consuming the entire SA region */ error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); if (error) return (error); if (sa_size > DXATTR_MAX_SA_SIZE) return (-EFBIG); error = -nvlist_add_byte_array(nvl, name, (uchar_t *)value, size); } /* * Update the SA for additions, modifications, and removals. On * error drop the inconsistent cached version of the nvlist, it * will be reconstructed from the ARC when next accessed. */ if (error == 0) error = -zfs_sa_set_xattr(zp); if (error) { nvlist_free(nvl); zp->z_xattr_cached = NULL; } ASSERT3S(error, <=, 0); return (error); } static int zpl_xattr_set(struct inode *ip, const char *name, const void *value, size_t size, int flags) { znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ZTOZSB(zp); cred_t *cr = CRED(); fstrans_cookie_t cookie; int where; int error; crhold(cr); cookie = spl_fstrans_mark(); ZPL_ENTER(zfsvfs); ZPL_VERIFY_ZP(zp); rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER); /* * Before setting the xattr check to see if it already exists. * This is done to ensure the following optional flags are honored. * * XATTR_CREATE: fail if xattr already exists * XATTR_REPLACE: fail if xattr does not exist * * We also want to know if it resides in sa or dir, so we can make * sure we don't end up with duplicate in both places. */ error = __zpl_xattr_where(ip, name, &where, cr); if (error < 0) { if (error != -ENODATA) goto out; if (flags & XATTR_REPLACE) goto out; /* The xattr to be removed already doesn't exist */ error = 0; if (value == NULL) goto out; } else { error = -EEXIST; if (flags & XATTR_CREATE) goto out; } /* Preferentially store the xattr as a SA for better performance */ if (zfsvfs->z_use_sa && zp->z_is_sa && (zfsvfs->z_xattr_sa || (value == NULL && where & XATTR_IN_SA))) { error = zpl_xattr_set_sa(ip, name, value, size, flags, cr); if (error == 0) { /* * Successfully put into SA, we need to clear the one * in dir. */ if (where & XATTR_IN_DIR) zpl_xattr_set_dir(ip, name, NULL, 0, 0, cr); goto out; } } error = zpl_xattr_set_dir(ip, name, value, size, flags, cr); /* * Successfully put into dir, we need to clear the one in SA. */ if (error == 0 && (where & XATTR_IN_SA)) zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr); out: rw_exit(&ITOZ(ip)->z_xattr_lock); ZPL_EXIT(zfsvfs); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } /* * Extended user attributes * * "Extended user attributes may be assigned to files and directories for * storing arbitrary additional information such as the mime type, * character set or encoding of a file. The access permissions for user * attributes are defined by the file permission bits: read permission * is required to retrieve the attribute value, and writer permission is * required to change it. * * The file permission bits of regular files and directories are * interpreted differently from the file permission bits of special * files and symbolic links. For regular files and directories the file * permission bits define access to the file's contents, while for * device special files they define access to the device described by * the special file. The file permissions of symbolic links are not * used in access checks. These differences would allow users to * consume filesystem resources in a way not controllable by disk quotas * for group or world writable special files and directories. * * For this reason, extended user attributes are allowed only for * regular files and directories, and access to extended user attributes * is restricted to the owner and to users with appropriate capabilities * for directories with the sticky bit set (see the chmod(1) manual page * for an explanation of the sticky bit)." - xattr(7) * * ZFS allows extended user attributes to be disabled administratively * by setting the 'xattr=off' property on the dataset. */ static int __zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { return (ITOZSB(ip)->z_flags & ZSB_XATTR); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_user_list); static int __zpl_xattr_user_get(struct inode *ip, const char *name, void *value, size_t size) { char *xattr_name; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) return (-EOPNOTSUPP); xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); error = zpl_xattr_get(ip, xattr_name, value, size); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get); static int __zpl_xattr_user_set(struct inode *ip, const char *name, const void *value, size_t size, int flags) { char *xattr_name; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) return (-EOPNOTSUPP); xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); error = zpl_xattr_set(ip, xattr_name, value, size, flags); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set); xattr_handler_t zpl_xattr_user_handler = { .prefix = XATTR_USER_PREFIX, .list = zpl_xattr_user_list, .get = zpl_xattr_user_get, .set = zpl_xattr_user_set, }; /* * Trusted extended attributes * * "Trusted extended attributes are visible and accessible only to * processes that have the CAP_SYS_ADMIN capability. Attributes in this * class are used to implement mechanisms in user space (i.e., outside * the kernel) which keep information in extended attributes to which * ordinary processes should not have access." - xattr(7) */ static int __zpl_xattr_trusted_list(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { return (capable(CAP_SYS_ADMIN)); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_trusted_list); static int __zpl_xattr_trusted_get(struct inode *ip, const char *name, void *value, size_t size) { char *xattr_name; int error; if (!capable(CAP_SYS_ADMIN)) return (-EACCES); /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); error = zpl_xattr_get(ip, xattr_name, value, size); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get); static int __zpl_xattr_trusted_set(struct inode *ip, const char *name, const void *value, size_t size, int flags) { char *xattr_name; int error; if (!capable(CAP_SYS_ADMIN)) return (-EACCES); /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); error = zpl_xattr_set(ip, xattr_name, value, size, flags); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set); xattr_handler_t zpl_xattr_trusted_handler = { .prefix = XATTR_TRUSTED_PREFIX, .list = zpl_xattr_trusted_list, .get = zpl_xattr_trusted_get, .set = zpl_xattr_trusted_set, }; /* * Extended security attributes * * "The security attribute namespace is used by kernel security modules, * such as Security Enhanced Linux, and also to implement file * capabilities (see capabilities(7)). Read and write access * permissions to security attributes depend on the policy implemented * for each security attribute by the security module. When no security * module is loaded, all processes have read access to extended security * attributes, and write access is limited to processes that have the * CAP_SYS_ADMIN capability." - xattr(7) */ static int __zpl_xattr_security_list(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { return (1); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_security_list); static int __zpl_xattr_security_get(struct inode *ip, const char *name, void *value, size_t size) { char *xattr_name; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); error = zpl_xattr_get(ip, xattr_name, value, size); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get); static int __zpl_xattr_security_set(struct inode *ip, const char *name, const void *value, size_t size, int flags) { char *xattr_name; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") == 0) return (-EINVAL); #endif xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); error = zpl_xattr_set(ip, xattr_name, value, size, flags); kmem_strfree(xattr_name); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set); static int zpl_xattr_security_init_impl(struct inode *ip, const struct xattr *xattrs, void *fs_info) { const struct xattr *xattr; int error = 0; for (xattr = xattrs; xattr->name != NULL; xattr++) { error = __zpl_xattr_security_set(ip, xattr->name, xattr->value, xattr->value_len, 0); if (error < 0) break; } return (error); } int zpl_xattr_security_init(struct inode *ip, struct inode *dip, const struct qstr *qstr) { return security_inode_init_security(ip, dip, qstr, &zpl_xattr_security_init_impl, NULL); } /* * Security xattr namespace handlers. */ xattr_handler_t zpl_xattr_security_handler = { .prefix = XATTR_SECURITY_PREFIX, .list = zpl_xattr_security_list, .get = zpl_xattr_security_get, .set = zpl_xattr_security_set, }; /* * Extended system attributes * * "Extended system attributes are used by the kernel to store system * objects such as Access Control Lists. Read and write access permissions * to system attributes depend on the policy implemented for each system * attribute implemented by filesystems in the kernel." - xattr(7) */ #ifdef CONFIG_FS_POSIX_ACL #ifndef HAVE_SET_ACL static #endif int zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) { char *name, *value = NULL; int error = 0; size_t size = 0; if (S_ISLNK(ip->i_mode)) return (-EOPNOTSUPP); switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; if (acl) { umode_t mode = ip->i_mode; error = posix_acl_equiv_mode(acl, &mode); if (error < 0) { return (error); } else { /* * The mode bits will have been set by * ->zfs_setattr()->zfs_acl_chmod_setattr() * using the ZFS ACL conversion. If they * differ from the Posix ACL conversion dirty * the inode to write the Posix mode bits. */ if (ip->i_mode != mode) { ip->i_mode = mode; ip->i_ctime = current_time(ip); zfs_mark_inode_dirty(ip); } if (error == 0) acl = NULL; } } break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; if (!S_ISDIR(ip->i_mode)) return (acl ? -EACCES : 0); break; default: return (-EINVAL); } if (acl) { size = posix_acl_xattr_size(acl->a_count); value = kmem_alloc(size, KM_SLEEP); error = zpl_acl_to_xattr(acl, value, size); if (error < 0) { kmem_free(value, size); return (error); } } error = zpl_xattr_set(ip, name, value, size, 0); if (value) kmem_free(value, size); if (!error) { if (acl) zpl_set_cached_acl(ip, type, acl); else zpl_forget_cached_acl(ip, type); } return (error); } struct posix_acl * zpl_get_acl(struct inode *ip, int type) { struct posix_acl *acl; void *value = NULL; char *name; int size; /* * As of Linux 3.14, the kernel get_acl will check this for us. * Also as of Linux 4.7, comparing against ACL_NOT_CACHED is wrong * as the kernel get_acl will set it to temporary sentinel value. */ #ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE acl = get_cached_acl(ip, type); if (acl != ACL_NOT_CACHED) return (acl); #endif switch (type) { case ACL_TYPE_ACCESS: name = XATTR_NAME_POSIX_ACL_ACCESS; break; case ACL_TYPE_DEFAULT: name = XATTR_NAME_POSIX_ACL_DEFAULT; break; default: return (ERR_PTR(-EINVAL)); } size = zpl_xattr_get(ip, name, NULL, 0); if (size > 0) { value = kmem_alloc(size, KM_SLEEP); size = zpl_xattr_get(ip, name, value, size); } if (size > 0) { acl = zpl_acl_from_xattr(value, size); } else if (size == -ENODATA || size == -ENOSYS) { acl = NULL; } else { acl = ERR_PTR(-EIO); } if (size > 0) kmem_free(value, size); /* As of Linux 4.7, the kernel get_acl will set this for us */ #ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE if (!IS_ERR(acl)) zpl_set_cached_acl(ip, type, acl); #endif return (acl); } int zpl_init_acl(struct inode *ip, struct inode *dir) { struct posix_acl *acl = NULL; int error = 0; if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (!S_ISLNK(ip->i_mode)) { acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT); if (IS_ERR(acl)) return (PTR_ERR(acl)); if (!acl) { ip->i_mode &= ~current_umask(); ip->i_ctime = current_time(ip); zfs_mark_inode_dirty(ip); return (0); } } if (acl) { umode_t mode; if (S_ISDIR(ip->i_mode)) { error = zpl_set_acl(ip, acl, ACL_TYPE_DEFAULT); if (error) goto out; } mode = ip->i_mode; error = __posix_acl_create(&acl, GFP_KERNEL, &mode); if (error >= 0) { ip->i_mode = mode; zfs_mark_inode_dirty(ip); if (error > 0) error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS); } } out: zpl_posix_acl_release(acl); return (error); } int zpl_chmod_acl(struct inode *ip) { struct posix_acl *acl; int error; if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (S_ISLNK(ip->i_mode)) return (-EOPNOTSUPP); acl = zpl_get_acl(ip, ACL_TYPE_ACCESS); if (IS_ERR(acl) || !acl) return (PTR_ERR(acl)); error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode); if (!error) error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS); zpl_posix_acl_release(acl); return (error); } static int __zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS); if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (list && xattr_size <= list_size) memcpy(list, xattr_name, xattr_size); return (xattr_size); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_access); static int __zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size, const char *name, size_t name_len) { char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT); if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (0); if (list && xattr_size <= list_size) memcpy(list, xattr_name, xattr_size); return (xattr_size); } ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_default); static int __zpl_xattr_acl_get_access(struct inode *ip, const char *name, void *buffer, size_t size) { struct posix_acl *acl; int type = ACL_TYPE_ACCESS; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") != 0) return (-EINVAL); #endif if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); acl = zpl_get_acl(ip, type); if (IS_ERR(acl)) return (PTR_ERR(acl)); if (acl == NULL) return (-ENODATA); error = zpl_acl_to_xattr(acl, buffer, size); zpl_posix_acl_release(acl); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_access); static int __zpl_xattr_acl_get_default(struct inode *ip, const char *name, void *buffer, size_t size) { struct posix_acl *acl; int type = ACL_TYPE_DEFAULT; int error; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") != 0) return (-EINVAL); #endif if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); acl = zpl_get_acl(ip, type); if (IS_ERR(acl)) return (PTR_ERR(acl)); if (acl == NULL) return (-ENODATA); error = zpl_acl_to_xattr(acl, buffer, size); zpl_posix_acl_release(acl); return (error); } ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default); static int __zpl_xattr_acl_set_access(struct inode *ip, const char *name, const void *value, size_t size, int flags) { struct posix_acl *acl; int type = ACL_TYPE_ACCESS; int error = 0; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") != 0) return (-EINVAL); #endif if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); - if (!inode_owner_or_capable(ip)) + if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EPERM); if (value) { acl = zpl_acl_from_xattr(value, size); if (IS_ERR(acl)) return (PTR_ERR(acl)); else if (acl) { error = zpl_posix_acl_valid(ip, acl); if (error) { zpl_posix_acl_release(acl); return (error); } } } else { acl = NULL; } error = zpl_set_acl(ip, acl, type); zpl_posix_acl_release(acl); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access); static int __zpl_xattr_acl_set_default(struct inode *ip, const char *name, const void *value, size_t size, int flags) { struct posix_acl *acl; int type = ACL_TYPE_DEFAULT; int error = 0; /* xattr_resolve_name will do this for us if this is defined */ #ifndef HAVE_XATTR_HANDLER_NAME if (strcmp(name, "") != 0) return (-EINVAL); #endif if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIX) return (-EOPNOTSUPP); - if (!inode_owner_or_capable(ip)) + if (!zpl_inode_owner_or_capable(kcred->user_ns, ip)) return (-EPERM); if (value) { acl = zpl_acl_from_xattr(value, size); if (IS_ERR(acl)) return (PTR_ERR(acl)); else if (acl) { error = zpl_posix_acl_valid(ip, acl); if (error) { zpl_posix_acl_release(acl); return (error); } } } else { acl = NULL; } error = zpl_set_acl(ip, acl, type); zpl_posix_acl_release(acl); return (error); } ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_default); /* * ACL access xattr namespace handlers. * * Use .name instead of .prefix when available. xattr_resolve_name will match * whole name and reject anything that has .name only as prefix. */ xattr_handler_t zpl_xattr_acl_access_handler = { #ifdef HAVE_XATTR_HANDLER_NAME .name = XATTR_NAME_POSIX_ACL_ACCESS, #else .prefix = XATTR_NAME_POSIX_ACL_ACCESS, #endif .list = zpl_xattr_acl_list_access, .get = zpl_xattr_acl_get_access, .set = zpl_xattr_acl_set_access, #if defined(HAVE_XATTR_LIST_SIMPLE) || \ defined(HAVE_XATTR_LIST_DENTRY) || \ defined(HAVE_XATTR_LIST_HANDLER) .flags = ACL_TYPE_ACCESS, #endif }; /* * ACL default xattr namespace handlers. * * Use .name instead of .prefix when available. xattr_resolve_name will match * whole name and reject anything that has .name only as prefix. */ xattr_handler_t zpl_xattr_acl_default_handler = { #ifdef HAVE_XATTR_HANDLER_NAME .name = XATTR_NAME_POSIX_ACL_DEFAULT, #else .prefix = XATTR_NAME_POSIX_ACL_DEFAULT, #endif .list = zpl_xattr_acl_list_default, .get = zpl_xattr_acl_get_default, .set = zpl_xattr_acl_set_default, #if defined(HAVE_XATTR_LIST_SIMPLE) || \ defined(HAVE_XATTR_LIST_DENTRY) || \ defined(HAVE_XATTR_LIST_HANDLER) .flags = ACL_TYPE_DEFAULT, #endif }; #endif /* CONFIG_FS_POSIX_ACL */ xattr_handler_t *zpl_xattr_handlers[] = { &zpl_xattr_security_handler, &zpl_xattr_trusted_handler, &zpl_xattr_user_handler, #ifdef CONFIG_FS_POSIX_ACL &zpl_xattr_acl_access_handler, &zpl_xattr_acl_default_handler, #endif /* CONFIG_FS_POSIX_ACL */ NULL }; static const struct xattr_handler * zpl_xattr_handler(const char *name) { if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) == 0) return (&zpl_xattr_user_handler); if (strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) == 0) return (&zpl_xattr_trusted_handler); if (strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN) == 0) return (&zpl_xattr_security_handler); #ifdef CONFIG_FS_POSIX_ACL if (strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, sizeof (XATTR_NAME_POSIX_ACL_ACCESS)) == 0) return (&zpl_xattr_acl_access_handler); if (strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, sizeof (XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) return (&zpl_xattr_acl_default_handler); #endif /* CONFIG_FS_POSIX_ACL */ return (NULL); } #if !defined(HAVE_POSIX_ACL_RELEASE) || defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY) struct acl_rel_struct { struct acl_rel_struct *next; struct posix_acl *acl; clock_t time; }; #define ACL_REL_GRACE (60*HZ) #define ACL_REL_WINDOW (1*HZ) #define ACL_REL_SCHED (ACL_REL_GRACE+ACL_REL_WINDOW) /* * Lockless multi-producer single-consumer fifo list. * Nodes are added to tail and removed from head. Tail pointer is our * synchronization point. It always points to the next pointer of the last * node, or head if list is empty. */ static struct acl_rel_struct *acl_rel_head = NULL; static struct acl_rel_struct **acl_rel_tail = &acl_rel_head; static void zpl_posix_acl_free(void *arg) { struct acl_rel_struct *freelist = NULL; struct acl_rel_struct *a; clock_t new_time; boolean_t refire = B_FALSE; ASSERT3P(acl_rel_head, !=, NULL); while (acl_rel_head) { a = acl_rel_head; if (ddi_get_lbolt() - a->time >= ACL_REL_GRACE) { /* * If a is the last node we need to reset tail, but we * need to use cmpxchg to make sure it is still the * last node. */ if (acl_rel_tail == &a->next) { acl_rel_head = NULL; if (cmpxchg(&acl_rel_tail, &a->next, &acl_rel_head) == &a->next) { ASSERT3P(a->next, ==, NULL); a->next = freelist; freelist = a; break; } } /* * a is not last node, make sure next pointer is set * by the adder and advance the head. */ while (READ_ONCE(a->next) == NULL) cpu_relax(); acl_rel_head = a->next; a->next = freelist; freelist = a; } else { /* * a is still in grace period. We are responsible to * reschedule the free task, since adder will only do * so if list is empty. */ new_time = a->time + ACL_REL_SCHED; refire = B_TRUE; break; } } if (refire) taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, NULL, TQ_SLEEP, new_time); while (freelist) { a = freelist; freelist = a->next; kfree(a->acl); kmem_free(a, sizeof (struct acl_rel_struct)); } } void zpl_posix_acl_release_impl(struct posix_acl *acl) { struct acl_rel_struct *a, **prev; a = kmem_alloc(sizeof (struct acl_rel_struct), KM_SLEEP); a->next = NULL; a->acl = acl; a->time = ddi_get_lbolt(); /* atomically points tail to us and get the previous tail */ prev = xchg(&acl_rel_tail, &a->next); ASSERT3P(*prev, ==, NULL); *prev = a; /* if it was empty before, schedule the free task */ if (prev == &acl_rel_head) taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED); } #endif