diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 8b0e79afb0f1..b62ab5eec81f 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -1,227 +1,262 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. */ #ifndef _SYS_ZPL_H #define _SYS_ZPL_H #include #include #include #include #include #include #include #include #include #include #include /* zpl_inode.c */ extern void zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr, zidmap_t *mnt_ns); extern const struct inode_operations zpl_inode_operations; #ifdef HAVE_RENAME2_OPERATIONS_WRAPPER extern const struct inode_operations_wrapper zpl_dir_inode_operations; #else extern const struct inode_operations zpl_dir_inode_operations; #endif extern const struct inode_operations zpl_symlink_inode_operations; extern const struct inode_operations zpl_special_inode_operations; /* zpl_file.c */ extern const struct address_space_operations zpl_address_space_operations; extern const struct file_operations zpl_file_operations; extern const struct file_operations zpl_dir_file_operations; /* zpl_super.c */ extern void zpl_prune_sb(int64_t nr_to_scan, void *arg); extern const struct super_operations zpl_super_operations; extern const struct export_operations zpl_export_operations; extern struct file_system_type zpl_fs_type; /* zpl_xattr.c */ extern ssize_t zpl_xattr_list(struct dentry *dentry, char *buf, size_t size); extern int zpl_xattr_security_init(struct inode *ip, struct inode *dip, const struct qstr *qstr); #if defined(CONFIG_FS_POSIX_ACL) #if defined(HAVE_SET_ACL) #if defined(HAVE_SET_ACL_IDMAP_DENTRY) extern int zpl_set_acl(struct mnt_idmap *idmap, struct dentry *dentry, struct posix_acl *acl, int type); #elif defined(HAVE_SET_ACL_USERNS) extern int zpl_set_acl(struct user_namespace *userns, struct inode *ip, struct posix_acl *acl, int type); #elif defined(HAVE_SET_ACL_USERNS_DENTRY_ARG2) extern int zpl_set_acl(struct user_namespace *userns, struct dentry *dentry, struct posix_acl *acl, int type); #else extern int zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type); #endif /* HAVE_SET_ACL_USERNS */ #endif /* HAVE_SET_ACL */ #if defined(HAVE_GET_ACL_RCU) || defined(HAVE_GET_INODE_ACL) extern struct posix_acl *zpl_get_acl(struct inode *ip, int type, bool rcu); #elif defined(HAVE_GET_ACL) extern struct posix_acl *zpl_get_acl(struct inode *ip, int type); #endif extern int zpl_init_acl(struct inode *ip, struct inode *dir); extern int zpl_chmod_acl(struct inode *ip); #else static inline int zpl_init_acl(struct inode *ip, struct inode *dir) { return (0); } static inline int zpl_chmod_acl(struct inode *ip) { return (0); } #endif /* CONFIG_FS_POSIX_ACL */ extern xattr_handler_t *zpl_xattr_handlers[]; /* zpl_ctldir.c */ extern const struct file_operations zpl_fops_root; extern const struct inode_operations zpl_ops_root; extern const struct file_operations zpl_fops_snapdir; extern const struct inode_operations zpl_ops_snapdir; extern const struct file_operations zpl_fops_shares; extern const struct inode_operations zpl_ops_shares; #if defined(HAVE_VFS_ITERATE) || defined(HAVE_VFS_ITERATE_SHARED) #define ZPL_DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ .actor = _actor, \ .pos = _pos, \ } typedef struct dir_context zpl_dir_context_t; #define zpl_dir_emit dir_emit #define zpl_dir_emit_dot dir_emit_dot #define zpl_dir_emit_dotdot dir_emit_dotdot #define zpl_dir_emit_dots dir_emit_dots #else typedef struct zpl_dir_context { void *dirent; const filldir_t actor; loff_t pos; } zpl_dir_context_t; #define ZPL_DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ .dirent = _dirent, \ .actor = _actor, \ .pos = _pos, \ } static inline bool zpl_dir_emit(zpl_dir_context_t *ctx, const char *name, int namelen, uint64_t ino, unsigned type) { return (!ctx->actor(ctx->dirent, name, namelen, ctx->pos, ino, type)); } static inline bool zpl_dir_emit_dot(struct file *file, zpl_dir_context_t *ctx) { return (ctx->actor(ctx->dirent, ".", 1, ctx->pos, file_inode(file)->i_ino, DT_DIR) == 0); } static inline bool zpl_dir_emit_dotdot(struct file *file, zpl_dir_context_t *ctx) { return (ctx->actor(ctx->dirent, "..", 2, ctx->pos, parent_ino(file_dentry(file)), DT_DIR) == 0); } static inline bool zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx) { if (ctx->pos == 0) { if (!zpl_dir_emit_dot(file, ctx)) return (false); ctx->pos = 1; } if (ctx->pos == 1) { if (!zpl_dir_emit_dotdot(file, ctx)) return (false); ctx->pos = 2; } return (true); } #endif /* HAVE_VFS_ITERATE */ /* zpl_file_range.c */ /* handlers for file_operations of the same name */ extern ssize_t zpl_copy_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags); extern loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags); extern int zpl_clone_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, uint64_t len); extern int zpl_dedupe_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, uint64_t len); +/* compat for FICLONE/FICLONERANGE/FIDEDUPERANGE ioctls */ +typedef struct { + int64_t fcr_src_fd; + uint64_t fcr_src_offset; + uint64_t fcr_src_length; + uint64_t fcr_dest_offset; +} zfs_ioc_compat_file_clone_range_t; + +typedef struct { + int64_t fdri_dest_fd; + uint64_t fdri_dest_offset; + uint64_t fdri_bytes_deduped; + int32_t fdri_status; + uint32_t fdri_reserved; +} zfs_ioc_compat_dedupe_range_info_t; + +typedef struct { + uint64_t fdr_src_offset; + uint64_t fdr_src_length; + uint16_t fdr_dest_count; + uint16_t fdr_reserved1; + uint32_t fdr_reserved2; + zfs_ioc_compat_dedupe_range_info_t fdr_info[]; +} zfs_ioc_compat_dedupe_range_t; + +#define ZFS_IOC_COMPAT_FICLONE _IOW(0x94, 9, int) +#define ZFS_IOC_COMPAT_FICLONERANGE \ + _IOW(0x94, 13, zfs_ioc_compat_file_clone_range_t) +#define ZFS_IOC_COMPAT_FIDEDUPERANGE \ + _IOWR(0x94, 54, zfs_ioc_compat_dedupe_range_t) + +extern long zpl_ioctl_ficlone(struct file *filp, void *arg); +extern long zpl_ioctl_ficlonerange(struct file *filp, void *arg); +extern long zpl_ioctl_fideduperange(struct file *filp, void *arg); + #if defined(HAVE_INODE_TIMESTAMP_TRUNCATE) #define zpl_inode_timestamp_truncate(ts, ip) timestamp_truncate(ts, ip) #elif defined(HAVE_INODE_TIMESPEC64_TIMES) #define zpl_inode_timestamp_truncate(ts, ip) \ timespec64_trunc(ts, (ip)->i_sb->s_time_gran) #else #define zpl_inode_timestamp_truncate(ts, ip) \ timespec_trunc(ts, (ip)->i_sb->s_time_gran) #endif #if defined(HAVE_INODE_OWNER_OR_CAPABLE) #define zpl_inode_owner_or_capable(ns, ip) inode_owner_or_capable(ip) #elif defined(HAVE_INODE_OWNER_OR_CAPABLE_USERNS) #define zpl_inode_owner_or_capable(ns, ip) inode_owner_or_capable(ns, ip) #elif defined(HAVE_INODE_OWNER_OR_CAPABLE_IDMAP) #define zpl_inode_owner_or_capable(idmap, ip) inode_owner_or_capable(idmap, ip) #else #error "Unsupported kernel" #endif #if defined(HAVE_SETATTR_PREPARE_USERNS) || defined(HAVE_SETATTR_PREPARE_IDMAP) #define zpl_setattr_prepare(ns, dentry, ia) setattr_prepare(ns, dentry, ia) #else /* * Use kernel-provided version, or our own from * linux/vfs_compat.h */ #define zpl_setattr_prepare(ns, dentry, ia) setattr_prepare(dentry, ia) #endif #endif /* _SYS_ZPL_H */ diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 92b603e98a23..87a248af8303 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -1,1376 +1,1382 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2011, Lawrence Livermore National Security, LLC. * Copyright (c) 2015 by Chunwei Chen. All rights reserved. */ #ifdef CONFIG_COMPAT #include #endif #include #include #include #include #include #include #include #if defined(HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS) || \ defined(HAVE_VFS_FILEMAP_DIRTY_FOLIO) #include #endif #ifdef HAVE_FILE_FADVISE #include #endif #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO #include #endif /* * When using fallocate(2) to preallocate space, inflate the requested * capacity check by 10% to account for the required metadata blocks. */ static unsigned int zfs_fallocate_reserve_percent = 110; static int zpl_open(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; error = generic_file_open(ip, filp); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_release(struct inode *ip, struct file *filp) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); if (ITOZ(ip)->z_atime_dirty) zfs_mark_inode_dirty(ip); crhold(cr); error = -zfs_close(ip, filp->f_flags, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } static int zpl_iterate(struct file *filp, zpl_dir_context_t *ctx) { cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_readdir(file_inode(filp), ctx, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) static int zpl_readdir(struct file *filp, void *dirent, filldir_t filldir) { zpl_dir_context_t ctx = ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); int error; error = zpl_iterate(filp, &ctx); filp->f_pos = ctx.pos; return (error); } #endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ #if defined(HAVE_FSYNC_WITHOUT_DENTRY) /* * Linux 2.6.35 - 3.0 API, * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed * redundant. The dentry is still accessible via filp->f_path.dentry, * and we are guaranteed that filp will never be NULL. */ static int zpl_fsync(struct file *filp, int datasync) { struct inode *inode = filp->f_mapping->host; cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(ITOZ(inode), datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { return (zpl_fsync(kiocb->ki_filp, datasync)); } #endif #elif defined(HAVE_FSYNC_RANGE) /* * Linux 3.1 API, * As of 3.1 the responsibility to call filemap_write_and_wait_range() has * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex * lock is no longer held by the caller, for zfs we don't require the lock * to be held so we don't acquire it. */ static int zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { struct inode *inode = filp->f_mapping->host; znode_t *zp = ITOZ(inode); zfsvfs_t *zfsvfs = ITOZSB(inode); cred_t *cr = CRED(); int error; fstrans_cookie_t cookie; /* * The variables z_sync_writes_cnt and z_async_writes_cnt work in * tandem so that sync writes can detect if there are any non-sync * writes going on and vice-versa. The "vice-versa" part to this logic * is located in zfs_putpage() where non-sync writes check if there are * any ongoing sync writes. If any sync and non-sync writes overlap, * we do a commit to complete the non-sync writes since the latter can * potentially take several seconds to complete and thus block sync * writes in the upcoming call to filemap_write_and_wait_range(). */ atomic_inc_32(&zp->z_sync_writes_cnt); /* * If the following check does not detect an overlapping non-sync write * (say because it's just about to start), then it is guaranteed that * the non-sync write will detect this sync write. This is because we * always increment z_sync_writes_cnt / z_async_writes_cnt before doing * the check on z_async_writes_cnt / z_sync_writes_cnt here and in * zfs_putpage() respectively. */ if (atomic_load_32(&zp->z_async_writes_cnt) > 0) { if ((error = zpl_enter(zfsvfs, FTAG)) != 0) { atomic_dec_32(&zp->z_sync_writes_cnt); return (error); } zil_commit(zfsvfs->z_log, zp->z_id); zpl_exit(zfsvfs, FTAG); } error = filemap_write_and_wait_range(inode->i_mapping, start, end); /* * The sync write is not complete yet but we decrement * z_sync_writes_cnt since zfs_fsync() increments and decrements * it internally. If a non-sync write starts just after the decrement * operation but before we call zfs_fsync(), it may not detect this * overlapping sync write but it does not matter since we have already * gone past filemap_write_and_wait_range() and we won't block due to * the non-sync write. */ atomic_dec_32(&zp->z_sync_writes_cnt); if (error) return (error); crhold(cr); cookie = spl_fstrans_mark(); error = -zfs_fsync(zp, datasync, cr); spl_fstrans_unmark(cookie); crfree(cr); ASSERT3S(error, <=, 0); return (error); } #ifdef HAVE_FILE_AIO_FSYNC static int zpl_aio_fsync(struct kiocb *kiocb, int datasync) { return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync)); } #endif #else #error "Unsupported fops->fsync() implementation" #endif static inline int zfs_io_flags(struct kiocb *kiocb) { int flags = 0; #if defined(IOCB_DSYNC) if (kiocb->ki_flags & IOCB_DSYNC) flags |= O_DSYNC; #endif #if defined(IOCB_SYNC) if (kiocb->ki_flags & IOCB_SYNC) flags |= O_SYNC; #endif #if defined(IOCB_APPEND) if (kiocb->ki_flags & IOCB_APPEND) flags |= O_APPEND; #endif #if defined(IOCB_DIRECT) if (kiocb->ki_flags & IOCB_DIRECT) flags |= O_DIRECT; #endif return (flags); } /* * If relatime is enabled, call file_accessed() if zfs_relatime_need_update() * is true. This is needed since datasets with inherited "relatime" property * aren't necessarily mounted with the MNT_RELATIME flag (e.g. after * `zfs set relatime=...`), which is what relatime test in VFS by * relatime_need_update() is based on. */ static inline void zpl_file_accessed(struct file *filp) { struct inode *ip = filp->f_mapping->host; if (!IS_NOATIME(ip) && ITOZSB(ip)->z_relatime) { if (zfs_relatime_need_update(ip)) file_accessed(filp); } else { file_accessed(filp); } } #if defined(HAVE_VFS_RW_ITERATE) /* * When HAVE_VFS_IOV_ITER is defined the iov_iter structure supports * iovecs, kvevs, bvecs and pipes, plus all the required interfaces to * manipulate the iov_iter are available. In which case the full iov_iter * can be attached to the uio and correctly handled in the lower layers. * Otherwise, for older kernels extract the iovec and pass it instead. */ static void zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, loff_t pos, ssize_t count, size_t skip) { #if defined(HAVE_VFS_IOV_ITER) zfs_uio_iov_iter_init(uio, to, pos, count, skip); #else #ifdef HAVE_IOV_ITER_TYPE zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, count, skip); #else zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, count, skip); #endif #endif } static ssize_t zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; ssize_t count = iov_iter_count(to); zfs_uio_t uio; zpl_uio_init(&uio, kiocb, to, kiocb->ki_pos, count, 0); crhold(cr); cookie = spl_fstrans_mark(); int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (error < 0) return (error); ssize_t read = count - uio.uio_resid; kiocb->ki_pos += read; zpl_file_accessed(filp); return (read); } static inline ssize_t zpl_generic_write_checks(struct kiocb *kiocb, struct iov_iter *from, size_t *countp) { #ifdef HAVE_GENERIC_WRITE_CHECKS_KIOCB ssize_t ret = generic_write_checks(kiocb, from); if (ret <= 0) return (ret); *countp = ret; #else struct file *file = kiocb->ki_filp; struct address_space *mapping = file->f_mapping; struct inode *ip = mapping->host; int isblk = S_ISBLK(ip->i_mode); *countp = iov_iter_count(from); ssize_t ret = generic_write_checks(file, &kiocb->ki_pos, countp, isblk); if (ret) return (ret); #endif return (0); } static ssize_t zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; zfs_uio_t uio; size_t count = 0; ssize_t ret; ret = zpl_generic_write_checks(kiocb, from, &count); if (ret) return (ret); zpl_uio_init(&uio, kiocb, from, kiocb->ki_pos, count, from->iov_offset); crhold(cr); cookie = spl_fstrans_mark(); int error = -zfs_write(ITOZ(ip), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (error < 0) return (error); ssize_t wrote = count - uio.uio_resid; kiocb->ki_pos += wrote; return (wrote); } #else /* !HAVE_VFS_RW_ITERATE */ static ssize_t zpl_aio_read(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; size_t count; ssize_t ret; ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_WRITE); if (ret) return (ret); zfs_uio_t uio; zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); crhold(cr); cookie = spl_fstrans_mark(); int error = -zfs_read(ITOZ(filp->f_mapping->host), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (error < 0) return (error); ssize_t read = count - uio.uio_resid; kiocb->ki_pos += read; zpl_file_accessed(filp); return (read); } static ssize_t zpl_aio_write(struct kiocb *kiocb, const struct iovec *iov, unsigned long nr_segs, loff_t pos) { cred_t *cr = CRED(); fstrans_cookie_t cookie; struct file *filp = kiocb->ki_filp; struct inode *ip = filp->f_mapping->host; size_t count; ssize_t ret; ret = generic_segment_checks(iov, &nr_segs, &count, VERIFY_READ); if (ret) return (ret); ret = generic_write_checks(filp, &pos, &count, S_ISBLK(ip->i_mode)); if (ret) return (ret); kiocb->ki_pos = pos; zfs_uio_t uio; zfs_uio_iovec_init(&uio, iov, nr_segs, kiocb->ki_pos, UIO_USERSPACE, count, 0); crhold(cr); cookie = spl_fstrans_mark(); int error = -zfs_write(ITOZ(ip), &uio, filp->f_flags | zfs_io_flags(kiocb), cr); spl_fstrans_unmark(cookie); crfree(cr); if (error < 0) return (error); ssize_t wrote = count - uio.uio_resid; kiocb->ki_pos += wrote; return (wrote); } #endif /* HAVE_VFS_RW_ITERATE */ #if defined(HAVE_VFS_RW_ITERATE) static ssize_t zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter) { if (rw == WRITE) return (zpl_iter_write(kiocb, iter)); else return (zpl_iter_read(kiocb, iter)); } #if defined(HAVE_VFS_DIRECT_IO_ITER) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) { return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) static ssize_t zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { ASSERT3S(pos, ==, kiocb->ki_pos); return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { ASSERT3S(pos, ==, kiocb->ki_pos); return (zpl_direct_IO_impl(rw, kiocb, iter)); } #else #error "Unknown direct IO interface" #endif #else /* HAVE_VFS_RW_ITERATE */ #if defined(HAVE_VFS_DIRECT_IO_IOVEC) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, loff_t pos, unsigned long nr_segs) { if (rw == WRITE) return (zpl_aio_write(kiocb, iov, nr_segs, pos)); else return (zpl_aio_read(kiocb, iov, nr_segs, pos)); } #elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) static ssize_t zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) { const struct iovec *iovp = iov_iter_iovec(iter); unsigned long nr_segs = iter->nr_segs; ASSERT3S(pos, ==, kiocb->ki_pos); if (rw == WRITE) return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); else return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); } #else #error "Unknown direct IO interface" #endif #endif /* HAVE_VFS_RW_ITERATE */ static loff_t zpl_llseek(struct file *filp, loff_t offset, int whence) { #if defined(SEEK_HOLE) && defined(SEEK_DATA) fstrans_cookie_t cookie; if (whence == SEEK_DATA || whence == SEEK_HOLE) { struct inode *ip = filp->f_mapping->host; loff_t maxbytes = ip->i_sb->s_maxbytes; loff_t error; spl_inode_lock_shared(ip); cookie = spl_fstrans_mark(); error = -zfs_holey(ITOZ(ip), whence, &offset); spl_fstrans_unmark(cookie); if (error == 0) error = lseek_execute(filp, ip, offset, maxbytes); spl_inode_unlock_shared(ip); return (error); } #endif /* SEEK_HOLE && SEEK_DATA */ return (generic_file_llseek(filp, offset, whence)); } /* * It's worth taking a moment to describe how mmap is implemented * for zfs because it differs considerably from other Linux filesystems. * However, this issue is handled the same way under OpenSolaris. * * The issue is that by design zfs bypasses the Linux page cache and * leaves all caching up to the ARC. This has been shown to work * well for the common read(2)/write(2) case. However, mmap(2) * is problem because it relies on being tightly integrated with the * page cache. To handle this we cache mmap'ed files twice, once in * the ARC and a second time in the page cache. The code is careful * to keep both copies synchronized. * * When a file with an mmap'ed region is written to using write(2) * both the data in the ARC and existing pages in the page cache * are updated. For a read(2) data will be read first from the page * cache then the ARC if needed. Neither a write(2) or read(2) will * will ever result in new pages being added to the page cache. * * New pages are added to the page cache only via .readpage() which * is called when the vfs needs to read a page off disk to back the * virtual memory region. These pages may be modified without * notifying the ARC and will be written out periodically via * .writepage(). This will occur due to either a sync or the usual * page aging behavior. Note because a read(2) of a mmap'ed file * will always check the page cache first even when the ARC is out * of date correct data will still be returned. * * While this implementation ensures correct behavior it does have * have some drawbacks. The most obvious of which is that it * increases the required memory footprint when access mmap'ed * files. It also adds additional complexity to the code keeping * both caches synchronized. * * Longer term it may be possible to cleanly resolve this wart by * mapping page cache pages directly on to the ARC buffers. The * Linux address space operations are flexible enough to allow * selection of which pages back a particular index. The trick * would be working out the details of which subsystem is in * charge, the ARC, the page cache, or both. It may also prove * helpful to move the ARC buffers to a scatter-gather lists * rather than a vmalloc'ed region. */ static int zpl_mmap(struct file *filp, struct vm_area_struct *vma) { struct inode *ip = filp->f_mapping->host; int error; fstrans_cookie_t cookie; cookie = spl_fstrans_mark(); error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); spl_fstrans_unmark(cookie); if (error) return (error); error = generic_file_mmap(filp, vma); if (error) return (error); #if !defined(HAVE_FILEMAP_RANGE_HAS_PAGE) znode_t *zp = ITOZ(ip); mutex_enter(&zp->z_lock); zp->z_is_mapped = B_TRUE; mutex_exit(&zp->z_lock); #endif return (error); } /* * Populate a page with data for the Linux page cache. This function is * only used to support mmap(2). There will be an identical copy of the * data in the ARC which is kept up to date via .write() and .writepage(). */ static inline int zpl_readpage_common(struct page *pp) { fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); cookie = spl_fstrans_mark(); int error = -zfs_getpage(pp->mapping->host, pp); spl_fstrans_unmark(cookie); unlock_page(pp); return (error); } #ifdef HAVE_VFS_READ_FOLIO static int zpl_read_folio(struct file *filp, struct folio *folio) { return (zpl_readpage_common(&folio->page)); } #else static int zpl_readpage(struct file *filp, struct page *pp) { return (zpl_readpage_common(pp)); } #endif static int zpl_readpage_filler(void *data, struct page *pp) { return (zpl_readpage_common(pp)); } /* * Populate a set of pages with data for the Linux page cache. This * function will only be called for read ahead and never for demand * paging. For simplicity, the code relies on read_cache_pages() to * correctly lock each page for IO and call zpl_readpage(). */ #ifdef HAVE_VFS_READPAGES static int zpl_readpages(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages) { return (read_cache_pages(mapping, pages, zpl_readpage_filler, NULL)); } #else static void zpl_readahead(struct readahead_control *ractl) { struct page *page; while ((page = readahead_page(ractl)) != NULL) { int ret; ret = zpl_readpage_filler(NULL, page); put_page(page); if (ret) break; } } #endif static int zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) { boolean_t *for_sync = data; fstrans_cookie_t cookie; ASSERT(PageLocked(pp)); ASSERT(!PageWriteback(pp)); cookie = spl_fstrans_mark(); (void) zfs_putpage(pp->mapping->host, pp, wbc, *for_sync); spl_fstrans_unmark(cookie); return (0); } #ifdef HAVE_WRITEPAGE_T_FOLIO static int zpl_putfolio(struct folio *pp, struct writeback_control *wbc, void *data) { (void) zpl_putpage(&pp->page, wbc, data); return (0); } #endif static inline int zpl_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc, void *data) { int result; #ifdef HAVE_WRITEPAGE_T_FOLIO result = write_cache_pages(mapping, wbc, zpl_putfolio, data); #else result = write_cache_pages(mapping, wbc, zpl_putpage, data); #endif return (result); } static int zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) { znode_t *zp = ITOZ(mapping->host); zfsvfs_t *zfsvfs = ITOZSB(mapping->host); enum writeback_sync_modes sync_mode; int result; if ((result = zpl_enter(zfsvfs, FTAG)) != 0) return (result); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; zpl_exit(zfsvfs, FTAG); sync_mode = wbc->sync_mode; /* * We don't want to run write_cache_pages() in SYNC mode here, because * that would make putpage() wait for a single page to be committed to * disk every single time, resulting in atrocious performance. Instead * we run it once in non-SYNC mode so that the ZIL gets all the data, * and then we commit it all in one go. */ boolean_t for_sync = (sync_mode == WB_SYNC_ALL); wbc->sync_mode = WB_SYNC_NONE; result = zpl_write_cache_pages(mapping, wbc, &for_sync); if (sync_mode != wbc->sync_mode) { if ((result = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (result); if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, zp->z_id); zpl_exit(zfsvfs, FTAG); /* * We need to call write_cache_pages() again (we can't just * return after the commit) because the previous call in * non-SYNC mode does not guarantee that we got all the dirty * pages (see the implementation of write_cache_pages() for * details). That being said, this is a no-op in most cases. */ wbc->sync_mode = sync_mode; result = zpl_write_cache_pages(mapping, wbc, &for_sync); } return (result); } /* * Write out dirty pages to the ARC, this function is only required to * support mmap(2). Mapped pages may be dirtied by memory operations * which never call .write(). These dirty pages are kept in sync with * the ARC buffers via this hook. */ static int zpl_writepage(struct page *pp, struct writeback_control *wbc) { if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) wbc->sync_mode = WB_SYNC_ALL; boolean_t for_sync = (wbc->sync_mode == WB_SYNC_ALL); return (zpl_putpage(pp, wbc, &for_sync)); } /* * The flag combination which matches the behavior of zfs_space() is * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE * flag was introduced in the 2.6.38 kernel. * * The original mode=0 (allocate space) behavior can be reasonably emulated * by checking if enough space exists and creating a sparse file, as real * persistent space reservation is not possible due to COW, snapshots, etc. */ static long zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) { cred_t *cr = CRED(); loff_t olen; fstrans_cookie_t cookie; int error = 0; int test_mode = FALLOC_FL_PUNCH_HOLE; #ifdef HAVE_FALLOC_FL_ZERO_RANGE test_mode |= FALLOC_FL_ZERO_RANGE; #endif if ((mode & ~(FALLOC_FL_KEEP_SIZE | test_mode)) != 0) return (-EOPNOTSUPP); if (offset < 0 || len <= 0) return (-EINVAL); spl_inode_lock(ip); olen = i_size_read(ip); crhold(cr); cookie = spl_fstrans_mark(); if (mode & (test_mode)) { flock64_t bf; if (mode & FALLOC_FL_KEEP_SIZE) { if (offset > olen) goto out_unmark; if (offset + len > olen) len = olen - offset; } bf.l_type = F_WRLCK; bf.l_whence = SEEK_SET; bf.l_start = offset; bf.l_len = len; bf.l_pid = 0; error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr); } else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) { unsigned int percent = zfs_fallocate_reserve_percent; struct kstatfs statfs; /* Legacy mode, disable fallocate compatibility. */ if (percent == 0) { error = -EOPNOTSUPP; goto out_unmark; } /* * Use zfs_statvfs() instead of dmu_objset_space() since it * also checks project quota limits, which are relevant here. */ error = zfs_statvfs(ip, &statfs); if (error) goto out_unmark; /* * Shrink available space a bit to account for overhead/races. * We know the product previously fit into availbytes from * dmu_objset_space(), so the smaller product will also fit. */ if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) { error = -ENOSPC; goto out_unmark; } if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen) error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE); } out_unmark: spl_fstrans_unmark(cookie); spl_inode_unlock(ip); crfree(cr); return (error); } static long zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) { return zpl_fallocate_common(file_inode(filp), mode, offset, len); } static int zpl_ioctl_getversion(struct file *filp, void __user *arg) { uint32_t generation = file_inode(filp)->i_generation; return (copy_to_user(arg, &generation, sizeof (generation))); } #ifdef HAVE_FILE_FADVISE static int zpl_fadvise(struct file *filp, loff_t offset, loff_t len, int advice) { struct inode *ip = file_inode(filp); znode_t *zp = ITOZ(ip); zfsvfs_t *zfsvfs = ITOZSB(ip); objset_t *os = zfsvfs->z_os; int error = 0; if (S_ISFIFO(ip->i_mode)) return (-ESPIPE); if (offset < 0 || len < 0) return (-EINVAL); if ((error = zpl_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); switch (advice) { case POSIX_FADV_SEQUENTIAL: case POSIX_FADV_WILLNEED: #ifdef HAVE_GENERIC_FADVISE if (zn_has_cached_data(zp, offset, offset + len - 1)) error = generic_fadvise(filp, offset, len, advice); #endif /* * Pass on the caller's size directly, but note that * dmu_prefetch_max will effectively cap it. If there * really is a larger sequential access pattern, perhaps * dmu_zfetch will detect it. */ if (len == 0) len = i_size_read(ip) - offset; dmu_prefetch(os, zp->z_id, 0, offset, len, ZIO_PRIORITY_ASYNC_READ); break; case POSIX_FADV_NORMAL: case POSIX_FADV_RANDOM: case POSIX_FADV_DONTNEED: case POSIX_FADV_NOREUSE: /* ignored for now */ break; default: error = -EINVAL; break; } zfs_exit(zfsvfs, FTAG); return (error); } #endif /* HAVE_FILE_FADVISE */ #define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) #define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) static uint32_t __zpl_ioctl_getflags(struct inode *ip) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; uint32_t ioctl_flags = 0; if (zfs_flags & ZFS_IMMUTABLE) ioctl_flags |= FS_IMMUTABLE_FL; if (zfs_flags & ZFS_APPENDONLY) ioctl_flags |= FS_APPEND_FL; if (zfs_flags & ZFS_NODUMP) ioctl_flags |= FS_NODUMP_FL; if (zfs_flags & ZFS_PROJINHERIT) ioctl_flags |= ZFS_PROJINHERIT_FL; return (ioctl_flags & ZFS_FL_USER_VISIBLE); } /* * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file * attributes common to both Linux and Solaris are mapped. */ static int zpl_ioctl_getflags(struct file *filp, void __user *arg) { uint32_t flags; int err; flags = __zpl_ioctl_getflags(file_inode(filp)); err = copy_to_user(arg, &flags, sizeof (flags)); return (err); } /* * fchange() is a helper macro to detect if we have been asked to change a * flag. This is ugly, but the requirement that we do this is a consequence of * how the Linux file attribute interface was designed. Another consequence is * that concurrent modification of files suffers from a TOCTOU race. Neither * are things we can fix without modifying the kernel-userland interface, which * is outside of our jurisdiction. */ #define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) static int __zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; xoptattr_t *xoap; if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | ZFS_PROJINHERIT_FL)) return (-EOPNOTSUPP); if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) return (-EACCES); if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && !capable(CAP_LINUX_IMMUTABLE)) return (-EPERM); if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EACCES); xva_init(xva); xoap = xva_getxoptattr(xva); #define FLAG_CHANGE(iflag, zflag, xflag, xfield) do { \ if (((ioctl_flags & (iflag)) && !(zfs_flags & (zflag))) || \ ((zfs_flags & (zflag)) && !(ioctl_flags & (iflag)))) { \ XVA_SET_REQ(xva, (xflag)); \ (xfield) = ((ioctl_flags & (iflag)) != 0); \ } \ } while (0) FLAG_CHANGE(FS_IMMUTABLE_FL, ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable); FLAG_CHANGE(FS_APPEND_FL, ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly); FLAG_CHANGE(FS_NODUMP_FL, ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump); FLAG_CHANGE(ZFS_PROJINHERIT_FL, ZFS_PROJINHERIT, XAT_PROJINHERIT, xoap->xoa_projinherit); #undef FLAG_CHANGE return (0); } static int zpl_ioctl_setflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint32_t flags; cred_t *cr = CRED(); xvattr_t xva; int err; fstrans_cookie_t cookie; if (copy_from_user(&flags, arg, sizeof (flags))) return (-EFAULT); err = __zpl_ioctl_setflags(ip, flags, &xva); if (err) return (err); crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); return (err); } static int zpl_ioctl_getxattr(struct file *filp, void __user *arg) { zfsxattr_t fsx = { 0 }; struct inode *ip = file_inode(filp); int err; fsx.fsx_xflags = __zpl_ioctl_getflags(ip); fsx.fsx_projid = ITOZ(ip)->z_projid; err = copy_to_user(arg, &fsx, sizeof (fsx)); return (err); } static int zpl_ioctl_setxattr(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); zfsxattr_t fsx; cred_t *cr = CRED(); xvattr_t xva; xoptattr_t *xoap; int err; fstrans_cookie_t cookie; if (copy_from_user(&fsx, arg, sizeof (fsx))) return (-EFAULT); if (!zpl_is_valid_projid(fsx.fsx_projid)) return (-EINVAL); err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); if (err) return (err); xoap = xva_getxoptattr(&xva); XVA_SET_REQ(&xva, XAT_PROJID); xoap->xoa_projid = fsx.fsx_projid; crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); return (err); } /* * Expose Additional File Level Attributes of ZFS. */ static int zpl_ioctl_getdosflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint64_t dosflags = ITOZ(ip)->z_pflags; dosflags &= ZFS_DOS_FL_USER_VISIBLE; int err = copy_to_user(arg, &dosflags, sizeof (dosflags)); return (err); } static int __zpl_ioctl_setdosflags(struct inode *ip, uint64_t ioctl_flags, xvattr_t *xva) { uint64_t zfs_flags = ITOZ(ip)->z_pflags; xoptattr_t *xoap; if (ioctl_flags & (~ZFS_DOS_FL_USER_VISIBLE)) return (-EOPNOTSUPP); if ((fchange(ioctl_flags, zfs_flags, ZFS_IMMUTABLE, ZFS_IMMUTABLE) || fchange(ioctl_flags, zfs_flags, ZFS_APPENDONLY, ZFS_APPENDONLY)) && !capable(CAP_LINUX_IMMUTABLE)) return (-EPERM); if (!zpl_inode_owner_or_capable(zfs_init_idmap, ip)) return (-EACCES); xva_init(xva); xoap = xva_getxoptattr(xva); #define FLAG_CHANGE(iflag, xflag, xfield) do { \ if (((ioctl_flags & (iflag)) && !(zfs_flags & (iflag))) || \ ((zfs_flags & (iflag)) && !(ioctl_flags & (iflag)))) { \ XVA_SET_REQ(xva, (xflag)); \ (xfield) = ((ioctl_flags & (iflag)) != 0); \ } \ } while (0) FLAG_CHANGE(ZFS_IMMUTABLE, XAT_IMMUTABLE, xoap->xoa_immutable); FLAG_CHANGE(ZFS_APPENDONLY, XAT_APPENDONLY, xoap->xoa_appendonly); FLAG_CHANGE(ZFS_NODUMP, XAT_NODUMP, xoap->xoa_nodump); FLAG_CHANGE(ZFS_READONLY, XAT_READONLY, xoap->xoa_readonly); FLAG_CHANGE(ZFS_HIDDEN, XAT_HIDDEN, xoap->xoa_hidden); FLAG_CHANGE(ZFS_SYSTEM, XAT_SYSTEM, xoap->xoa_system); FLAG_CHANGE(ZFS_ARCHIVE, XAT_ARCHIVE, xoap->xoa_archive); FLAG_CHANGE(ZFS_NOUNLINK, XAT_NOUNLINK, xoap->xoa_nounlink); FLAG_CHANGE(ZFS_REPARSE, XAT_REPARSE, xoap->xoa_reparse); FLAG_CHANGE(ZFS_OFFLINE, XAT_OFFLINE, xoap->xoa_offline); FLAG_CHANGE(ZFS_SPARSE, XAT_SPARSE, xoap->xoa_sparse); #undef FLAG_CHANGE return (0); } /* * Set Additional File Level Attributes of ZFS. */ static int zpl_ioctl_setdosflags(struct file *filp, void __user *arg) { struct inode *ip = file_inode(filp); uint64_t dosflags; cred_t *cr = CRED(); xvattr_t xva; int err; fstrans_cookie_t cookie; if (copy_from_user(&dosflags, arg, sizeof (dosflags))) return (-EFAULT); err = __zpl_ioctl_setdosflags(ip, dosflags, &xva); if (err) return (err); crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr, zfs_init_idmap); spl_fstrans_unmark(cookie); crfree(cr); return (err); } static long zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC_GETVERSION: return (zpl_ioctl_getversion(filp, (void *)arg)); case FS_IOC_GETFLAGS: return (zpl_ioctl_getflags(filp, (void *)arg)); case FS_IOC_SETFLAGS: return (zpl_ioctl_setflags(filp, (void *)arg)); case ZFS_IOC_FSGETXATTR: return (zpl_ioctl_getxattr(filp, (void *)arg)); case ZFS_IOC_FSSETXATTR: return (zpl_ioctl_setxattr(filp, (void *)arg)); case ZFS_IOC_GETDOSFLAGS: return (zpl_ioctl_getdosflags(filp, (void *)arg)); case ZFS_IOC_SETDOSFLAGS: return (zpl_ioctl_setdosflags(filp, (void *)arg)); + case ZFS_IOC_COMPAT_FICLONE: + return (zpl_ioctl_ficlone(filp, (void *)arg)); + case ZFS_IOC_COMPAT_FICLONERANGE: + return (zpl_ioctl_ficlonerange(filp, (void *)arg)); + case ZFS_IOC_COMPAT_FIDEDUPERANGE: + return (zpl_ioctl_fideduperange(filp, (void *)arg)); default: return (-ENOTTY); } } #ifdef CONFIG_COMPAT static long zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; case FS_IOC32_GETFLAGS: cmd = FS_IOC_GETFLAGS; break; case FS_IOC32_SETFLAGS: cmd = FS_IOC_SETFLAGS; break; default: return (-ENOTTY); } return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); } #endif /* CONFIG_COMPAT */ const struct address_space_operations zpl_address_space_operations = { #ifdef HAVE_VFS_READPAGES .readpages = zpl_readpages, #else .readahead = zpl_readahead, #endif #ifdef HAVE_VFS_READ_FOLIO .read_folio = zpl_read_folio, #else .readpage = zpl_readpage, #endif .writepage = zpl_writepage, .writepages = zpl_writepages, .direct_IO = zpl_direct_IO, #ifdef HAVE_VFS_SET_PAGE_DIRTY_NOBUFFERS .set_page_dirty = __set_page_dirty_nobuffers, #endif #ifdef HAVE_VFS_FILEMAP_DIRTY_FOLIO .dirty_folio = filemap_dirty_folio, #endif }; const struct file_operations zpl_file_operations = { .open = zpl_open, .release = zpl_release, .llseek = zpl_llseek, #ifdef HAVE_VFS_RW_ITERATE #ifdef HAVE_NEW_SYNC_READ .read = new_sync_read, .write = new_sync_write, #endif .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, #ifdef HAVE_VFS_IOV_ITER .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, #endif #else .read = do_sync_read, .write = do_sync_write, .aio_read = zpl_aio_read, .aio_write = zpl_aio_write, #endif .mmap = zpl_mmap, .fsync = zpl_fsync, #ifdef HAVE_FILE_AIO_FSYNC .aio_fsync = zpl_aio_fsync, #endif .fallocate = zpl_fallocate, #ifdef HAVE_VFS_COPY_FILE_RANGE .copy_file_range = zpl_copy_file_range, #endif #ifdef HAVE_VFS_REMAP_FILE_RANGE .remap_file_range = zpl_remap_file_range, #endif #ifdef HAVE_VFS_CLONE_FILE_RANGE .clone_file_range = zpl_clone_file_range, #endif #ifdef HAVE_VFS_DEDUPE_FILE_RANGE .dedupe_file_range = zpl_dedupe_file_range, #endif #ifdef HAVE_FILE_FADVISE .fadvise = zpl_fadvise, #endif .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; const struct file_operations zpl_dir_file_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, #if defined(HAVE_VFS_ITERATE_SHARED) .iterate_shared = zpl_iterate, #elif defined(HAVE_VFS_ITERATE) .iterate = zpl_iterate, #else .readdir = zpl_readdir, #endif .fsync = zpl_fsync, .unlocked_ioctl = zpl_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = zpl_compat_ioctl, #endif }; /* CSTYLED */ module_param(zfs_fallocate_reserve_percent, uint, 0644); MODULE_PARM_DESC(zfs_fallocate_reserve_percent, "Percentage of length to use for the available capacity check"); diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index db387a748130..aad502a8092e 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -1,183 +1,262 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or https://opensource.org/licenses/CDDL-1.0. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2023, Klara Inc. */ #ifdef CONFIG_COMPAT #include #endif #include #include #include #include #include /* * Clone part of a file via block cloning. * * Note that we are not required to update file offsets; the kernel will take * care of that depending on how it was called. */ static ssize_t __zpl_clone_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len) { struct inode *src_i = file_inode(src_file); struct inode *dst_i = file_inode(dst_file); uint64_t src_off_o = (uint64_t)src_off; uint64_t dst_off_o = (uint64_t)dst_off; uint64_t len_o = (uint64_t)len; cred_t *cr = CRED(); fstrans_cookie_t cookie; int err; if (!spa_feature_is_enabled( dmu_objset_spa(ITOZSB(dst_i)->z_os), SPA_FEATURE_BLOCK_CLONING)) return (-EOPNOTSUPP); if (src_i != dst_i) spl_inode_lock_shared(src_i); spl_inode_lock(dst_i); crhold(cr); cookie = spl_fstrans_mark(); err = -zfs_clone_range(ITOZ(src_i), &src_off_o, ITOZ(dst_i), &dst_off_o, &len_o, cr); spl_fstrans_unmark(cookie); crfree(cr); spl_inode_unlock(dst_i); if (src_i != dst_i) spl_inode_unlock_shared(src_i); if (err < 0) return (err); return ((ssize_t)len_o); } #ifdef HAVE_VFS_COPY_FILE_RANGE /* * Entry point for copy_file_range(). Copy len bytes from src_off in src_file * to dst_off in dst_file. We are permitted to do this however we like, so we * try to just clone the blocks, and if we can't support it, fall back to the * kernel's generic byte copy function. */ ssize_t zpl_copy_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len, unsigned int flags) { ssize_t ret; if (flags != 0) return (-EINVAL); /* Try to do it via zfs_clone_range() */ ret =__zpl_clone_file_range(src_file, src_off, dst_file, dst_off, len); #ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE /* * Since Linux 5.3 the filesystem driver is responsible for executing * an appropriate fallback, and a generic fallback function is provided. */ if (ret == -EOPNOTSUPP || ret == -EXDEV) ret = generic_copy_file_range(src_file, src_off, dst_file, dst_off, len, flags); #endif /* HAVE_VFS_GENERIC_COPY_FILE_RANGE */ return (ret); } #endif /* HAVE_VFS_COPY_FILE_RANGE */ #ifdef HAVE_VFS_REMAP_FILE_RANGE /* * Entry point for FICLONE/FICLONERANGE/FIDEDUPERANGE. * * FICLONE and FICLONERANGE are basically the same as copy_file_range(), except * that they must clone - they cannot fall back to copying. FICLONE is exactly * FICLONERANGE, for the entire file. We don't need to try to tell them apart; * the kernel will sort that out for us. * * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the * range in both files and if they're the same, arrange for them to be backed * by the same storage. */ loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, loff_t len, unsigned int flags) { if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)) return (-EINVAL); /* * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given * range if we want. Its designed for filesystems that make data past * EOF available, and don't want it to be visible in both files. ZFS * doesn't do that, so we just turn the flag off. */ flags &= ~REMAP_FILE_CAN_SHORTEN; if (flags & REMAP_FILE_DEDUP) /* No support for dedup yet */ return (-EOPNOTSUPP); /* Zero length means to clone everything to the end of the file */ if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; return (__zpl_clone_file_range(src_file, src_off, dst_file, dst_off, len)); } #endif /* HAVE_VFS_REMAP_FILE_RANGE */ #ifdef HAVE_VFS_CLONE_FILE_RANGE /* * Entry point for FICLONE and FICLONERANGE, before Linux 4.20. */ int zpl_clone_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, uint64_t len) { /* Zero length means to clone everything to the end of the file */ if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; return (__zpl_clone_file_range(src_file, src_off, dst_file, dst_off, len)); } #endif /* HAVE_VFS_CLONE_FILE_RANGE */ #ifdef HAVE_VFS_DEDUPE_FILE_RANGE /* * Entry point for FIDEDUPERANGE, before Linux 4.20. */ int zpl_dedupe_file_range(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, uint64_t len) { /* No support for dedup yet */ return (-EOPNOTSUPP); } #endif /* HAVE_VFS_DEDUPE_FILE_RANGE */ + +/* Entry point for FICLONE, before Linux 4.5. */ +long +zpl_ioctl_ficlone(struct file *dst_file, void *arg) +{ + unsigned long sfd = (unsigned long)arg; + + struct file *src_file = fget(sfd); + if (src_file == NULL) + return (-EBADF); + + if (dst_file->f_op != src_file->f_op) + return (-EXDEV); + + size_t len = i_size_read(file_inode(src_file)); + + ssize_t ret = + __zpl_clone_file_range(src_file, 0, dst_file, 0, len); + + fput(src_file); + + if (ret < 0) { + if (ret == -EOPNOTSUPP) + return (-ENOTTY); + return (ret); + } + + if (ret != len) + return (-EINVAL); + + return (0); +} + +/* Entry point for FICLONERANGE, before Linux 4.5. */ +long +zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg) +{ + zfs_ioc_compat_file_clone_range_t fcr; + + if (copy_from_user(&fcr, arg, sizeof (fcr))) + return (-EFAULT); + + struct file *src_file = fget(fcr.fcr_src_fd); + if (src_file == NULL) + return (-EBADF); + + if (dst_file->f_op != src_file->f_op) + return (-EXDEV); + + size_t len = fcr.fcr_src_length; + if (len == 0) + len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset; + + ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset, + dst_file, fcr.fcr_dest_offset, len); + + fput(src_file); + + if (ret < 0) { + if (ret == -EOPNOTSUPP) + return (-ENOTTY); + return (ret); + } + + if (ret != len) + return (-EINVAL); + + return (0); +} + +/* Entry point for FIDEDUPERANGE, before Linux 4.5. */ +long +zpl_ioctl_fideduperange(struct file *filp, void *arg) +{ + (void) arg; + + /* No support for dedup yet */ + return (-ENOTTY); +}