Index: head/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h =================================================================== --- head/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h (revision 367344) +++ head/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_vfsops_os.h (revision 367345) @@ -1,240 +1,275 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. */ #ifndef _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H #if __FreeBSD_version >= 1300109 #define TEARDOWN_INACTIVE_RMS #endif #include #include #include #include #include #include #ifdef TEARDOWN_INACTIVE_RMS #include #endif #include #ifdef __cplusplus extern "C" { #endif +#define zfs_teardown_lock_t rrmlock_t + #ifdef TEARDOWN_INACTIVE_RMS -typedef struct rmslock zfs_teardown_lock_t; +typedef struct rmslock zfs_teardown_inactive_lock_t; #else -#define zfs_teardown_lock_t krwlock_t +#define zfs_teardown_inactive_lock_t krwlock_t #endif typedef struct zfsvfs zfsvfs_t; struct znode; struct zfsvfs { vfs_t *z_vfs; /* generic fs struct */ zfsvfs_t *z_parent; /* parent fs */ objset_t *z_os; /* objset reference */ uint64_t z_flags; /* super_block flags */ uint64_t z_root; /* id of root znode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_size; /* fuid table size */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ krwlock_t z_fuid_lock; /* fuid lock */ boolean_t z_fuid_loaded; /* fuid tables are loaded */ boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_type; /* type of acl usable on this fs */ uint_t z_acl_mode; /* acl chmod/mode behavior */ uint_t z_acl_inherit; /* acl inheritance behavior */ zfs_case_t z_case; /* case-sense */ boolean_t z_utf8; /* utf8-only */ int z_norm; /* normalization flags */ boolean_t z_atime; /* enable atimes mount option */ boolean_t z_unmounted; /* unmounted */ - rrmlock_t z_teardown_lock; - zfs_teardown_lock_t z_teardown_inactive_lock; + zfs_teardown_lock_t z_teardown_lock; + zfs_teardown_inactive_lock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all vnodes in the fs */ uint64_t z_nr_znodes; /* number of znodes in the fs */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ struct zfsctl_root *z_ctldir; /* .zfs directory pointer */ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ boolean_t z_replay; /* set during ZIL replay */ boolean_t z_use_sa; /* version allow system attributes */ boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ boolean_t z_use_namecache; /* make use of FreeBSD name cache */ uint8_t z_xattr; /* xattr type in use */ uint64_t z_version; /* ZPL version */ uint64_t z_shares_dir; /* hidden shares dir */ dataset_kstats_t z_kstat; /* fs kstats */ kmutex_t z_lock; uint64_t z_userquota_obj; uint64_t z_groupquota_obj; uint64_t z_userobjquota_obj; uint64_t z_groupobjquota_obj; uint64_t z_projectquota_obj; uint64_t z_projectobjquota_obj; uint64_t z_replay_eof; /* New end of file - replay only */ sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ struct task z_unlinked_drain_task; }; + +#define ZFS_TEARDOWN_INIT(zfsvfs) \ + rrm_init(&(zfsvfs)->z_teardown_lock, B_FALSE) + +#define ZFS_TEARDOWN_DESTROY(zfsvfs) \ + rrm_destroy(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_TRY_ENTER_READ(zfsvfs) \ + rw_tryenter(&(zfsvfs)->z_teardown_lock, RW_READER) + +#define ZFS_TEARDOWN_ENTER_READ(zfsvfs, tag) \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, tag); + +#define ZFS_TEARDOWN_EXIT_READ(zfsvfs, tag) \ + rrm_exit(&(zfsvfs)->z_teardown_lock, tag) + +#define ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, tag) \ + rrm_enter(&(zfsvfs)->z_teardown_lock, RW_WRITER, tag) + +#define ZFS_TEARDOWN_EXIT_WRITE(zfsvfs) \ + rrm_exit(&(zfsvfs)->z_teardown_lock, tag) + +#define ZFS_TEARDOWN_EXIT(zfsvfs, tag) \ + rrm_exit(&(zfsvfs)->z_teardown_lock, tag) + +#define ZFS_TEARDOWN_READ_HELD(zfsvfs) \ + RRM_READ_HELD(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_WRITE_HELD(zfsvfs) \ + RRM_WRITE_HELD(&(zfsvfs)->z_teardown_lock) + +#define ZFS_TEARDOWN_HELD(zfsvfs) \ + RRM_LOCK_HELD(&(zfsvfs)->z_teardown_lock) #ifdef TEARDOWN_INACTIVE_RMS #define ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs) \ rms_init(&(zfsvfs)->z_teardown_inactive_lock, "zfs teardown inactive") #define ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs) \ rms_destroy(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs) \ rms_try_rlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs) \ rms_rlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs) \ rms_runlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs) \ rms_wlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs) \ rms_wunlock(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs) \ rms_wowned(&(zfsvfs)->z_teardown_inactive_lock) #else #define ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs) \ rw_init(&(zfsvfs)->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL) #define ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs) \ rw_destroy(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs) \ rw_tryenter(&(zfsvfs)->z_teardown_inactive_lock, RW_READER) #define ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs) \ rw_enter(&(zfsvfs)->z_teardown_inactive_lock, RW_READER) #define ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs) \ rw_exit(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs) \ rw_enter(&(zfsvfs)->z_teardown_inactive_lock, RW_WRITER) #define ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs) \ rw_exit(&(zfsvfs)->z_teardown_inactive_lock) #define ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs) \ RW_WRITE_HELD(&(zfsvfs)->z_teardown_inactive_lock) #endif #define ZSB_XATTR 0x0001 /* Enable user xattrs */ /* * Normal filesystems (those not under .zfs/snapshot) have a total * file ID size limited to 12 bytes (including the length field) due to * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical * reasons, this same limit is being imposed by the Solaris NFSv3 implementation * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It * is not possible to expand beyond 12 bytes without abandoning support * of NFSv2. * * For normal filesystems, we partition up the available space as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * * We reserve only 48 bits for the object number, as this is the limit * currently defined and imposed by the DMU. */ typedef struct zfid_short { uint16_t zf_len; uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ } zfid_short_t; /* * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes * (including the length field). This makes files under .zfs/snapshot * accessible by NFSv3 and NFSv4, but not NFSv2. * * For files under .zfs/snapshot, we partition up the available space * as follows: * 2 bytes fid length (required) * 6 bytes object number (48 bits) * 4 bytes generation number (32 bits) * 6 bytes objset id (48 bits) * 4 bytes[**] currently just zero (32 bits) * * We reserve only 48 bits for the object number and objset id, as these are * the limits currently defined and imposed by the DMU. * * [*] 20 bytes on FreeBSD to fit into the size of struct fid. * [**] 2 bytes on FreeBSD for the above reason. */ typedef struct zfid_long { zfid_short_t z_fid; uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */ } zfid_long_t; #define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) extern uint_t zfs_fsyncer_key; extern int zfs_super_owner; extern void zfs_init(void); extern void zfs_fini(void); extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); extern int zfsvfs_create(const char *name, boolean_t readonly, zfsvfs_t **zfvp); extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); extern void zfsvfs_free(zfsvfs_t *zfsvfs); extern int zfs_check_global_label(const char *dsname, const char *hexsl); extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); extern int zfs_get_temporary_prop(struct dsl_dataset *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint); extern int zfs_busy(void); #ifdef __cplusplus } #endif #endif /* _SYS_FS_ZFS_VFSOPS_H */ Index: head/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_znode_impl.h =================================================================== --- head/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_znode_impl.h (revision 367344) +++ head/sys/contrib/openzfs/include/os/freebsd/zfs/sys/zfs_znode_impl.h (revision 367345) @@ -1,183 +1,183 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #ifndef _FREEBSD_ZFS_SYS_ZNODE_IMPL_H #define _FREEBSD_ZFS_SYS_ZNODE_IMPL_H #include #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif /* * Directory entry locks control access to directory entries. * They are used to protect creates, deletes, and renames. * Each directory znode has a mutex and a list of locked names. */ #define ZNODE_OS_FIELDS \ struct zfsvfs *z_zfsvfs; \ vnode_t *z_vnode; \ uint64_t z_uid; \ uint64_t z_gid; \ uint64_t z_gen; \ uint64_t z_atime[2]; \ uint64_t z_links; #define ZFS_LINK_MAX UINT64_MAX /* * ZFS minor numbers can refer to either a control device instance or * a zvol. Depending on the value of zss_type, zss_data points to either * a zvol_state_t or a zfs_onexit_t. */ enum zfs_soft_state_type { ZSST_ZVOL, ZSST_CTLDEV }; typedef struct zfs_soft_state { enum zfs_soft_state_type zss_type; void *zss_data; } zfs_soft_state_t; extern minor_t zfsdev_minor_alloc(void); /* * Range locking rules * -------------------- * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole * file range needs to be locked as RL_WRITER. Only then can the pages be * freed etc and zp_size reset. zp_size must be set within range lock. * 2. For writes and punching holes (zfs_write & zfs_space) just the range * being written or freed needs to be locked as RL_WRITER. * Multiple writes at the end of the file must coordinate zp_size updates * to ensure data isn't lost. A compare and swap loop is currently used * to ensure the file size is at least the offset last written. * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being * read needs to be locked as RL_READER. A check against zp_size can then * be made for reading beyond end of file. */ /* * Convert between znode pointers and vnode pointers */ #define ZTOV(ZP) ((ZP)->z_vnode) #define ZTOI(ZP) ((ZP)->z_vnode) #define VTOZ(VP) ((struct znode *)(VP)->v_data) #define VTOZ_SMR(VP) ((znode_t *)vn_load_v_data_smr(VP)) #define ITOZ(VP) ((struct znode *)(VP)->v_data) #define zhold(zp) vhold(ZTOV((zp))) #define zrele(zp) vrele(ZTOV((zp))) #define ZTOZSB(zp) ((zp)->z_zfsvfs) #define ITOZSB(vp) (VTOZ(vp)->z_zfsvfs) #define ZTOTYPE(zp) (ZTOV(zp)->v_type) #define ZTOGID(zp) ((zp)->z_gid) #define ZTOUID(zp) ((zp)->z_uid) #define ZTONLNK(zp) ((zp)->z_links) #define Z_ISBLK(type) ((type) == VBLK) #define Z_ISCHR(type) ((type) == VCHR) #define Z_ISLNK(type) ((type) == VLNK) /* Called on entry to each ZFS vnode and vfs operation */ #define ZFS_ENTER(zfsvfs) \ { \ - rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ + ZFS_TEARDOWN_ENTER_READ((zfsvfs), FTAG); \ if (__predict_false((zfsvfs)->z_unmounted)) { \ - ZFS_EXIT(zfsvfs); \ + ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG); \ return (EIO); \ } \ } /* Must be called before exiting the vop */ -#define ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG) +#define ZFS_EXIT(zfsvfs) ZFS_TEARDOWN_EXIT_READ(zfsvfs, FTAG) /* Verifies the znode is valid */ #define ZFS_VERIFY_ZP(zp) \ if (__predict_false((zp)->z_sa_hdl == NULL)) { \ ZFS_EXIT((zp)->z_zfsvfs); \ return (EIO); \ } \ /* * Macros for dealing with dmu_buf_hold */ #define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1)) #define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \ (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) #define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) #define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \ mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) #define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) /* Encode ZFS stored time values from a struct timespec */ #define ZFS_TIME_ENCODE(tp, stmp) \ { \ (stmp)[0] = (uint64_t)(tp)->tv_sec; \ (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ } /* Decode ZFS stored time values to a struct timespec */ #define ZFS_TIME_DECODE(tp, stmp) \ { \ (tp)->tv_sec = (time_t)(stmp)[0]; \ (tp)->tv_nsec = (long)(stmp)[1]; \ } #define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ zfs_tstamp_update_setup_ext(zp, ACCESSED, NULL, NULL, B_FALSE); extern void zfs_tstamp_update_setup_ext(struct znode *, uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx); extern void zfs_znode_free(struct znode *); extern zil_get_data_t zfs_get_data; extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; extern int zfsfstype; extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp, char *buf); #ifdef __cplusplus } #endif #endif /* _FREEBSD_SYS_FS_ZFS_ZNODE_H */ Index: head/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h =================================================================== --- head/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h (revision 367344) +++ head/sys/contrib/openzfs/include/os/linux/zfs/sys/zfs_znode_impl.h (revision 367345) @@ -1,168 +1,168 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ #ifndef _SYS_ZFS_ZNODE_IMPL_H #define _SYS_ZFS_ZNODE_IMPL_H #ifndef _KERNEL #error "no user serviceable parts within" #endif #include #include #include #include #include #include #include #include #include #include #ifdef __cplusplus extern "C" { #endif #define ZNODE_OS_FIELDS \ struct inode z_inode; /* * Convert between znode pointers and inode pointers */ #define ZTOI(znode) (&((znode)->z_inode)) #define ITOZ(inode) (container_of((inode), znode_t, z_inode)) #define ZTOZSB(znode) ((zfsvfs_t *)(ZTOI(znode)->i_sb->s_fs_info)) #define ITOZSB(inode) ((zfsvfs_t *)((inode)->i_sb->s_fs_info)) #define ZTOTYPE(zp) (ZTOI(zp)->i_mode) #define ZTOGID(zp) (ZTOI(zp)->i_gid) #define ZTOUID(zp) (ZTOI(zp)->i_uid) #define ZTONLNK(zp) (ZTOI(zp)->i_nlink) #define Z_ISBLK(type) S_ISBLK(type) #define Z_ISCHR(type) S_ISCHR(type) #define Z_ISLNK(type) S_ISLNK(type) #define Z_ISDEV(type) (S_ISCHR(type) || S_ISBLK(type) || S_ISFIFO(type)) #define zhold(zp) igrab(ZTOI((zp))) #define zrele(zp) iput(ZTOI((zp))) /* Called on entry to each ZFS inode and vfs operation. */ #define ZFS_ENTER_ERROR(zfsvfs, error) \ do { \ - rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ + ZFS_TEARDOWN_ENTER_READ(zfsvfs, FTAG); \ if (unlikely((zfsvfs)->z_unmounted)) { \ - ZFS_EXIT(zfsvfs); \ + ZFS_EXIT_READ(zfsvfs, FTAG); \ return (error); \ } \ } while (0) #define ZFS_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, EIO) #define ZPL_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, -EIO) /* Must be called before exiting the operation. */ #define ZFS_EXIT(zfsvfs) \ do { \ zfs_exit_fs(zfsvfs); \ - rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG); \ + ZFS_EXIT_READ(zfsvfs, FTAG); \ } while (0) #define ZPL_EXIT(zfsvfs) ZFS_EXIT(zfsvfs) /* Verifies the znode is valid. */ #define ZFS_VERIFY_ZP_ERROR(zp, error) \ do { \ if (unlikely((zp)->z_sa_hdl == NULL)) { \ ZFS_EXIT(ZTOZSB(zp)); \ return (error); \ } \ } while (0) #define ZFS_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, EIO) #define ZPL_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, -EIO) /* * Macros for dealing with dmu_buf_hold */ #define ZFS_OBJ_MTX_SZ 64 #define ZFS_OBJ_MTX_MAX (1024 * 1024) #define ZFS_OBJ_HASH(zfsvfs, obj) ((obj) & ((zfsvfs->z_hold_size) - 1)) extern unsigned int zfs_object_mutex_size; /* * Encode ZFS stored time values from a struct timespec / struct timespec64. */ #define ZFS_TIME_ENCODE(tp, stmp) \ do { \ (stmp)[0] = (uint64_t)(tp)->tv_sec; \ (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ } while (0) #if defined(HAVE_INODE_TIMESPEC64_TIMES) /* * Decode ZFS stored time values to a struct timespec64 * 4.18 and newer kernels. */ #define ZFS_TIME_DECODE(tp, stmp) \ do { \ (tp)->tv_sec = (time64_t)(stmp)[0]; \ (tp)->tv_nsec = (long)(stmp)[1]; \ } while (0) #else /* * Decode ZFS stored time values to a struct timespec * 4.17 and older kernels. */ #define ZFS_TIME_DECODE(tp, stmp) \ do { \ (tp)->tv_sec = (time_t)(stmp)[0]; \ (tp)->tv_nsec = (long)(stmp)[1]; \ } while (0) #endif /* HAVE_INODE_TIMESPEC64_TIMES */ struct znode; extern int zfs_sync(struct super_block *, int, cred_t *); extern int zfs_inode_alloc(struct super_block *, struct inode **ip); extern void zfs_inode_destroy(struct inode *); extern void zfs_inode_update(struct znode *); extern void zfs_mark_inode_dirty(struct inode *); extern boolean_t zfs_relatime_need_update(const struct inode *); #if defined(HAVE_UIO_RW) extern caddr_t zfs_map_page(page_t *, enum seg_rw); extern void zfs_unmap_page(page_t *, caddr_t); #endif /* HAVE_UIO_RW */ extern zil_get_data_t zfs_get_data; extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; extern int zfsfstype; #ifdef __cplusplus } #endif #endif /* _SYS_ZFS_ZNODE_IMPL_H */ Index: head/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c =================================================================== --- head/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c (revision 367344) +++ head/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_dir.c (revision 367345) @@ -1,968 +1,968 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2013, 2016 by Delphix. All rights reserved. * Copyright 2017 Nexenta Systems, Inc. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * zfs_match_find() is used by zfs_dirent_lookup() to perform zap lookups * of names after deciding which is the appropriate lookup interface. */ static int zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, matchtype_t mt, uint64_t *zoid) { int error; if (zfsvfs->z_norm) { /* * In the non-mixed case we only expect there would ever * be one match, but we need to use the normalizing lookup. */ error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid, mt, NULL, 0, NULL); } else { error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); } *zoid = ZFS_DIRENT_OBJ(*zoid); return (error); } /* * Look up a directory entry under a locked vnode. * dvp being locked gives us a guarantee that there are no concurrent * modification of the directory and, thus, if a node can be found in * the directory, then it must not be unlinked. * * Input arguments: * dzp - znode for directory * name - name of entry to lock * flag - ZNEW: if the entry already exists, fail with EEXIST. * ZEXISTS: if the entry does not exist, fail with ENOENT. * ZXATTR: we want dzp's xattr directory * * Output arguments: * zpp - pointer to the znode for the entry (NULL if there isn't one) * * Return value: 0 on success or errno on failure. * * NOTE: Always checks for, and rejects, '.' and '..'. */ int zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; znode_t *zp; matchtype_t mt = 0; uint64_t zoid; int error = 0; if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); *zpp = NULL; /* * Verify that we are not trying to lock '.', '..', or '.zfs' */ if (name[0] == '.' && (((name[1] == '\0') || (name[1] == '.' && name[2] == '\0')) || (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))) return (SET_ERROR(EEXIST)); /* * Case sensitivity and normalization preferences are set when * the file system is created. These are stored in the * zfsvfs->z_case and zfsvfs->z_norm fields. These choices * affect how we perform zap lookups. * * When matching we may need to normalize & change case according to * FS settings. * * Note that a normalized match is necessary for a case insensitive * filesystem when the lookup request is not exact because normalization * can fold case independent of normalizing code point sequences. * * See the table above zfs_dropname(). */ if (zfsvfs->z_norm != 0) { mt = MT_NORMALIZE; /* * Determine if the match needs to honor the case specified in * lookup, and if so keep track of that so that during * normalization we don't fold case. */ if (zfsvfs->z_case == ZFS_CASE_MIXED) { mt |= MT_MATCH_CASE; } } /* * Only look in or update the DNLC if we are looking for the * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name. * * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE * because in that case MT_EXACT and MT_FIRST should produce exactly * the same result. */ if (dzp->z_unlinked && !(flag & ZXATTR)) return (ENOENT); if (flag & ZXATTR) { error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, sizeof (zoid)); if (error == 0) error = (zoid == 0 ? ENOENT : 0); } else { error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid); } if (error) { if (error != ENOENT || (flag & ZEXISTS)) { return (error); } } else { if (flag & ZNEW) { return (SET_ERROR(EEXIST)); } error = zfs_zget(zfsvfs, zoid, &zp); if (error) return (error); ASSERT(!zp->z_unlinked); *zpp = zp; } return (0); } static int zfs_dd_lookup(znode_t *dzp, znode_t **zpp) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; znode_t *zp; uint64_t parent; int error; #ifdef ZFS_DEBUG if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); - ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(ZFS_TEARDOWN_READ_HELD(zfsvfs)); #endif if (dzp->z_unlinked) return (ENOENT); if ((error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) return (error); error = zfs_zget(zfsvfs, parent, &zp); if (error == 0) *zpp = zp; return (error); } int zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp) { zfsvfs_t *zfsvfs __unused = dzp->z_zfsvfs; znode_t *zp = NULL; int error = 0; #ifdef ZFS_DEBUG if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); - ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(ZFS_TEARDOWN_READ_HELD(zfsvfs)); #endif if (dzp->z_unlinked) return (SET_ERROR(ENOENT)); if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { *zpp = dzp; } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { error = zfs_dd_lookup(dzp, &zp); if (error == 0) *zpp = zp; } else { error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS); if (error == 0) { dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ *zpp = zp; } } return (error); } /* * unlinked Set (formerly known as the "delete queue") Error Handling * * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we * don't specify the name of the entry that we will be manipulating. We * also fib and say that we won't be adding any new entries to the * unlinked set, even though we might (this is to lower the minimum file * size that can be deleted in a full filesystem). So on the small * chance that the nlink list is using a fat zap (ie. has more than * 2000 entries), we *may* not pre-read a block that's needed. * Therefore it is remotely possible for some of the assertions * regarding the unlinked set below to fail due to i/o error. On a * nondebug system, this will result in the space being leaked. */ void zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp->z_unlinked); ASSERT(zp->z_links == 0); VERIFY3U(0, ==, zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1); } /* * Clean up any znodes that had no links when we either crashed or * (force) umounted the file system. */ void zfs_unlinked_drain(zfsvfs_t *zfsvfs) { zap_cursor_t zc; zap_attribute_t zap; dmu_object_info_t doi; znode_t *zp; dmu_tx_t *tx; int error; /* * Iterate over the contents of the unlinked set. */ for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); zap_cursor_retrieve(&zc, &zap) == 0; zap_cursor_advance(&zc)) { /* * See what kind of object we have in list */ error = dmu_object_info(zfsvfs->z_os, zap.za_first_integer, &doi); if (error != 0) continue; ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); /* * We need to re-mark these list entries for deletion, * so we pull them back into core and set zp->z_unlinked. */ error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); /* * We may pick up znodes that are already marked for deletion. * This could happen during the purge of an extended attribute * directory. All we need to do is skip over them, since they * are already in the system marked z_unlinked. */ if (error != 0) continue; vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); /* * Due to changes in zfs_rmnode we need to make sure the * link count is set to zero here. */ if (zp->z_links != 0) { tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); error = dmu_tx_assign(tx, TXG_WAIT); if (error != 0) { dmu_tx_abort(tx); vput(ZTOV(zp)); continue; } zp->z_links = 0; VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), &zp->z_links, sizeof (zp->z_links), tx)); dmu_tx_commit(tx); } zp->z_unlinked = B_TRUE; vput(ZTOV(zp)); } zap_cursor_fini(&zc); } /* * Delete the entire contents of a directory. Return a count * of the number of entries that could not be deleted. If we encounter * an error, return a count of at least one so that the directory stays * in the unlinked set. * * NOTE: this function assumes that the directory is inactive, * so there is no need to lock its entries before deletion. * Also, it assumes the directory contents is *only* regular * files. */ static int zfs_purgedir(znode_t *dzp) { zap_cursor_t zc; zap_attribute_t zap; znode_t *xzp; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; int skipped = 0; int error; for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); (error = zap_cursor_retrieve(&zc, &zap)) == 0; zap_cursor_advance(&zc)) { error = zfs_zget(zfsvfs, ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); if (error) { skipped += 1; continue; } vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); ASSERT((ZTOV(xzp)->v_type == VREG) || (ZTOV(xzp)->v_type == VLNK)); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* Is this really needed ? */ zfs_sa_upgrade_txholds(tx, xzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); vput(ZTOV(xzp)); skipped += 1; continue; } error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL); if (error) skipped += 1; dmu_tx_commit(tx); vput(ZTOV(xzp)); } zap_cursor_fini(&zc); if (error != ENOENT) skipped += 1; return (skipped); } extern taskq_t *zfsvfs_taskq; void zfs_rmnode(znode_t *zp) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; uint64_t acl_obj; uint64_t xattr_obj; uint64_t count; int error; ASSERT(zp->z_links == 0); if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); /* * If this is an attribute directory, purge its contents. */ if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && (zp->z_pflags & ZFS_XATTR)) { if (zfs_purgedir(zp) != 0) { /* * Not enough space to delete some xattrs. * Leave it in the unlinked set. */ zfs_znode_dmu_fini(zp); zfs_znode_free(zp); return; } } else { /* * Free up all the data in the file. We don't do this for * XATTR directories because we need truncate and remove to be * in the same tx, like in zfs_znode_delete(). Otherwise, if * we crash here we'll end up with an inconsistent truncated * zap object in the delete queue. Note a truncated file is * harmless since it only contains user data. */ error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); if (error) { /* * Not enough space or we were interrupted by unmount. * Leave the file in the unlinked set. */ zfs_znode_dmu_fini(zp); zfs_znode_free(zp); return; } } /* * If the file has extended attributes, we're going to unlink * the xattr dir. */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error) xattr_obj = 0; acl_obj = zfs_external_acl(zp); /* * Set up the final transaction. */ tx = dmu_tx_create(os); dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); if (xattr_obj) dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); if (acl_obj) dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { /* * Not enough space to delete the file. Leave it in the * unlinked set, leaking it until the fs is remounted (at * which point we'll call zfs_unlinked_drain() to process it). */ dmu_tx_abort(tx); zfs_znode_dmu_fini(zp); zfs_znode_free(zp); return; } /* * FreeBSD's implementation of zfs_zget requires a vnode to back it. * This means that we could end up calling into getnewvnode while * calling zfs_rmnode as a result of a prior call to getnewvnode * trying to clear vnodes out of the cache. If this repeats we can * recurse enough that we overflow our stack. To avoid this, we * avoid calling zfs_zget on the xattr znode and instead simply add * it to the unlinked set and schedule a call to zfs_unlinked_drain. */ if (xattr_obj) { /* Add extended attribute directory to the unlinked set. */ VERIFY3U(0, ==, zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx)); } mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); /* Remove this znode from the unlinked set */ VERIFY3U(0, ==, zap_remove_int(os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); } mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); zfs_znode_delete(zp, tx); dmu_tx_commit(tx); if (xattr_obj) { /* * We're using the FreeBSD taskqueue API here instead of * the Solaris taskq API since the FreeBSD API allows for a * task to be enqueued multiple times but executed once. */ taskqueue_enqueue(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task); } } static uint64_t zfs_dirent(znode_t *zp, uint64_t mode) { uint64_t de = zp->z_id; if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) de |= IFTODT(mode) << 60; return (de); } /* * Link zp into dzp. Can only fail if zp has been unlinked. */ int zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, int flag) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; vnode_t *vp = ZTOV(zp); uint64_t value; int zp_is_dir = (vp->v_type == VDIR); sa_bulk_attr_t bulk[5]; uint64_t mtime[2], ctime[2]; int count = 0; int error; if (zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); } if (zp_is_dir) { if (dzp->z_links >= ZFS_LINK_MAX) return (SET_ERROR(EMLINK)); } if (!(flag & ZRENAMING)) { if (zp->z_unlinked) { /* no new links to unlinked zp */ ASSERT(!(flag & (ZNEW | ZEXISTS))); return (SET_ERROR(ENOENT)); } if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) { return (SET_ERROR(EMLINK)); } zp->z_links++; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); } else { ASSERT(zp->z_unlinked == 0); } value = zfs_dirent(zp, zp->z_mode); error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name, 8, 1, &value, tx); /* * zap_add could fail to add the entry if it exceeds the capacity of the * leaf-block and zap_leaf_split() failed to help. * The caller of this routine is responsible for failing the transaction * which will rollback the SA updates done above. */ if (error != 0) { if (!(flag & ZRENAMING) && !(flag & ZNEW)) zp->z_links--; return (error); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &dzp->z_id, sizeof (dzp->z_id)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (!(flag & ZNEW)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); } error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(error); dzp->z_size++; dzp->z_links += zp_is_dir; count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &dzp->z_size, sizeof (dzp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &dzp->z_links, sizeof (dzp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ASSERT0(error); return (0); } /* * The match type in the code for this function should conform to: * * ------------------------------------------------------------------------ * fs type | z_norm | lookup type | match type * ---------|-------------|-------------|---------------------------------- * CS !norm | 0 | 0 | 0 (exact) * CS norm | formX | 0 | MT_NORMALIZE * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE * CM !norm | upper | ZCILOOK | MT_NORMALIZE * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE * * Abbreviations: * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) * formX = unicode normalization form set on fs creation */ static int zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, int flag) { int error; if (zp->z_zfsvfs->z_norm) { matchtype_t mt = MT_NORMALIZE; if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) { mt |= MT_MATCH_CASE; } error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id, name, mt, tx); } else { error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx); } return (error); } /* * Unlink zp from dzp, and mark zp for deletion if this was the last link. * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. * If it's non-NULL, we use it to indicate whether the znode needs deletion, * and it's the caller's job to do it. */ int zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, int flag, boolean_t *unlinkedp) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; vnode_t *vp = ZTOV(zp); int zp_is_dir = (vp->v_type == VDIR); boolean_t unlinked = B_FALSE; sa_bulk_attr_t bulk[5]; uint64_t mtime[2], ctime[2]; int count = 0; int error; if (zfsvfs->z_replay == B_FALSE) { ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); } if (!(flag & ZRENAMING)) { if (zp_is_dir && !zfs_dirempty(zp)) return (SET_ERROR(ENOTEMPTY)); /* * If we get here, we are going to try to remove the object. * First try removing the name from the directory; if that * fails, return the error. */ error = zfs_dropname(dzp, name, zp, tx, flag); if (error != 0) { return (error); } if (zp->z_links <= zp_is_dir) { zfs_panic_recover("zfs: link count on vnode %p is %u, " "should be at least %u", zp->z_vnode, (int)zp->z_links, zp_is_dir + 1); zp->z_links = zp_is_dir + 1; } if (--zp->z_links == zp_is_dir) { zp->z_unlinked = B_TRUE; zp->z_links = 0; unlinked = B_TRUE; } else { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); } SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &zp->z_links, sizeof (zp->z_links)); error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); count = 0; ASSERT0(error); } else { ASSERT(zp->z_unlinked == 0); error = zfs_dropname(dzp, name, zp, tx, flag); if (error != 0) return (error); } dzp->z_size--; /* one dirent removed */ dzp->z_links -= zp_is_dir; /* ".." link from zp */ SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &dzp->z_links, sizeof (dzp->z_links)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &dzp->z_size, sizeof (dzp->z_size)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, sizeof (ctime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); ASSERT0(error); if (unlinkedp != NULL) *unlinkedp = unlinked; else if (unlinked) zfs_unlinked_add(zp, tx); return (0); } /* * Indicate whether the directory is empty. */ boolean_t zfs_dirempty(znode_t *dzp) { return (dzp->z_size == 2); } int zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; znode_t *xzp; dmu_tx_t *tx; int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t parent __unused; *xvpp = NULL; if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, &acl_ids)) != 0) return (error); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) { zfs_acl_ids_free(&acl_ids); return (SET_ERROR(EDQUOT)); } getnewvnode_reserve_(); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); return (error); } zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); #ifdef ZFS_DEBUG error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); ASSERT(error == 0 && parent == zp->z_id); #endif VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, sizeof (xzp->z_id), tx)); (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, xzp, "", NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); *xvpp = xzp; return (0); } /* * Return a znode for the extended attribute directory for zp. * ** If the directory does not already exist, it is created ** * * IN: zp - znode to obtain attribute directory from * cr - credentials of caller * flags - flags from the VOP_LOOKUP call * * OUT: xzpp - pointer to extended attribute znode * * RETURN: 0 on success * error number on failure */ int zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; znode_t *xzp; vattr_t va; int error; top: error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR); if (error) return (error); if (xzp != NULL) { *xzpp = xzp; return (0); } if (!(flags & CREATE_XATTR_DIR)) return (SET_ERROR(ENOATTR)); if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { return (SET_ERROR(EROFS)); } /* * The ability to 'create' files in an attribute * directory comes from the write_xattr permission on the base file. * * The ability to 'search' an attribute directory requires * read_xattr permission on the base file. * * Once in a directory the ability to read/write attributes * is controlled by the permissions on the attribute file. */ va.va_mask = AT_MODE | AT_UID | AT_GID; va.va_type = VDIR; va.va_mode = S_IFDIR | S_ISVTX | 0777; zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); error = zfs_make_xattrdir(zp, &va, xzpp, cr); if (error == ERESTART) { /* NB: we already did dmu_tx_wait() if necessary */ goto top; } if (error == 0) VOP_UNLOCK1(ZTOV(*xzpp)); return (error); } /* * Decide whether it is okay to remove within a sticky directory. * * In sticky directories, write access is not sufficient; * you can remove entries from a directory only if: * * you own the directory, * you own the entry, * the entry is a plain file and you have write access, * or you are privileged (checked in secpolicy...). * * The function returns 0 if remove access is granted. */ int zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) { uid_t uid; uid_t downer; uid_t fowner; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; if (zdp->z_zfsvfs->z_replay) return (0); if ((zdp->z_mode & S_ISVTX) == 0) return (0); downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); if ((uid = crgetuid(cr)) == downer || uid == fowner || (ZTOV(zp)->v_type == VREG && zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) return (0); else return (secpolicy_vnode_remove(ZTOV(zp), cr)); } Index: head/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c =================================================================== --- head/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c (revision 367344) +++ head/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vfsops.c (revision 367345) @@ -1,2304 +1,2300 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2011 Pawel Jakub Dawidek . * All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Nexenta Systems, Inc. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" #ifndef MNTK_VMSETSIZE_BUG #define MNTK_VMSETSIZE_BUG 0 #endif #ifndef MNTK_NOMSYNC #define MNTK_NOMSYNC 8 #endif /* BEGIN CSTYLED */ struct mtx zfs_debug_mtx; MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); int zfs_super_owner; SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, "File system owner can perform privileged operation on his file systems"); int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); static int zfs_version_acl = ZFS_ACL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, "ZFS_ACL_VERSION"); static int zfs_version_spa = SPA_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, "SPA_VERSION"); static int zfs_version_zpl = ZPL_VERSION; SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, "ZPL_VERSION"); /* END CSTYLED */ static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); static int zfs_mount(vfs_t *vfsp); static int zfs_umount(vfs_t *vfsp, int fflag); static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); static int zfs_sync(vfs_t *vfsp, int waitfor); #if __FreeBSD_version >= 1300098 static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, struct ucred **credanonp, int *numsecflavors, int *secflavors); #else static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors); #endif static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); static void zfs_freevfs(vfs_t *vfsp); struct vfsops zfs_vfsops = { .vfs_mount = zfs_mount, .vfs_unmount = zfs_umount, #if __FreeBSD_version >= 1300049 .vfs_root = vfs_cache_root, .vfs_cachedroot = zfs_root, #else .vfs_root = zfs_root, #endif .vfs_statfs = zfs_statfs, .vfs_vget = zfs_vget, .vfs_sync = zfs_sync, .vfs_checkexp = zfs_checkexp, .vfs_fhtovp = zfs_fhtovp, .vfs_quotactl = zfs_quotactl, }; VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); /* * We need to keep a count of active fs's. * This is necessary to prevent our module * from being unloaded after a umount -f */ static uint32_t zfs_active_fs_count = 0; int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint) { int error; zfsvfs_t *zfvp; vfs_t *vfsp; objset_t *os; uint64_t tmp = *val; error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, &zfvp); if (error != 0) return (error); if (zfvp == NULL) return (ENOENT); vfsp = zfvp->z_vfs; switch (zfs_prop) { case ZFS_PROP_ATIME: if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) tmp = 1; break; case ZFS_PROP_DEVICES: if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) tmp = 1; break; case ZFS_PROP_EXEC: if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) tmp = 1; break; case ZFS_PROP_SETUID: if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) tmp = 1; break; case ZFS_PROP_READONLY: if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) tmp = 1; break; case ZFS_PROP_XATTR: if (zfvp->z_flags & ZSB_XATTR) tmp = zfvp->z_xattr; break; case ZFS_PROP_NBMAND: if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) tmp = 0; if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) tmp = 1; break; default: vfs_unbusy(vfsp); return (ENOENT); } vfs_unbusy(vfsp); if (tmp != *val) { (void) strcpy(setpoint, "temporary"); *val = tmp; } return (0); } static int zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) { int error = 0; char buf[32]; uint64_t usedobj, quotaobj; uint64_t quota, used = 0; timespec_t now; usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; if (quotaobj == 0 || zfsvfs->z_replay) { error = ENOENT; goto done; } (void) sprintf(buf, "%llx", (longlong_t)id); if ((error = zap_lookup(zfsvfs->z_os, quotaobj, buf, sizeof (quota), 1, "a)) != 0) { dprintf("%s(%d): quotaobj lookup failed\n", __FUNCTION__, __LINE__); goto done; } /* * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". * So we set them to be the same. */ dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); if (error && error != ENOENT) { dprintf("%s(%d): usedobj failed; %d\n", __FUNCTION__, __LINE__, error); goto done; } dqp->dqb_curblocks = btodb(used); dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; vfs_timestamp(&now); /* * Setting this to 0 causes FreeBSD quota(8) to print * the number of days since the epoch, which isn't * particularly useful. */ dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; done: return (error); } static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) { zfsvfs_t *zfsvfs = vfsp->vfs_data; struct thread *td; int cmd, type, error = 0; int bitsize; zfs_userquota_prop_t quota_type; struct dqblk64 dqblk = { 0 }; td = curthread; cmd = cmds >> SUBCMDSHIFT; type = cmds & SUBCMDMASK; ZFS_ENTER(zfsvfs); if (id == -1) { switch (type) { case USRQUOTA: id = td->td_ucred->cr_ruid; break; case GRPQUOTA: id = td->td_ucred->cr_rgid; break; default: error = EINVAL; if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) vfs_unbusy(vfsp); goto done; } } /* * Map BSD type to: * ZFS_PROP_USERUSED, * ZFS_PROP_USERQUOTA, * ZFS_PROP_GROUPUSED, * ZFS_PROP_GROUPQUOTA */ switch (cmd) { case Q_SETQUOTA: case Q_SETQUOTA32: if (type == USRQUOTA) quota_type = ZFS_PROP_USERQUOTA; else if (type == GRPQUOTA) quota_type = ZFS_PROP_GROUPQUOTA; else error = EINVAL; break; case Q_GETQUOTA: case Q_GETQUOTA32: if (type == USRQUOTA) quota_type = ZFS_PROP_USERUSED; else if (type == GRPQUOTA) quota_type = ZFS_PROP_GROUPUSED; else error = EINVAL; break; } /* * Depending on the cmd, we may need to get * the ruid and domain (see fuidstr_to_sid?), * the fuid (how?), or other information. * Create fuid using zfs_fuid_create(zfsvfs, id, * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? * I think I can use just the id? * * Look at zfs_id_overquota() to look up a quota. * zap_lookup(something, quotaobj, fuidstring, * sizeof (long long), 1, "a) * * See zfs_set_userquota() to set a quota. */ if ((uint32_t)type >= MAXQUOTAS) { error = EINVAL; goto done; } switch (cmd) { case Q_GETQUOTASIZE: bitsize = 64; error = copyout(&bitsize, arg, sizeof (int)); break; case Q_QUOTAON: // As far as I can tell, you can't turn quotas on or off on zfs error = 0; vfs_unbusy(vfsp); break; case Q_QUOTAOFF: error = ENOTSUP; vfs_unbusy(vfsp); break; case Q_SETQUOTA: error = copyin(arg, &dqblk, sizeof (dqblk)); if (error == 0) error = zfs_set_userquota(zfsvfs, quota_type, "", id, dbtob(dqblk.dqb_bhardlimit)); break; case Q_GETQUOTA: error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); if (error == 0) error = copyout(&dqblk, arg, sizeof (dqblk)); break; default: error = EINVAL; break; } done: ZFS_EXIT(zfsvfs); return (error); } boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs) { return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); } /*ARGSUSED*/ static int zfs_sync(vfs_t *vfsp, int waitfor) { /* * Data integrity is job one. We don't want a compromised kernel * writing to the storage pool, so we never sync during panic. */ if (panicstr) return (0); /* * Ignore the system syncher. ZFS already commits async data * at zfs_txg_timeout intervals. */ if (waitfor == MNT_LAZY) return (0); if (vfsp != NULL) { /* * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; dsl_pool_t *dp; int error; error = vfs_stdsync(vfsp, waitfor); if (error != 0) return (error); ZFS_ENTER(zfsvfs); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (rebooting && spa_suspended(dp->dp_spa)) { ZFS_EXIT(zfsvfs); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == TRUE) { zfsvfs->z_atime = TRUE; zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); } else { zfsvfs->z_atime = FALSE; zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); } } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == ZFS_XATTR_OFF) { zfsvfs->z_flags &= ~ZSB_XATTR; } else { zfsvfs->z_flags |= ZSB_XATTR; if (newval == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; else zfsvfs->z_xattr_sa = B_FALSE; } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; zfsvfs->z_vfs->mnt_stat.f_iosize = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval) { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); } else { /* XXX locking on vfs_flag? */ zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); } } static void setuid_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); } } static void exec_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); } else { zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); } } /* * The nbmand mount option can be changed at mount time. * We can't allow it to be toggled on live file systems or incorrect * behavior may be seen from cifs clients * * This property isn't registered via dsl_prop_register(), but this callback * will be called when a file system is first mounted */ static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == FALSE) { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); } else { vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); } } static void snapdir_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_show_ctldir = newval; } static void vscan_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_vscan = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_inherit = newval; } static void acl_type_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_type = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; uint64_t nbmand; boolean_t readonly = B_FALSE; boolean_t do_readonly = B_FALSE; boolean_t setuid = B_FALSE; boolean_t do_setuid = B_FALSE; boolean_t exec = B_FALSE; boolean_t do_exec = B_FALSE; boolean_t xattr = B_FALSE; boolean_t atime = B_FALSE; boolean_t do_atime = B_FALSE; boolean_t do_xattr = B_FALSE; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * This function can be called for a snapshot when we update snapshot's * mount point, which isn't really supported. */ if (dmu_objset_is_snapshot(os)) return (EOPNOTSUPP); /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || !spa_writeable(dmu_objset_spa(os))) { readonly = B_TRUE; do_readonly = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { readonly = B_FALSE; do_readonly = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { setuid = B_FALSE; do_setuid = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { setuid = B_TRUE; do_setuid = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { exec = B_FALSE; do_exec = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { exec = B_TRUE; do_exec = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; do_xattr = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; do_xattr = B_TRUE; } if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { atime = B_FALSE; do_atime = B_TRUE; } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { atime = B_TRUE; do_atime = B_TRUE; } /* * We need to enter pool configuration here, so that we can use * dsl_prop_get_int_ds() to handle the special nbmand property below. * dsl_prop_get_integer() can not be used, because it has to acquire * spa_namespace_lock and we can not do that because we already hold * z_teardown_lock. The problem is that spa_write_cachefile() is called * with spa_namespace_lock held and the function calls ZFS vnode * operations to write the cache file and thus z_teardown_lock is * acquired after spa_namespace_lock. */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); /* * nbmand is a special property. It can only be changed at * mount time. * * This is weird, but it is documented to only be changeable * at mount time. */ if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { nbmand = B_FALSE; } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { nbmand = B_TRUE; } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) { dsl_pool_config_exit(dmu_objset_pool(os), FTAG); return (error); } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), acl_type_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (do_readonly) readonly_changed_cb(zfsvfs, readonly); if (do_setuid) setuid_changed_cb(zfsvfs, setuid); if (do_exec) exec_changed_cb(zfsvfs, exec); if (do_xattr) xattr_changed_cb(zfsvfs, xattr); if (do_atime) atime_changed_cb(zfsvfs, atime); nbmand_changed_cb(zfsvfs, nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printf("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val); if (error != 0) return (error); zfsvfs->z_acl_type = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); } error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT(zfsvfs->z_root != 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 8, 1, &zfsvfs->z_projectquota_obj); if (error == ENOENT) zfsvfs->z_projectquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 8, 1, &zfsvfs->z_userobjquota_obj); if (error == ENOENT) zfsvfs->z_userobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 8, 1, &zfsvfs->z_groupobjquota_obj); if (error == ENOENT) zfsvfs->z_groupobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 8, 1, &zfsvfs->z_projectobjquota_obj); if (error == ENOENT) zfsvfs->z_projectobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); /* * Only use the name cache if we are looking for a * name on a file system that does not require normalization * or case folding. We can also look there if we happen to be * on a non-normalizing, mixed sensitivity file system IF we * are looking for the exact name (which is always the case on * FreeBSD). */ zfsvfs->z_use_namecache = !zfsvfs->z_norm || ((zfsvfs->z_case == ZFS_CASE_MIXED) && !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); return (0); } taskq_t *zfsvfs_taskq; static void zfsvfs_task_unlinked_drain(void *context, int pending __unused) { zfs_unlinked_drain((zfsvfs_t *)context); } int zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); /* * XXX: Fix struct statfs so this isn't necessary! * * The 'osname' is used as the filesystem's special node, which means * it must fit in statfs.f_mntfromname, or else it can't be * enumerated, so libzfs_mnttab_find() returns NULL, which causes * 'zfs unmount' to think it's not mounted when it is. */ if (strlen(osname) >= MNAMELEN) return (SET_ERROR(ENAMETOOLONG)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); if (error != 0) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } error = zfsvfs_create_impl(zfvp, zfsvfs, os); return (error); } int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) { int error; zfsvfs->z_vfs = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, zfsvfs_task_unlinked_drain, zfsvfs); -#ifdef DIAGNOSTIC - rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); -#else - rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); -#endif + ZFS_TEARDOWN_INIT(zfsvfs); ZFS_TEARDOWN_INACTIVE_INIT(zfsvfs); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); error = zfsvfs_init(zfsvfs, os); if (error != 0) { dmu_objset_disown(os, B_TRUE, zfsvfs); *zfvp = NULL; kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; /* * Check for a bad on-disk format version now since we * lied about owning the dataset readonly before. */ if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) return (SET_ERROR(EROFS)); error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { boolean_t readonly; ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); /* * During replay we remove the read only flag to * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; if (readonly != 0) { zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; } else { dsl_dir_t *dd; zap_stats_t zs; if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, &zs) == 0) { dataset_kstats_update_nunlinks_kstat( &zfsvfs->z_kstat, zs.zs_num_entries); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "num_entries in unlinked set: %llu", zs.zs_num_entries); } zfs_unlinked_drain(zfsvfs); dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dd->dd_activity_cancelled = B_FALSE; } /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { boolean_t use_nc = zfsvfs->z_use_namecache; zfsvfs->z_use_namecache = B_FALSE; zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; zfsvfs->z_use_namecache = use_nc; } } /* restore readonly bit */ if (readonly != 0) zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; } /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); return (0); } void zfsvfs_free(zfsvfs_t *zfsvfs) { int i; zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); ASSERT(zfsvfs->z_nr_znodes == 0); list_destroy(&zfsvfs->z_all_znodes); - rrm_destroy(&zfsvfs->z_teardown_lock); + ZFS_TEARDOWN_DESTROY(zfsvfs); ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) mutex_destroy(&zfsvfs->z_hold_mtx[i]); dataset_kstats_destroy(&zfsvfs->z_kstat); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); if (zfsvfs->z_vfs) { if (zfsvfs->z_use_fuids) { vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } else { vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); } } zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static int zfs_domount(vfs_t *vfsp, char *osname) { uint64_t recordsize, fsid_guid; int error = 0; zfsvfs_t *zfsvfs; ASSERT(vfsp); ASSERT(osname); error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); if (error) return (error); zfsvfs->z_vfs = vfsp; if ((error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL))) goto out; zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; vfsp->vfs_data = zfsvfs; vfsp->mnt_flag |= MNT_LOCAL; vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; /* * This can cause a loss of coherence between ARC and page cache * on ZoF - unclear if the problem is in FreeBSD or ZoF */ vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ vfsp->mnt_kern_flag |= MNTK_NOMSYNC; vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; #if defined(_KERNEL) && !defined(KMEM_DEBUG) vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; #endif /* * The fsid is 64 bits, composed of an 8-bit fs type, which * separates our fsid from any other filesystem types, and a * 56-bit objset unique ID. The objset unique ID is unique to * all objsets open on this system, provided by unique_create(). * The 8-bit fs type must be put in the low bits of fsid[1] * because that's where other Solaris filesystems put it. */ fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); vfsp->vfs_fsid.val[0] = fsid_guid; vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | (vfsp->mnt_vfc->vfc_typenum & 0xFF); /* * Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); } vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if ((error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))) goto out; xattr_changed_cb(zfsvfs, pval); if ((error = dsl_prop_get_integer(osname, "acltype", &pval, NULL))) goto out; acl_type_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) goto out; } vfs_mountedfrom(vfsp, osname); if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); out: if (error) { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } else { atomic_inc_32(&zfs_active_fs_count); } return (error); } static void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } static int getpoolname(const char *osname, char *poolname) { char *p; p = strchr(osname, '/'); if (p == NULL) { if (strlen(osname) >= MAXNAMELEN) return (ENAMETOOLONG); (void) strcpy(poolname, osname); } else { if (p - osname >= MAXNAMELEN) return (ENAMETOOLONG); (void) strncpy(poolname, osname, p - osname); poolname[p - osname] = '\0'; } return (0); } /*ARGSUSED*/ static int zfs_mount(vfs_t *vfsp) { kthread_t *td = curthread; vnode_t *mvp = vfsp->mnt_vnodecovered; cred_t *cr = td->td_ucred; char *osname; int error = 0; int canwrite; if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) return (SET_ERROR(EINVAL)); /* * If full-owner-access is enabled and delegated administration is * turned on, we must set nosuid. */ if (zfs_super_owner && dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Check for mount privilege? * * If we don't have privilege then see if * we have local permission to allow it */ error = secpolicy_fs_mount(cr, mvp, vfsp); if (error) { if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) goto out; if (!(vfsp->vfs_flag & MS_REMOUNT)) { vattr_t vattr; /* * Make sure user is the owner of the mount point * or has sufficient privileges. */ vattr.va_mask = AT_UID; vn_lock(mvp, LK_SHARED | LK_RETRY); if (VOP_GETATTR(mvp, &vattr, cr)) { VOP_UNLOCK1(mvp); goto out; } if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { VOP_UNLOCK1(mvp); goto out; } VOP_UNLOCK1(mvp); } secpolicy_fs_mount_clearopts(cr, vfsp); } /* * Refuse to mount a filesystem if we are in a local zone and the * dataset is not visible. */ if (!INGLOBALZONE(curproc) && (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { error = SET_ERROR(EPERM); goto out; } vfsp->vfs_flag |= MNT_NFS4ACLS; /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. */ if (vfsp->vfs_flag & MS_REMOUNT) { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * Refresh mount options with z_teardown_lock blocking I/O while * the filesystem is in an inconsistent state. * The lock also serializes this code with filesystem * manipulations between entry to zfs_suspend_fs() and return * from zfs_resume_fs(). */ - rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); zfs_unregister_callbacks(zfsvfs); error = zfs_register_callbacks(vfsp); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); goto out; } /* Initial root mount: try hard to import the requested root pool. */ if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && (vfsp->vfs_flag & MNT_UPDATE) == 0) { char pname[MAXNAMELEN]; error = getpoolname(osname, pname); if (error == 0) error = spa_import_rootpool(pname, false); if (error) goto out; } DROP_GIANT(); error = zfs_domount(vfsp, osname); PICKUP_GIANT(); out: return (error); } static int zfs_statfs(vfs_t *vfsp, struct statfs *statp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; uint64_t refdbytes, availbytes, usedobjs, availobjs; statp->f_version = STATFS_VERSION; ZFS_ENTER(zfsvfs); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); /* * The underlying storage pool actually uses multiple block sizes. * We report the fragsize as the smallest block size we support, * and we report our blocksize as the filesystem's maximum blocksize. */ statp->f_bsize = SPA_MINBLOCKSIZE; statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; /* * The following report "total" blocks of various kinds in the * file system, but reported in terms of f_frsize - the * "fragment" size. */ statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; statp->f_bfree = availbytes / statp->f_bsize; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of object available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, statp->f_bfree); statp->f_files = statp->f_ffree + usedobjs; /* * We're a zfs filesystem. */ strlcpy(statp->f_fstypename, "zfs", sizeof (statp->f_fstypename)); strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, sizeof (statp->f_mntfromname)); strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, sizeof (statp->f_mntonname)); statp->f_namemax = MAXNAMELEN - 1; ZFS_EXIT(zfsvfs); return (0); } static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *rootzp; int error; ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *vpp = ZTOV(rootzp); ZFS_EXIT(zfsvfs); if (error == 0) { error = vn_lock(*vpp, flags); if (error != 0) { VN_RELE(*vpp); *vpp = NULL; } } return (error); } /* * Teardown the zfsvfs::z_os. * * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; dsl_dir_t *dd; /* * If someone has not already unmounted this file system, * drain the zrele_taskq to ensure all active references to the * zfsvfs_t have been handled only then can it be safely destroyed. */ if (zfsvfs->z_os) { /* * If we're unmounting we have to wait for the list to * drain completely. * * If we're not unmounting there's no guarantee the list * will drain completely, but zreles run from the taskq * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * * We can safely read z_nr_znodes without locking because the * VFS has already blocked operations which add to the * z_all_znodes list and thus increment z_nr_znodes. */ int round = 0; while (zfsvfs->z_nr_znodes > 0) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) break; } } - rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); if (!unmounting) { /* * We purge the parent filesystem's vfsp as the parent * filesystem and all of its snapshots have their vnode's * v_vfsp set to the parent's filesystem's vfsp. Note, * 'z_parent' is self referential for non-snapshots. */ #ifdef FREEBSD_NAMECACHE #if __FreeBSD_version >= 1300117 cache_purgevfs(zfsvfs->z_parent->z_vfs); #else cache_purgevfs(zfsvfs->z_parent->z_vfs, true); #endif #endif } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } ZFS_TEARDOWN_INACTIVE_ENTER_WRITE(zfsvfs); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no vops active, and any new vops will * fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * * Release all holds on dbufs. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) if (zp->z_sa_hdl) { ASSERT(ZTOV(zp)->v_count >= 0); zfs_znode_dmu_fini(zp); } mutex_exit(&zfsvfs->z_znodes_lock); /* * If we are unmounting, set the unmounted flag and let new vops * unblock. zfs_inactive will have the unmounted behavior, and all * other vops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data */ if (!zfs_is_readonly(zfsvfs)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); dmu_objset_evict_dbufs(zfsvfs->z_os); dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dsl_dir_cancel_waiters(dd); return (0); } /*ARGSUSED*/ static int zfs_umount(vfs_t *vfsp, int fflag) { kthread_t *td = curthread; zfsvfs_t *zfsvfs = vfsp->vfs_data; objset_t *os; cred_t *cr = td->td_ucred; int ret; ret = secpolicy_fs_unmount(cr, vfsp); if (ret) { if (dsl_deleg_access((char *)vfsp->vfs_resource, ZFS_DELEG_PERM_MOUNT, cr)) return (ret); } /* * Unmount any snapshots mounted under .zfs before unmounting the * dataset itself. */ if (zfsvfs->z_ctldir != NULL) { if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) return (ret); } if (fflag & MS_FORCE) { /* * Mark file system as unmounted before calling * vflush(FORCECLOSE). This way we ensure no future vnops * will be called and risk operating on DOOMED vnodes. */ - rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); zfsvfs->z_unmounted = B_TRUE; - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); } /* * Flush all the files. */ ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); if (ret != 0) return (ret); while (taskqueue_cancel(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task, NULL) != 0) taskqueue_drain(zfsvfs_taskq->tq_queue, &zfsvfs->z_unlinked_drain_task); VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); os = zfsvfs->z_os; /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, B_TRUE, zfsvfs); } /* * We can now safely destroy the '.zfs' directory node. */ if (zfsvfs->z_ctldir != NULL) zfsctl_destroy(zfsvfs); zfs_freevfs(vfsp); return (0); } static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; int err; /* * zfs_zget() can't operate on virtual entries like .zfs/ or * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. * This will make NFS to switch to LOOKUP instead of using VGET. */ if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) return (EOPNOTSUPP); ZFS_ENTER(zfsvfs); err = zfs_zget(zfsvfs, ino, &zp); if (err == 0 && zp->z_unlinked) { vrele(ZTOV(zp)); err = EINVAL; } if (err == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (err == 0) { err = vn_lock(*vpp, flags); if (err != 0) vrele(*vpp); } if (err != 0) *vpp = NULL; return (err); } static int #if __FreeBSD_version >= 1300098 zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, struct ucred **credanonp, int *numsecflavors, int *secflavors) #else zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, struct ucred **credanonp, int *numsecflavors, int **secflavors) #endif { zfsvfs_t *zfsvfs = vfsp->vfs_data; /* * If this is regular file system vfsp is the same as * zfsvfs->z_parent->z_vfs, but if it is snapshot, * zfsvfs->z_parent->z_vfs represents parent file system * which we have to use here, because only this file system * has mnt_export configured. */ return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, credanonp, numsecflavors, secflavors)); } CTASSERT(SHORT_FID_LEN <= sizeof (struct fid)); CTASSERT(LONG_FID_LEN <= sizeof (struct fid)); static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) { struct componentname cn; zfsvfs_t *zfsvfs = vfsp->vfs_data; znode_t *zp; vnode_t *dvp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *vpp = NULL; ZFS_ENTER(zfsvfs); /* * On FreeBSD we can get snapshot's mount point or its parent file * system mount point depending if snapshot is already mounted or not. */ if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); ZFS_EXIT(zfsvfs); err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); if (err) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); } if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * A zero fid_gen means we are in .zfs or the .zfs/snapshot * directory tree. If the object == zfsvfs->z_shares_dir, then * we are in the .zfs/shares directory tree. */ if ((fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { ZFS_EXIT(zfsvfs); VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); if (object == ZFSCTL_INO_SNAPDIR) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN | LOCKLEAF; cn.cn_lkflags = flags; VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); vput(dvp); } else if (object == zfsvfs->z_shares_dir) { /* * XXX This branch must not be taken, * if it is, then the lookup below will * explode. */ cn.cn_nameptr = "shares"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = LOOKUP; cn.cn_flags = ISLASTCN; cn.cn_lkflags = flags; VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); vput(dvp); } else { *vpp = dvp; } return (err); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); if ((err = zfs_zget(zfsvfs, object, &zp))) { ZFS_EXIT(zfsvfs); return (err); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); vrele(ZTOV(zp)); ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); err = vn_lock(*vpp, flags); if (err == 0) vnode_create_vobject(*vpp, zp->z_size, curthread); else *vpp = NULL; return (err); } /* * Block out VOPs and close zfsvfs_t::z_os * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { int err; znode_t *zp; - ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); /* * We already own this, so just update the objset_t, as the one we * had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; ds->ds_dir->dd_activity_cancelled = B_FALSE; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); /* * Attempt to re-establish all the active znodes with * their dbufs. If a zfs_rezget() fails, then we'll let * any potential callers discover that via ZFS_ENTER_VERIFY_VP * when they try to use their znode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { (void) zfs_rezget(zp); } mutex_exit(&zfsvfs->z_znodes_lock); bail: /* release the VOPs */ ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); if (err) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { vfs_ref(zfsvfs->z_vfs); (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); } } return (err); } static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; zfsvfs_free(zfsvfs); atomic_dec_32(&zfs_active_fs_count); } #ifdef __i386__ static int desiredvnodes_backup; #include #include #include #include #include #endif static void zfs_vnodes_adjust(void) { #ifdef __i386__ int newdesiredvnodes; desiredvnodes_backup = desiredvnodes; /* * We calculate newdesiredvnodes the same way it is done in * vntblinit(). If it is equal to desiredvnodes, it means that * it wasn't tuned by the administrator and we can tune it down. */ newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * vm_kmem_size / (5 * (sizeof (struct vm_object) + sizeof (struct vnode)))); if (newdesiredvnodes == desiredvnodes) desiredvnodes = (3 * newdesiredvnodes) / 4; #endif } static void zfs_vnodes_adjust_back(void) { #ifdef __i386__ desiredvnodes = desiredvnodes_backup; #endif } void zfs_init(void) { printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); /* * Initialize .zfs directory structures */ zfsctl_init(); /* * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); /* * Reduce number of vnodes. Originally number of vnodes is calculated * with UFS inode in mind. We reduce it here, because it's too big for * ZFS/i386. */ zfs_vnodes_adjust(); dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); } void zfs_fini(void) { taskq_destroy(zfsvfs_taskq); zfsctl_fini(); zfs_znode_fini(); zfs_vnodes_adjust_back(); } int zfs_busy(void) { return (zfs_active_fs_count != 0); } /* * Release VOPs and unmount a suspended filesystem. */ int zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { - ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(ZFS_TEARDOWN_INACTIVE_WRITE_HELD(zfsvfs)); /* * We already own this, so just hold and rele it to update the * objset_t, as the one we had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); zfsvfs->z_os = os; /* release the VOPs */ ZFS_TEARDOWN_INACTIVE_EXIT_WRITE(zfsvfs); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); /* * Try to force unmount this file system. */ (void) zfs_umount(zfsvfs->z_vfs, 0); zfsvfs->z_unmounted = B_TRUE; return (0); } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY(0 == sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %ju to %ju", (uintmax_t)zfsvfs->z_version, (uintmax_t)newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { uint64_t *cached_copy = NULL; /* * Figure out where in the objset_t the cached copy would live, if it * is available for the requested property. */ if (os != NULL) { switch (prop) { case ZFS_PROP_VERSION: cached_copy = &os->os_version; break; case ZFS_PROP_NORMALIZE: cached_copy = &os->os_normalization; break; case ZFS_PROP_UTF8ONLY: cached_copy = &os->os_utf8only; break; case ZFS_PROP_CASE: cached_copy = &os->os_casesensitivity; break; default: break; } } if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { *value = *cached_copy; return (0); } /* * If the property wasn't cached, look up the file system's value for * the property. For the version property, we look up a slightly * different string. */ const char *pname; int error = ENOENT; if (prop == ZFS_PROP_VERSION) { pname = ZPL_VERSION_STR; } else { pname = zfs_prop_to_name(prop); } if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); } if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; case ZFS_PROP_ACLTYPE: *value = ZFS_ACLTYPE_NFSV4; break; default: return (error); } error = 0; } /* * If one of the methods for getting the property value above worked, * copy it into the objset_t's cache. */ if (error == 0 && cached_copy != NULL) { *cached_copy = *value; } return (error); } /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. * If this function returns true we know VFS unmount has been initiated. */ boolean_t zfs_get_vfs_flag_unmounted(objset_t *os) { zfsvfs_t *zfvp; boolean_t unmounted = B_FALSE; ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); if (zfvp != NULL && zfvp->z_vfs != NULL && (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) unmounted = B_TRUE; mutex_exit(&os->os_user_ptr_lock); return (unmounted); } #ifdef _KERNEL void zfsvfs_update_fromname(const char *oldname, const char *newname) { char tmpbuf[MAXPATHLEN]; struct mount *mp; char *fromname; size_t oldlen; oldlen = strlen(oldname); mtx_lock(&mountlist_mtx); TAILQ_FOREACH(mp, &mountlist, mnt_list) { fromname = mp->mnt_stat.f_mntfromname; if (strcmp(fromname, oldname) == 0) { (void) strlcpy(fromname, newname, sizeof (mp->mnt_stat.f_mntfromname)); continue; } if (strncmp(fromname, oldname, oldlen) == 0 && (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", newname, fromname + oldlen); (void) strlcpy(fromname, tmpbuf, sizeof (mp->mnt_stat.f_mntfromname)); continue; } } mtx_unlock(&mountlist_mtx); } #endif Index: head/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c =================================================================== --- head/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c (revision 367344) +++ head/sys/contrib/openzfs/module/os/freebsd/zfs/zfs_vnops.c (revision 367345) @@ -1,6659 +1,6659 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2017 Nexenta Systems, Inc. */ /* Portions Copyright 2007 Jeremy Teo */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #if __FreeBSD_version >= 1300102 #include #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifndef VN_OPEN_INVFS #define VN_OPEN_INVFS 0x0 #endif VFS_SMR_DECLARE; #if __FreeBSD_version >= 1300047 #define vm_page_wire_lock(pp) #define vm_page_wire_unlock(pp) #else #define vm_page_wire_lock(pp) vm_page_lock(pp) #define vm_page_wire_unlock(pp) vm_page_unlock(pp) #endif #ifdef DEBUG_VFS_LOCKS #define VNCHECKREF(vp) \ VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp, \ ("%s: wrong ref counts", __func__)); #else #define VNCHECKREF(vp) #endif /* * Programming rules. * * Each vnode op performs some logical unit of work. To do this, the ZPL must * properly lock its in-core state, create a DMU transaction, do the work, * record this work in the intent log (ZIL), commit the DMU transaction, * and wait for the intent log to commit if it is a synchronous operation. * Moreover, the vnode ops must work in both normal and log replay context. * The ordering of events is important to avoid deadlocks and references * to freed memory. The example below illustrates the following Big Rules: * * (1) A check must be made in each zfs thread for a mounted file system. * This is done avoiding races using ZFS_ENTER(zfsvfs). * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros * can return EIO from the calling function. * * (2) VN_RELE() should always be the last thing except for zil_commit() * (if necessary) and ZFS_EXIT(). This is for 3 reasons: * First, if it's the last reference, the vnode/znode * can be freed, so the zp may point to freed memory. Second, the last * reference will call zfs_zinactive(), which may induce a lot of work -- * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to * dmu_tx_assign(). This is critical because we don't want to block * while holding locks. * * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This * reduces lock contention and CPU usage when we must wait (note that if * throughput is constrained by the storage, nearly every transaction * must wait). * * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing * to use a non-blocking assign can deadlock the system. The scenario: * * Thread A has grabbed a lock before calling dmu_tx_assign(). * Thread B is in an already-assigned tx, and blocks for this lock. * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() * forever, because the previous txg can't quiesce until B's tx commits. * * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, * then drop all locks, call dmu_tx_wait(), and try again. On subsequent * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, * to indicate that this operation has already called dmu_tx_wait(). * This will ensure that we don't retry forever, waiting a short bit * each time. * * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. * During ZIL replay the zfs_log_* functions will update the sequence * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. * * (7) After dropping all locks, invoke zil_commit(zilog, foid) * to ensure that synchronous semantics are provided when necessary. * * In general, this is how things should be ordered in each vnode op: * * ZFS_ENTER(zfsvfs); // exit if unmounted * top: * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * if (error == ERESTART) { * waited = B_TRUE; * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; * } * dmu_tx_abort(tx); // abort DMU tx * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // really out of space * } * error = do_real_work(); // do whatever this VOP does * if (error == 0) * zfs_log_*(...); // on success, make ZIL entry * dmu_tx_commit(tx); // commit DMU tx -- error or not * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes * zil_commit(zilog, foid); // synchronous when necessary * ZFS_EXIT(zfsvfs); // finished in zfs * return (error); // done, report error */ /* ARGSUSED */ static int zfs_open(vnode_t **vpp, int flag, cred_t *cr) { znode_t *zp = VTOZ(*vpp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & FAPPEND) == 0)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { if (fs_vscan(*vpp, cr, 0) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } } /* Keep a count of the synchronous opens in the znode */ if (flag & (FSYNC | FDSYNC)) atomic_inc_32(&zp->z_sync_cnt); ZFS_EXIT(zfsvfs); return (0); } /* ARGSUSED */ static int zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* Decrement the synchronous opens in the znode */ if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) VERIFY(fs_vscan(vp, cr, 1) == 0); ZFS_EXIT(zfsvfs); return (0); } /* * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. */ static int zfs_holey(vnode_t *vp, ulong_t cmd, offset_t *off) { znode_t *zp = VTOZ(vp); uint64_t noff = (uint64_t)*off; /* new offset */ uint64_t file_sz; int error; boolean_t hole; file_sz = zp->z_size; if (noff >= file_sz) { return (SET_ERROR(ENXIO)); } if (cmd == _FIO_SEEK_HOLE) hole = B_TRUE; else hole = B_FALSE; error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); if (error == ESRCH) return (SET_ERROR(ENXIO)); /* file was dirty, so fall back to using generic logic */ if (error == EBUSY) { if (hole) *off = file_sz; return (0); } /* * We could find a hole that begins after the logical end-of-file, * because dmu_offset_next() only works on whole blocks. If the * EOF falls mid-block, then indicate that the "virtual hole" * at the end of the file begins at the logical EOF, rather than * at the end of the last block. */ if (noff > file_sz) { ASSERT(hole); noff = file_sz; } if (noff < *off) return (error); *off = noff; return (error); } /* ARGSUSED */ static int zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, int *rvalp) { offset_t off; int error; zfsvfs_t *zfsvfs; znode_t *zp; switch (com) { case _FIOFFS: { return (0); /* * The following two ioctls are used by bfu. Faking out, * necessary to avoid bfu errors. */ } case _FIOGDIO: case _FIOSDIO: { return (0); } case _FIO_SEEK_DATA: case _FIO_SEEK_HOLE: { off = *(offset_t *)data; zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* offset parameter is in/out */ error = zfs_holey(vp, com, &off); ZFS_EXIT(zfsvfs); if (error) return (error); *(offset_t *)data = off; return (0); } } return (SET_ERROR(ENOTTY)); } static vm_page_t page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) { vm_object_t obj; vm_page_t pp; int64_t end; /* * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE * aligned boundaries, if the range is not aligned. As a result a * DEV_BSIZE subrange with partially dirty data may get marked as clean. * It may happen that all DEV_BSIZE subranges are marked clean and thus * the whole page would be considered clean despite have some * dirty data. * For this reason we should shrink the range to DEV_BSIZE aligned * boundaries before calling vm_page_clear_dirty. */ end = rounddown2(off + nbytes, DEV_BSIZE); off = roundup2(off, DEV_BSIZE); nbytes = end - off; obj = vp->v_object; zfs_vmobject_assert_wlocked_12(obj); #if __FreeBSD_version < 1300050 for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } vm_page_sbusy(pp); } else if (pp != NULL) { ASSERT(!pp->valid); pp = NULL; } if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); pmap_remove_write(pp); if (nbytes != 0) vm_page_clear_dirty(pp, off, nbytes); } break; } #else vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (pp != NULL) { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_object_pip_add(obj, 1); pmap_remove_write(pp); if (nbytes != 0) vm_page_clear_dirty(pp, off, nbytes); } #endif return (pp); } static void page_unbusy(vm_page_t pp) { vm_page_sunbusy(pp); #if __FreeBSD_version >= 1300041 vm_object_pip_wakeup(pp->object); #else vm_object_pip_subtract(pp->object, 1); #endif } #if __FreeBSD_version > 1300051 static vm_page_t page_hold(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t m; obj = vp->v_object; vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start), VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | VM_ALLOC_NOBUSY); return (m); } #else static vm_page_t page_hold(vnode_t *vp, int64_t start) { vm_object_t obj; vm_page_t pp; obj = vp->v_object; zfs_vmobject_assert_wlocked(obj); for (;;) { if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && pp->valid) { if (vm_page_xbusied(pp)) { /* * Reference the page before unlocking and * sleeping so that the page daemon is less * likely to reclaim it. */ vm_page_reference(pp); vm_page_lock(pp); zfs_vmobject_wunlock(obj); vm_page_busy_sleep(pp, "zfsmwb", true); zfs_vmobject_wlock(obj); continue; } ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_wire_lock(pp); vm_page_hold(pp); vm_page_wire_unlock(pp); } else pp = NULL; break; } return (pp); } #endif static void page_unhold(vm_page_t pp) { vm_page_wire_lock(pp); #if __FreeBSD_version >= 1300035 vm_page_unwire(pp, PQ_ACTIVE); #else vm_page_unhold(pp); #endif vm_page_wire_unlock(pp); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. */ static void update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, int segflg, dmu_tx_t *tx) { vm_object_t obj; struct sf_buf *sf; caddr_t va; int off; ASSERT(segflg != UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); off = start & PAGEOFFSET; zfs_vmobject_wlock_12(obj); #if __FreeBSD_version >= 1300041 vm_object_pip_add(obj, 1); #endif for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; int nbytes = imin(PAGESIZE - off, len); if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { zfs_vmobject_wunlock_12(obj); va = zfs_map_page(pp, &sf); (void) dmu_read(os, oid, start+off, nbytes, va+off, DMU_READ_PREFETCH); zfs_unmap_page(sf); zfs_vmobject_wlock_12(obj); page_unbusy(pp); } len -= nbytes; off = 0; } #if __FreeBSD_version >= 1300041 vm_object_pip_wakeup(obj); #else vm_object_pip_wakeupn(obj, 0); #endif zfs_vmobject_wunlock_12(obj); } /* * Read with UIO_NOCOPY flag means that sendfile(2) requests * ZFS to populate a range of page cache pages with data. * * NOTE: this function could be optimized to pre-allocate * all pages in advance, drain exclusive busy on all of them, * map them into contiguous KVA region and populate them * in one single dmu_read() call. */ static int mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); objset_t *os = zp->z_zfsvfs->z_os; struct sf_buf *sf; vm_object_t obj; vm_page_t pp; int64_t start; caddr_t va; int len = nbytes; int error = 0; ASSERT(uio->uio_segflg == UIO_NOCOPY); ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); zfs_vmobject_wlock_12(obj); for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { int bytes = MIN(PAGESIZE, len); pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start), VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); if (vm_page_none_valid(pp)) { zfs_vmobject_wunlock_12(obj); va = zfs_map_page(pp, &sf); error = dmu_read(os, zp->z_id, start, bytes, va, DMU_READ_PREFETCH); if (bytes != PAGESIZE && error == 0) bzero(va + bytes, PAGESIZE - bytes); zfs_unmap_page(sf); zfs_vmobject_wlock_12(obj); #if __FreeBSD_version >= 1300081 if (error == 0) { vm_page_valid(pp); vm_page_activate(pp); vm_page_do_sunbusy(pp); } else { zfs_vmobject_wlock(obj); if (!vm_page_wired(pp) && pp->valid == 0 && vm_page_busy_tryupgrade(pp)) vm_page_free(pp); else vm_page_sunbusy(pp); zfs_vmobject_wunlock(obj); } #else vm_page_do_sunbusy(pp); vm_page_lock(pp); if (error) { if (pp->wire_count == 0 && pp->valid == 0 && !vm_page_busied(pp)) vm_page_free(pp); } else { pp->valid = VM_PAGE_BITS_ALL; vm_page_activate(pp); } vm_page_unlock(pp); #endif } else { ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); vm_page_do_sunbusy(pp); } if (error) break; uio->uio_resid -= bytes; uio->uio_offset += bytes; len -= bytes; } zfs_vmobject_wunlock_12(obj); return (error); } /* * When a file is memory mapped, we must keep the IO data synchronized * between the DMU cache and the memory mapped pages. What this means: * * On Read: We "read" preferentially from memory mapped pages, * else we default from the dmu buffer. * * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when * the file is memory mapped. */ static int mappedread(vnode_t *vp, int nbytes, uio_t *uio) { znode_t *zp = VTOZ(vp); vm_object_t obj; int64_t start; int len = nbytes; int off; int error = 0; ASSERT(vp->v_mount != NULL); obj = vp->v_object; ASSERT(obj != NULL); start = uio->uio_loffset; off = start & PAGEOFFSET; zfs_vmobject_wlock_12(obj); for (start &= PAGEMASK; len > 0; start += PAGESIZE) { vm_page_t pp; uint64_t bytes = MIN(PAGESIZE - off, len); if ((pp = page_hold(vp, start))) { struct sf_buf *sf; caddr_t va; zfs_vmobject_wunlock_12(obj); va = zfs_map_page(pp, &sf); error = vn_io_fault_uiomove(va + off, bytes, uio); zfs_unmap_page(sf); zfs_vmobject_wlock_12(obj); page_unhold(pp); } else { zfs_vmobject_wunlock_12(obj); error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, bytes); zfs_vmobject_wlock_12(obj); } len -= bytes; off = 0; if (error) break; } zfs_vmobject_wunlock_12(obj); return (error); } offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ /* * Read bytes from specified file into supplied buffer. * * IN: vp - vnode of file to be read from. * uio - structure supplying read location, range info, * and return buffer. * ioflag - SYNC flags; used to provide FRSYNC semantics. * cr - credentials of caller. * ct - caller context * * OUT: uio - updated offset and range, buffer filled. * * RETURN: 0 on success, error code on failure. * * Side Effects: * vp - atime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ssize_t n, nbytes, start_resid; int error = 0; int64_t nread; zfs_locked_range_t *lr; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* We don't copy out anything useful for directories. */ if (vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (zp->z_pflags & ZFS_AV_QUARANTINED) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EACCES)); } /* * Validate file offset */ if (uio->uio_loffset < (offset_t)0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Fasttrack empty reads */ if (uio->uio_resid == 0) { ZFS_EXIT(zfsvfs); return (0); } /* * If we're in FRSYNC mode, sync out this znode before reading it. */ if (zfsvfs->z_log && (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) zil_commit(zfsvfs->z_log, zp->z_id); /* * Lock the range against changes. */ lr = zfs_rangelock_enter(&zp->z_rangelock, uio->uio_loffset, uio->uio_resid, RL_READER); /* * If we are reading past end-of-file we can skip * to the end; but we might still need to set atime. */ if (uio->uio_loffset >= zp->z_size) { error = 0; goto out; } ASSERT(uio->uio_loffset < zp->z_size); n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); start_resid = n; while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); if (uio->uio_segflg == UIO_NOCOPY) error = mappedread_sf(vp, nbytes, uio); else if (vn_has_cached_data(vp)) { error = mappedread(vp, nbytes, uio); } else { error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes); } if (error) { /* convert checksum errors into IO errors */ if (error == ECKSUM) error = SET_ERROR(EIO); break; } n -= nbytes; } nread = start_resid - n; dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); out: zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Write the bytes to a file. * * IN: vp - vnode of file to be written to. * uio - structure supplying write location, range info, * and data buffer. * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is * set if in append mode. * cr - credentials of caller. * ct - caller context (NFS/CIFS fem monitor only) * * OUT: uio - updated offset and range. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime|mtime updated if byte count > 0 */ /* ARGSUSED */ static int zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) { znode_t *zp = VTOZ(vp); rlim64_t limit = MAXOFFSET_T; ssize_t start_resid = uio->uio_resid; ssize_t tx_bytes; uint64_t end_size; dmu_buf_impl_t *db; dmu_tx_t *tx; zfsvfs_t *zfsvfs = zp->z_zfsvfs; zilog_t *zilog; offset_t woff; ssize_t n, nbytes; zfs_locked_range_t *lr; int max_blksz = zfsvfs->z_max_blksz; int error = 0; arc_buf_t *abuf; iovec_t *aiov = NULL; xuio_t *xuio = NULL; int i_iov = 0; int iovcnt __unused = uio->uio_iovcnt; iovec_t *iovp = uio->uio_iov; int write_eof; int count = 0; sa_bulk_attr_t bulk[4]; uint64_t mtime[2], ctime[2]; uint64_t uid, gid, projid; int64_t nwritten; /* * Fasttrack empty write */ n = start_resid; if (n == 0) return (0); if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) limit = MAXOFFSET_T; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, &zp->z_size, 8); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * If immutable or not appending then return EPERM. * Intentionally allow ZFS_READONLY through here. * See zfs_zaccess_common() */ if ((zp->z_pflags & ZFS_IMMUTABLE) || ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && (uio->uio_loffset < zp->z_size))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } zilog = zfsvfs->z_log; /* * Validate file offset */ woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; if (woff < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { /* * Obtain an appending range lock to guarantee file append * semantics. We reset the write offset once we have the lock. */ lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); woff = lr->lr_offset; if (lr->lr_length == UINT64_MAX) { /* * We overlocked the file because this write will cause * the file block size to increase. * Note that zp_size cannot change with this lock held. */ woff = zp->z_size; } uio->uio_loffset = woff; } else { /* * Note that if the file block size will change as a result of * this write, then this range lock will lock the entire file * so that we can re-write the block safely. */ lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); } if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (EFBIG); } if (woff >= limit) { zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (SET_ERROR(EFBIG)); } if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; /* Will this write extend the file length? */ write_eof = (woff + n > zp->z_size); end_size = MAX(zp->z_size, woff + n); uid = zp->z_uid; gid = zp->z_gid; projid = zp->z_projid; /* * Write the file in reasonable size chunks. Each chunk is written * in a separate transaction; this keeps the intent log records small * and allows us to do more fine-grained space accounting. */ while (n > 0) { woff = uio->uio_loffset; if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || (projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid))) { error = SET_ERROR(EDQUOT); break; } abuf = NULL; if (xuio) { ASSERT(i_iov < iovcnt); aiov = &iovp[i_iov]; abuf = dmu_xuio_arcbuf(xuio, i_iov); dmu_xuio_clear(xuio, i_iov); DTRACE_PROBE3(zfs_cp_write, int, i_iov, iovec_t *, aiov, arc_buf_t *, abuf); ASSERT((aiov->iov_base == abuf->b_data) || ((char *)aiov->iov_base - (char *)abuf->b_data + aiov->iov_len == arc_buf_size(abuf))); i_iov++; } else if (n >= max_blksz && woff >= zp->z_size && P2PHASE(woff, max_blksz) == 0 && zp->z_blksz == max_blksz) { /* * This write covers a full block. "Borrow" a buffer * from the dmu so that we can fill it before we enter * a transaction. This avoids the possibility of * holding up the transaction if the data copy hangs * up on a pagefault (e.g., from an NFS server mapping). */ size_t cbytes; abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), max_blksz); ASSERT(abuf != NULL); ASSERT(arc_buf_size(abuf) == max_blksz); if ((error = uiocopy(abuf->b_data, max_blksz, UIO_WRITE, uio, &cbytes))) { dmu_return_arcbuf(abuf); break; } ASSERT(cbytes == max_blksz); } /* * Start a transaction. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); DB_DNODE_ENTER(db); dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, MIN(n, max_blksz)); DB_DNODE_EXIT(db); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); if (abuf != NULL) dmu_return_arcbuf(abuf); break; } /* * If zfs_range_lock() over-locked we grow the blocksize * and then reduce the lock range. This will only happen * on the first iteration since zfs_range_reduce() will * shrink down r_len to the appropriate size. */ if (lr->lr_length == UINT64_MAX) { uint64_t new_blksz; if (zp->z_blksz > max_blksz) { /* * File's blocksize is already larger than the * "recordsize" property. Only let it grow to * the next power of 2. */ ASSERT(!ISP2(zp->z_blksz)); new_blksz = MIN(end_size, 1 << highbit64(zp->z_blksz)); } else { new_blksz = MIN(end_size, max_blksz); } zfs_grow_blocksize(zp, new_blksz, tx); zfs_rangelock_reduce(lr, woff, n); } /* * XXX - should we really limit each write to z_max_blksz? * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); if (woff + nbytes > zp->z_size) vnode_pager_setsize(vp, woff + nbytes); if (abuf == NULL) { tx_bytes = uio->uio_resid; error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), uio, nbytes, tx); tx_bytes -= uio->uio_resid; } else { tx_bytes = nbytes; ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); /* * If this is not a full block write, but we are * extending the file past EOF and this data starts * block-aligned, use assign_arcbuf(). Otherwise, * write via dmu_write(). */ if (tx_bytes < max_blksz && (!write_eof || aiov->iov_base != abuf->b_data)) { ASSERT(xuio); dmu_write(zfsvfs->z_os, zp->z_id, woff, aiov->iov_len, aiov->iov_base, tx); dmu_return_arcbuf(abuf); xuio_stat_wbuf_copied(); } else { ASSERT(xuio || tx_bytes == max_blksz); dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, abuf, tx); } ASSERT(tx_bytes <= uio->uio_resid); uioskip(uio, tx_bytes); } if (tx_bytes && vn_has_cached_data(vp)) { update_pages(vp, woff, tx_bytes, zfsvfs->z_os, zp->z_id, uio->uio_segflg, tx); } /* * If we made no progress, we're done. If we made even * partial progress, update the znode and ZIL accordingly. */ if (tx_bytes == 0) { (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), (void *)&zp->z_size, sizeof (uint64_t), tx); dmu_tx_commit(tx); ASSERT(error != 0); break; } /* * Clear Set-UID/Set-GID bits on successful write if not * privileged and at least one of the execute bits is set. * * It would be nice to to this after all writes have * been done, but that would still expose the ISUID/ISGID * to another app after the partial write is committed. * * Note: we don't call zfs_fuid_map_id() here because * user 0 is not an ephemeral uid. */ mutex_enter(&zp->z_acl_lock); if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | (S_IXUSR >> 6))) != 0 && (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && secpolicy_vnode_setid_retain(vp, cr, (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { uint64_t newmode; zp->z_mode &= ~(S_ISUID | S_ISGID); newmode = zp->z_mode; (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), (void *)&newmode, sizeof (uint64_t), tx); } mutex_exit(&zp->z_acl_lock); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); /* * Update the file size (zp_size) if it has changed; * account for possible concurrent updates. */ while ((end_size = zp->z_size) < uio->uio_loffset) { (void) atomic_cas_64(&zp->z_size, end_size, uio->uio_loffset); ASSERT(error == 0 || error == EFAULT); } /* * If we are replaying and eof is non zero then force * the file size to the specified eof. Note, there's no * concurrency during replay. */ if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) zp->z_size = zfsvfs->z_replay_eof; if (error == 0) error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); else (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, NULL, NULL); dmu_tx_commit(tx); if (error != 0) break; ASSERT(tx_bytes == nbytes); n -= nbytes; } zfs_rangelock_exit(lr); /* * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } /* * EFAULT means that at least one page of the source buffer was not * available. VFS will re-try remaining I/O upon this error. */ if (error == EFAULT) { ZFS_EXIT(zfsvfs); return (error); } if (ioflag & (FSYNC | FDSYNC) || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, zp->z_id); nwritten = start_resid - uio->uio_resid; dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); ZFS_EXIT(zfsvfs); return (0); } int zfs_write_simple(znode_t *zp, const void *data, size_t len, loff_t pos, size_t *presid) { int error = 0; ssize_t resid; error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos, UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread); if (error) { return (SET_ERROR(error)); } else if (presid == NULL) { if (resid != 0) { error = SET_ERROR(EIO); } } else { *presid = resid; } return (error); } static void zfs_get_done(zgd_t *zgd, int error) { znode_t *zp = zgd->zgd_private; objset_t *os = zp->z_zfsvfs->z_os; if (zgd->zgd_db) dmu_buf_rele(zgd->zgd_db, zgd); zfs_rangelock_exit(zgd->zgd_lr); /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os))); kmem_free(zgd, sizeof (zgd_t)); } #ifdef ZFS_DEBUG static int zil_fault_io = 0; #endif /* * Get data to generate a TX_WRITE intent log record. */ int zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) { zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; uint64_t object = lr->lr_foid; uint64_t offset = lr->lr_offset; uint64_t size = lr->lr_length; dmu_buf_t *db; zgd_t *zgd; int error = 0; ASSERT3P(lwb, !=, NULL); ASSERT3P(zio, !=, NULL); ASSERT3U(size, !=, 0); /* * Nothing to do if the file has been removed */ if (zfs_zget(zfsvfs, object, &zp) != 0) return (SET_ERROR(ENOENT)); if (zp->z_unlinked) { /* * Release the vnode asynchronously as we currently have the * txg stopped from syncing. */ VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os))); return (SET_ERROR(ENOENT)); } zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); zgd->zgd_lwb = lwb; zgd->zgd_private = zp; /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the * log record (immediate); for large writes it's cheaper to * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ if (offset >= zp->z_size) { error = SET_ERROR(ENOENT); } else { error = dmu_read(os, object, offset, size, buf, DMU_READ_NO_PREFETCH); } ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ /* * Have to lock the whole block to ensure when it's * written out and its checksum is being calculated * that no one can change the data. We need to re-check * blocksize after we get the lock in case it's changed! */ for (;;) { uint64_t blkoff; size = zp->z_blksz; blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; offset -= blkoff; zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, size, RL_READER); if (zp->z_blksz == size) break; offset += blkoff; zfs_rangelock_exit(zgd->zgd_lr); } /* test for truncation needs to be done while range locked */ if (lr->lr_offset >= zp->z_size) error = SET_ERROR(ENOENT); #ifdef ZFS_DEBUG if (zil_fault_io) { error = SET_ERROR(EIO); zil_fault_io = 0; } #endif if (error == 0) error = dmu_buf_hold(os, object, offset, zgd, &db, DMU_READ_NO_PREFETCH); if (error == 0) { blkptr_t *bp = &lr->lr_blkptr; zgd->zgd_db = db; zgd->zgd_bp = bp; ASSERT(db->db_offset == offset); ASSERT(db->db_size == size); error = dmu_sync(zio, lr->lr_common.lrc_txg, zfs_get_done, zgd); ASSERT(error || lr->lr_length <= size); /* * On success, we need to wait for the write I/O * initiated by dmu_sync() to complete before we can * release this dbuf. We will finish everything up * in the zfs_get_done() callback. */ if (error == 0) return (0); if (error == EALREADY) { lr->lr_common.lrc_txtype = TX_WRITE2; /* * TX_WRITE2 relies on the data previously * written by the TX_WRITE that caused * EALREADY. We zero out the BP because * it is the old, currently-on-disk BP, * so there's no need to zio_flush() its * vdevs (flushing would needlesly hurt * performance, and doesn't work on * indirect vdevs). */ zgd->zgd_bp = NULL; BP_ZERO(bp); error = 0; } } } zfs_get_done(zgd, error); return (error); } /*ARGSUSED*/ static int zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (flag & V_ACE_MASK) error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); else error = zfs_zaccess_rwx(zp, mode, flag, cr); ZFS_EXIT(zfsvfs); return (error); } static int zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) { int error; *vpp = arg; error = vn_lock(*vpp, lkflags); if (error != 0) vrele(*vpp); return (error); } static int zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs; int error; int ltype; if (zfsvfs->z_replay == B_FALSE) ASSERT_VOP_LOCKED(dvp, __func__); #ifdef DIAGNOSTIC if ((zdp->z_pflags & ZFS_XATTR) == 0) - VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); + VERIFY(!ZFS_TEARDOWN_HELD(zfsvfs)); #endif if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { ASSERT3P(dvp, ==, vp); vref(dvp); ltype = lkflags & LK_TYPE_MASK; if (ltype != VOP_ISLOCKED(dvp)) { if (ltype == LK_EXCLUSIVE) vn_lock(dvp, LK_UPGRADE | LK_RETRY); else /* if (ltype == LK_SHARED) */ vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); /* * Relock for the "." case could leave us with * reclaimed vnode. */ if (VN_IS_DOOMED(dvp)) { vrele(dvp); return (SET_ERROR(ENOENT)); } } return (0); } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { /* * Note that in this case, dvp is the child vnode, and we * are looking up the parent vnode - exactly reverse from * normal operation. Unlocking dvp requires some rather * tricky unlock/relock dance to prevent mp from being freed; * use vn_vget_ino_gen() which takes care of all that. * * XXX Note that there is a time window when both vnodes are * unlocked. It is possible, although highly unlikely, that * during that window the parent-child relationship between * the vnodes may change, for example, get reversed. * In that case we would have a wrong lock order for the vnodes. * All other filesystems seem to ignore this problem, so we * do the same here. * A potential solution could be implemented as follows: * - using LK_NOWAIT when locking the second vnode and retrying * if necessary * - checking that the parent-child relationship still holds * after locking both vnodes and retrying if it doesn't */ error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); return (error); } else { error = vn_lock(vp, lkflags); if (error != 0) vrele(vp); return (error); } } /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. * * IN: dvp - vnode of directory to search. * nm - name of entry to lookup. * pnp - full pathname to lookup [UNUSED]. * flags - LOOKUP_XATTR set if looking for an attribute. * rdir - root directory vnode [UNUSED]. * cr - credentials of caller. * ct - caller context * * OUT: vpp - vnode of located entry, NULL if not found. * * RETURN: 0 on success, error code on failure. * * Timestamps: * NA */ /* ARGSUSED */ static int zfs_lookup(vnode_t *dvp, const char *nm, vnode_t **vpp, struct componentname *cnp, int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached) { znode_t *zdp = VTOZ(dvp); znode_t *zp; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; int error = 0; /* * Fast path lookup, however we must skip DNLC lookup * for case folding or normalizing lookups because the * DNLC code only stores the passed in name. This means * creating 'a' and removing 'A' on a case insensitive * file system would work, but DNLC still thinks 'a' * exists and won't let you create it again on the next * pass through fast path. */ if (!(flags & LOOKUP_XATTR)) { if (dvp->v_type != VDIR) { return (SET_ERROR(ENOTDIR)); } else if (zdp->z_sa_hdl == NULL) { return (SET_ERROR(EIO)); } } DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, const char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); *vpp = NULL; if (flags & LOOKUP_XATTR) { /* * If the xattr property is off, refuse the lookup request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOPNOTSUPP)); } /* * We don't allow recursive attributes.. * Maybe someday we will. */ if (zdp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) { ZFS_EXIT(zfsvfs); return (error); } *vpp = ZTOV(zp); /* * Do we have permission to get into attribute directory? */ error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr); if (error) { vrele(ZTOV(zp)); } ZFS_EXIT(zfsvfs); return (error); } /* * Check accessibility of directory if we're not coming in via * VOP_CACHEDLOOKUP. */ if (!cached) { #ifdef NOEXECCHECK if ((cnp->cn_flags & NOEXECCHECK) != 0) { cnp->cn_flags &= ~NOEXECCHECK; } else #endif if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } } if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * First handle the special cases. */ if ((cnp->cn_flags & ISDOTDOT) != 0) { /* * If we are a snapshot mounted under .zfs, return * the vp for the snapshot directory. */ if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { struct componentname cn; vnode_t *zfsctl_vp; int ltype; ZFS_EXIT(zfsvfs); ltype = VOP_ISLOCKED(dvp); VOP_UNLOCK1(dvp); error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, &zfsctl_vp); if (error == 0) { cn.cn_nameptr = "snapshot"; cn.cn_namelen = strlen(cn.cn_nameptr); cn.cn_nameiop = cnp->cn_nameiop; cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; cn.cn_lkflags = cnp->cn_lkflags; error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); vput(zfsctl_vp); } vn_lock(dvp, ltype | LK_RETRY); return (error); } } if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { ZFS_EXIT(zfsvfs); if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) return (SET_ERROR(ENOTSUP)); error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); return (error); } /* * The loop is retry the lookup if the parent-child relationship * changes during the dot-dot locking complexities. */ for (;;) { uint64_t parent; error = zfs_dirlook(zdp, nm, &zp); if (error == 0) *vpp = ZTOV(zp); ZFS_EXIT(zfsvfs); if (error != 0) break; error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); if (error != 0) { /* * If we've got a locking error, then the vnode * got reclaimed because of a force unmount. * We never enter doomed vnodes into the name cache. */ *vpp = NULL; return (error); } if ((cnp->cn_flags & ISDOTDOT) == 0) break; ZFS_ENTER(zfsvfs); if (zdp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); } else { error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent)); } if (error != 0) { ZFS_EXIT(zfsvfs); vput(ZTOV(zp)); break; } if (zp->z_id == parent) { ZFS_EXIT(zfsvfs); break; } vput(ZTOV(zp)); } if (error != 0) *vpp = NULL; /* Translate errors and add SAVENAME when needed. */ if (cnp->cn_flags & ISLASTCN) { switch (nameiop) { case CREATE: case RENAME: if (error == ENOENT) { error = EJUSTRETURN; cnp->cn_flags |= SAVENAME; break; } /* FALLTHROUGH */ case DELETE: if (error == 0) cnp->cn_flags |= SAVENAME; break; } } /* Insert name into cache (as non-existent) if appropriate. */ if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(dvp, NULL, cnp); /* Insert name into cache if appropriate. */ if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && error == 0 && (cnp->cn_flags & MAKEENTRY)) { if (!(cnp->cn_flags & ISLASTCN) || (nameiop != DELETE && nameiop != RENAME)) { cache_enter(dvp, *vpp, cnp); } } return (error); } /* * Attempt to create a new entry in a directory. If the entry * already exists, truncate the file if permissible, else return * an error. Return the vp of the created or trunc'd file. * * IN: dvp - vnode of directory to put new file entry in. * name - name of new file entry. * vap - attributes of new file. * excl - flag indicating exclusive or non-exclusive mode. * mode - mode to open file with. * cr - credentials of caller. * flag - large file flag [UNUSED]. * ct - caller context * vsecp - ACL to be set * * OUT: vpp - vnode of created or trunc'd entry. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated if new entry created * vp - ctime|mtime always, atime if new */ /* ARGSUSED */ int zfs_create(znode_t *dzp, const char *name, vattr_t *vap, int excl, int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) { znode_t *zp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; objset_t *os; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); uint64_t projid = ZFS_DEFAULT_PROJID; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype; #ifdef DEBUG_VFS_LOCKS vnode_t *dvp = ZTOV(dzp); #endif /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && (vsecp || (vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); os = zfsvfs->z_os; zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } *zpp = NULL; if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) vap->va_mode &= ~S_ISVTX; error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); /* * Create a new file object and update the directory * to reference it. */ if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { goto out; } /* * We only support the creation of regular files in * extended attribute directories. */ if ((dzp->z_pflags & ZFS_XATTR) && (vap->va_type != VREG)) { error = SET_ERROR(EINVAL); goto out; } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, &acl_ids)) != 0) goto out; if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) projid = zfs_inherit_projid(dzp); if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { zfs_acl_ids_free(&acl_ids); error = SET_ERROR(EDQUOT); goto out; } getnewvnode_reserve_(); tx = dmu_tx_create(os); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); (void) zfs_link_create(dzp, name, zp, tx, ZNEW); txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, name, vsecp, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); out: VNCHECKREF(dvp); if (error == 0) { *zpp = zp; } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Remove an entry from a directory. * * IN: dvp - vnode of directory to remove entry from. * name - name of entry to remove. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime * vp - ctime (if nlink > 0) */ /*ARGSUSED*/ static int zfs_remove_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp; znode_t *xzp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t xattr_obj; uint64_t obj = 0; dmu_tx_t *tx; boolean_t unlinked; uint64_t txtype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zp = VTOZ(vp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; xattr_obj = 0; xzp = NULL; if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } /* * Need to use rmdir for removing directories. */ if (vp->v_type == VDIR) { error = SET_ERROR(EPERM); goto out; } vnevent_remove(vp, dvp, name, ct); obj = zp->z_id; /* are there any extended attributes? */ error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (error == 0 && xattr_obj) { error = zfs_zget(zfsvfs, xattr_obj, &xzp); ASSERT0(error); } /* * We may delete the znode now, or we may put it in the unlinked set; * it depends on whether we're the last link, and on whether there are * other holds on the vnode. So we dmu_tx_hold() the right things to * allow for either case. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); if (xzp) { dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); } /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); /* * Mark this transaction as typically resulting in a net free of space */ dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } /* * Remove the directory entry. */ error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); if (error) { dmu_tx_commit(tx); goto out; } if (unlinked) { zfs_unlinked_add(zp, tx); vp->v_vflag |= VV_NOSYNC; } /* XXX check changes to linux vnops */ txtype = TX_REMOVE; zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); dmu_tx_commit(tx); out: if (xzp) vrele(ZTOV(xzp)); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } static int zfs_lookup_internal(znode_t *dzp, const char *name, vnode_t **vpp, struct componentname *cnp, int nameiop) { zfsvfs_t *zfsvfs = dzp->z_zfsvfs; int error; cnp->cn_nameptr = __DECONST(char *, name); cnp->cn_namelen = strlen(name); cnp->cn_nameiop = nameiop; cnp->cn_flags = ISLASTCN | SAVENAME; cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY; cnp->cn_cred = kcred; cnp->cn_thread = curthread; if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) { struct vop_lookup_args a; a.a_gen.a_desc = &vop_lookup_desc; a.a_dvp = ZTOV(dzp); a.a_vpp = vpp; a.a_cnp = cnp; error = vfs_cache_lookup(&a); } else { error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, curthread, 0, B_FALSE); } #ifdef ZFS_DEBUG if (error) { printf("got error %d on name %s on op %d\n", error, name, nameiop); kdb_backtrace(); } #endif return (error); } int zfs_remove(znode_t *dzp, const char *name, cred_t *cr, int flags) { vnode_t *vp; int error; struct componentname cn; if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) return (error); error = zfs_remove_(ZTOV(dzp), vp, name, cr); vput(vp); return (error); } /* * Create a new directory and insert it into dvp using the name * provided. Return a pointer to the inserted directory. * * IN: dvp - vnode of directory to add subdir to. * dirname - name of new directory. * vap - attributes of new directory. * cr - credentials of caller. * ct - caller context * flags - case flags * vsecp - ACL to be set * * OUT: vpp - vnode of created directory. * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated * vp - ctime|mtime|atime updated */ /*ARGSUSED*/ int zfs_mkdir(znode_t *dzp, const char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp) { znode_t *zp; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t txtype; dmu_tx_t *tx; int error; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); /* * If we have an ephemeral id, ACL, or XVATTR then * make sure file system is at proper version */ ksid = crgetsid(cr, KSID_OWNER); if (ksid) uid = ksid_getid(ksid); else uid = crgetuid(cr); if (zfsvfs->z_use_fuids == B_FALSE && ((vap->va_mask & AT_XVATTR) || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (dzp->z_pflags & ZFS_XATTR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (zfsvfs->z_utf8 && u8_validate(dirname, strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (vap->va_mask & AT_XVATTR) { if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, crgetuid(cr), cr, vap->va_type)) != 0) { ZFS_EXIT(zfsvfs); return (error); } } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * First make sure the new directory doesn't exist. * * Existence is checked first to make sure we don't return * EACCES instead of EEXIST which can cause some applications * to fail. */ *zpp = NULL; if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } ASSERT3P(zp, ==, NULL); if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } /* * Add a new entry to the directory. */ getnewvnode_reserve_(); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create new node. */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); *zpp = zp; txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, acl_ids.z_fuidp, vap); zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (0); } #if __FreeBSD_version < 1300124 static void cache_vop_rmdir(struct vnode *dvp, struct vnode *vp) { cache_purge(dvp); cache_purge(vp); } #endif /* * Remove a directory subdir entry. If the current working * directory is the same as the subdir to be removed, the * remove will fail. * * IN: dvp - vnode of directory to remove from. * name - name of directory to be removed. * cwd - vnode of current working directory. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rmdir_(vnode_t *dvp, vnode_t *vp, const char *name, cred_t *cr) { znode_t *dzp = VTOZ(dvp); znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; if ((error = zfs_zaccess_delete(dzp, zp, cr))) { goto out; } if (vp->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto out; } vnevent_rmdir(vp, dvp, name, ct); tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); zfs_sa_upgrade_txholds(tx, zp); zfs_sa_upgrade_txholds(tx, dzp); dmu_tx_mark_netfree(tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); if (error == 0) { uint64_t txtype = TX_RMDIR; zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, B_FALSE); } dmu_tx_commit(tx); cache_vop_rmdir(dvp, vp); out: if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } int zfs_rmdir(znode_t *dzp, const char *name, znode_t *cwd, cred_t *cr, int flags) { struct componentname cn; vnode_t *vp; int error; if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) return (error); error = zfs_rmdir_(ZTOV(dzp), vp, name, cr); vput(vp); return (error); } /* * Read as many directory entries as will fit into the provided * buffer from the given directory cursor position (specified in * the uio structure). * * IN: vp - vnode of directory to read. * uio - structure supplying read location, range info, * and return buffer. * cr - credentials of caller. * ct - caller context * flags - case flags * * OUT: uio - updated offset and range, buffer filled. * eofp - set to true if end-of-file detected. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated * * Note that the low 4 bits of the cookie returned by zap is always zero. * This allows us to use the low range for "special" directory entries: * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, * we use the offset 2 for the '.zfs' directory. */ /* ARGSUSED */ static int zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, int *ncookies, ulong_t **cookies) { znode_t *zp = VTOZ(vp); iovec_t *iovp; edirent_t *eodp; dirent64_t *odp; zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os; caddr_t outbuf; size_t bufsize; zap_cursor_t zc; zap_attribute_t zap; uint_t bytes_wanted; uint64_t offset; /* must be unsigned; checks for < 1 */ uint64_t parent; int local_eof; int outcount; int error; uint8_t prefetch; boolean_t check_sysattrs; uint8_t type; int ncooks; ulong_t *cooks = NULL; int flags = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If we are not given an eof variable, * use a local one. */ if (eofp == NULL) eofp = &local_eof; /* * Check for valid iov_len. */ if (uio->uio_iov->iov_len <= 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Quit if directory has been removed (posix) */ if ((*eofp = zp->z_unlinked) != 0) { ZFS_EXIT(zfsvfs); return (0); } error = 0; os = zfsvfs->z_os; offset = uio->uio_loffset; prefetch = zp->z_zn_prefetch; /* * Initialize the iterator cursor. */ if (offset <= 3) { /* * Start iteration from the beginning of the directory. */ zap_cursor_init(&zc, os, zp->z_id); } else { /* * The offset is a serialized cursor. */ zap_cursor_init_serialized(&zc, os, zp->z_id, offset); } /* * Get space to change directory entries into fs independent format. */ iovp = uio->uio_iov; bytes_wanted = iovp->iov_len; if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { bufsize = bytes_wanted; outbuf = kmem_alloc(bufsize, KM_SLEEP); odp = (struct dirent64 *)outbuf; } else { bufsize = bytes_wanted; outbuf = NULL; odp = (struct dirent64 *)iovp->iov_base; } eodp = (struct edirent *)odp; if (ncookies != NULL) { /* * Minimum entry size is dirent size and 1 byte for a file name. */ ncooks = uio->uio_resid / (sizeof (struct dirent) - sizeof (((struct dirent *)NULL)->d_name) + 1); cooks = malloc(ncooks * sizeof (ulong_t), M_TEMP, M_WAITOK); *cookies = cooks; *ncookies = ncooks; } /* * If this VFS supports the system attribute view interface; and * we're looking at an extended attribute directory; and we care * about normalization conflicts on this vfs; then we must check * for normalization conflicts with the sysattr name space. */ #ifdef TODO check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && (flags & V_RDDIR_ENTFLAGS); #else check_sysattrs = 0; #endif /* * Transform to file-system independent format */ outcount = 0; while (outcount < bytes_wanted) { ino64_t objnum; ushort_t reclen; off64_t *next = NULL; /* * Special case `.', `..', and `.zfs'. */ if (offset == 0) { (void) strcpy(zap.za_name, "."); zap.za_normalization_conflict = 0; objnum = zp->z_id; type = DT_DIR; } else if (offset == 1) { (void) strcpy(zap.za_name, ".."); zap.za_normalization_conflict = 0; objnum = parent; type = DT_DIR; } else if (offset == 2 && zfs_show_ctldir(zp)) { (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); zap.za_normalization_conflict = 0; objnum = ZFSCTL_INO_ROOT; type = DT_DIR; } else { /* * Grab next entry. */ if ((error = zap_cursor_retrieve(&zc, &zap))) { if ((*eofp = (error == ENOENT)) != 0) break; else goto update; } if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { cmn_err(CE_WARN, "zap_readdir: bad directory " "entry, obj = %lld, offset = %lld\n", (u_longlong_t)zp->z_id, (u_longlong_t)offset); error = SET_ERROR(ENXIO); goto update; } objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); /* * MacOS X can extract the object type here such as: * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); */ type = ZFS_DIRENT_TYPE(zap.za_first_integer); if (check_sysattrs && !zap.za_normalization_conflict) { #ifdef TODO zap.za_normalization_conflict = xattr_sysattr_casechk(zap.za_name); #else panic("%s:%u: TODO", __func__, __LINE__); #endif } } if (flags & V_RDDIR_ACCFILTER) { /* * If we have no access at all, don't include * this entry in the returned information */ znode_t *ezp; if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) goto skip_entry; if (!zfs_has_access(ezp, cr)) { vrele(ZTOV(ezp)); goto skip_entry; } vrele(ZTOV(ezp)); } if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else reclen = DIRENT64_RECLEN(strlen(zap.za_name)); /* * Will this entry fit in the buffer? */ if (outcount + reclen > bufsize) { /* * Did we manage to fit anything in the buffer? */ if (!outcount) { error = SET_ERROR(EINVAL); goto update; } break; } if (flags & V_RDDIR_ENTFLAGS) { /* * Add extended flag entry: */ eodp->ed_ino = objnum; eodp->ed_reclen = reclen; /* NOTE: ed_off is the offset for the *next* entry */ next = &(eodp->ed_off); eodp->ed_eflags = zap.za_normalization_conflict ? ED_CASE_CONFLICT : 0; (void) strncpy(eodp->ed_name, zap.za_name, EDIRENT_NAMELEN(reclen)); eodp = (edirent_t *)((intptr_t)eodp + reclen); } else { /* * Add normal entry: */ odp->d_ino = objnum; odp->d_reclen = reclen; odp->d_namlen = strlen(zap.za_name); /* NOTE: d_off is the offset for the *next* entry. */ next = &odp->d_off; strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); odp->d_type = type; dirent_terminate(odp); odp = (dirent64_t *)((intptr_t)odp + reclen); } outcount += reclen; ASSERT(outcount <= bufsize); /* Prefetch znode */ if (prefetch) dmu_prefetch(os, objnum, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); skip_entry: /* * Move to the next entry, fill in the previous offset. */ if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { zap_cursor_advance(&zc); offset = zap_cursor_serialize(&zc); } else { offset += 1; } /* Fill the offset right after advancing the cursor. */ if (next != NULL) *next = offset; if (cooks != NULL) { *cooks++ = offset; ncooks--; KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); } } zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ /* Subtract unused cookies */ if (ncookies != NULL) *ncookies -= ncooks; if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { iovp->iov_base += outcount; iovp->iov_len -= outcount; uio->uio_resid -= outcount; } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ, uio))) { /* * Reset the pointer. */ offset = uio->uio_loffset; } update: zap_cursor_fini(&zc); if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) kmem_free(outbuf, bufsize); if (error == ENOENT) error = 0; ZFS_ACCESSTIME_STAMP(zfsvfs, zp); uio->uio_loffset = offset; ZFS_EXIT(zfsvfs); if (error != 0 && cookies != NULL) { free(*cookies, M_TEMP); *cookies = NULL; *ncookies = 0; } return (error); } ulong_t zfs_fsync_sync_cnt = 4; static int zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); } tsd_set(zfs_fsyncer_key, NULL); return (0); } /* * Get the requested file attributes and place them in the provided * vattr structure. * * IN: vp - vnode of file. * vap - va_mask identifies requested attributes. * If AT_XVATTR set, then optional attrs are requested * flags - ATTR_NOACLCHECK (CIFS server context) * cr - credentials of caller. * * OUT: vap - attribute values. * * RETURN: 0 (always succeeds). */ /* ARGSUSED */ static int zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error = 0; uint32_t blksize; u_longlong_t nblocks; uint64_t mtime[2], ctime[2], crtime[2], rdev; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; sa_bulk_attr_t bulk[4]; int count = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); if (vp->v_type == VBLK || vp->v_type == VCHR) SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should * always be allowed to read basic attributes of file. */ if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && (vap->va_uid != crgetuid(cr))) { if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr))) { ZFS_EXIT(zfsvfs); return (error); } } /* * Return all attributes. It's cheaper to provide the answer * than to determine whether we were asked the question. */ vap->va_type = IFTOVT(zp->z_mode); vap->va_mode = zp->z_mode & ~S_IFMT; vn_fsid(vp, vap); vap->va_nodeid = zp->z_id; vap->va_nlink = zp->z_links; if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) && zp->z_links < ZFS_LINK_MAX) vap->va_nlink++; vap->va_size = zp->z_size; if (vp->v_type == VBLK || vp->v_type == VCHR) vap->va_rdev = zfs_cmpldev(rdev); vap->va_seq = zp->z_seq; vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ vap->va_filerev = zp->z_seq; /* * Add in any requested optional attributes and the create time. * Also set the corresponding bits in the returned attribute bitmap. */ if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { xoap->xoa_archive = ((zp->z_pflags & ZFS_ARCHIVE) != 0); XVA_SET_RTN(xvap, XAT_ARCHIVE); } if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { xoap->xoa_readonly = ((zp->z_pflags & ZFS_READONLY) != 0); XVA_SET_RTN(xvap, XAT_READONLY); } if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { xoap->xoa_system = ((zp->z_pflags & ZFS_SYSTEM) != 0); XVA_SET_RTN(xvap, XAT_SYSTEM); } if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { xoap->xoa_hidden = ((zp->z_pflags & ZFS_HIDDEN) != 0); XVA_SET_RTN(xvap, XAT_HIDDEN); } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { xoap->xoa_nounlink = ((zp->z_pflags & ZFS_NOUNLINK) != 0); XVA_SET_RTN(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { xoap->xoa_immutable = ((zp->z_pflags & ZFS_IMMUTABLE) != 0); XVA_SET_RTN(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { xoap->xoa_appendonly = ((zp->z_pflags & ZFS_APPENDONLY) != 0); XVA_SET_RTN(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { xoap->xoa_nodump = ((zp->z_pflags & ZFS_NODUMP) != 0); XVA_SET_RTN(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { xoap->xoa_opaque = ((zp->z_pflags & ZFS_OPAQUE) != 0); XVA_SET_RTN(xvap, XAT_OPAQUE); } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { xoap->xoa_av_quarantined = ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { xoap->xoa_av_modified = ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); XVA_SET_RTN(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && vp->v_type == VREG) { zfs_sa_get_scanstamp(zp, xvap); } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); XVA_SET_RTN(xvap, XAT_REPARSE); } if (XVA_ISSET_REQ(xvap, XAT_GEN)) { xoap->xoa_generation = zp->z_gen; XVA_SET_RTN(xvap, XAT_GEN); } if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { xoap->xoa_offline = ((zp->z_pflags & ZFS_OFFLINE) != 0); XVA_SET_RTN(xvap, XAT_OFFLINE); } if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { xoap->xoa_sparse = ((zp->z_pflags & ZFS_SPARSE) != 0); XVA_SET_RTN(xvap, XAT_SPARSE); } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { xoap->xoa_projinherit = ((zp->z_pflags & ZFS_PROJINHERIT) != 0); XVA_SET_RTN(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { xoap->xoa_projid = zp->z_projid; XVA_SET_RTN(xvap, XAT_PROJID); } } ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); ZFS_TIME_DECODE(&vap->va_mtime, mtime); ZFS_TIME_DECODE(&vap->va_ctime, ctime); ZFS_TIME_DECODE(&vap->va_birthtime, crtime); sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); vap->va_blksize = blksize; vap->va_bytes = nblocks << 9; /* nblocks * 512 */ if (zp->z_blksz == 0) { /* * Block size hasn't been set; suggest maximal I/O transfers. */ vap->va_blksize = zfsvfs->z_max_blksz; } ZFS_EXIT(zfsvfs); return (0); } /* * Set the file attributes to the values contained in the * vattr structure. * * IN: zp - znode of file to be modified. * vap - new attribute values. * If AT_XVATTR set, then optional attrs are being set * flags - ATTR_UTIME set if non-default time values provided. * - ATTR_NOACLCHECK (CIFS context only). * cr - credentials of caller. * ct - caller context * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - ctime updated, mtime updated if size changed. */ /* ARGSUSED */ int zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) { vnode_t *vp = ZTOV(zp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zfsvfs->z_os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask = 0; uint64_t saved_mode; int trim_mask = 0; uint64_t new_mode; uint64_t new_uid, new_gid; uint64_t xattr_obj; uint64_t mtime[2], ctime[2]; uint64_t projid = ZFS_INVALID_PROJID; znode_t *attrzp; int need_policy = FALSE; int err, err2; zfs_fuid_info_t *fuidp = NULL; xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ xoptattr_t *xoap; zfs_acl_t *aclp; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; boolean_t fuid_dirtied = B_FALSE; sa_bulk_attr_t bulk[7], xattr_bulk[7]; int count = 0, xattr_count = 0; if (mask == 0) return (0); if (mask & AT_NOSET) return (SET_ERROR(EINVAL)); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); zilog = zfsvfs->z_log; /* * Make sure that if we have ephemeral uid/gid or xvattr specified * that file system is at proper version level */ if (zfsvfs->z_use_fuids == B_FALSE && (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || (mask & AT_XVATTR))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (mask & AT_SIZE && vp->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EISDIR)); } if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * If this is an xvattr_t, then get a pointer to the structure of * optional attributes. If this is NULL, then we have a vattr_t. */ xoap = xva_getxoptattr(xvap); xva_init(&tmpxvattr); /* * Immutable files can only alter immutable bit and atime */ if ((zp->z_pflags & ZFS_IMMUTABLE) && ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* * Note: ZFS_READONLY is handled in zfs_zaccess_common. */ /* * Verify timestamps doesn't overflow 32 bits. * ZFS can handle large timestamps, but 32bit syscalls can't * handle times greater than 2039. This check should be removed * once large timestamps are fully supported. */ if (mask & (AT_ATIME | AT_MTIME)) { if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } } if (xoap != NULL && (mask & AT_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) && TIMESPEC_OVERFLOW(&vap->va_birthtime)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOVERFLOW)); } if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { if (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOPNOTSUPP)); } projid = xoap->xoa_projid; if (unlikely(projid == ZFS_INVALID_PROJID)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) projid = ZFS_INVALID_PROJID; else need_policy = TRUE; } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && (!dmu_objset_projectquota_enabled(os) || (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EOPNOTSUPP)); } } attrzp = NULL; aclp = NULL; if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } /* * First validate permissions */ if (mask & AT_SIZE) { /* * XXX - Note, we are not providing any open * mode flags here (like FNDELAY), so we may * block if there are locks present... this * should be addressed in openat(). */ /* XXX - would it be OK to generate a log record here? */ err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); if (err) { ZFS_EXIT(zfsvfs); return (err); } } if (mask & (AT_ATIME|AT_MTIME) || ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || XVA_ISSET_REQ(xvap, XAT_READONLY) || XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || XVA_ISSET_REQ(xvap, XAT_OFFLINE) || XVA_ISSET_REQ(xvap, XAT_SPARSE) || XVA_ISSET_REQ(xvap, XAT_CREATETIME) || XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, skipaclchk, cr); } if (mask & (AT_UID|AT_GID)) { int idmask = (mask & (AT_UID|AT_GID)); int take_owner; int take_group; /* * NOTE: even if a new mode is being set, * we may clear S_ISUID/S_ISGID bits. */ if (!(mask & AT_MODE)) vap->va_mode = zp->z_mode; /* * Take ownership or chgrp to group we are a member of */ take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); take_group = (mask & AT_GID) && zfs_groupmember(zfsvfs, vap->va_gid, cr); /* * If both AT_UID and AT_GID are set then take_owner and * take_group must both be set in order to allow taking * ownership. * * Otherwise, send the check through secpolicy_vnode_setattr() * */ if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || ((idmask == AT_UID) && take_owner) || ((idmask == AT_GID) && take_group)) { if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, skipaclchk, cr) == 0) { /* * Remove setuid/setgid for non-privileged users */ secpolicy_setid_clear(vap, vp, cr); trim_mask = (mask & (AT_UID|AT_GID)); } else { need_policy = TRUE; } } else { need_policy = TRUE; } } oldva.va_mode = zp->z_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { /* * Update xvattr mask to include only those attributes * that are actually changing. * * the bits will be restored prior to actually setting * the attributes so the caller thinks they were set. */ if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { if (xoap->xoa_appendonly != ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_APPENDONLY); XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); } } if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { if (xoap->xoa_projinherit != ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_PROJINHERIT); XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT); } } if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { if (xoap->xoa_nounlink != ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NOUNLINK); XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } } if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { if (xoap->xoa_immutable != ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_IMMUTABLE); XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); } } if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { if (xoap->xoa_nodump != ((zp->z_pflags & ZFS_NODUMP) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_NODUMP); XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); } } if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { if (xoap->xoa_av_modified != ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); } } if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { if ((vp->v_type != VREG && xoap->xoa_av_quarantined) || xoap->xoa_av_quarantined != ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { need_policy = TRUE; } else { XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); } } if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (need_policy == FALSE && (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { need_policy = TRUE; } } if (mask & AT_MODE) { if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { err = secpolicy_setid_setsticky_clear(vp, vap, &oldva, cr); if (err) { ZFS_EXIT(zfsvfs); return (err); } trim_mask |= AT_MODE; } else { need_policy = TRUE; } } if (need_policy) { /* * If trim_mask is set then take ownership * has been granted or write_acl is present and user * has the ability to modify mode. In that case remove * UID|GID and or MODE from mask so that * secpolicy_vnode_setattr() doesn't revoke it. */ if (trim_mask) { saved_mask = vap->va_mask; vap->va_mask &= ~trim_mask; if (trim_mask & AT_MODE) { /* * Save the mode, as secpolicy_vnode_setattr() * will overwrite it with ova.va_mode. */ saved_mode = vap->va_mode; } } err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); if (err) { ZFS_EXIT(zfsvfs); return (err); } if (trim_mask) { vap->va_mask |= saved_mask; if (trim_mask & AT_MODE) { /* * Recover the mode after * secpolicy_vnode_setattr(). */ vap->va_mode = saved_mode; } } } /* * secpolicy_vnode_setattr, or take ownership may have * changed va_mask */ mask = vap->va_mask; if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) { err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xattr_obj, sizeof (xattr_obj)); if (err == 0 && xattr_obj) { err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); if (err == 0) { err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); if (err != 0) vrele(ZTOV(attrzp)); } if (err) goto out2; } if (mask & AT_UID) { new_uid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); if (new_uid != zp->z_uid && zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, new_uid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (mask & AT_GID) { new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); if (new_gid != zp->z_gid && zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, new_gid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } if (projid != ZFS_INVALID_PROJID && zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { if (attrzp) vput(ZTOV(attrzp)); err = SET_ERROR(EDQUOT); goto out2; } } tx = dmu_tx_create(os); if (mask & AT_MODE) { uint64_t pmode = zp->z_mode; uint64_t acl_obj; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { err = SET_ERROR(EPERM); goto out; } if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) goto out; if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { /* * Are we upgrading ACL from old V0 format * to V1 format? */ if (zfsvfs->z_version >= ZPL_VERSION_FUID && zfs_znode_acl_version(zp) == ZFS_ACL_VERSION_INITIAL) { dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } else { dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); } } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); } else { if (((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID))) dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); else dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); } if (attrzp) { dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); } fuid_dirtied = zfsvfs->z_fuid_dirty; if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err) goto out; count = 0; /* * Set each attribute requested. * We group settings according to the locks they need to acquire. * * Note: you cannot set ctime directly, although it will be * updated as a side-effect of calling this function. */ if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { /* * For the existed object that is upgraded from old system, * its on-disk layout has no slot for the project ID attribute. * But quota accounting logic needs to access related slots by * offset directly. So we need to adjust old objects' layout * to make the project ID to some unified and fixed offset. */ if (attrzp) err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); if (err == 0) err = sa_add_projid(zp->z_sa_hdl, tx, projid); if (unlikely(err == EEXIST)) err = 0; else if (err != 0) goto out; else projid = ZFS_INVALID_PROJID; } if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&zp->z_acl_lock); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, sizeof (zp->z_pflags)); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_enter(&attrzp->z_acl_lock); SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, sizeof (attrzp->z_pflags)); if (projid != ZFS_INVALID_PROJID) { attrzp->z_projid = projid; SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, sizeof (attrzp->z_projid)); } } if (mask & (AT_UID|AT_GID)) { if (mask & AT_UID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); zp->z_uid = new_uid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_UID(zfsvfs), NULL, &new_uid, sizeof (new_uid)); attrzp->z_uid = new_uid; } } if (mask & AT_GID) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); zp->z_gid = new_gid; if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_GID(zfsvfs), NULL, &new_gid, sizeof (new_gid)); attrzp->z_gid = new_gid; } } if (!(mask & AT_MODE)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); new_mode = zp->z_mode; } err = zfs_acl_chown_setattr(zp); ASSERT(err == 0); if (attrzp) { err = zfs_acl_chown_setattr(attrzp); ASSERT(err == 0); } } if (mask & AT_MODE) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &new_mode, sizeof (new_mode)); zp->z_mode = new_mode; ASSERT3U((uintptr_t)aclp, !=, 0); err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT0(err); if (zp->z_acl_cached) zfs_acl_free(zp->z_acl_cached); zp->z_acl_cached = aclp; aclp = NULL; } if (mask & AT_ATIME) { ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &zp->z_atime, sizeof (zp->z_atime)); } if (mask & AT_MTIME) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); } if (projid != ZFS_INVALID_PROJID) { zp->z_projid = projid; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, sizeof (zp->z_projid)); } /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ if (mask & AT_SIZE && !(mask & AT_MTIME)) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); } else if (mask != 0) { SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); if (attrzp) { SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, sizeof (ctime)); zfs_tstamp_update_setup(attrzp, STATE_CHANGED, mtime, ctime); } } /* * Do this after setting timestamps to prevent timestamp * update from toggling bit */ if (xoap && (mask & AT_XVATTR)) { if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) xoap->xoa_createtime = vap->va_birthtime; /* * restore trimmed off masks * so that return masks can be set for caller. */ if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { XVA_SET_REQ(xvap, XAT_APPENDONLY); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { XVA_SET_REQ(xvap, XAT_NOUNLINK); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { XVA_SET_REQ(xvap, XAT_IMMUTABLE); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { XVA_SET_REQ(xvap, XAT_NODUMP); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { XVA_SET_REQ(xvap, XAT_AV_MODIFIED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); } if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) { XVA_SET_REQ(xvap, XAT_PROJINHERIT); } if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ASSERT(vp->v_type == VREG); zfs_xvattr_set(zp, xvap, tx); } if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&zp->z_acl_lock); if (attrzp) { if (mask & (AT_UID|AT_GID|AT_MODE)) mutex_exit(&attrzp->z_acl_lock); } out: if (err == 0 && attrzp) { err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, xattr_count, tx); ASSERT(err2 == 0); } if (attrzp) vput(ZTOV(attrzp)); if (aclp) zfs_acl_free(aclp); if (fuidp) { zfs_fuid_info_free(fuidp); fuidp = NULL; } if (err) { dmu_tx_abort(tx); } else { err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); dmu_tx_commit(tx); } out2: if (os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (err); } /* * We acquire all but fdvp locks using non-blocking acquisitions. If we * fail to acquire any lock in the path we will drop all held locks, * acquire the new lock in a blocking fashion, and then release it and * restart the rename. This acquire/release step ensures that we do not * spin on a lock waiting for release. On error release all vnode locks * and decrement references the way tmpfs_rename() would do. */ static int zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, struct vnode *tdvp, struct vnode **tvpp, const struct componentname *scnp, const struct componentname *tcnp) { zfsvfs_t *zfsvfs; struct vnode *nvp, *svp, *tvp; znode_t *sdzp, *tdzp, *szp, *tzp; const char *snm = scnp->cn_nameptr; const char *tnm = tcnp->cn_nameptr; int error; VOP_UNLOCK1(tdvp); if (*tvpp != NULL && *tvpp != tdvp) VOP_UNLOCK1(*tvpp); relock: error = vn_lock(sdvp, LK_EXCLUSIVE); if (error) goto out; sdzp = VTOZ(sdvp); error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK1(sdvp); if (error != EBUSY) goto out; error = vn_lock(tdvp, LK_EXCLUSIVE); if (error) goto out; VOP_UNLOCK1(tdvp); goto relock; } tdzp = VTOZ(tdvp); /* * Before using sdzp and tdzp we must ensure that they are live. * As a porting legacy from illumos we have two things to worry * about. One is typical for FreeBSD and it is that the vnode is * not reclaimed (doomed). The other is that the znode is live. * The current code can invalidate the znode without acquiring the * corresponding vnode lock if the object represented by the znode * and vnode is no longer valid after a rollback or receive operation. * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock * that protects the znodes from the invalidation. */ zfsvfs = sdzp->z_zfsvfs; ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); ZFS_ENTER(zfsvfs); /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { ZFS_EXIT(zfsvfs); VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); error = SET_ERROR(EIO); goto out; } /* * Re-resolve svp to be certain it still exists and fetch the * correct vnode. */ error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); if (error != 0) { /* Source entry invalid or not there. */ ZFS_EXIT(zfsvfs); VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); if ((scnp->cn_flags & ISDOTDOT) != 0 || (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) error = SET_ERROR(EINVAL); goto out; } svp = ZTOV(szp); /* * Re-resolve tvp, if it disappeared we just carry on. */ error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); if (error != 0) { ZFS_EXIT(zfsvfs); VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); vrele(svp); if ((tcnp->cn_flags & ISDOTDOT) != 0) error = SET_ERROR(EINVAL); goto out; } if (tzp != NULL) tvp = ZTOV(tzp); else tvp = NULL; /* * At present the vnode locks must be acquired before z_teardown_lock, * although it would be more logical to use the opposite order. */ ZFS_EXIT(zfsvfs); /* * Now try acquire locks on svp and tvp. */ nvp = svp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); if (tvp != NULL) vrele(tvp); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } VOP_UNLOCK1(nvp); /* * Concurrent rename race. * XXX ? */ if (nvp == tdvp) { vrele(nvp); error = SET_ERROR(EINVAL); goto out; } vrele(*svpp); *svpp = nvp; goto relock; } vrele(*svpp); *svpp = nvp; if (*tvpp != NULL) vrele(*tvpp); *tvpp = NULL; if (tvp != NULL) { nvp = tvp; error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); if (error != 0) { VOP_UNLOCK1(sdvp); VOP_UNLOCK1(tdvp); VOP_UNLOCK1(*svpp); if (error != EBUSY) { vrele(nvp); goto out; } error = vn_lock(nvp, LK_EXCLUSIVE); if (error != 0) { vrele(nvp); goto out; } vput(nvp); goto relock; } *tvpp = nvp; } return (0); out: return (error); } /* * Note that we must use VRELE_ASYNC in this function as it walks * up the directory tree and vrele may need to acquire an exclusive * lock if a last reference to a vnode is dropped. */ static int zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) { zfsvfs_t *zfsvfs; znode_t *zp, *zp1; uint64_t parent; int error; zfsvfs = tdzp->z_zfsvfs; if (tdzp == szp) return (SET_ERROR(EINVAL)); if (tdzp == sdzp) return (0); if (tdzp->z_id == zfsvfs->z_root) return (0); zp = tdzp; for (;;) { ASSERT(!zp->z_unlinked); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) break; if (parent == szp->z_id) { error = SET_ERROR(EINVAL); break; } if (parent == zfsvfs->z_root) break; if (parent == sdzp->z_id) break; error = zfs_zget(zfsvfs, parent, &zp1); if (error != 0) break; if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os))); zp = zp1; } if (error == ENOTDIR) panic("checkpath: .. not a directory\n"); if (zp != tdzp) VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os))); return (error); } #if __FreeBSD_version < 1300110 static void cache_vop_rename(struct vnode *fdvp, struct vnode *fvp, struct vnode *tdvp, struct vnode *tvp, struct componentname *fcnp, struct componentname *tcnp) { cache_purge(fvp); if (tvp != NULL) cache_purge(tvp); cache_purge_negative(tdvp); } #endif /* * Move an entry from the provided source directory to the target * directory. Change the entry name as indicated. * * IN: sdvp - Source directory containing the "old entry". * snm - Old entry name. * tdvp - Target directory to contain the "new entry". * tnm - New entry name. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * sdvp,tdvp - ctime|mtime updated */ /*ARGSUSED*/ static int zfs_rename_(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, cred_t *cr, int log) { zfsvfs_t *zfsvfs; znode_t *sdzp, *tdzp, *szp, *tzp; zilog_t *zilog = NULL; dmu_tx_t *tx; const char *snm = scnp->cn_nameptr; const char *tnm = tcnp->cn_nameptr; int error = 0; bool want_seqc_end __maybe_unused = false; /* Reject renames across filesystems. */ if ((*svpp)->v_mount != tdvp->v_mount || ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { error = SET_ERROR(EXDEV); goto out; } if (zfsctl_is_node(tdvp)) { error = SET_ERROR(EXDEV); goto out; } /* * Lock all four vnodes to ensure safety and semantics of renaming. */ error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); if (error != 0) { /* no vnodes are locked in the case of error here */ return (error); } tdzp = VTOZ(tdvp); sdzp = VTOZ(sdvp); zfsvfs = tdzp->z_zfsvfs; zilog = zfsvfs->z_log; /* * After we re-enter ZFS_ENTER() we will have to revalidate all * znodes involved. */ ZFS_ENTER(zfsvfs); if (zfsvfs->z_utf8 && u8_validate(tnm, strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { error = SET_ERROR(EILSEQ); goto unlockout; } /* If source and target are the same file, there is nothing to do. */ if ((*svpp) == (*tvpp)) { error = 0; goto unlockout; } if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && (*tvpp)->v_mountedhere != NULL)) { error = SET_ERROR(EXDEV); goto unlockout; } /* * We can not use ZFS_VERIFY_ZP() here because it could directly return * bypassing the cleanup code in the case of an error. */ if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { error = SET_ERROR(EIO); goto unlockout; } szp = VTOZ(*svpp); tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { error = SET_ERROR(EIO); goto unlockout; } /* * This is to prevent the creation of links into attribute space * by renaming a linked file into/outof an attribute directory. * See the comment in zfs_link() for why this is considered bad. */ if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { error = SET_ERROR(EINVAL); goto unlockout; } /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow renames into our tree when the project * IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { error = SET_ERROR(EXDEV); goto unlockout; } /* * Must have write access at the source to remove the old entry * and write access at the target to create the new entry. * Note that if target and source are the same, this can be * done in a single check. */ if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) goto unlockout; if ((*svpp)->v_type == VDIR) { /* * Avoid ".", "..", and aliases of "." for obvious reasons. */ if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || sdzp == szp || (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { error = EINVAL; goto unlockout; } /* * Check to make sure rename is valid. * Can't do a move like this: /usr/a/b to /usr/a/b/c/d */ if ((error = zfs_rename_check(szp, sdzp, tdzp))) goto unlockout; } /* * Does target exist? */ if (tzp) { /* * Source and target must be the same type. */ if ((*svpp)->v_type == VDIR) { if ((*tvpp)->v_type != VDIR) { error = SET_ERROR(ENOTDIR); goto unlockout; } else { cache_purge(tdvp); if (sdvp != tdvp) cache_purge(sdvp); } } else { if ((*tvpp)->v_type == VDIR) { error = SET_ERROR(EISDIR); goto unlockout; } } } vn_seqc_write_begin(*svpp); vn_seqc_write_begin(sdvp); if (*tvpp != NULL) vn_seqc_write_begin(*tvpp); if (tdvp != *tvpp) vn_seqc_write_begin(tdvp); #if __FreeBSD_version >= 1300102 want_seqc_end = true; #endif vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); if (tzp) vnevent_rename_dest(*tvpp, tdvp, tnm, ct); /* * notify the target directory if it is not the same * as source directory. */ if (tdvp != sdvp) { vnevent_rename_dest_dir(tdvp, ct); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); if (sdzp != tdzp) { dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tdzp); } if (tzp) { dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, tzp); } zfs_sa_upgrade_txholds(tx, szp); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); goto unlockout; } if (tzp) /* Attempt to remove the existing target */ error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); if (error == 0) { error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); if (error == 0) { szp->z_pflags |= ZFS_AV_MODIFIED; error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), (void *)&szp->z_pflags, sizeof (uint64_t), tx); ASSERT0(error); error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, NULL); if (error == 0) { zfs_log_rename(zilog, tx, TX_RENAME, sdzp, snm, tdzp, tnm, szp); /* * Update path information for the target vnode */ vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); } else { /* * At this point, we have successfully created * the target name, but have failed to remove * the source name. Since the create was done * with the ZRENAMING flag, there are * complications; for one, the link count is * wrong. The easiest way to deal with this * is to remove the newly created target, and * return the original error. This must * succeed; fortunately, it is very unlikely to * fail, since we just created it. */ VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, ZRENAMING, NULL), ==, 0); } } if (error == 0) { cache_vop_rename(sdvp, *svpp, tdvp, *tvpp, scnp, tcnp); } } dmu_tx_commit(tx); unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ ZFS_EXIT(zfsvfs); if (want_seqc_end) { vn_seqc_write_end(*svpp); vn_seqc_write_end(sdvp); if (*tvpp != NULL) vn_seqc_write_end(*tvpp); if (tdvp != *tvpp) vn_seqc_write_end(tdvp); want_seqc_end = false; } VOP_UNLOCK1(*svpp); VOP_UNLOCK1(sdvp); out: /* original two vnodes are locked */ MPASS(!want_seqc_end); if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); if (*tvpp != NULL) VOP_UNLOCK1(*tvpp); if (tdvp != *tvpp) VOP_UNLOCK1(tdvp); return (error); } int zfs_rename(znode_t *sdzp, const char *sname, znode_t *tdzp, const char *tname, cred_t *cr, int flags) { struct componentname scn, tcn; vnode_t *sdvp, *tdvp; vnode_t *svp, *tvp; int error; svp = tvp = NULL; sdvp = ZTOV(sdzp); tdvp = ZTOV(tdzp); error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE); if (sdzp->z_zfsvfs->z_replay == B_FALSE) VOP_UNLOCK1(sdvp); if (error != 0) goto fail; VOP_UNLOCK1(svp); vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY); error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME); if (error == EJUSTRETURN) tvp = NULL; else if (error != 0) { VOP_UNLOCK1(tdvp); goto fail; } error = zfs_rename_(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr, 0); fail: if (svp != NULL) vrele(svp); if (tvp != NULL) vrele(tvp); return (error); } /* * Insert the indicated symbolic reference entry into the directory. * * IN: dvp - Directory to contain new symbolic link. * link - Name for new symlink entry. * vap - Attributes of new entry. * cr - credentials of caller. * ct - caller context * flags - case flags * * RETURN: 0 on success, error code on failure. * * Timestamps: * dvp - ctime|mtime updated */ /*ARGSUSED*/ int zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, const char *link, znode_t **zpp, cred_t *cr, int flags) { znode_t *zp; dmu_tx_t *tx; zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zilog_t *zilog; uint64_t len = strlen(link); int error; zfs_acl_ids_t acl_ids; boolean_t fuid_dirtied; uint64_t txtype = TX_SYMLINK; ASSERT(vap->va_type == VLNK); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(dzp); zilog = zfsvfs->z_log; if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } if (len > MAXPATHLEN) { ZFS_EXIT(zfsvfs); return (SET_ERROR(ENAMETOOLONG)); } if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)) != 0) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); if (error) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (error); } if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0 /* projid */)) { zfs_acl_ids_free(&acl_ids); ZFS_EXIT(zfsvfs); return (SET_ERROR(EDQUOT)); } getnewvnode_reserve_(); tx = dmu_tx_create(zfsvfs->z_os); fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + ZFS_SA_BASE_ATTR_SIZE + len); dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, acl_ids.z_aclp->z_acl_bytes); } if (fuid_dirtied) zfs_fuid_txhold(zfsvfs, tx); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { zfs_acl_ids_free(&acl_ids); dmu_tx_abort(tx); getnewvnode_drop_reserve(); ZFS_EXIT(zfsvfs); return (error); } /* * Create a new object for the symlink. * for version 4 ZPL datsets the symlink will be an SA attribute */ zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); if (fuid_dirtied) zfs_fuid_sync(zfsvfs, tx); if (zp->z_is_sa) error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), __DECONST(void *, link), len, tx); else zfs_sa_symlink(zp, __DECONST(char *, link), len, tx); zp->z_size = len; (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), &zp->z_size, sizeof (zp->z_size), tx); /* * Insert the new object into the directory. */ (void) zfs_link_create(dzp, name, zp, tx, ZNEW); zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); *zpp = zp; zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); getnewvnode_drop_reserve(); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Return, in the buffer contained in the provided uio structure, * the symbolic path referred to by vp. * * IN: vp - vnode of symbolic link. * uio - structure to contain the link path. * cr - credentials of caller. * ct - caller context * * OUT: uio - structure containing the link path. * * RETURN: 0 on success, error code on failure. * * Timestamps: * vp - atime updated */ /* ARGSUSED */ static int zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (zp->z_is_sa) error = sa_lookup_uio(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), uio); else error = zfs_sa_readlink(zp, uio); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); return (error); } /* * Insert a new entry into directory tdvp referencing svp. * * IN: tdvp - Directory to contain new entry. * svp - vnode of new entry. * name - name of new entry. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * tdvp - ctime|mtime updated * svp - ctime updated */ /* ARGSUSED */ int zfs_link(znode_t *tdzp, znode_t *szp, const char *name, cred_t *cr, int flags) { znode_t *tzp; zfsvfs_t *zfsvfs = tdzp->z_zfsvfs; zilog_t *zilog; dmu_tx_t *tx; int error; uint64_t parent; uid_t owner; ASSERT(ZTOV(tdzp)->v_type == VDIR); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(tdzp); zilog = zfsvfs->z_log; /* * POSIX dictates that we return EPERM here. * Better choices include ENOTSUP or EISDIR. */ if (ZTOV(szp)->v_type == VDIR) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } ZFS_VERIFY_ZP(szp); /* * If we are using project inheritance, means if the directory has * ZFS_PROJINHERIT set, then its descendant directories will inherit * not only the project ID, but also the ZFS_PROJINHERIT flag. Under * such case, we only allow hard link creation in our tree when the * project IDs are the same. */ if (tdzp->z_pflags & ZFS_PROJINHERIT && tdzp->z_projid != szp->z_projid) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EXDEV)); } if (szp->z_pflags & (ZFS_APPENDONLY | ZFS_IMMUTABLE | ZFS_READONLY)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } /* Prevent links to .zfs/shares files */ if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), &parent, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } if (parent == zfsvfs->z_shares_dir) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EILSEQ)); } /* * We do not support links between attributes and non-attributes * because of the potential security risk of creating links * into "normal" file space in order to circumvent restrictions * imposed in attribute space. */ if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EPERM)); } if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } /* * Attempt to lock directory; fail if entry already exists. */ error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW); if (error) { ZFS_EXIT(zfsvfs); return (error); } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); zfs_sa_upgrade_txholds(tx, szp); zfs_sa_upgrade_txholds(tx, tdzp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); return (error); } error = zfs_link_create(tdzp, name, szp, tx, 0); if (error == 0) { uint64_t txtype = TX_LINK; zfs_log_link(zilog, tx, txtype, tdzp, szp, name); } dmu_tx_commit(tx); if (error == 0) { vnevent_link(ZTOV(szp), ct); } if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } /* * Free or allocate space in a file. Currently, this function only * supports the `F_FREESP' command. However, this command is somewhat * misnamed, as its functionality includes the ability to allocate as * well as free space. * * IN: ip - inode of file to free data in. * cmd - action to take (only F_FREESP supported). * bfp - section of file to free/alloc. * flag - current file open mode flags. * offset - current file offset. * cr - credentials of caller. * * RETURN: 0 on success, error code on failure. * * Timestamps: * ip - ctime|mtime updated */ /* ARGSUSED */ int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, offset_t offset, cred_t *cr) { zfsvfs_t *zfsvfs = ZTOZSB(zp); uint64_t off, len; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if (cmd != F_FREESP) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Callers might not be able to detect properly that we are read-only, * so check it explicitly here. */ if (zfs_is_readonly(zfsvfs)) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EROFS)); } if (bfp->l_len < 0) { ZFS_EXIT(zfsvfs); return (SET_ERROR(EINVAL)); } /* * Permissions aren't checked on Solaris because on this OS * zfs_space() can only be called with an opened file handle. * On Linux we can get here through truncate_range() which * operates directly on inodes, so we need to check access rights. */ if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { ZFS_EXIT(zfsvfs); return (error); } off = bfp->l_start; len = bfp->l_len; /* 0 means from off to end of file */ error = zfs_freesp(zp, off, len, flag, TRUE); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ static void zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs); if (zp->z_sa_hdl == NULL) { /* * The fs has been unmounted, or we did a * suspend/resume and this file no longer exists. */ ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); vrecycle(vp); return; } if (zp->z_unlinked) { /* * Fast path to recycle a vnode of a removed file. */ ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); vrecycle(vp); return; } if (zp->z_atime_dirty && zp->z_unlinked == 0) { dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&zp->z_atime, sizeof (zp->z_atime), tx); zp->z_atime_dirty = 0; dmu_tx_commit(tx); } } ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); } CTASSERT(sizeof (struct zfid_short) <= sizeof (struct fid)); CTASSERT(sizeof (struct zfid_long) <= sizeof (struct fid)); /*ARGSUSED*/ static int zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; uint32_t gen; uint64_t gen64; uint64_t object = zp->z_id; zfid_short_t *zfid; int size, i, error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &gen64, sizeof (uint64_t))) != 0) { ZFS_EXIT(zfsvfs); return (error); } gen = (uint32_t)gen64; size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; fidp->fid_len = size; zfid = (zfid_short_t *)fidp; zfid->zf_len = size; for (i = 0; i < sizeof (zfid->zf_object); i++) zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); /* Must have a non-zero generation number to distinguish from .zfs */ if (gen == 0) gen = 1; for (i = 0; i < sizeof (zfid->zf_gen); i++) zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); if (size == LONG_FID_LEN) { uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); zfid_long_t *zlfid; zlfid = (zfid_long_t *)fidp; for (i = 0; i < sizeof (zlfid->zf_setid); i++) zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); /* XXX - this should be the generation number for the objset */ for (i = 0; i < sizeof (zlfid->zf_setgen); i++) zlfid->zf_setgen[i] = 0; } ZFS_EXIT(zfsvfs); return (0); } static int zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, caller_context_t *ct) { znode_t *zp; zfsvfs_t *zfsvfs; switch (cmd) { case _PC_LINK_MAX: *valp = MIN(LONG_MAX, ZFS_LINK_MAX); return (0); case _PC_FILESIZEBITS: *valp = 64; return (0); case _PC_MIN_HOLE_SIZE: *valp = (int)SPA_MINBLOCKSIZE; return (0); case _PC_ACL_EXTENDED: #if 0 /* POSIX ACLs are not implemented for ZFS on FreeBSD yet. */ zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); *valp = zfsvfs->z_acl_type == ZFSACLTYPE_POSIX ? 1 : 0; ZFS_EXIT(zfsvfs); #else *valp = 0; #endif return (0); case _PC_ACL_NFS4: zp = VTOZ(vp); zfsvfs = zp->z_zfsvfs; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); *valp = zfsvfs->z_acl_type == ZFS_ACLTYPE_NFSV4 ? 1 : 0; ZFS_EXIT(zfsvfs); return (0); case _PC_ACL_PATH_MAX: *valp = ACL_MAX_ENTRIES; return (0); default: return (EOPNOTSUPP); } } /*ARGSUSED*/ static int zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, caller_context_t *ct) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_getacl(zp, vsecp, skipaclchk, cr); ZFS_EXIT(zfsvfs); return (error); } /*ARGSUSED*/ int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) { zfsvfs_t *zfsvfs = zp->z_zfsvfs; int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; zilog_t *zilog = zfsvfs->z_log; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zilog, 0); ZFS_EXIT(zfsvfs); return (error); } static int zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, int *rahead) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; objset_t *os = zp->z_zfsvfs->z_os; zfs_locked_range_t *lr; vm_object_t object; off_t start, end, obj_size; uint_t blksz; int pgsin_b, pgsin_a; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); start = IDX_TO_OFF(ma[0]->pindex); end = IDX_TO_OFF(ma[count - 1]->pindex + 1); /* * Lock a range covering all required and optional pages. * Note that we need to handle the case of the block size growing. */ for (;;) { blksz = zp->z_blksz; lr = zfs_rangelock_tryenter(&zp->z_rangelock, rounddown(start, blksz), roundup(end, blksz) - rounddown(start, blksz), RL_READER); if (lr == NULL) { if (rahead != NULL) { *rahead = 0; rahead = NULL; } if (rbehind != NULL) { *rbehind = 0; rbehind = NULL; } break; } if (blksz == zp->z_blksz) break; zfs_rangelock_exit(lr); } object = ma[0]->object; zfs_vmobject_wlock(object); obj_size = object->un_pager.vnp.vnp_size; zfs_vmobject_wunlock(object); if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { if (lr != NULL) zfs_rangelock_exit(lr); ZFS_EXIT(zfsvfs); return (zfs_vm_pagerret_bad); } pgsin_b = 0; if (rbehind != NULL) { pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); pgsin_b = MIN(*rbehind, pgsin_b); } pgsin_a = 0; if (rahead != NULL) { pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); if (end + IDX_TO_OFF(pgsin_a) >= obj_size) pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); pgsin_a = MIN(*rahead, pgsin_a); } /* * NB: we need to pass the exact byte size of the data that we expect * to read after accounting for the file size. This is required because * ZFS will panic if we request DMU to read beyond the end of the last * allocated block. */ error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, MIN(end, obj_size) - (end - PAGE_SIZE)); if (lr != NULL) zfs_rangelock_exit(lr); ZFS_ACCESSTIME_STAMP(zfsvfs, zp); ZFS_EXIT(zfsvfs); if (error != 0) return (zfs_vm_pagerret_error); VM_CNT_INC(v_vnodein); VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a); if (rbehind != NULL) *rbehind = pgsin_b; if (rahead != NULL) *rahead = pgsin_a; return (zfs_vm_pagerret_ok); } #ifndef _SYS_SYSPROTO_H_ struct vop_getpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int *a_rbehind; int *a_rahead; }; #endif static int zfs_freebsd_getpages(struct vop_getpages_args *ap) { return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, ap->a_rahead)); } static int zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, int *rtvals) { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; zfs_locked_range_t *lr; dmu_tx_t *tx; struct sf_buf *sf; vm_object_t object; vm_page_t m; caddr_t va; size_t tocopy; size_t lo_len; vm_ooffset_t lo_off; vm_ooffset_t off; uint_t blksz; int ncount; int pcount; int err; int i; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); object = vp->v_object; pcount = btoc(len); ncount = pcount; KASSERT(ma[0]->object == object, ("mismatching object")); KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); for (i = 0; i < pcount; i++) rtvals[i] = zfs_vm_pagerret_error; off = IDX_TO_OFF(ma[0]->pindex); blksz = zp->z_blksz; lo_off = rounddown(off, blksz); lo_len = roundup(len + (off - lo_off), blksz); lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER); zfs_vmobject_wlock(object); if (len + off > object->un_pager.vnp.vnp_size) { if (object->un_pager.vnp.vnp_size > off) { int pgoff; len = object->un_pager.vnp.vnp_size - off; ncount = btoc(len); if ((pgoff = (int)len & PAGE_MASK) != 0) { /* * If the object is locked and the following * conditions hold, then the page's dirty * field cannot be concurrently changed by a * pmap operation. */ m = ma[ncount - 1]; vm_page_assert_sbusied(m); KASSERT(!pmap_page_is_write_mapped(m), ("zfs_putpages: page %p is not read-only", m)); vm_page_clear_dirty(m, pgoff, PAGE_SIZE - pgoff); } } else { len = 0; ncount = 0; } if (ncount < pcount) { for (i = ncount; i < pcount; i++) { rtvals[i] = zfs_vm_pagerret_bad; } } } zfs_vmobject_wunlock(object); if (ncount == 0) goto out; if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) || zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) || (zp->z_projid != ZFS_DEFAULT_PROJID && zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, zp->z_projid))) { goto out; } tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); zfs_sa_upgrade_txholds(tx, zp); err = dmu_tx_assign(tx, TXG_WAIT); if (err != 0) { dmu_tx_abort(tx); goto out; } if (zp->z_blksz < PAGE_SIZE) { for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; va = zfs_map_page(ma[i], &sf); dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); zfs_unmap_page(sf); } } else { err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); } if (err == 0) { uint64_t mtime[2], ctime[2]; sa_bulk_attr_t bulk[3]; int count = 0; SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, 8); zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); ASSERT0(err); /* * XXX we should be passing a callback to undirty * but that would make the locking messier */ zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0, NULL, NULL); zfs_vmobject_wlock(object); for (i = 0; i < ncount; i++) { rtvals[i] = zfs_vm_pagerret_ok; vm_page_undirty(ma[i]); } zfs_vmobject_wunlock(object); VM_CNT_INC(v_vnodeout); VM_CNT_ADD(v_vnodepgsout, ncount); } dmu_tx_commit(tx); out: zfs_rangelock_exit(lr); if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) zil_commit(zfsvfs->z_log, zp->z_id); ZFS_EXIT(zfsvfs); return (rtvals[0]); } #ifndef _SYS_SYSPROTO_H_ struct vop_putpages_args { struct vnode *a_vp; vm_page_t *a_m; int a_count; int a_sync; int *a_rtvals; }; #endif static int zfs_freebsd_putpages(struct vop_putpages_args *ap) { return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, ap->a_rtvals)); } #ifndef _SYS_SYSPROTO_H_ struct vop_bmap_args { struct vnode *a_vp; daddr_t a_bn; struct bufobj **a_bop; daddr_t *a_bnp; int *a_runp; int *a_runb; }; #endif static int zfs_freebsd_bmap(struct vop_bmap_args *ap) { if (ap->a_bop != NULL) *ap->a_bop = &ap->a_vp->v_bufobj; if (ap->a_bnp != NULL) *ap->a_bnp = ap->a_bn; if (ap->a_runp != NULL) *ap->a_runp = 0; if (ap->a_runb != NULL) *ap->a_runb = 0; return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_open_args { struct vnode *a_vp; int a_mode; struct ucred *a_cred; struct thread *a_td; }; #endif static int zfs_freebsd_open(struct vop_open_args *ap) { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); int error; error = zfs_open(&vp, ap->a_mode, ap->a_cred); if (error == 0) vnode_create_vobject(vp, zp->z_size, ap->a_td); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_close_args { struct vnode *a_vp; int a_fflag; struct ucred *a_cred; struct thread *a_td; }; #endif static int zfs_freebsd_close(struct vop_close_args *ap) { return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_ioctl_args { struct vnode *a_vp; ulong_t a_command; caddr_t a_data; int a_fflag; struct ucred *cred; struct thread *td; }; #endif static int zfs_freebsd_ioctl(struct vop_ioctl_args *ap) { return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, ap->a_fflag, ap->a_cred, NULL)); } static int ioflags(int ioflags) { int flags = 0; if (ioflags & IO_APPEND) flags |= FAPPEND; if (ioflags & IO_NDELAY) flags |= FNONBLOCK; if (ioflags & IO_SYNC) flags |= (FSYNC | FDSYNC | FRSYNC); return (flags); } #ifndef _SYS_SYSPROTO_H_ struct vop_read_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif static int zfs_freebsd_read(struct vop_read_args *ap) { return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_write_args { struct vnode *a_vp; struct uio *a_uio; int a_ioflag; struct ucred *a_cred; }; #endif static int zfs_freebsd_write(struct vop_write_args *ap) { return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), ap->a_cred)); } #if __FreeBSD_version >= 1300102 /* * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see * the comment above cache_fplookup for details. */ static int zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v) { vnode_t *vp; znode_t *zp; uint64_t pflags; vp = v->a_vp; zp = VTOZ_SMR(vp); if (__predict_false(zp == NULL)) return (EAGAIN); pflags = atomic_load_64(&zp->z_pflags); if (pflags & ZFS_AV_QUARANTINED) return (EAGAIN); if (pflags & ZFS_XATTR) return (EAGAIN); if ((pflags & ZFS_NO_EXECS_DENIED) == 0) return (EAGAIN); return (0); } #endif #ifndef _SYS_SYSPROTO_H_ struct vop_access_args { struct vnode *a_vp; accmode_t a_accmode; struct ucred *a_cred; struct thread *a_td; }; #endif static int zfs_freebsd_access(struct vop_access_args *ap) { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); accmode_t accmode; int error = 0; if (ap->a_accmode == VEXEC) { if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0) return (0); } /* * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, */ accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); /* * VADMIN has to be handled by vaccess(). */ if (error == 0) { accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); if (accmode != 0) { #if __FreeBSD_version >= 1300105 error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, zp->z_gid, accmode, ap->a_cred); #else error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, zp->z_gid, accmode, ap->a_cred, NULL); #endif } } /* * For VEXEC, ensure that at least one execute bit is set for * non-directories. */ if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { error = EACCES; } return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_lookup_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif static int zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) { struct componentname *cnp = ap->a_cnp; char nm[NAME_MAX + 1]; ASSERT(cnp->cn_namelen < sizeof (nm)); strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm))); return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, cnp->cn_cred, cnp->cn_thread, 0, cached)); } static int zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) { return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE)); } #ifndef _SYS_SYSPROTO_H_ struct vop_lookup_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; }; #endif static int zfs_cache_lookup(struct vop_lookup_args *ap) { zfsvfs_t *zfsvfs; zfsvfs = ap->a_dvp->v_mount->mnt_data; if (zfsvfs->z_use_namecache) return (vfs_cache_lookup(ap)); else return (zfs_freebsd_lookup(ap, B_FALSE)); } #ifndef _SYS_SYSPROTO_H_ struct vop_create_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif static int zfs_freebsd_create(struct vop_create_args *ap) { zfsvfs_t *zfsvfs; struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; znode_t *zp = NULL; int rc, mode; ASSERT(cnp->cn_flags & SAVENAME); vattr_init_mask(vap); mode = vap->va_mode & ALLPERMS; zfsvfs = ap->a_dvp->v_mount->mnt_data; *ap->a_vpp = NULL; rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, !EXCL, mode, &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */); if (rc == 0) *ap->a_vpp = ZTOV(zp); if (zfsvfs->z_use_namecache && rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0) cache_enter(ap->a_dvp, *ap->a_vpp, cnp); return (rc); } #ifndef _SYS_SYSPROTO_H_ struct vop_remove_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif static int zfs_freebsd_remove(struct vop_remove_args *ap) { ASSERT(ap->a_cnp->cn_flags & SAVENAME); return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_mkdir_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; }; #endif static int zfs_freebsd_mkdir(struct vop_mkdir_args *ap) { vattr_t *vap = ap->a_vap; znode_t *zp = NULL; int rc; ASSERT(ap->a_cnp->cn_flags & SAVENAME); vattr_init_mask(vap); *ap->a_vpp = NULL; rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp, ap->a_cnp->cn_cred, 0, NULL); if (rc == 0) *ap->a_vpp = ZTOV(zp); return (rc); } #ifndef _SYS_SYSPROTO_H_ struct vop_rmdir_args { struct vnode *a_dvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif static int zfs_freebsd_rmdir(struct vop_rmdir_args *ap) { struct componentname *cnp = ap->a_cnp; ASSERT(cnp->cn_flags & SAVENAME); return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_readdir_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; int *a_eofflag; int *a_ncookies; ulong_t **a_cookies; }; #endif static int zfs_freebsd_readdir(struct vop_readdir_args *ap) { return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, ap->a_ncookies, ap->a_cookies)); } #ifndef _SYS_SYSPROTO_H_ struct vop_fsync_args { struct vnode *a_vp; int a_waitfor; struct thread *a_td; }; #endif static int zfs_freebsd_fsync(struct vop_fsync_args *ap) { vop_stdfsync(ap); return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct vop_getattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif static int zfs_freebsd_getattr(struct vop_getattr_args *ap) { vattr_t *vap = ap->a_vap; xvattr_t xvap; ulong_t fflags = 0; int error; xva_init(&xvap); xvap.xva_vattr = *vap; xvap.xva_vattr.va_mask |= AT_XVATTR; /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ XVA_SET_REQ(&xvap, XAT_IMMUTABLE); XVA_SET_REQ(&xvap, XAT_APPENDONLY); XVA_SET_REQ(&xvap, XAT_NOUNLINK); XVA_SET_REQ(&xvap, XAT_NODUMP); XVA_SET_REQ(&xvap, XAT_READONLY); XVA_SET_REQ(&xvap, XAT_ARCHIVE); XVA_SET_REQ(&xvap, XAT_SYSTEM); XVA_SET_REQ(&xvap, XAT_HIDDEN); XVA_SET_REQ(&xvap, XAT_REPARSE); XVA_SET_REQ(&xvap, XAT_OFFLINE); XVA_SET_REQ(&xvap, XAT_SPARSE); error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred); if (error != 0) return (error); /* Convert ZFS xattr into chflags. */ #define FLAG_CHECK(fflag, xflag, xfield) do { \ if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ fflags |= (fflag); \ } while (0) FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHECK(UF_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHECK(UF_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHECK(UF_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHECK(UF_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHECK *vap = xvap.xva_vattr; vap->va_flags = fflags; return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_setattr_args { struct vnode *a_vp; struct vattr *a_vap; struct ucred *a_cred; }; #endif static int zfs_freebsd_setattr(struct vop_setattr_args *ap) { vnode_t *vp = ap->a_vp; vattr_t *vap = ap->a_vap; cred_t *cred = ap->a_cred; xvattr_t xvap; ulong_t fflags; uint64_t zflags; vattr_init_mask(vap); vap->va_mask &= ~AT_NOSET; xva_init(&xvap); xvap.xva_vattr = *vap; zflags = VTOZ(vp)->z_pflags; if (vap->va_flags != VNOVAL) { zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; int error; if (zfsvfs->z_use_fuids == B_FALSE) return (EOPNOTSUPP); fflags = vap->va_flags; /* * XXX KDM * We need to figure out whether it makes sense to allow * UF_REPARSE through, since we don't really have other * facilities to handle reparse points and zfs_setattr() * doesn't currently allow setting that attribute anyway. */ if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| UF_OFFLINE|UF_SPARSE)) != 0) return (EOPNOTSUPP); /* * Unprivileged processes are not permitted to unset system * flags, or modify flags if any system flags are set. * Privileged non-jail processes may not modify system flags * if securelevel > 0 and any existing system flags are set. * Privileged jail processes behave like privileged non-jail * processes if the PR_ALLOW_CHFLAGS permission bit is set; * otherwise, they behave like unprivileged processes. */ if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || spl_priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) { if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { error = securelevel_gt(cred, 0); if (error != 0) return (error); } } else { /* * Callers may only modify the file flags on * objects they have VADMIN rights for. */ if ((error = VOP_ACCESS(vp, VADMIN, cred, curthread)) != 0) return (error); if (zflags & (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { return (EPERM); } if (fflags & (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { return (EPERM); } } #define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ if (((fflags & (fflag)) && !(zflags & (zflag))) || \ ((zflags & (zflag)) && !(fflags & (fflag)))) { \ XVA_SET_REQ(&xvap, (xflag)); \ (xfield) = ((fflags & (fflag)) != 0); \ } \ } while (0) /* Convert chflags into ZFS-type flags. */ /* XXX: what about SF_SETTABLE?. */ FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, xvap.xva_xoptattrs.xoa_immutable); FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, xvap.xva_xoptattrs.xoa_appendonly); FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, xvap.xva_xoptattrs.xoa_nounlink); FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, xvap.xva_xoptattrs.xoa_archive); FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, xvap.xva_xoptattrs.xoa_nodump); FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, xvap.xva_xoptattrs.xoa_readonly); FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, xvap.xva_xoptattrs.xoa_system); FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, xvap.xva_xoptattrs.xoa_hidden); FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, xvap.xva_xoptattrs.xoa_reparse); FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, xvap.xva_xoptattrs.xoa_offline); FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, xvap.xva_xoptattrs.xoa_sparse); #undef FLAG_CHANGE } if (vap->va_birthtime.tv_sec != VNOVAL) { xvap.xva_vattr.va_mask |= AT_XVATTR; XVA_SET_REQ(&xvap, XAT_CREATETIME); } return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred)); } #ifndef _SYS_SYSPROTO_H_ struct vop_rename_args { struct vnode *a_fdvp; struct vnode *a_fvp; struct componentname *a_fcnp; struct vnode *a_tdvp; struct vnode *a_tvp; struct componentname *a_tcnp; }; #endif static int zfs_freebsd_rename(struct vop_rename_args *ap) { vnode_t *fdvp = ap->a_fdvp; vnode_t *fvp = ap->a_fvp; vnode_t *tdvp = ap->a_tdvp; vnode_t *tvp = ap->a_tvp; int error; ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); error = zfs_rename_(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, ap->a_tcnp, ap->a_fcnp->cn_cred, 1); vrele(fdvp); vrele(fvp); vrele(tdvp); if (tvp != NULL) vrele(tvp); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_symlink_args { struct vnode *a_dvp; struct vnode **a_vpp; struct componentname *a_cnp; struct vattr *a_vap; char *a_target; }; #endif static int zfs_freebsd_symlink(struct vop_symlink_args *ap) { struct componentname *cnp = ap->a_cnp; vattr_t *vap = ap->a_vap; znode_t *zp = NULL; int rc; ASSERT(cnp->cn_flags & SAVENAME); vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ vattr_init_mask(vap); *ap->a_vpp = NULL; rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, ap->a_target, &zp, cnp->cn_cred, 0 /* flags */); if (rc == 0) *ap->a_vpp = ZTOV(zp); return (rc); } #ifndef _SYS_SYSPROTO_H_ struct vop_readlink_args { struct vnode *a_vp; struct uio *a_uio; struct ucred *a_cred; }; #endif static int zfs_freebsd_readlink(struct vop_readlink_args *ap) { return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct vop_link_args { struct vnode *a_tdvp; struct vnode *a_vp; struct componentname *a_cnp; }; #endif static int zfs_freebsd_link(struct vop_link_args *ap) { struct componentname *cnp = ap->a_cnp; vnode_t *vp = ap->a_vp; vnode_t *tdvp = ap->a_tdvp; if (tdvp->v_mount != vp->v_mount) return (EXDEV); ASSERT(cnp->cn_flags & SAVENAME); return (zfs_link(VTOZ(tdvp), VTOZ(vp), cnp->cn_nameptr, cnp->cn_cred, 0)); } #ifndef _SYS_SYSPROTO_H_ struct vop_inactive_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int zfs_freebsd_inactive(struct vop_inactive_args *ap) { vnode_t *vp = ap->a_vp; zfs_inactive(vp, curthread->td_ucred, NULL); return (0); } #if __FreeBSD_version >= 1300042 #ifndef _SYS_SYSPROTO_H_ struct vop_need_inactive_args { struct vnode *a_vp; }; #endif static int zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap) { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; int need; if (vn_need_pageq_flush(vp)) return (1); if (!ZFS_TEARDOWN_INACTIVE_TRY_ENTER_READ(zfsvfs)) return (1); need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty); ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); return (need); } #endif #ifndef _SYS_SYSPROTO_H_ struct vop_reclaim_args { struct vnode *a_vp; struct thread *a_td; }; #endif static int zfs_freebsd_reclaim(struct vop_reclaim_args *ap) { vnode_t *vp = ap->a_vp; znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; ASSERT(zp != NULL); #if __FreeBSD_version < 1300042 /* Destroy the vm object and flush associated pages. */ vnode_destroy_vobject(vp); #endif /* * z_teardown_inactive_lock protects from a race with * zfs_znode_dmu_fini in zfsvfs_teardown during * force unmount. */ ZFS_TEARDOWN_INACTIVE_ENTER_READ(zfsvfs); if (zp->z_sa_hdl == NULL) zfs_znode_free(zp); else zfs_zinactive(zp); ZFS_TEARDOWN_INACTIVE_EXIT_READ(zfsvfs); vp->v_data = NULL; return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_fid_args { struct vnode *a_vp; struct fid *a_fid; }; #endif static int zfs_freebsd_fid(struct vop_fid_args *ap) { return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); } #ifndef _SYS_SYSPROTO_H_ struct vop_pathconf_args { struct vnode *a_vp; int a_name; register_t *a_retval; } *ap; #endif static int zfs_freebsd_pathconf(struct vop_pathconf_args *ap) { ulong_t val; int error; error = zfs_pathconf(ap->a_vp, ap->a_name, &val, curthread->td_ucred, NULL); if (error == 0) { *ap->a_retval = val; return (error); } if (error != EOPNOTSUPP) return (error); switch (ap->a_name) { case _PC_NAME_MAX: *ap->a_retval = NAME_MAX; return (0); case _PC_PIPE_BUF: if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { *ap->a_retval = PIPE_BUF; return (0); } return (EINVAL); default: return (vop_stdpathconf(ap)); } } /* * FreeBSD's extended attributes namespace defines file name prefix for ZFS' * extended attribute name: * * NAMESPACE PREFIX * system freebsd:system: * user (none, can be used to access ZFS fsattr(5) attributes * created on Solaris) */ static int zfs_create_attrname(int attrnamespace, const char *name, char *attrname, size_t size) { const char *namespace, *prefix, *suffix; /* We don't allow '/' character in attribute name. */ if (strchr(name, '/') != NULL) return (EINVAL); /* We don't allow attribute names that start with "freebsd:" string. */ if (strncmp(name, "freebsd:", 8) == 0) return (EINVAL); bzero(attrname, size); switch (attrnamespace) { case EXTATTR_NAMESPACE_USER: #if 0 prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_USER_STRING; suffix = ":"; #else /* * This is the default namespace by which we can access all * attributes created on Solaris. */ prefix = namespace = suffix = ""; #endif break; case EXTATTR_NAMESPACE_SYSTEM: prefix = "freebsd:"; namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; suffix = ":"; break; case EXTATTR_NAMESPACE_EMPTY: default: return (EINVAL); } if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, name) >= size) { return (ENAMETOOLONG); } return (0); } #ifndef _SYS_SYSPROTO_H_ struct vop_getextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; #endif /* * Vnode operating to retrieve a named extended attribute. */ static int zfs_getextattr(struct vop_getextattr_args *ap) { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; /* * If the xattr property is off, refuse the request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) { return (SET_ERROR(EOPNOTSUPP)); } error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof (attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR, B_FALSE); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FREAD; NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); if (error == ENOENT) error = ENOATTR; return (error); } if (ap->a_size != NULL) { error = VOP_GETATTR(vp, &va, ap->a_cred); if (error == 0) *ap->a_size = (size_t)va.va_size; } else if (ap->a_uio != NULL) error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK1(vp); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_deleteextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; IN struct ucred *a_cred; IN struct thread *a_td; }; #endif /* * Vnode operation to remove a named attribute. */ static int zfs_deleteextattr(struct vop_deleteextattr_args *ap) { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; vnode_t *xvp = NULL, *vp; int error; /* * If the xattr property is off, refuse the request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) { return (SET_ERROR(EOPNOTSUPP)); } error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof (attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR, B_FALSE); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, UIO_SYSSPACE, attrname, xvp, td); error = namei(&nd); vp = nd.ni_vp; if (error != 0) { ZFS_EXIT(zfsvfs); NDFREE(&nd, NDF_ONLY_PNBUF); if (error == ENOENT) error = ENOATTR; return (error); } error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); if (vp == nd.ni_dvp) vrele(vp); else vput(vp); ZFS_EXIT(zfsvfs); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_setextattr { IN struct vnode *a_vp; IN int a_attrnamespace; IN const char *a_name; INOUT struct uio *a_uio; IN struct ucred *a_cred; IN struct thread *a_td; }; #endif /* * Vnode operation to set a named attribute. */ static int zfs_setextattr(struct vop_setextattr_args *ap) { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrname[255]; struct vattr va; vnode_t *xvp = NULL, *vp; int error, flags; /* * If the xattr property is off, refuse the request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) { return (SET_ERROR(EOPNOTSUPP)); } error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VWRITE); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, sizeof (attrname)); if (error != 0) return (error); ZFS_ENTER(zfsvfs); error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } flags = FFLAGS(O_WRONLY | O_CREAT); NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, xvp, td); error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, NULL); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } VATTR_NULL(&va); va.va_size = 0; error = VOP_SETATTR(vp, &va, ap->a_cred); if (error == 0) VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); VOP_UNLOCK1(vp); vn_close(vp, flags, ap->a_cred, td); ZFS_EXIT(zfsvfs); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_listextattr { IN struct vnode *a_vp; IN int a_attrnamespace; INOUT struct uio *a_uio; OUT size_t *a_size; IN struct ucred *a_cred; IN struct thread *a_td; }; #endif /* * Vnode operation to retrieve extended attributes on a vnode. */ static int zfs_listextattr(struct vop_listextattr_args *ap) { zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; struct thread *td = ap->a_td; struct nameidata nd; char attrprefix[16]; uint8_t dirbuf[sizeof (struct dirent)]; struct dirent *dp; struct iovec aiov; struct uio auio, *uio = ap->a_uio; size_t *sizep = ap->a_size; size_t plen; vnode_t *xvp = NULL, *vp; int done, error, eof, pos; /* * If the xattr property is off, refuse the request. */ if (!(zfsvfs->z_flags & ZSB_XATTR)) { return (SET_ERROR(EOPNOTSUPP)); } error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, ap->a_cred, ap->a_td, VREAD); if (error != 0) return (error); error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, sizeof (attrprefix)); if (error != 0) return (error); plen = strlen(attrprefix); ZFS_ENTER(zfsvfs); if (sizep != NULL) *sizep = 0; error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, LOOKUP_XATTR, B_FALSE); if (error != 0) { ZFS_EXIT(zfsvfs); /* * ENOATTR means that the EA directory does not yet exist, * i.e. there are no extended attributes there. */ if (error == ENOATTR) error = 0; return (error); } NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, UIO_SYSSPACE, ".", xvp, td); error = namei(&nd); vp = nd.ni_vp; NDFREE(&nd, NDF_ONLY_PNBUF); if (error != 0) { ZFS_EXIT(zfsvfs); return (error); } auio.uio_iov = &aiov; auio.uio_iovcnt = 1; auio.uio_segflg = UIO_SYSSPACE; auio.uio_td = td; auio.uio_rw = UIO_READ; auio.uio_offset = 0; do { uint8_t nlen; aiov.iov_base = (void *)dirbuf; aiov.iov_len = sizeof (dirbuf); auio.uio_resid = sizeof (dirbuf); error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); done = sizeof (dirbuf) - auio.uio_resid; if (error != 0) break; for (pos = 0; pos < done; ) { dp = (struct dirent *)(dirbuf + pos); pos += dp->d_reclen; /* * XXX: Temporarily we also accept DT_UNKNOWN, as this * is what we get when attribute was created on Solaris. */ if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) continue; if (plen == 0 && strncmp(dp->d_name, "freebsd:", 8) == 0) continue; else if (strncmp(dp->d_name, attrprefix, plen) != 0) continue; nlen = dp->d_namlen - plen; if (sizep != NULL) *sizep += 1 + nlen; else if (uio != NULL) { /* * Format of extattr name entry is one byte for * length and the rest for name. */ error = uiomove(&nlen, 1, uio->uio_rw, uio); if (error == 0) { error = uiomove(dp->d_name + plen, nlen, uio->uio_rw, uio); } if (error != 0) break; } } } while (!eof && error == 0); vput(vp); ZFS_EXIT(zfsvfs); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_getacl_args { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; }; #endif static int zfs_freebsd_getacl(struct vop_getacl_args *ap) { int error; vsecattr_t vsecattr; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; if ((error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))) return (error); error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, vsecattr.vsa_aclcnt); if (vsecattr.vsa_aclentp != NULL) kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_setacl_args { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; }; #endif static int zfs_freebsd_setacl(struct vop_setacl_args *ap) { int error; vsecattr_t vsecattr; int aclbsize; /* size of acl list in bytes */ aclent_t *aaclp; if (ap->a_type != ACL_TYPE_NFS4) return (EINVAL); if (ap->a_aclp == NULL) return (EINVAL); if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) return (EINVAL); /* * With NFSv4 ACLs, chmod(2) may need to add additional entries, * splitting every entry into two and appending "canonical six" * entries at the end. Don't allow for setting an ACL that would * cause chmod(2) to run out of ACL entries. */ if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) return (ENOSPC); error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); if (error != 0) return (error); vsecattr.vsa_mask = VSA_ACE; aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t); vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); aaclp = vsecattr.vsa_aclentp; vsecattr.vsa_aclentsz = aclbsize; aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred); kmem_free(aaclp, aclbsize); return (error); } #ifndef _SYS_SYSPROTO_H_ struct vop_aclcheck_args { struct vnode *vp; acl_type_t type; struct acl *aclp; struct ucred *cred; struct thread *td; }; #endif static int zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap) { return (EOPNOTSUPP); } static int zfs_vptocnp(struct vop_vptocnp_args *ap) { vnode_t *covered_vp; vnode_t *vp = ap->a_vp; zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; znode_t *zp = VTOZ(vp); int ltype; int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); /* * If we are a snapshot mounted under .zfs, run the operation * on the covered vnode. */ if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { char name[MAXNAMLEN + 1]; znode_t *dzp; size_t len; error = zfs_znode_parent_and_name(zp, &dzp, name); if (error == 0) { len = strlen(name); if (*ap->a_buflen < len) error = SET_ERROR(ENOMEM); } if (error == 0) { *ap->a_buflen -= len; bcopy(name, ap->a_buf + *ap->a_buflen, len); *ap->a_vpp = ZTOV(dzp); } ZFS_EXIT(zfsvfs); return (error); } ZFS_EXIT(zfsvfs); covered_vp = vp->v_mount->mnt_vnodecovered; #if __FreeBSD_version >= 1300045 enum vgetstate vs = vget_prep(covered_vp); #else vhold(covered_vp); #endif ltype = VOP_ISLOCKED(vp); VOP_UNLOCK1(vp); #if __FreeBSD_version >= 1300045 error = vget_finish(covered_vp, LK_SHARED, vs); #else error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); #endif if (error == 0) { error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_buf, ap->a_buflen); vput(covered_vp); } vn_lock(vp, ltype | LK_RETRY); if (VN_IS_DOOMED(vp)) error = SET_ERROR(ENOENT); return (error); } #ifdef DIAGNOSTIC #ifndef _SYS_SYSPROTO_H_ struct vop_lock1_args { struct vnode *a_vp; int a_flags; char *file; int line; }; #endif static int zfs_lock(struct vop_lock1_args *ap) { vnode_t *vp; znode_t *zp; int err; #if __FreeBSD_version >= 1300064 err = vop_lock(ap); #else err = vop_stdlock(ap); #endif if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { vp = ap->a_vp; zp = vp->v_data; if (vp->v_mount != NULL && !VN_IS_DOOMED(vp) && zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) - VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); + VERIFY(!ZFS_TEARDOWN_HELD(zp->z_zfsvfs)); } return (err); } #endif struct vop_vector zfs_vnodeops; struct vop_vector zfs_fifoops; struct vop_vector zfs_shareops; struct vop_vector zfs_vnodeops = { .vop_default = &default_vnodeops, .vop_inactive = zfs_freebsd_inactive, #if __FreeBSD_version >= 1300042 .vop_need_inactive = zfs_freebsd_need_inactive, #endif .vop_reclaim = zfs_freebsd_reclaim, #if __FreeBSD_version >= 1300102 .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, #endif .vop_access = zfs_freebsd_access, .vop_allocate = VOP_EINVAL, .vop_lookup = zfs_cache_lookup, .vop_cachedlookup = zfs_freebsd_cachedlookup, .vop_getattr = zfs_freebsd_getattr, .vop_setattr = zfs_freebsd_setattr, .vop_create = zfs_freebsd_create, .vop_mknod = (vop_mknod_t *)zfs_freebsd_create, .vop_mkdir = zfs_freebsd_mkdir, .vop_readdir = zfs_freebsd_readdir, .vop_fsync = zfs_freebsd_fsync, .vop_open = zfs_freebsd_open, .vop_close = zfs_freebsd_close, .vop_rmdir = zfs_freebsd_rmdir, .vop_ioctl = zfs_freebsd_ioctl, .vop_link = zfs_freebsd_link, .vop_symlink = zfs_freebsd_symlink, .vop_readlink = zfs_freebsd_readlink, .vop_read = zfs_freebsd_read, .vop_write = zfs_freebsd_write, .vop_remove = zfs_freebsd_remove, .vop_rename = zfs_freebsd_rename, .vop_pathconf = zfs_freebsd_pathconf, .vop_bmap = zfs_freebsd_bmap, .vop_fid = zfs_freebsd_fid, .vop_getextattr = zfs_getextattr, .vop_deleteextattr = zfs_deleteextattr, .vop_setextattr = zfs_setextattr, .vop_listextattr = zfs_listextattr, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, .vop_getpages = zfs_freebsd_getpages, .vop_putpages = zfs_freebsd_putpages, .vop_vptocnp = zfs_vptocnp, #if __FreeBSD_version >= 1300064 #ifdef DIAGNOSTIC .vop_lock1 = zfs_lock, #else .vop_lock1 = vop_lock, #endif .vop_unlock = vop_unlock, .vop_islocked = vop_islocked, #else #ifdef DIAGNOSTIC .vop_lock1 = zfs_lock, #endif #endif }; VFS_VOP_VECTOR_REGISTER(zfs_vnodeops); struct vop_vector zfs_fifoops = { .vop_default = &fifo_specops, .vop_fsync = zfs_freebsd_fsync, #if __FreeBSD_version >= 1300102 .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, #endif .vop_access = zfs_freebsd_access, .vop_getattr = zfs_freebsd_getattr, .vop_inactive = zfs_freebsd_inactive, .vop_read = VOP_PANIC, .vop_reclaim = zfs_freebsd_reclaim, .vop_setattr = zfs_freebsd_setattr, .vop_write = VOP_PANIC, .vop_pathconf = zfs_freebsd_pathconf, .vop_fid = zfs_freebsd_fid, .vop_getacl = zfs_freebsd_getacl, .vop_setacl = zfs_freebsd_setacl, .vop_aclcheck = zfs_freebsd_aclcheck, }; VFS_VOP_VECTOR_REGISTER(zfs_fifoops); /* * special share hidden files vnode operations template */ struct vop_vector zfs_shareops = { .vop_default = &default_vnodeops, #if __FreeBSD_version >= 1300121 .vop_fplookup_vexec = VOP_EAGAIN, #endif .vop_access = zfs_freebsd_access, .vop_inactive = zfs_freebsd_inactive, .vop_reclaim = zfs_freebsd_reclaim, .vop_fid = zfs_freebsd_fid, .vop_pathconf = zfs_freebsd_pathconf, }; VFS_VOP_VECTOR_REGISTER(zfs_shareops); Index: head/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c =================================================================== --- head/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c (revision 367344) +++ head/sys/contrib/openzfs/module/os/linux/zfs/zfs_vfsops.c (revision 367345) @@ -1,2176 +1,2176 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2018 by Delphix. All rights reserved. */ /* Portions Copyright 2010 Robert Milkowski */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_comutil.h" enum { TOKEN_RO, TOKEN_RW, TOKEN_SETUID, TOKEN_NOSETUID, TOKEN_EXEC, TOKEN_NOEXEC, TOKEN_DEVICES, TOKEN_NODEVICES, TOKEN_DIRXATTR, TOKEN_SAXATTR, TOKEN_XATTR, TOKEN_NOXATTR, TOKEN_ATIME, TOKEN_NOATIME, TOKEN_RELATIME, TOKEN_NORELATIME, TOKEN_NBMAND, TOKEN_NONBMAND, TOKEN_MNTPOINT, TOKEN_LAST, }; static const match_table_t zpl_tokens = { { TOKEN_RO, MNTOPT_RO }, { TOKEN_RW, MNTOPT_RW }, { TOKEN_SETUID, MNTOPT_SETUID }, { TOKEN_NOSETUID, MNTOPT_NOSETUID }, { TOKEN_EXEC, MNTOPT_EXEC }, { TOKEN_NOEXEC, MNTOPT_NOEXEC }, { TOKEN_DEVICES, MNTOPT_DEVICES }, { TOKEN_NODEVICES, MNTOPT_NODEVICES }, { TOKEN_DIRXATTR, MNTOPT_DIRXATTR }, { TOKEN_SAXATTR, MNTOPT_SAXATTR }, { TOKEN_XATTR, MNTOPT_XATTR }, { TOKEN_NOXATTR, MNTOPT_NOXATTR }, { TOKEN_ATIME, MNTOPT_ATIME }, { TOKEN_NOATIME, MNTOPT_NOATIME }, { TOKEN_RELATIME, MNTOPT_RELATIME }, { TOKEN_NORELATIME, MNTOPT_NORELATIME }, { TOKEN_NBMAND, MNTOPT_NBMAND }, { TOKEN_NONBMAND, MNTOPT_NONBMAND }, { TOKEN_MNTPOINT, MNTOPT_MNTPOINT "=%s" }, { TOKEN_LAST, NULL }, }; static void zfsvfs_vfs_free(vfs_t *vfsp) { if (vfsp != NULL) { if (vfsp->vfs_mntpoint != NULL) kmem_strfree(vfsp->vfs_mntpoint); kmem_free(vfsp, sizeof (vfs_t)); } } static int zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp) { switch (token) { case TOKEN_RO: vfsp->vfs_readonly = B_TRUE; vfsp->vfs_do_readonly = B_TRUE; break; case TOKEN_RW: vfsp->vfs_readonly = B_FALSE; vfsp->vfs_do_readonly = B_TRUE; break; case TOKEN_SETUID: vfsp->vfs_setuid = B_TRUE; vfsp->vfs_do_setuid = B_TRUE; break; case TOKEN_NOSETUID: vfsp->vfs_setuid = B_FALSE; vfsp->vfs_do_setuid = B_TRUE; break; case TOKEN_EXEC: vfsp->vfs_exec = B_TRUE; vfsp->vfs_do_exec = B_TRUE; break; case TOKEN_NOEXEC: vfsp->vfs_exec = B_FALSE; vfsp->vfs_do_exec = B_TRUE; break; case TOKEN_DEVICES: vfsp->vfs_devices = B_TRUE; vfsp->vfs_do_devices = B_TRUE; break; case TOKEN_NODEVICES: vfsp->vfs_devices = B_FALSE; vfsp->vfs_do_devices = B_TRUE; break; case TOKEN_DIRXATTR: vfsp->vfs_xattr = ZFS_XATTR_DIR; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_SAXATTR: vfsp->vfs_xattr = ZFS_XATTR_SA; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_XATTR: vfsp->vfs_xattr = ZFS_XATTR_DIR; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_NOXATTR: vfsp->vfs_xattr = ZFS_XATTR_OFF; vfsp->vfs_do_xattr = B_TRUE; break; case TOKEN_ATIME: vfsp->vfs_atime = B_TRUE; vfsp->vfs_do_atime = B_TRUE; break; case TOKEN_NOATIME: vfsp->vfs_atime = B_FALSE; vfsp->vfs_do_atime = B_TRUE; break; case TOKEN_RELATIME: vfsp->vfs_relatime = B_TRUE; vfsp->vfs_do_relatime = B_TRUE; break; case TOKEN_NORELATIME: vfsp->vfs_relatime = B_FALSE; vfsp->vfs_do_relatime = B_TRUE; break; case TOKEN_NBMAND: vfsp->vfs_nbmand = B_TRUE; vfsp->vfs_do_nbmand = B_TRUE; break; case TOKEN_NONBMAND: vfsp->vfs_nbmand = B_FALSE; vfsp->vfs_do_nbmand = B_TRUE; break; case TOKEN_MNTPOINT: vfsp->vfs_mntpoint = match_strdup(&args[0]); if (vfsp->vfs_mntpoint == NULL) return (SET_ERROR(ENOMEM)); break; default: break; } return (0); } /* * Parse the raw mntopts and return a vfs_t describing the options. */ static int zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) { vfs_t *tmp_vfsp; int error; tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP); if (mntopts != NULL) { substring_t args[MAX_OPT_ARGS]; char *tmp_mntopts, *p, *t; int token; tmp_mntopts = t = kmem_strdup(mntopts); if (tmp_mntopts == NULL) return (SET_ERROR(ENOMEM)); while ((p = strsep(&t, ",")) != NULL) { if (!*p) continue; args[0].to = args[0].from = NULL; token = match_token(p, zpl_tokens, args); error = zfsvfs_parse_option(p, token, args, tmp_vfsp); if (error) { kmem_strfree(tmp_mntopts); zfsvfs_vfs_free(tmp_vfsp); return (error); } } kmem_strfree(tmp_mntopts); } *vfsp = tmp_vfsp; return (0); } boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs) { return (!!(zfsvfs->z_sb->s_flags & SB_RDONLY)); } /*ARGSUSED*/ int zfs_sync(struct super_block *sb, int wait, cred_t *cr) { zfsvfs_t *zfsvfs = sb->s_fs_info; /* * Semantically, the only requirement is that the sync be initiated. * The DMU syncs out txgs frequently, so there's nothing to do. */ if (!wait) return (0); if (zfsvfs != NULL) { /* * Sync a specific filesystem. */ dsl_pool_t *dp; ZFS_ENTER(zfsvfs); dp = dmu_objset_pool(zfsvfs->z_os); /* * If the system is shutting down, then skip any * filesystems which may exist on a suspended pool. */ if (spa_suspended(dp->dp_spa)) { ZFS_EXIT(zfsvfs); return (0); } if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, 0); ZFS_EXIT(zfsvfs); } else { /* * Sync all ZFS filesystems. This is what happens when you * run sync(1M). Unlike other filesystems, ZFS honors the * request by waiting for all pools to commit all dirty data. */ spa_sync_allpools(); } return (0); } static void atime_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; struct super_block *sb = zfsvfs->z_sb; if (sb == NULL) return; /* * Update SB_NOATIME bit in VFS super block. Since atime update is * determined by atime_needs_update(), atime_needs_update() needs to * return false if atime is turned off, and not unconditionally return * false if atime is turned on. */ if (newval) sb->s_flags &= ~SB_NOATIME; else sb->s_flags |= SB_NOATIME; } static void relatime_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_relatime = newval; } static void xattr_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; if (newval == ZFS_XATTR_OFF) { zfsvfs->z_flags &= ~ZSB_XATTR; } else { zfsvfs->z_flags |= ZSB_XATTR; if (newval == ZFS_XATTR_SA) zfsvfs->z_xattr_sa = B_TRUE; else zfsvfs->z_xattr_sa = B_FALSE; } } static void acltype_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; switch (newval) { case ZFS_ACLTYPE_NFSV4: case ZFS_ACLTYPE_OFF: zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; break; case ZFS_ACLTYPE_POSIX: #ifdef CONFIG_FS_POSIX_ACL zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIX; zfsvfs->z_sb->s_flags |= SB_POSIXACL; #else zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; #endif /* CONFIG_FS_POSIX_ACL */ break; default: break; } } static void blksz_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); ASSERT(ISP2(newval)); zfsvfs->z_max_blksz = newval; } static void readonly_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; struct super_block *sb = zfsvfs->z_sb; if (sb == NULL) return; if (newval) sb->s_flags |= SB_RDONLY; else sb->s_flags &= ~SB_RDONLY; } static void devices_changed_cb(void *arg, uint64_t newval) { } static void setuid_changed_cb(void *arg, uint64_t newval) { } static void exec_changed_cb(void *arg, uint64_t newval) { } static void nbmand_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; struct super_block *sb = zfsvfs->z_sb; if (sb == NULL) return; if (newval == TRUE) sb->s_flags |= SB_MANDLOCK; else sb->s_flags &= ~SB_MANDLOCK; } static void snapdir_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_show_ctldir = newval; } static void vscan_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_vscan = newval; } static void acl_mode_changed_cb(void *arg, uint64_t newval) { zfsvfs_t *zfsvfs = arg; zfsvfs->z_acl_mode = newval; } static void acl_inherit_changed_cb(void *arg, uint64_t newval) { ((zfsvfs_t *)arg)->z_acl_inherit = newval; } static int zfs_register_callbacks(vfs_t *vfsp) { struct dsl_dataset *ds = NULL; objset_t *os = NULL; zfsvfs_t *zfsvfs = NULL; int error = 0; ASSERT(vfsp); zfsvfs = vfsp->vfs_data; ASSERT(zfsvfs); os = zfsvfs->z_os; /* * The act of registering our callbacks will destroy any mount * options we may have. In order to enable temporary overrides * of mount options, we stash away the current values and * restore them after we register the callbacks. */ if (zfs_is_readonly(zfsvfs) || !spa_writeable(dmu_objset_spa(os))) { vfsp->vfs_do_readonly = B_TRUE; vfsp->vfs_readonly = B_TRUE; } /* * Register property callbacks. * * It would probably be fine to just check for i/o error from * the first prop_register(), but I guess I like to go * overboard... */ ds = dmu_objset_ds(os); dsl_pool_config_enter(dmu_objset_pool(os), FTAG); error = dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); error = error ? error : dsl_prop_register(ds, zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs); dsl_pool_config_exit(dmu_objset_pool(os), FTAG); if (error) goto unregister; /* * Invoke our callbacks to restore temporary mount options. */ if (vfsp->vfs_do_readonly) readonly_changed_cb(zfsvfs, vfsp->vfs_readonly); if (vfsp->vfs_do_setuid) setuid_changed_cb(zfsvfs, vfsp->vfs_setuid); if (vfsp->vfs_do_exec) exec_changed_cb(zfsvfs, vfsp->vfs_exec); if (vfsp->vfs_do_devices) devices_changed_cb(zfsvfs, vfsp->vfs_devices); if (vfsp->vfs_do_xattr) xattr_changed_cb(zfsvfs, vfsp->vfs_xattr); if (vfsp->vfs_do_atime) atime_changed_cb(zfsvfs, vfsp->vfs_atime); if (vfsp->vfs_do_relatime) relatime_changed_cb(zfsvfs, vfsp->vfs_relatime); if (vfsp->vfs_do_nbmand) nbmand_changed_cb(zfsvfs, vfsp->vfs_nbmand); return (0); unregister: dsl_prop_unregister_all(ds, zfsvfs); return (error); } /* * Takes a dataset, a property, a value and that value's setpoint as * found in the ZAP. Checks if the property has been changed in the vfs. * If so, val and setpoint will be overwritten with updated content. * Otherwise, they are left unchanged. */ int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, char *setpoint) { int error; zfsvfs_t *zfvp; vfs_t *vfsp; objset_t *os; uint64_t tmp = *val; error = dmu_objset_from_ds(ds, &os); if (error != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) return (EINVAL); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); mutex_exit(&os->os_user_ptr_lock); if (zfvp == NULL) return (ESRCH); vfsp = zfvp->z_vfs; switch (zfs_prop) { case ZFS_PROP_ATIME: if (vfsp->vfs_do_atime) tmp = vfsp->vfs_atime; break; case ZFS_PROP_RELATIME: if (vfsp->vfs_do_relatime) tmp = vfsp->vfs_relatime; break; case ZFS_PROP_DEVICES: if (vfsp->vfs_do_devices) tmp = vfsp->vfs_devices; break; case ZFS_PROP_EXEC: if (vfsp->vfs_do_exec) tmp = vfsp->vfs_exec; break; case ZFS_PROP_SETUID: if (vfsp->vfs_do_setuid) tmp = vfsp->vfs_setuid; break; case ZFS_PROP_READONLY: if (vfsp->vfs_do_readonly) tmp = vfsp->vfs_readonly; break; case ZFS_PROP_XATTR: if (vfsp->vfs_do_xattr) tmp = vfsp->vfs_xattr; break; case ZFS_PROP_NBMAND: if (vfsp->vfs_do_nbmand) tmp = vfsp->vfs_nbmand; break; default: return (ENOENT); } if (tmp != *val) { (void) strcpy(setpoint, "temporary"); *val = tmp; } return (0); } /* * Associate this zfsvfs with the given objset, which must be owned. * This will cache a bunch of on-disk state from the objset in the * zfsvfs. */ static int zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) { int error; uint64_t val; zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; zfsvfs->z_os = os; error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); if (error != 0) return (error); if (zfsvfs->z_version > zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { (void) printk("Can't mount a version %lld file system " "on a version %lld pool\n. Pool must be upgraded to mount " "this file system.\n", (u_longlong_t)zfsvfs->z_version, (u_longlong_t)spa_version(dmu_objset_spa(os))); return (SET_ERROR(ENOTSUP)); } error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); if (error != 0) return (error); zfsvfs->z_norm = (int)val; error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); if (error != 0) return (error); zfsvfs->z_utf8 = (val != 0); error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); if (error != 0) return (error); zfsvfs->z_case = (uint_t)val; if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val)) != 0) return (error); zfsvfs->z_acl_type = (uint_t)val; /* * Fold case on file systems that are always or sometimes case * insensitive. */ if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || zfsvfs->z_case == ZFS_CASE_MIXED) zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); uint64_t sa_obj = 0; if (zfsvfs->z_use_sa) { /* should either have both of these objects or none */ error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); if (error != 0) return (error); error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); if ((error == 0) && (val == ZFS_XATTR_SA)) zfsvfs->z_xattr_sa = B_TRUE; } error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &zfsvfs->z_root); if (error != 0) return (error); ASSERT(zfsvfs->z_root != 0); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, &zfsvfs->z_unlinkedobj); if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], 8, 1, &zfsvfs->z_userquota_obj); if (error == ENOENT) zfsvfs->z_userquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], 8, 1, &zfsvfs->z_groupquota_obj); if (error == ENOENT) zfsvfs->z_groupquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], 8, 1, &zfsvfs->z_projectquota_obj); if (error == ENOENT) zfsvfs->z_projectquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], 8, 1, &zfsvfs->z_userobjquota_obj); if (error == ENOENT) zfsvfs->z_userobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], 8, 1, &zfsvfs->z_groupobjquota_obj); if (error == ENOENT) zfsvfs->z_groupobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], 8, 1, &zfsvfs->z_projectobjquota_obj); if (error == ENOENT) zfsvfs->z_projectobjquota_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (error == ENOENT) zfsvfs->z_fuid_obj = 0; else if (error != 0) return (error); error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, &zfsvfs->z_shares_dir); if (error == ENOENT) zfsvfs->z_shares_dir = 0; else if (error != 0) return (error); error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, &zfsvfs->z_attr_table); if (error != 0) return (error); if (zfsvfs->z_version >= ZPL_VERSION_SA) sa_register_update_callback(os, zfs_sa_upgrade); return (0); } int zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) { objset_t *os; zfsvfs_t *zfsvfs; int error; boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); if (error != 0) { kmem_free(zfsvfs, sizeof (zfsvfs_t)); return (error); } error = zfsvfs_create_impl(zfvp, zfsvfs, os); if (error != 0) { dmu_objset_disown(os, B_TRUE, zfsvfs); } return (error); } /* * Note: zfsvfs is assumed to be malloc'd, and will be freed by this function * on a failure. Do not pass in a statically allocated zfsvfs. */ int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) { int error; zfsvfs->z_vfs = NULL; zfsvfs->z_sb = NULL; zfsvfs->z_parent = zfsvfs; mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); - rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); + ZFS_TEARDOWN_INIT(zfsvfs); rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1), ZFS_OBJ_MTX_MAX); zfsvfs->z_hold_size = size; zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, KM_SLEEP); zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); for (int i = 0; i != size; i++) { avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); } error = zfsvfs_init(zfsvfs, os); if (error != 0) { *zfvp = NULL; zfsvfs_free(zfsvfs); return (error); } zfsvfs->z_drain_task = TASKQID_INVALID; zfsvfs->z_draining = B_FALSE; zfsvfs->z_drain_cancel = B_TRUE; *zfvp = zfsvfs; return (0); } static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { int error; boolean_t readonly = zfs_is_readonly(zfsvfs); error = zfs_register_callbacks(zfsvfs->z_vfs); if (error) return (error); zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); /* * If we are not mounting (ie: online recv), then we don't * have to worry about replaying the log as we blocked all * operations out since we closed the ZIL. */ if (mounting) { ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); /* * During replay we remove the read only flag to * allow replays to succeed. */ if (readonly != 0) { readonly_changed_cb(zfsvfs, B_FALSE); } else { zap_stats_t zs; if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, &zs) == 0) { dataset_kstats_update_nunlinks_kstat( &zfsvfs->z_kstat, zs.zs_num_entries); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "num_entries in unlinked set: %llu", zs.zs_num_entries); } zfs_unlinked_drain(zfsvfs); dsl_dir_t *dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; dd->dd_activity_cancelled = B_FALSE; } /* * Parse and replay the intent log. * * Because of ziltest, this must be done after * zfs_unlinked_drain(). (Further note: ziltest * doesn't use readonly mounts, where * zfs_unlinked_drain() isn't called.) This is because * ziltest causes spa_sync() to think it's committed, * but actually it is not, so the intent log contains * many txg's worth of changes. * * In particular, if object N is in the unlinked set in * the last txg to actually sync, then it could be * actually freed in a later txg and then reallocated * in a yet later txg. This would write a "create * object N" record to the intent log. Normally, this * would be fine because the spa_sync() would have * written out the fact that object N is free, before * we could write the "create object N" intent log * record. * * But when we are in ziltest mode, we advance the "open * txg" without actually spa_sync()-ing the changes to * disk. So we would see that object N is still * allocated and in the unlinked set, and there is an * intent log record saying to allocate it. */ if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { if (zil_replay_disable) { zil_destroy(zfsvfs->z_log, B_FALSE); } else { zfsvfs->z_replay = B_TRUE; zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); zfsvfs->z_replay = B_FALSE; } } /* restore readonly bit */ if (readonly != 0) readonly_changed_cb(zfsvfs, B_TRUE); } /* * Set the objset user_ptr to track its zfsvfs. */ mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); return (0); } void zfsvfs_free(zfsvfs_t *zfsvfs) { int i, size = zfsvfs->z_hold_size; zfs_fuid_destroy(zfsvfs); mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); - rrm_destroy(&zfsvfs->z_teardown_lock); + ZFS_TEARDOWN_DESTROY(zfsvfs); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); for (i = 0; i != size; i++) { avl_destroy(&zfsvfs->z_hold_trees[i]); mutex_destroy(&zfsvfs->z_hold_locks[i]); } vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); zfsvfs_vfs_free(zfsvfs->z_vfs); dataset_kstats_destroy(&zfsvfs->z_kstat); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } static void zfs_set_fuid_feature(zfsvfs_t *zfsvfs) { zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); } static void zfs_unregister_callbacks(zfsvfs_t *zfsvfs) { objset_t *os = zfsvfs->z_os; if (!dmu_objset_is_snapshot(os)) dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); } #ifdef HAVE_MLSLABEL /* * Check that the hex label string is appropriate for the dataset being * mounted into the global_zone proper. * * Return an error if the hex label string is not default or * admin_low/admin_high. For admin_low labels, the corresponding * dataset must be readonly. */ int zfs_check_global_label(const char *dsname, const char *hexsl) { if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) return (0); if (strcasecmp(hexsl, ADMIN_HIGH) == 0) return (0); if (strcasecmp(hexsl, ADMIN_LOW) == 0) { /* must be readonly */ uint64_t rdonly; if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) return (SET_ERROR(EACCES)); return (rdonly ? 0 : SET_ERROR(EACCES)); } return (SET_ERROR(EACCES)); } #endif /* HAVE_MLSLABEL */ static int zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp, uint32_t bshift) { char buf[20 + DMU_OBJACCT_PREFIX_LEN]; uint64_t offset = DMU_OBJACCT_PREFIX_LEN; uint64_t quota; uint64_t used; int err; strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); err = zfs_id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, sizeof (buf) - offset, B_FALSE); if (err) return (err); if (zfsvfs->z_projectquota_obj == 0) goto objs; err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj, buf + offset, 8, 1, "a); if (err == ENOENT) goto objs; else if (err) return (err); err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, buf + offset, 8, 1, &used); if (unlikely(err == ENOENT)) { uint32_t blksize; u_longlong_t nblocks; /* * Quota accounting is async, so it is possible race case. * There is at least one object with the given project ID. */ sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); if (unlikely(zp->z_blksz == 0)) blksize = zfsvfs->z_max_blksz; used = blksize * nblocks; } else if (err) { return (err); } statp->f_blocks = quota >> bshift; statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0; statp->f_bavail = statp->f_bfree; objs: if (zfsvfs->z_projectobjquota_obj == 0) return (0); err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj, buf + offset, 8, 1, "a); if (err == ENOENT) return (0); else if (err) return (err); err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, buf, 8, 1, &used); if (unlikely(err == ENOENT)) { /* * Quota accounting is async, so it is possible race case. * There is at least one object with the given project ID. */ used = 1; } else if (err) { return (err); } statp->f_files = quota; statp->f_ffree = (quota > used) ? (quota - used) : 0; return (0); } int zfs_statvfs(struct inode *ip, struct kstatfs *statp) { zfsvfs_t *zfsvfs = ITOZSB(ip); uint64_t refdbytes, availbytes, usedobjs, availobjs; int err = 0; ZFS_ENTER(zfsvfs); dmu_objset_space(zfsvfs->z_os, &refdbytes, &availbytes, &usedobjs, &availobjs); uint64_t fsid = dmu_objset_fsid_guid(zfsvfs->z_os); /* * The underlying storage pool actually uses multiple block * size. Under Solaris frsize (fragment size) is reported as * the smallest block size we support, and bsize (block size) * as the filesystem's maximum block size. Unfortunately, * under Linux the fragment size and block size are often used * interchangeably. Thus we are forced to report both of them * as the filesystem's maximum block size. */ statp->f_frsize = zfsvfs->z_max_blksz; statp->f_bsize = zfsvfs->z_max_blksz; uint32_t bshift = fls(statp->f_bsize) - 1; /* * The following report "total" blocks of various kinds in * the file system, but reported in terms of f_bsize - the * "preferred" size. */ /* Round up so we never have a filesystem using 0 blocks. */ refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize); statp->f_blocks = (refdbytes + availbytes) >> bshift; statp->f_bfree = availbytes >> bshift; statp->f_bavail = statp->f_bfree; /* no root reservation */ /* * statvfs() should really be called statufs(), because it assumes * static metadata. ZFS doesn't preallocate files, so the best * we can do is report the max that could possibly fit in f_files, * and that minus the number actually used in f_ffree. * For f_ffree, report the smaller of the number of objects available * and the number of blocks (each object will take at least a block). */ statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT); statp->f_files = statp->f_ffree + usedobjs; statp->f_fsid.val[0] = (uint32_t)fsid; statp->f_fsid.val[1] = (uint32_t)(fsid >> 32); statp->f_type = ZFS_SUPER_MAGIC; statp->f_namelen = MAXNAMELEN - 1; /* * We have all of 40 characters to stuff a string here. * Is there anything useful we could/should provide? */ bzero(statp->f_spare, sizeof (statp->f_spare)); if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && dmu_objset_projectquota_present(zfsvfs->z_os)) { znode_t *zp = ITOZ(ip); if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid && zpl_is_valid_projid(zp->z_projid)) err = zfs_statfs_project(zfsvfs, zp, statp, bshift); } ZFS_EXIT(zfsvfs); return (err); } static int zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) { znode_t *rootzp; int error; ZFS_ENTER(zfsvfs); error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); if (error == 0) *ipp = ZTOI(rootzp); ZFS_EXIT(zfsvfs); return (error); } /* * Linux kernels older than 3.1 do not support a per-filesystem shrinker. * To accommodate this we must improvise and manually walk the list of znodes * attempting to prune dentries in order to be able to drop the inodes. * * To avoid scanning the same znodes multiple times they are always rotated * to the end of the z_all_znodes list. New znodes are inserted at the * end of the list so we're always scanning the oldest znodes first. */ static int zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) { znode_t **zp_array, *zp; int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); int objects = 0; int i = 0, j = 0; zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); mutex_enter(&zfsvfs->z_znodes_lock); while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { if ((i++ > nr_to_scan) || (j >= max_array)) break; ASSERT(list_link_active(&zp->z_link_node)); list_remove(&zfsvfs->z_all_znodes, zp); list_insert_tail(&zfsvfs->z_all_znodes, zp); /* Skip active znodes and .zfs entries */ if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) continue; if (igrab(ZTOI(zp)) == NULL) continue; zp_array[j] = zp; j++; } mutex_exit(&zfsvfs->z_znodes_lock); for (i = 0; i < j; i++) { zp = zp_array[i]; ASSERT3P(zp, !=, NULL); d_prune_aliases(ZTOI(zp)); if (atomic_read(&ZTOI(zp)->i_count) == 1) objects++; zrele(zp); } kmem_free(zp_array, max_array * sizeof (znode_t *)); return (objects); } /* * The ARC has requested that the filesystem drop entries from the dentry * and inode caches. This can occur when the ARC needs to free meta data * blocks but can't because they are all pinned by entries in these caches. */ int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) { zfsvfs_t *zfsvfs = sb->s_fs_info; int error = 0; struct shrinker *shrinker = &sb->s_shrink; struct shrink_control sc = { .nr_to_scan = nr_to_scan, .gfp_mask = GFP_KERNEL, }; ZFS_ENTER(zfsvfs); #if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \ defined(SHRINK_CONTROL_HAS_NID) && \ defined(SHRINKER_NUMA_AWARE) if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) { *objects = 0; for_each_online_node(sc.nid) { *objects += (*shrinker->scan_objects)(shrinker, &sc); } } else { *objects = (*shrinker->scan_objects)(shrinker, &sc); } #elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) *objects = (*shrinker->scan_objects)(shrinker, &sc); #elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) *objects = (*shrinker->shrink)(shrinker, &sc); #elif defined(HAVE_D_PRUNE_ALIASES) #define D_PRUNE_ALIASES_IS_DEFAULT *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); #else #error "No available dentry and inode cache pruning mechanism." #endif #if defined(HAVE_D_PRUNE_ALIASES) && !defined(D_PRUNE_ALIASES_IS_DEFAULT) #undef D_PRUNE_ALIASES_IS_DEFAULT /* * Fall back to zfs_prune_aliases if the kernel's per-superblock * shrinker couldn't free anything, possibly due to the inodes being * allocated in a different memcg. */ if (*objects == 0) *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); #endif ZFS_EXIT(zfsvfs); dprintf_ds(zfsvfs->z_os->os_dsl_dataset, "pruning, nr_to_scan=%lu objects=%d error=%d\n", nr_to_scan, *objects, error); return (error); } /* * Teardown the zfsvfs_t. * * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' * and 'z_teardown_inactive_lock' held. */ static int zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) { znode_t *zp; zfs_unlinked_drain_stop_wait(zfsvfs); /* * If someone has not already unmounted this file system, * drain the zrele_taskq to ensure all active references to the * zfsvfs_t have been handled only then can it be safely destroyed. */ if (zfsvfs->z_os) { /* * If we're unmounting we have to wait for the list to * drain completely. * * If we're not unmounting there's no guarantee the list * will drain completely, but iputs run from the taskq * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * * We can safely read z_nr_znodes without locking because the * VFS has already blocked operations which add to the * z_all_znodes list and thus increment z_nr_znodes. */ int round = 0; while (zfsvfs->z_nr_znodes > 0) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) break; } } - rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + ZFS_TEARDOWN_ENTER_WRITE(zfsvfs, FTAG); if (!unmounting) { /* * We purge the parent filesystem's super block as the * parent filesystem and all of its snapshots have their * inode's super block set to the parent's filesystem's * super block. Note, 'z_parent' is self referential * for non-snapshots. */ shrink_dcache_sb(zfsvfs->z_parent->z_sb); } /* * Close the zil. NB: Can't close the zil while zfs_inactive * threads are blocked as zil_close can call zfs_inactive. */ if (zfsvfs->z_log) { zil_close(zfsvfs->z_log); zfsvfs->z_log = NULL; } rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); /* * If we are not unmounting (ie: online recv) and someone already * unmounted this file system while we were doing the switcheroo, * or a reopen of z_os failed then just bail out now. */ if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); return (SET_ERROR(EIO)); } /* * At this point there are no VFS ops active, and any new VFS ops * will fail with EIO since we have z_teardown_lock for writer (only * relevant for forced unmount). * * Release all holds on dbufs. We also grab an extra reference to all * the remaining inodes so that the kernel does not attempt to free * any inodes of a suspended fs. This can cause deadlocks since the * zfs_resume_fs() process may involve starting threads, which might * attempt to free unreferenced inodes to free up memory for the new * thread. */ if (!unmounting) { mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; zp = list_next(&zfsvfs->z_all_znodes, zp)) { if (zp->z_sa_hdl) zfs_znode_dmu_fini(zp); if (igrab(ZTOI(zp)) != NULL) zp->z_suspended = B_TRUE; } mutex_exit(&zfsvfs->z_znodes_lock); } /* * If we are unmounting, set the unmounted flag and let new VFS ops * unblock. zfs_inactive will have the unmounted behavior, and all * other VFS ops will fail with EIO. */ if (unmounting) { zfsvfs->z_unmounted = B_TRUE; rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); } /* * z_os will be NULL if there was an error in attempting to reopen * zfsvfs, so just return as the properties had already been * * unregistered and cached data had been evicted before. */ if (zfsvfs->z_os == NULL) return (0); /* * Unregister properties. */ zfs_unregister_callbacks(zfsvfs); /* * Evict cached data. We must write out any dirty data before * disowning the dataset. */ objset_t *os = zfsvfs->z_os; boolean_t os_dirty = B_FALSE; for (int t = 0; t < TXG_SIZE; t++) { if (dmu_objset_is_dirty(os, t)) { os_dirty = B_TRUE; break; } } if (!zfs_is_readonly(zfsvfs) && os_dirty) { txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); } dmu_objset_evict_dbufs(zfsvfs->z_os); dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; dsl_dir_cancel_waiters(dd); return (0); } #if defined(HAVE_SUPER_SETUP_BDI_NAME) atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0); #endif int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) { const char *osname = zm->mnt_osname; struct inode *root_inode; uint64_t recordsize; int error = 0; zfsvfs_t *zfsvfs = NULL; vfs_t *vfs = NULL; ASSERT(zm); ASSERT(osname); error = zfsvfs_parse_options(zm->mnt_data, &vfs); if (error) return (error); error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); if (error) { zfsvfs_vfs_free(vfs); goto out; } if ((error = dsl_prop_get_integer(osname, "recordsize", &recordsize, NULL))) { zfsvfs_vfs_free(vfs); goto out; } vfs->vfs_data = zfsvfs; zfsvfs->z_vfs = vfs; zfsvfs->z_sb = sb; sb->s_fs_info = zfsvfs; sb->s_magic = ZFS_SUPER_MAGIC; sb->s_maxbytes = MAX_LFS_FILESIZE; sb->s_time_gran = 1; sb->s_blocksize = recordsize; sb->s_blocksize_bits = ilog2(recordsize); error = -zpl_bdi_setup(sb, "zfs"); if (error) goto out; sb->s_bdi->ra_pages = 0; /* Set callback operations for the file system. */ sb->s_op = &zpl_super_operations; sb->s_xattr = zpl_xattr_handlers; sb->s_export_op = &zpl_export_operations; sb->s_d_op = &zpl_dentry_operations; /* Set features for file system. */ zfs_set_fuid_feature(zfsvfs); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if ((error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))) goto out; xattr_changed_cb(zfsvfs, pval); if ((error = dsl_prop_get_integer(osname, "acltype", &pval, NULL))) goto out; acltype_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; zfsvfs->z_snap_defer_time = jiffies; mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) goto out; } /* Allocate a root inode for the filesystem. */ error = zfs_root(zfsvfs, &root_inode); if (error) { (void) zfs_umount(sb); goto out; } /* Allocate a root dentry for the filesystem */ sb->s_root = d_make_root(root_inode); if (sb->s_root == NULL) { (void) zfs_umount(sb); error = SET_ERROR(ENOMEM); goto out; } if (!zfsvfs->z_issnap) zfsctl_create(zfsvfs); zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb); out: if (error) { if (zfsvfs != NULL) { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } /* * make sure we don't have dangling sb->s_fs_info which * zfs_preumount will use. */ sb->s_fs_info = NULL; } return (error); } /* * Called when an unmount is requested and certain sanity checks have * already passed. At this point no dentries or inodes have been reclaimed * from their respective caches. We drop the extra reference on the .zfs * control directory to allow everything to be reclaimed. All snapshots * must already have been unmounted to reach this point. */ void zfs_preumount(struct super_block *sb) { zfsvfs_t *zfsvfs = sb->s_fs_info; /* zfsvfs is NULL when zfs_domount fails during mount */ if (zfsvfs) { zfs_unlinked_drain_stop_wait(zfsvfs); zfsctl_destroy(sb->s_fs_info); /* * Wait for zrele_async before entering evict_inodes in * generic_shutdown_super. The reason we must finish before * evict_inodes is when lazytime is on, or when zfs_purgedir * calls zfs_zget, zrele would bump i_count from 0 to 1. This * would race with the i_count check in evict_inodes. This means * it could destroy the inode while we are still using it. * * We wait for two passes. xattr directories in the first pass * may add xattr entries in zfs_purgedir, so in the second pass * we wait for them. We don't use taskq_wait here because it is * a pool wide taskq. Other mounted filesystems can constantly * do zrele_async and there's no guarantee when taskq will be * empty. */ taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); } } /* * Called once all other unmount released tear down has occurred. * It is our responsibility to release any remaining infrastructure. */ /*ARGSUSED*/ int zfs_umount(struct super_block *sb) { zfsvfs_t *zfsvfs = sb->s_fs_info; objset_t *os; if (zfsvfs->z_arc_prune != NULL) arc_remove_prune_callback(zfsvfs->z_arc_prune); VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); os = zfsvfs->z_os; zpl_bdi_destroy(sb); /* * z_os will be NULL if there was an error in * attempting to reopen zfsvfs. */ if (os != NULL) { /* * Unset the objset user_ptr. */ mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ dmu_objset_disown(os, B_TRUE, zfsvfs); } zfsvfs_free(zfsvfs); return (0); } int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm) { zfsvfs_t *zfsvfs = sb->s_fs_info; vfs_t *vfsp; boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os); int error; if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) && !(*flags & SB_RDONLY)) { *flags |= SB_RDONLY; return (EROFS); } error = zfsvfs_parse_options(zm->mnt_data, &vfsp); if (error) return (error); if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY)) txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); zfs_unregister_callbacks(zfsvfs); zfsvfs_vfs_free(zfsvfs->z_vfs); vfsp->vfs_data = zfsvfs; zfsvfs->z_vfs = vfsp; if (!issnap) (void) zfs_register_callbacks(vfsp); return (error); } int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) { zfsvfs_t *zfsvfs = sb->s_fs_info; znode_t *zp; uint64_t object = 0; uint64_t fid_gen = 0; uint64_t gen_mask; uint64_t zp_gen; int i, err; *ipp = NULL; if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { zfid_short_t *zfid = (zfid_short_t *)fidp; for (i = 0; i < sizeof (zfid->zf_object); i++) object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); for (i = 0; i < sizeof (zfid->zf_gen); i++) fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); } else { return (SET_ERROR(EINVAL)); } /* LONG_FID_LEN means snapdirs */ if (fidp->fid_len == LONG_FID_LEN) { zfid_long_t *zlfid = (zfid_long_t *)fidp; uint64_t objsetid = 0; uint64_t setgen = 0; for (i = 0; i < sizeof (zlfid->zf_setid); i++) objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); for (i = 0; i < sizeof (zlfid->zf_setgen); i++) setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); if (objsetid != ZFSCTL_INO_SNAPDIRS - object) { dprintf("snapdir fid: objsetid (%llu) != " "ZFSCTL_INO_SNAPDIRS (%llu) - object (%llu)\n", objsetid, ZFSCTL_INO_SNAPDIRS, object); return (SET_ERROR(EINVAL)); } if (fid_gen > 1 || setgen != 0) { dprintf("snapdir fid: fid_gen (%llu) and setgen " "(%llu)\n", fid_gen, setgen); return (SET_ERROR(EINVAL)); } return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp)); } ZFS_ENTER(zfsvfs); /* A zero fid_gen means we are in the .zfs control directories */ if (fid_gen == 0 && (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { *ipp = zfsvfs->z_ctldir; ASSERT(*ipp != NULL); if (object == ZFSCTL_INO_SNAPDIR) { VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, 0, kcred, NULL, NULL) == 0); } else { igrab(*ipp); } ZFS_EXIT(zfsvfs); return (0); } gen_mask = -1ULL >> (64 - 8 * i); dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask); if ((err = zfs_zget(zfsvfs, object, &zp))) { ZFS_EXIT(zfsvfs); return (err); } /* Don't export xattr stuff */ if (zp->z_pflags & ZFS_XATTR) { zrele(zp); ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOENT)); } (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, sizeof (uint64_t)); zp_gen = zp_gen & gen_mask; if (zp_gen == 0) zp_gen = 1; if ((fid_gen == 0) && (zfsvfs->z_root == object)) fid_gen = zp_gen; if (zp->z_unlinked || zp_gen != fid_gen) { dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen, fid_gen); zrele(zp); ZFS_EXIT(zfsvfs); return (SET_ERROR(ENOENT)); } *ipp = ZTOI(zp); if (*ipp) zfs_inode_update(ITOZ(*ipp)); ZFS_EXIT(zfsvfs); return (0); } /* * Block out VFS ops and close zfsvfs_t * * Note, if successful, then we return with the 'z_teardown_lock' and * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying * dataset and objset intact so that they can be atomically handed off during * a subsequent rollback or recv operation and the resume thereafter. */ int zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); return (0); } /* * Rebuild SA and release VOPs. Note that ownership of the underlying dataset * is an invariant across any of the operations that can be performed while the * filesystem was suspended. Whether it succeeded or failed, the preconditions * are the same: the relevant objset and associated dataset are owned by * zfsvfs, held, and long held on entry. */ int zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { int err, err2; znode_t *zp; - ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just update the objset_t, as the one we * had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); err = zfsvfs_init(zfsvfs, os); if (err != 0) goto bail; ds->ds_dir->dd_activity_cancelled = B_FALSE; VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); zfs_set_fuid_feature(zfsvfs); zfsvfs->z_rollback_time = jiffies; /* * Attempt to re-establish all the active inodes with their * dbufs. If a zfs_rezget() fails, then we unhash the inode * and mark it stale. This prevents a collision if a new * inode/object is created which must use the same inode * number. The stale inode will be be released when the * VFS prunes the dentry holding the remaining references * on the stale inode. */ mutex_enter(&zfsvfs->z_znodes_lock); for (zp = list_head(&zfsvfs->z_all_znodes); zp; zp = list_next(&zfsvfs->z_all_znodes, zp)) { err2 = zfs_rezget(zp); if (err2) { remove_inode_hash(ZTOI(zp)); zp->z_is_stale = B_TRUE; } /* see comment in zfs_suspend_fs() */ if (zp->z_suspended) { zfs_zrele_async(zp); zp->z_suspended = B_FALSE; } } mutex_exit(&zfsvfs->z_znodes_lock); if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) { /* * zfs_suspend_fs() could have interrupted freeing * of dnodes. We need to restart this freeing so * that we don't "leak" the space. */ zfs_unlinked_drain(zfsvfs); } /* * Most of the time zfs_suspend_fs is used for changing the contents * of the underlying dataset. ZFS rollback and receive operations * might create files for which negative dentries are present in * the cache. Since walking the dcache would require a lot of GPL-only * code duplication, it's much easier on these rather rare occasions * just to flush the whole dcache for the given dataset/filesystem. */ shrink_dcache_sb(zfsvfs->z_sb); bail: if (err != 0) zfsvfs->z_unmounted = B_TRUE; /* release the VFS ops */ rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); if (err != 0) { /* * Since we couldn't setup the sa framework, try to force * unmount this file system. */ if (zfsvfs->z_os) (void) zfs_umount(zfsvfs->z_sb); } return (err); } /* * Release VOPs and unmount a suspended filesystem. */ int zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) { - ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(ZFS_TEARDOWN_WRITE_HELD(zfsvfs)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); /* * We already own this, so just hold and rele it to update the * objset_t, as the one we had before may have been evicted. */ objset_t *os; VERIFY3P(ds->ds_owner, ==, zfsvfs); VERIFY(dsl_dataset_long_held(ds)); dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); dsl_pool_config_enter(dp, FTAG); VERIFY0(dmu_objset_from_ds(ds, &os)); dsl_pool_config_exit(dp, FTAG); zfsvfs->z_os = os; /* release the VOPs */ rw_exit(&zfsvfs->z_teardown_inactive_lock); - rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + ZFS_TEARDOWN_EXIT(zfsvfs, FTAG); /* * Try to force unmount this file system. */ (void) zfs_umount(zfsvfs->z_sb); zfsvfs->z_unmounted = B_TRUE; return (0); } /* * Automounted snapshots rely on periodic revalidation * to defer snapshots from being automatically unmounted. */ inline void zfs_exit_fs(zfsvfs_t *zfsvfs) { if (!zfsvfs->z_issnap) return; if (time_after(jiffies, zfsvfs->z_snap_defer_time + MAX(zfs_expire_snapshot * HZ / 2, HZ))) { zfsvfs->z_snap_defer_time = jiffies; zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa, dmu_objset_id(zfsvfs->z_os), zfs_expire_snapshot); } } int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (SET_ERROR(EINVAL)); if (newvers < zfsvfs->z_version) return (SET_ERROR(EINVAL)); if (zfs_spa_version_map(newvers) > spa_version(dmu_objset_spa(zfsvfs->z_os))) return (SET_ERROR(ENOTSUP)); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, ZFS_SA_ATTRS); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); } error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); return (error); } error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, &newvers, tx); if (error) { dmu_tx_commit(tx); return (error); } if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { uint64_t sa_obj; ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, SPA_VERSION_SA); sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, DMU_OT_NONE, 0, tx); error = zap_add(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); ASSERT0(error); VERIFY(0 == sa_set_sa_object(os, sa_obj)); sa_register_update_callback(os, zfs_sa_upgrade); } spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, "from %llu to %llu", zfsvfs->z_version, newvers); dmu_tx_commit(tx); zfsvfs->z_version = newvers; os->os_version = newvers; zfs_set_fuid_feature(zfsvfs); return (0); } /* * Read a property stored within the master node. */ int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) { uint64_t *cached_copy = NULL; /* * Figure out where in the objset_t the cached copy would live, if it * is available for the requested property. */ if (os != NULL) { switch (prop) { case ZFS_PROP_VERSION: cached_copy = &os->os_version; break; case ZFS_PROP_NORMALIZE: cached_copy = &os->os_normalization; break; case ZFS_PROP_UTF8ONLY: cached_copy = &os->os_utf8only; break; case ZFS_PROP_CASE: cached_copy = &os->os_casesensitivity; break; default: break; } } if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { *value = *cached_copy; return (0); } /* * If the property wasn't cached, look up the file system's value for * the property. For the version property, we look up a slightly * different string. */ const char *pname; int error = ENOENT; if (prop == ZFS_PROP_VERSION) pname = ZPL_VERSION_STR; else pname = zfs_prop_to_name(prop); if (os != NULL) { ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); } if (error == ENOENT) { /* No value set, use the default value */ switch (prop) { case ZFS_PROP_VERSION: *value = ZPL_VERSION; break; case ZFS_PROP_NORMALIZE: case ZFS_PROP_UTF8ONLY: *value = 0; break; case ZFS_PROP_CASE: *value = ZFS_CASE_SENSITIVE; break; case ZFS_PROP_ACLTYPE: *value = ZFS_ACLTYPE_OFF; break; default: return (error); } error = 0; } /* * If one of the methods for getting the property value above worked, * copy it into the objset_t's cache. */ if (error == 0 && cached_copy != NULL) { *cached_copy = *value; } return (error); } /* * Return true if the corresponding vfs's unmounted flag is set. * Otherwise return false. * If this function returns true we know VFS unmount has been initiated. */ boolean_t zfs_get_vfs_flag_unmounted(objset_t *os) { zfsvfs_t *zfvp; boolean_t unmounted = B_FALSE; ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); mutex_enter(&os->os_user_ptr_lock); zfvp = dmu_objset_get_user(os); if (zfvp != NULL && zfvp->z_unmounted) unmounted = B_TRUE; mutex_exit(&os->os_user_ptr_lock); return (unmounted); } /*ARGSUSED*/ void zfsvfs_update_fromname(const char *oldname, const char *newname) { /* * We don't need to do anything here, the devname is always current by * virtue of zfsvfs->z_sb->s_op->show_devname. */ } void zfs_init(void) { zfsctl_init(); zfs_znode_init(); dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); register_filesystem(&zpl_fs_type); } void zfs_fini(void) { /* * we don't use outstanding because zpl_posix_acl_free might add more. */ taskq_wait(system_delay_taskq); taskq_wait(system_taskq); unregister_filesystem(&zpl_fs_type); zfs_znode_fini(); zfsctl_fini(); } #if defined(_KERNEL) EXPORT_SYMBOL(zfs_suspend_fs); EXPORT_SYMBOL(zfs_resume_fs); EXPORT_SYMBOL(zfs_set_version); EXPORT_SYMBOL(zfsvfs_create); EXPORT_SYMBOL(zfsvfs_free); EXPORT_SYMBOL(zfs_is_readonly); EXPORT_SYMBOL(zfs_domount); EXPORT_SYMBOL(zfs_preumount); EXPORT_SYMBOL(zfs_umount); EXPORT_SYMBOL(zfs_remount); EXPORT_SYMBOL(zfs_statvfs); EXPORT_SYMBOL(zfs_vget); EXPORT_SYMBOL(zfs_prune); #endif Index: head/sys/contrib/openzfs/module/zfs/zfs_ioctl.c =================================================================== --- head/sys/contrib/openzfs/module/zfs/zfs_ioctl.c (revision 367344) +++ head/sys/contrib/openzfs/module/zfs/zfs_ioctl.c (revision 367345) @@ -1,7630 +1,7632 @@ /* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Portions Copyright 2011 Martin Matuska * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. * Portions Copyright 2012 Pawel Jakub Dawidek * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. * Copyright 2016 Nexenta Systems, Inc. All rights reserved. * Copyright (c) 2014, Joyent, Inc. All rights reserved. * Copyright (c) 2011, 2020 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013 Steven Hartland. All rights reserved. * Copyright (c) 2014 Integros [integros.com] * Copyright 2016 Toomas Soome * Copyright (c) 2016 Actifio, Inc. All rights reserved. * Copyright (c) 2018, loli10K . All rights reserved. * Copyright 2017 RackTop Systems. * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. * Copyright (c) 2019 Datto Inc. * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. * Copyright (c) 2019, Klara Inc. * Copyright (c) 2019, Allan Jude */ /* * ZFS ioctls. * * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. * * There are two ways that we handle ioctls: the legacy way where almost * all of the logic is in the ioctl callback, and the new way where most * of the marshalling is handled in the common entry point, zfsdev_ioctl(). * * Non-legacy ioctls should be registered by calling * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked * from userland by lzc_ioctl(). * * The registration arguments are as follows: * * const char *name * The name of the ioctl. This is used for history logging. If the * ioctl returns successfully (the callback returns 0), and allow_log * is true, then a history log entry will be recorded with the input & * output nvlists. The log entry can be printed with "zpool history -i". * * zfs_ioc_t ioc * The ioctl request number, which userland will pass to ioctl(2). * We want newer versions of libzfs and libzfs_core to run against * existing zfs kernel modules (i.e. a deferred reboot after an update). * Therefore the ioctl numbers cannot change from release to release. * * zfs_secpolicy_func_t *secpolicy * This function will be called before the zfs_ioc_func_t, to * determine if this operation is permitted. It should return EPERM * on failure, and 0 on success. Checks include determining if the * dataset is visible in this zone, and if the user has either all * zfs privileges in the zone (SYS_MOUNT), or has been granted permission * to do this operation on this dataset with "zfs allow". * * zfs_ioc_namecheck_t namecheck * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool * name, a dataset name, or nothing. If the name is not well-formed, * the ioctl will fail and the callback will not be called. * Therefore, the callback can assume that the name is well-formed * (e.g. is null-terminated, doesn't have more than one '@' character, * doesn't have invalid characters). * * zfs_ioc_poolcheck_t pool_check * This specifies requirements on the pool state. If the pool does * not meet them (is suspended or is readonly), the ioctl will fail * and the callback will not be called. If any checks are specified * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | * POOL_CHECK_READONLY). * * zfs_ioc_key_t *nvl_keys * The list of expected/allowable innvl input keys. This list is used * to validate the nvlist input to the ioctl. * * boolean_t smush_outnvlist * If smush_outnvlist is true, then the output is presumed to be a * list of errors, and it will be "smushed" down to fit into the * caller's buffer, by removing some entries and replacing them with a * single "N_MORE_ERRORS" entry indicating how many were removed. See * nvlist_smush() for details. If smush_outnvlist is false, and the * outnvlist does not fit into the userland-provided buffer, then the * ioctl will fail with ENOMEM. * * zfs_ioc_func_t *func * The callback function that will perform the operation. * * The callback should return 0 on success, or an error number on * failure. If the function fails, the userland ioctl will return -1, * and errno will be set to the callback's return value. The callback * will be called with the following arguments: * * const char *name * The name of the pool or dataset to operate on, from * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the * expected type (pool, dataset, or none). * * nvlist_t *innvl * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or * NULL if no input nvlist was provided. Changes to this nvlist are * ignored. If the input nvlist could not be deserialized, the * ioctl will fail and the callback will not be called. * * nvlist_t *outnvl * The output nvlist, initially empty. The callback can fill it in, * and it will be returned to userland by serializing it into * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization * fails (e.g. because the caller didn't supply a large enough * buffer), then the overall ioctl will fail. See the * 'smush_nvlist' argument above for additional behaviors. * * There are two typical uses of the output nvlist: * - To return state, e.g. property values. In this case, * smush_outnvlist should be false. If the buffer was not large * enough, the caller will reallocate a larger buffer and try * the ioctl again. * * - To return multiple errors from an ioctl which makes on-disk * changes. In this case, smush_outnvlist should be true. * Ioctls which make on-disk modifications should generally not * use the outnvl if they succeed, because the caller can not * distinguish between the operation failing, and * deserialization failing. * * IOCTL Interface Errors * * The following ioctl input errors can be returned: * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "zfs_deleg.h" #include "zfs_comutil.h" #include #include #include kmutex_t zfsdev_state_lock; zfsdev_state_t *zfsdev_state_list; /* * Limit maximum nvlist size. We don't want users passing in insane values * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. * Defaults to 0=auto which is handled by platform code. */ unsigned long zfs_max_nvlist_src_size = 0; uint_t zfs_fsyncer_key; uint_t zfs_allow_log_key; /* DATA_TYPE_ANY is used when zkey_type can vary. */ #define DATA_TYPE_ANY DATA_TYPE_UNKNOWN typedef struct zfs_ioc_vec { zfs_ioc_legacy_func_t *zvec_legacy_func; zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_allow_log; zfs_ioc_poolcheck_t zvec_pool_check; boolean_t zvec_smush_outnvlist; const char *zvec_name; const zfs_ioc_key_t *zvec_nvl_keys; size_t zvec_nvl_key_count; } zfs_ioc_vec_t; /* This array is indexed by zfs_userquota_prop_t */ static const char *userquota_perms[] = { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_PERM_PROJECTOBJQUOTA, }; static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc); static int zfs_check_settable(const char *name, nvpair_t *property, cred_t *cr); static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); static void history_str_free(char *buf) { kmem_free(buf, HIS_MAX_RECORD_LEN); } static char * history_str_get(zfs_cmd_t *zc) { char *buf; if (zc->zc_history == 0) return (NULL); buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); if (copyinstr((void *)(uintptr_t)zc->zc_history, buf, HIS_MAX_RECORD_LEN, NULL) != 0) { history_str_free(buf); return (NULL); } buf[HIS_MAX_RECORD_LEN -1] = '\0'; return (buf); } /* * Return non-zero if the spa version is less than requested version. */ static int zfs_earlier_version(const char *name, int version) { spa_t *spa; if (spa_open(name, &spa, FTAG) == 0) { if (spa_version(spa) < version) { spa_close(spa, FTAG); return (1); } spa_close(spa, FTAG); } return (0); } /* * Return TRUE if the ZPL version is less than requested version. */ static boolean_t zpl_earlier_version(const char *name, int version) { objset_t *os; boolean_t rc = B_TRUE; if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele(os, FTAG); return (B_TRUE); } /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; dmu_objset_rele(os, FTAG); } return (rc); } static void zfs_log_history(zfs_cmd_t *zc) { spa_t *spa; char *buf; if ((buf = history_str_get(zc)) == NULL) return; if (spa_open(zc->zc_name, &spa, FTAG) == 0) { if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) (void) spa_history_log(spa, buf); spa_close(spa, FTAG); } history_str_free(buf); } /* * Policy for top-level read operations (list pools). Requires no privileges, * and can be used in the local zone, as there is no associated dataset. */ /* ARGSUSED */ static int zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (0); } /* * Policy for dataset read operations (list children, get statistics). Requires * no privileges, but must be visible in the local zone. */ /* ARGSUSED */ static int zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (INGLOBALZONE(curproc) || zone_dataset_visible(zc->zc_name, NULL)) return (0); return (SET_ERROR(ENOENT)); } static int zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) { int writable = 1; /* * The dataset must be visible by this zone -- check this first * so they don't see EPERM on something they shouldn't know about. */ if (!INGLOBALZONE(curproc) && !zone_dataset_visible(dataset, &writable)) return (SET_ERROR(ENOENT)); if (INGLOBALZONE(curproc)) { /* * If the fs is zoned, only root can access it from the * global zone. */ if (secpolicy_zfs(cr) && zoned) return (SET_ERROR(EPERM)); } else { /* * If we are in a local zone, the 'zoned' property must be set. */ if (!zoned) return (SET_ERROR(EPERM)); /* must be writable by this zone */ if (!writable) return (SET_ERROR(EPERM)); } return (0); } static int zfs_dozonecheck(const char *dataset, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) { uint64_t zoned; if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned)) return (SET_ERROR(ENOENT)); return (zfs_dozonecheck_impl(dataset, zoned, cr)); } static int zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, const char *perm, cred_t *cr) { int error; error = zfs_dozonecheck_ds(name, ds, cr); if (error == 0) { error = secpolicy_zfs(cr); if (error != 0) error = dsl_deleg_access_impl(ds, perm, cr); } return (error); } static int zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) { int error; dsl_dataset_t *ds; dsl_pool_t *dp; /* * First do a quick check for root in the global zone, which * is allowed to do all write_perms. This ensures that zfs_ioc_* * will get to handle nonexistent datasets. */ if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0) return (0); error = dsl_pool_hold(name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * Policy for setting the security label property. * * Returns 0 for success, non-zero for access and other errors. */ static int zfs_set_slabel_policy(const char *name, const char *strval, cred_t *cr) { #ifdef HAVE_MLSLABEL char ds_hexsl[MAXNAMELEN]; bslabel_t ds_sl, new_sl; boolean_t new_default = FALSE; uint64_t zoned; int needed_priv = -1; int error; /* First get the existing dataset label. */ error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), 1, sizeof (ds_hexsl), &ds_hexsl, NULL); if (error != 0) return (SET_ERROR(EPERM)); if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) new_default = TRUE; /* The label must be translatable */ if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) return (SET_ERROR(EINVAL)); /* * In a non-global zone, disallow attempts to set a label that * doesn't match that of the zone; otherwise no other checks * are needed. */ if (!INGLOBALZONE(curproc)) { if (new_default || !blequal(&new_sl, CR_SL(CRED()))) return (SET_ERROR(EPERM)); return (0); } /* * For global-zone datasets (i.e., those whose zoned property is * "off", verify that the specified new label is valid for the * global zone. */ if (dsl_prop_get_integer(name, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) return (SET_ERROR(EPERM)); if (!zoned) { if (zfs_check_global_label(name, strval) != 0) return (SET_ERROR(EPERM)); } /* * If the existing dataset label is nondefault, check if the * dataset is mounted (label cannot be changed while mounted). * Get the zfsvfs_t; if there isn't one, then the dataset isn't * mounted (or isn't a dataset, doesn't exist, ...). */ if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { objset_t *os; static const char *setsl_tag = "setsl_tag"; /* * Try to own the dataset; abort if there is any error, * (e.g., already mounted, in use, or other error). */ error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE, setsl_tag, &os); if (error != 0) return (SET_ERROR(EPERM)); dmu_objset_disown(os, B_TRUE, setsl_tag); if (new_default) { needed_priv = PRIV_FILE_DOWNGRADE_SL; goto out_check; } if (hexstr_to_label(strval, &new_sl) != 0) return (SET_ERROR(EPERM)); if (blstrictdom(&ds_sl, &new_sl)) needed_priv = PRIV_FILE_DOWNGRADE_SL; else if (blstrictdom(&new_sl, &ds_sl)) needed_priv = PRIV_FILE_UPGRADE_SL; } else { /* dataset currently has a default label */ if (!new_default) needed_priv = PRIV_FILE_UPGRADE_SL; } out_check: if (needed_priv != -1) return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); return (0); #else return (SET_ERROR(ENOTSUP)); #endif /* HAVE_MLSLABEL */ } static int zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, cred_t *cr) { char *strval; /* * Check permissions for special properties. */ switch (prop) { default: break; case ZFS_PROP_ZONED: /* * Disallow setting of 'zoned' from within a local zone. */ if (!INGLOBALZONE(curproc)) return (SET_ERROR(EPERM)); break; case ZFS_PROP_QUOTA: case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (!INGLOBALZONE(curproc)) { uint64_t zoned; char setpoint[ZFS_MAX_DATASET_NAME_LEN]; /* * Unprivileged users are allowed to modify the * limit on things *under* (ie. contained by) * the thing they own. */ if (dsl_prop_get_integer(dsname, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint)) return (SET_ERROR(EPERM)); if (!zoned || strlen(dsname) <= strlen(setpoint)) return (SET_ERROR(EPERM)); } break; case ZFS_PROP_MLSLABEL: if (!is_system_labeled()) return (SET_ERROR(EPERM)); if (nvpair_value_string(propval, &strval) == 0) { int err; err = zfs_set_slabel_policy(dsname, strval, CRED()); if (err != 0) return (err); } break; } return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } /* ARGSUSED */ static int zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; error = zfs_dozonecheck(zc->zc_name, cr); if (error != 0) return (error); /* * permission to set permissions will be evaluated later in * dsl_deleg_can_allow() */ return (0); } /* ARGSUSED */ static int zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_ROLLBACK, cr)); } /* ARGSUSED */ static int zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *ds; const char *cp; int error; /* * Generate the current snapshot name from the given objsetid, then * use that name for the secpolicy/zone checks. */ cp = strchr(zc->zc_name, '@'); if (cp == NULL) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ds, zc->zc_name); error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, ZFS_DELEG_PERM_SEND, cr); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_SEND, cr)); } static int zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (SET_ERROR(ENOTSUP)); } static int zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (SET_ERROR(ENOTSUP)); } static int zfs_get_parent(const char *datasetname, char *parent, int parentsize) { char *cp; /* * Remove the @bla or /bla from the end of the name to get the parent. */ (void) strncpy(parent, datasetname, parentsize); cp = strrchr(parent, '@'); if (cp != NULL) { cp[0] = '\0'; } else { cp = strrchr(parent, '/'); if (cp == NULL) return (SET_ERROR(ENOENT)); cp[0] = '\0'; } return (0); } int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); } /* ARGSUSED */ static int zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } /* * Destroying snapshots with delegated permissions requires * descendant mount and destroy permissions. */ /* ARGSUSED */ static int zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; nvpair_t *pair, *nextpair; int error = 0; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nextpair) { nextpair = nvlist_next_nvpair(snaps, pair); error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); if (error == ENOENT) { /* * Ignore any snapshots that don't exist (we consider * them "already destroyed"). Remove the name from the * nvl here in case the snapshot is created between * now and when we try to destroy it (in which case * we don't want to destroy it since we haven't * checked for permission). */ fnvlist_remove_nvpair(snaps, pair); error = 0; } if (error != 0) break; } return (error); } int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_RENAME, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(from, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); if ((error = zfs_get_parent(to, parentname, sizeof (parentname))) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (error); } /* ARGSUSED */ static int zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); } /* ARGSUSED */ static int zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { dsl_pool_t *dp; dsl_dataset_t *clone; int error; error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_PROMOTE, cr); if (error != 0) return (error); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); if (error == 0) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_t *origin = NULL; dsl_dir_t *dd; dd = clone->ds_dir; error = dsl_dataset_hold_obj(dd->dd_pool, dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); if (error != 0) { dsl_dataset_rele(clone, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(origin, parentname); if (error == 0) { error = zfs_secpolicy_write_perms_ds(parentname, origin, ZFS_DELEG_PERM_PROMOTE, cr); } dsl_dataset_rele(clone, FTAG); dsl_dataset_rele(origin, FTAG); } dsl_pool_rele(dp, FTAG); return (error); } /* ARGSUSED */ static int zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_RECEIVE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_MOUNT, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CREATE, cr)); } /* ARGSUSED */ static int zfs_secpolicy_recv_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_recv(zc, innvl, cr)); } int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_SNAPSHOT, cr)); } /* * Check for permission to create each snapshot in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvlist_t *snaps; int error = 0; nvpair_t *pair; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { char *name = nvpair_name(pair); char *atp = strchr(name, '@'); if (atp == NULL) { error = SET_ERROR(EINVAL); break; } *atp = '\0'; error = zfs_secpolicy_snapshot_perms(name, cr); *atp = '@'; if (error != 0) break; } return (error); } /* * Check for permission to create each bookmark in the nvlist. */ /* ARGSUSED */ static int zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error = 0; for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_BOOKMARK, cr); *hashp = '#'; if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair, *nextpair; int error = 0; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nextpair) { char *name = nvpair_name(pair); char *hashp = strchr(name, '#'); nextpair = nvlist_next_nvpair(innvl, pair); if (hashp == NULL) { error = SET_ERROR(EINVAL); break; } *hashp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr); *hashp = '#'; if (error == ENOENT) { /* * Ignore any filesystems that don't exist (we consider * their bookmarks "already destroyed"). Remove * the name from the nvl here in case the filesystem * is created between now and when we try to destroy * the bookmark (in which case we don't want to * destroy it since we haven't checked for permission). */ fnvlist_remove_nvpair(innvl, pair); error = 0; } if (error != 0) break; } return (error); } /* ARGSUSED */ static int zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * Even root must have a proper TSD so that we know what pool * to log to. */ if (tsd_get(zfs_allow_log_key) == NULL) return (SET_ERROR(EPERM)); return (0); } static int zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { char parentname[ZFS_MAX_DATASET_NAME_LEN]; int error; char *origin; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) return (error); if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && (error = zfs_secpolicy_write_perms(origin, ZFS_DELEG_PERM_CLONE, cr)) != 0) return (error); if ((error = zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_CREATE, cr)) != 0) return (error); return (zfs_secpolicy_write_perms(parentname, ZFS_DELEG_PERM_MOUNT, cr)); } /* * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires * SYS_CONFIG privilege, which is not available in a local zone. */ /* ARGSUSED */ int zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { if (secpolicy_sys_config(cr, B_FALSE) != 0) return (SET_ERROR(EPERM)); return (0); } /* * Policy for object to name lookups. */ /* ARGSUSED */ static int zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int error; if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) return (0); error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); return (error); } /* * Policy for fault injection. Requires all privileges. */ /* ARGSUSED */ static int zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (secpolicy_zinject(cr)); } /* ARGSUSED */ static int zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); if (prop == ZPROP_INVAL) { if (!zfs_prop_user(zc->zc_value)) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { return (zfs_secpolicy_setprop(zc->zc_name, prop, NULL, cr)); } } static int zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); if (zc->zc_value[0] == 0) { /* * They are asking about a posix uid/gid. If it's * themself, allow it. */ if (zc->zc_objset_type == ZFS_PROP_USERUSED || zc->zc_objset_type == ZFS_PROP_USERQUOTA || zc->zc_objset_type == ZFS_PROP_USEROBJUSED || zc->zc_objset_type == ZFS_PROP_USEROBJQUOTA) { if (zc->zc_guid == crgetuid(cr)) return (0); } else if (zc->zc_objset_type == ZFS_PROP_GROUPUSED || zc->zc_objset_type == ZFS_PROP_GROUPQUOTA || zc->zc_objset_type == ZFS_PROP_GROUPOBJUSED || zc->zc_objset_type == ZFS_PROP_GROUPOBJQUOTA) { if (groupmember(zc->zc_guid, cr)) return (0); } /* else is for project quota/used */ } return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } static int zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { int err = zfs_secpolicy_read(zc, innvl, cr); if (err) return (err); if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); return (zfs_secpolicy_write_perms(zc->zc_name, userquota_perms[zc->zc_objset_type], cr)); } /* ARGSUSED */ static int zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, NULL, cr)); } /* ARGSUSED */ static int zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; nvlist_t *holds; int error; holds = fnvlist_lookup_nvlist(innvl, "holds"); for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_HOLD, cr); if (error != 0) return (error); } return (0); } /* ARGSUSED */ static int zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { nvpair_t *pair; int error; for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; error = dmu_fsname(nvpair_name(pair), fsname); if (error != 0) return (error); error = zfs_secpolicy_write_perms(fsname, ZFS_DELEG_PERM_RELEASE, cr); if (error != 0) return (error); } return (0); } /* * Policy for allowing temporary snapshots to be taken or released */ static int zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { /* * A temporary snapshot is the same as a snapshot, * hold, destroy and release all rolled into one. * Delegated diff alone is sufficient that we allow this. */ int error; if ((error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr)) == 0) return (0); error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); if (innvl != NULL) { if (error == 0) error = zfs_secpolicy_hold(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_release(zc, innvl, cr); if (error == 0) error = zfs_secpolicy_destroy(zc, innvl, cr); } return (error); } static int zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_LOAD_KEY, cr)); } static int zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) { return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_CHANGE_KEY, cr)); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; nvlist_t *list = NULL; /* * Read in and unpack the user-supplied nvlist. */ if (size == 0) return (SET_ERROR(EINVAL)); packed = vmem_alloc(size, KM_SLEEP); if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, iflag)) != 0) { vmem_free(packed, size); return (SET_ERROR(EFAULT)); } if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { vmem_free(packed, size); return (error); } vmem_free(packed, size); *nvp = list; return (0); } /* * Reduce the size of this nvlist until it can be serialized in 'max' bytes. * Entries will be removed from the end of the nvlist, and one int32 entry * named "N_MORE_ERRORS" will be added indicating how many entries were * removed. */ static int nvlist_smush(nvlist_t *errors, size_t max) { size_t size; size = fnvlist_size(errors); if (size > max) { nvpair_t *more_errors; int n = 0; if (max < 1024) return (SET_ERROR(ENOMEM)); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); more_errors = nvlist_prev_nvpair(errors, NULL); do { nvpair_t *pair = nvlist_prev_nvpair(errors, more_errors); fnvlist_remove_nvpair(errors, pair); n++; size = fnvlist_size(errors); } while (size > max); fnvlist_remove_nvpair(errors, more_errors); fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); ASSERT3U(fnvlist_size(errors), <=, max); } return (0); } static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { char *packed = NULL; int error = 0; size_t size; size = fnvlist_size(nvl); if (size > zc->zc_nvlist_dst_size) { error = SET_ERROR(ENOMEM); } else { packed = fnvlist_pack(nvl, &size); if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) error = SET_ERROR(EFAULT); fnvlist_pack_free(packed, size); } zc->zc_nvlist_dst_size = size; zc->zc_nvlist_dst_filled = B_TRUE; return (error); } int getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp) { int error = 0; if (dmu_objset_type(os) != DMU_OST_ZFS) { return (SET_ERROR(EINVAL)); } mutex_enter(&os->os_user_ptr_lock); *zfvp = dmu_objset_get_user(os); /* bump s_active only when non-zero to prevent umount race */ error = zfs_vfs_ref(zfvp); mutex_exit(&os->os_user_ptr_lock); return (error); } int getzfsvfs(const char *dsname, zfsvfs_t **zfvp) { objset_t *os; int error; error = dmu_objset_hold(dsname, FTAG, &os); if (error != 0) return (error); error = getzfsvfs_impl(os, zfvp); dmu_objset_rele(os, FTAG); return (error); } /* * Find a zfsvfs_t for a mounted filesystem, or create our own, in which * case its z_sb will be NULL, and it will be opened as the owner. * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, * which prevents all inode ops from running. */ static int zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) { int error = 0; if (getzfsvfs(name, zfvp) != 0) error = zfsvfs_create(name, B_FALSE, zfvp); if (error == 0) { - rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : - RW_READER, tag); + if (writer) + ZFS_TEARDOWN_ENTER_WRITE(*zfvp, tag); + else + ZFS_TEARDOWN_ENTER_READ(*zfvp, tag); if ((*zfvp)->z_unmounted) { /* * XXX we could probably try again, since the unmounting * thread should be just about to disassociate the * objset from the zfsvfs. */ - rrm_exit(&(*zfvp)->z_teardown_lock, tag); + ZFS_TEARDOWN_EXIT(*zfvp, tag); return (SET_ERROR(EBUSY)); } } return (error); } static void zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) { - rrm_exit(&zfsvfs->z_teardown_lock, tag); + ZFS_TEARDOWN_EXIT(zfsvfs, tag); if (zfs_vfs_held(zfsvfs)) { zfs_vfs_rele(zfsvfs); } else { dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); zfsvfs_free(zfsvfs); } } static int zfs_ioc_pool_create(zfs_cmd_t *zc) { int error; nvlist_t *config, *props = NULL; nvlist_t *rootprops = NULL; nvlist_t *zplprops = NULL; dsl_crypto_params_t *dcp = NULL; const char *spa_name = zc->zc_name; boolean_t unload_wkey = B_TRUE; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (props) { nvlist_t *nvl = NULL; nvlist_t *hidden_args = NULL; uint64_t version = SPA_VERSION; char *tname; (void) nvlist_lookup_uint64(props, zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); if (!SPA_VERSION_IS_SUPPORTED(version)) { error = SET_ERROR(EINVAL); goto pool_props_bad; } (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); if (nvl) { error = nvlist_dup(nvl, &rootprops, KM_SLEEP); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); } (void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS, &hidden_args); error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, rootprops, hidden_args, &dcp); if (error != 0) goto pool_props_bad; (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS); VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops_root(version, rootprops, zplprops, NULL); if (error != 0) goto pool_props_bad; if (nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0) spa_name = tname; } error = spa_create(zc->zc_name, config, props, zplprops, dcp); /* * Set the remaining root properties */ if (!error && (error = zfs_set_prop_nvlist(spa_name, ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) { (void) spa_destroy(spa_name); unload_wkey = B_FALSE; /* spa_destroy() unloads wrapping keys */ } pool_props_bad: nvlist_free(rootprops); nvlist_free(zplprops); nvlist_free(config); nvlist_free(props); dsl_crypto_params_free(dcp, unload_wkey && !!error); return (error); } static int zfs_ioc_pool_destroy(zfs_cmd_t *zc) { int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { nvlist_t *config, *props = NULL; uint64_t guid; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { nvlist_free(config); return (error); } if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != zc->zc_guid) error = SET_ERROR(EINVAL); else error = spa_import(zc->zc_name, config, props, zc->zc_cookie); if (zc->zc_nvlist_dst != 0) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; } nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); error = spa_export(zc->zc_name, NULL, force, hardforce); return (error); } static int zfs_ioc_pool_configs(zfs_cmd_t *zc) { nvlist_t *configs; int error; if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) return (SET_ERROR(EEXIST)); error = put_nvlist(zc, configs); nvlist_free(configs); return (error); } /* * inputs: * zc_name name of the pool * * outputs: * zc_cookie real errno * zc_nvlist_dst config nvlist * zc_nvlist_dst_size size of config nvlist */ static int zfs_ioc_pool_stats(zfs_cmd_t *zc) { nvlist_t *config; int error; int ret = 0; error = spa_get_stats(zc->zc_name, &config, zc->zc_value, sizeof (zc->zc_value)); if (config != NULL) { ret = put_nvlist(zc, config); nvlist_free(config); /* * The config may be present even if 'error' is non-zero. * In this case we return success, and preserve the real errno * in 'zc_cookie'. */ zc->zc_cookie = error; } else { ret = error; } return (ret); } /* * Try to import the given pool, returning pool stats as appropriate so that * user land knows which devices are available and overall pool health. */ static int zfs_ioc_pool_tryimport(zfs_cmd_t *zc) { nvlist_t *tryconfig, *config = NULL; int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); nvlist_free(tryconfig); if (config == NULL) return (SET_ERROR(EINVAL)); error = put_nvlist(zc, config); nvlist_free(config); return (error); } /* * inputs: * zc_name name of the pool * zc_cookie scan func (pool_scan_func_t) * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) */ static int zfs_ioc_pool_scan(zfs_cmd_t *zc) { spa_t *spa; int error; if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_flags == POOL_SCRUB_PAUSE) error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); else if (zc->zc_cookie == POOL_SCAN_NONE) error = spa_scan_stop(spa); else error = spa_scan(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_freeze(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { spa_freeze(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_pool_upgrade(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (zc->zc_cookie < spa_version(spa) || !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { spa_close(spa, FTAG); return (SET_ERROR(EINVAL)); } spa_upgrade(spa, zc->zc_cookie); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_history(zfs_cmd_t *zc) { spa_t *spa; char *hist_buf; uint64_t size; int error; if ((size = zc->zc_history_len) == 0) return (SET_ERROR(EINVAL)); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } hist_buf = vmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { error = ddi_copyout(hist_buf, (void *)(uintptr_t)zc->zc_history, zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); vmem_free(hist_buf, size); return (error); } static int zfs_ioc_pool_reguid(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error == 0) { error = spa_change_guid(spa); spa_close(spa, FTAG); } return (error); } static int zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) { return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_value name of object */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_obj object to find * * outputs: * zc_stat stats on object * zc_value path to object */ static int zfs_ioc_obj_to_stats(zfs_cmd_t *zc) { objset_t *os; int error; /* XXX reading from objset not owned */ if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os)) != 0) return (error); if (dmu_objset_type(os) != DMU_OST_ZFS) { dmu_objset_rele_flags(os, B_TRUE, FTAG); return (SET_ERROR(EINVAL)); } error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, sizeof (zc->zc_value)); dmu_objset_rele_flags(os, B_TRUE, FTAG); return (error); } static int zfs_ioc_vdev_add(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *config; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config); if (error == 0) { error = spa_vdev_add(spa, config); nvlist_free(config); } spa_close(spa, FTAG); return (error); } /* * inputs: * zc_name name of the pool * zc_guid guid of vdev to remove * zc_cookie cancel removal */ static int zfs_ioc_vdev_remove(zfs_cmd_t *zc) { spa_t *spa; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); if (zc->zc_cookie != 0) { error = spa_vdev_remove_cancel(spa); } else { error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_set_state(zfs_cmd_t *zc) { spa_t *spa; int error; vdev_state_t newstate = VDEV_STATE_UNKNOWN; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); switch (zc->zc_cookie) { case VDEV_STATE_ONLINE: error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); break; case VDEV_STATE_OFFLINE: error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_FAULTED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL && zc->zc_obj != VDEV_AUX_EXTERNAL_PERSIST) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && zc->zc_obj != VDEV_AUX_EXTERNAL) zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; default: error = SET_ERROR(EINVAL); } zc->zc_cookie = newstate; spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_attach(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config; int replacing = zc->zc_cookie; int rebuild = zc->zc_simple; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, rebuild); nvlist_free(config); } spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_detach(zfs_cmd_t *zc) { spa_t *spa; int error; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; nvlist_t *config, *props = NULL; int error; boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &config))) { spa_close(spa, FTAG); return (error); } if (zc->zc_nvlist_src_size != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) { spa_close(spa, FTAG); nvlist_free(config); return (error); } error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); spa_close(spa, FTAG); nvlist_free(config); nvlist_free(props); return (error); } static int zfs_ioc_vdev_setpath(zfs_cmd_t *zc) { spa_t *spa; const char *path = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setpath(spa, guid, path); spa_close(spa, FTAG); return (error); } static int zfs_ioc_vdev_setfru(zfs_cmd_t *zc) { spa_t *spa; const char *fru = zc->zc_value; uint64_t guid = zc->zc_guid; int error; error = spa_open(zc->zc_name, &spa, FTAG); if (error != 0) return (error); error = spa_vdev_setfru(spa, guid, fru); spa_close(spa, FTAG); return (error); } static int zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) { int error = 0; nvlist_t *nv; dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... * XXX reading without owning */ if (!zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZVOL) { error = zvol_get_stats(os, nv); if (error == EIO) { nvlist_free(nv); return (error); } VERIFY0(error); } if (error == 0) error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_objset_stats(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error == 0) { error = zfs_ioc_objset_stats_impl(zc, os); dmu_objset_rele(os, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_nvlist_dst received property nvlist * zc_nvlist_dst_size size of received property nvlist * * Gets received properties (distinct from local properties on or after * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from * local property values. */ static int zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) { int error = 0; nvlist_t *nv; /* * Without this check, we would return local property values if the * caller has not already received properties on or after * SPA_VERSION_RECVD_PROPS. */ if (!dsl_prop_get_hasrecvd(zc->zc_name)) return (SET_ERROR(ENOTSUP)); if (zc->zc_nvlist_dst != 0 && (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { error = put_nvlist(zc, nv); nvlist_free(nv); } return (error); } static int nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) { uint64_t value; int error; /* * zfs_get_zplprop() will either find a value or give us * the default value (if there is one). */ if ((error = zfs_get_zplprop(os, prop, &value)) != 0) return (error); VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); return (0); } /* * inputs: * zc_name name of filesystem * zc_nvlist_dst_size size of buffer for zpl property nvlist * * outputs: * zc_nvlist_dst zpl property nvlist * zc_nvlist_dst_size size of zpl property nvlist */ static int zfs_ioc_objset_zplprops(zfs_cmd_t *zc) { objset_t *os; int err; /* XXX reading without owning */ if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os))) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); /* * NB: nvl_add_zplprop() will read the objset contents, * which we aren't supposed to do with a DS_MODE_USER * hold, because it could be inconsistent. */ if (zc->zc_nvlist_dst != 0 && !zc->zc_objset_stats.dds_inconsistent && dmu_objset_type(os) == DMU_OST_ZFS) { nvlist_t *nv; VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) err = put_nvlist(zc, nv); nvlist_free(nv); } else { err = SET_ERROR(ENOENT); } dmu_objset_rele(os, FTAG); return (err); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_dst_size size of buffer for property nvlist * * outputs: * zc_name name of next filesystem * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_dataset_list_next(zfs_cmd_t *zc) { objset_t *os; int error; char *p; size_t orig_len = strlen(zc->zc_name); top: if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) { if (error == ENOENT) error = SET_ERROR(ESRCH); return (error); } p = strrchr(zc->zc_name, '/'); if (p == NULL || p[1] != '\0') (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = SET_ERROR(ESRCH); } while (error == 0 && zfs_dataset_name_hidden(zc->zc_name)); dmu_objset_rele(os, FTAG); /* * If it's an internal dataset (ie. with a '$' in its name), * don't try to get stats for it, otherwise we'll return ENOENT. */ if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ if (error == ENOENT) { /* We lost a race with destroy, get the next one. */ zc->zc_name[orig_len] = '\0'; goto top; } } return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_nvlist_src iteration range nvlist * zc_nvlist_src_size size of iteration range nvlist * * outputs: * zc_name name of next snapshot * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist */ static int zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) { int error; objset_t *os, *ossnap; dsl_dataset_t *ds; uint64_t min_txg = 0, max_txg = 0; if (zc->zc_nvlist_src_size != 0) { nvlist_t *props = NULL; error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props); if (error != 0) return (error); (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG, &min_txg); (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG, &max_txg); nvlist_free(props); } error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) { return (error == ENOENT ? SET_ERROR(ESRCH) : error); } /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= ZFS_MAX_DATASET_NAME_LEN) { dmu_objset_rele(os, FTAG); return (SET_ERROR(ESRCH)); } while (error == 0) { if (issig(JUSTLOOKING) && issig(FORREAL)) { error = SET_ERROR(EINTR); break; } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, &zc->zc_cookie, NULL); if (error == ENOENT) { error = SET_ERROR(ESRCH); break; } else if (error != 0) { break; } error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj, FTAG, &ds); if (error != 0) break; if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) || (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) { dsl_dataset_rele(ds, FTAG); /* undo snapshot name append */ *(strchr(zc->zc_name, '@') + 1) = '\0'; /* skip snapshot */ continue; } if (zc->zc_simple) { dsl_dataset_rele(ds, FTAG); break; } if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) { dsl_dataset_rele(ds, FTAG); break; } dsl_dataset_rele(ds, FTAG); break; } dmu_objset_rele(os, FTAG); /* if we failed, undo the @ that we tacked on to zc_name */ if (error != 0) *strchr(zc->zc_name, '@') = '\0'; return (error); } static int zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { const char *propname = nvpair_name(pair); uint64_t *valary; unsigned int vallen; const char *dash, *domain; zfs_userquota_prop_t type; uint64_t rid; uint64_t quota; zfsvfs_t *zfsvfs; int err; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) != 0) return (SET_ERROR(EINVAL)); } /* * A correctly constructed propname is encoded as * userquota@-. */ if ((dash = strchr(propname, '-')) == NULL || nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || vallen != 3) return (SET_ERROR(EINVAL)); domain = dash + 1; type = valary[0]; rid = valary[1]; quota = valary[2]; err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); if (err == 0) { err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); zfsvfs_rele(zfsvfs, FTAG); } return (err); } /* * If the named property is one that has a special function to set its value, * return 0 on success and a positive error code on failure; otherwise if it is * not one of the special properties handled by this function, return -1. * * XXX: It would be better for callers of the property interface if we handled * these special cases in dsl_prop.c (in the dsl layer). */ static int zfs_prop_set_special(const char *dsname, zprop_source_t source, nvpair_t *pair) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval = 0; const char *strval = NULL; int err = -1; if (prop == ZPROP_INVAL) { if (zfs_prop_userquota(propname)) return (zfs_prop_set_userquota(dsname, pair)); return (-1); } if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* all special properties are numeric except for keylocation */ if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { strval = fnvpair_value_string(pair); } else { intval = fnvpair_value_uint64(pair); } switch (prop) { case ZFS_PROP_QUOTA: err = dsl_dir_set_quota(dsname, source, intval); break; case ZFS_PROP_REFQUOTA: err = dsl_dataset_set_refquota(dsname, source, intval); break; case ZFS_PROP_FILESYSTEM_LIMIT: case ZFS_PROP_SNAPSHOT_LIMIT: if (intval == UINT64_MAX) { /* clearing the limit, just do it */ err = 0; } else { err = dsl_dir_activate_fs_ss_limit(dsname); } /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_KEYLOCATION: err = dsl_crypto_can_set_keylocation(dsname, strval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_RESERVATION: err = dsl_dir_set_reservation(dsname, source, intval); break; case ZFS_PROP_REFRESERVATION: err = dsl_dataset_set_refreservation(dsname, source, intval); break; case ZFS_PROP_COMPRESSION: err = dsl_dataset_set_compression(dsname, source, intval); /* * Set err to -1 to force the zfs_set_prop_nvlist code down the * default path to set the value in the nvlist. */ if (err == 0) err = -1; break; case ZFS_PROP_VOLSIZE: err = zvol_set_volsize(dsname, intval); break; case ZFS_PROP_SNAPDEV: err = zvol_set_snapdev(dsname, source, intval); break; case ZFS_PROP_VOLMODE: err = zvol_set_volmode(dsname, source, intval); break; case ZFS_PROP_VERSION: { zfsvfs_t *zfsvfs; if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) break; err = zfs_set_version(zfsvfs, intval); zfsvfs_rele(zfsvfs, FTAG); if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { zfs_cmd_t *zc; zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dsname, sizeof (zc->zc_name)); (void) zfs_ioc_userspace_upgrade(zc); (void) zfs_ioc_id_quota_upgrade(zc); kmem_free(zc, sizeof (zfs_cmd_t)); } break; } default: err = -1; } return (err); } /* * This function is best effort. If it fails to set any of the given properties, * it continues to set as many as it can and returns the last error * encountered. If the caller provides a non-NULL errlist, it will be filled in * with the list of names of all the properties that failed along with the * corresponding error numbers. * * If every property is set successfully, zero is returned and errlist is not * modified. */ int zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, nvlist_t *errlist) { nvpair_t *pair; nvpair_t *propval; int rv = 0; uint64_t intval; const char *strval; nvlist_t *genericnvl = fnvlist_alloc(); nvlist_t *retrynvl = fnvlist_alloc(); retry: pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); zfs_prop_t prop = zfs_name_to_prop(propname); int err = 0; /* decode the property value */ propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &propval) != 0) err = SET_ERROR(EINVAL); } /* Validate value type */ if (err == 0 && source == ZPROP_SRC_INHERITED) { /* inherited properties are expected to be booleans */ if (nvpair_type(propval) != DATA_TYPE_BOOLEAN) err = SET_ERROR(EINVAL); } else if (err == 0 && prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if (nvpair_type(propval) != DATA_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (zfs_prop_userquota(propname)) { if (nvpair_type(propval) != DATA_TYPE_UINT64_ARRAY) err = SET_ERROR(EINVAL); } else { err = SET_ERROR(EINVAL); } } else if (err == 0) { if (nvpair_type(propval) == DATA_TYPE_STRING) { if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) err = SET_ERROR(EINVAL); } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; intval = fnvpair_value_uint64(propval); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: err = SET_ERROR(EINVAL); break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) err = SET_ERROR(ZFS_ERR_BADPROP); break; default: cmn_err(CE_PANIC, "unknown property type"); } } else { err = SET_ERROR(EINVAL); } } /* Validate permissions */ if (err == 0) err = zfs_check_settable(dsname, pair, CRED()); if (err == 0) { if (source == ZPROP_SRC_INHERITED) err = -1; /* does not need special handling */ else err = zfs_prop_set_special(dsname, source, pair); if (err == -1) { /* * For better performance we build up a list of * properties to set in a single transaction. */ err = nvlist_add_nvpair(genericnvl, pair); } else if (err != 0 && nvl != retrynvl) { /* * This may be a spurious error caused by * receiving quota and reservation out of order. * Try again in a second pass. */ err = nvlist_add_nvpair(retrynvl, pair); } } if (err != 0) { if (errlist != NULL) fnvlist_add_int32(errlist, propname, err); rv = err; } } if (nvl != retrynvl && !nvlist_empty(retrynvl)) { nvl = retrynvl; goto retry; } if (!nvlist_empty(genericnvl) && dsl_props_set(dsname, source, genericnvl) != 0) { /* * If this fails, we still want to set as many properties as we * can, so try setting them individually. */ pair = NULL; while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { const char *propname = nvpair_name(pair); int err = 0; propval = pair; if (nvpair_type(pair) == DATA_TYPE_NVLIST) { nvlist_t *attrs; attrs = fnvpair_value_nvlist(pair); propval = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); } if (nvpair_type(propval) == DATA_TYPE_STRING) { strval = fnvpair_value_string(propval); err = dsl_prop_set_string(dsname, propname, source, strval); } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) { err = dsl_prop_inherit(dsname, propname, source); } else { intval = fnvpair_value_uint64(propval); err = dsl_prop_set_int(dsname, propname, source, intval); } if (err != 0) { if (errlist != NULL) { fnvlist_add_int32(errlist, propname, err); } rv = err; } } } nvlist_free(genericnvl); nvlist_free(retrynvl); return (rv); } /* * Check that all the properties are valid user properties. */ static int zfs_check_userprops(nvlist_t *nvl) { nvpair_t *pair = NULL; while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { const char *propname = nvpair_name(pair); if (!zfs_prop_user(propname) || nvpair_type(pair) != DATA_TYPE_STRING) return (SET_ERROR(EINVAL)); if (strlen(propname) >= ZAP_MAXNAMELEN) return (SET_ERROR(ENAMETOOLONG)); if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) return (SET_ERROR(E2BIG)); } return (0); } static void props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) { nvpair_t *pair; VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); pair = NULL; while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { if (nvlist_exists(skipped, nvpair_name(pair))) continue; VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); } } static int clear_received_props(const char *dsname, nvlist_t *props, nvlist_t *skipped) { int err = 0; nvlist_t *cleared_props = NULL; props_skip(props, skipped, &cleared_props); if (!nvlist_empty(cleared_props)) { /* * Acts on local properties until the dataset has received * properties at least once on or after SPA_VERSION_RECVD_PROPS. */ zprop_source_t flags = (ZPROP_SRC_NONE | (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); } nvlist_free(cleared_props); return (err); } /* * inputs: * zc_name name of filesystem * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply * zc_cookie received properties flag * * outputs: * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : ZPROP_SRC_LOCAL); nvlist_t *errors; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &nvl)) != 0) return (error); if (received) { nvlist_t *origprops; if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { (void) clear_received_props(zc->zc_name, origprops, nvl); nvlist_free(origprops); } error = dsl_prop_set_hasrecvd(zc->zc_name); } errors = fnvlist_alloc(); if (error == 0) error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); if (zc->zc_nvlist_dst != 0 && errors != NULL) { (void) put_nvlist(zc, errors); } nvlist_free(errors); nvlist_free(nvl); return (error); } /* * inputs: * zc_name name of filesystem * zc_value name of property to inherit * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { const char *propname = zc->zc_value; zfs_prop_t prop = zfs_name_to_prop(propname); boolean_t received = zc->zc_cookie; zprop_source_t source = (received ? ZPROP_SRC_NONE /* revert to received value, if any */ : ZPROP_SRC_INHERITED); /* explicitly inherit */ nvlist_t *dummy; nvpair_t *pair; zprop_type_t type; int err; if (!received) { /* * Only check this in the non-received case. We want to allow * 'inherit -S' to revert non-inheritable properties like quota * and reservation to the received or default values even though * they are not considered inheritable. */ if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) return (SET_ERROR(EINVAL)); } if (prop == ZPROP_INVAL) { if (!zfs_prop_user(propname)) return (SET_ERROR(EINVAL)); type = PROP_TYPE_STRING; } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { return (SET_ERROR(EINVAL)); } else { type = zfs_prop_get_type(prop); } /* * zfs_prop_set_special() expects properties in the form of an * nvpair with type info. */ dummy = fnvlist_alloc(); switch (type) { case PROP_TYPE_STRING: VERIFY(0 == nvlist_add_string(dummy, propname, "")); break; case PROP_TYPE_NUMBER: case PROP_TYPE_INDEX: VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); break; default: err = SET_ERROR(EINVAL); goto errout; } pair = nvlist_next_nvpair(dummy, NULL); if (pair == NULL) { err = SET_ERROR(EINVAL); } else { err = zfs_prop_set_special(zc->zc_name, source, pair); if (err == -1) /* property is not "special", needs handling */ err = dsl_prop_inherit(zc->zc_name, zc->zc_value, source); } errout: nvlist_free(dummy); return (err); } static int zfs_ioc_pool_set_props(zfs_cmd_t *zc) { nvlist_t *props; spa_t *spa; int error; nvpair_t *pair; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &props))) return (error); /* * If the only property is the configfile, then just do a spa_lookup() * to handle the faulted case. */ pair = nvlist_next_nvpair(props, NULL); if (pair != NULL && strcmp(nvpair_name(pair), zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && nvlist_next_nvpair(props, pair) == NULL) { mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) { spa_configfile_set(spa, props, B_FALSE); spa_write_cachefile(spa, B_FALSE, B_TRUE); } mutex_exit(&spa_namespace_lock); if (spa != NULL) { nvlist_free(props); return (0); } } if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); } error = spa_prop_set(spa, props); nvlist_free(props); spa_close(spa, FTAG); return (error); } static int zfs_ioc_pool_get_props(zfs_cmd_t *zc) { spa_t *spa; int error; nvlist_t *nvp = NULL; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { /* * If the pool is faulted, there may be properties we can still * get (such as altroot and cachefile), so attempt to get them * anyway. */ mutex_enter(&spa_namespace_lock); if ((spa = spa_lookup(zc->zc_name)) != NULL) error = spa_prop_get(spa, &nvp); mutex_exit(&spa_namespace_lock); } else { error = spa_prop_get(spa, &nvp); spa_close(spa, FTAG); } if (error == 0 && zc->zc_nvlist_dst != 0) error = put_nvlist(zc, nvp); else error = SET_ERROR(EFAULT); nvlist_free(nvp); return (error); } /* * inputs: * zc_name name of filesystem * zc_nvlist_src{_size} nvlist of delegated permissions * zc_perm_action allow/unallow flag * * outputs: none */ static int zfs_ioc_set_fsacl(zfs_cmd_t *zc) { int error; nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &fsaclnv)) != 0) return (error); /* * Verify nvlist is constructed correctly */ if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { nvlist_free(fsaclnv); return (SET_ERROR(EINVAL)); } /* * If we don't have PRIV_SYS_MOUNT, then validate * that user is allowed to hand out each permission in * the nvlist(s) */ error = secpolicy_zfs(CRED()); if (error != 0) { if (zc->zc_perm_action == B_FALSE) { error = dsl_deleg_can_allow(zc->zc_name, fsaclnv, CRED()); } else { error = dsl_deleg_can_unallow(zc->zc_name, fsaclnv, CRED()); } } if (error == 0) error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); nvlist_free(fsaclnv); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_nvlist_src{_size} nvlist of delegated permissions */ static int zfs_ioc_get_fsacl(zfs_cmd_t *zc) { nvlist_t *nvp; int error; if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { error = put_nvlist(zc, nvp); nvlist_free(nvp); } return (error); } /* ARGSUSED */ static void zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { zfs_creat_t *zct = arg; zfs_create_fs(os, cr, zct->zct_zplprops, tx); } #define ZFS_PROP_UNDEFINED ((uint64_t)-1) /* * inputs: * os parent objset pointer (NULL if root fs) * fuids_ok fuids allowed in this version of the spa? * sa_ok SAs allowed in this version of the spa? * createprops list of properties requested by creator * * outputs: * zplprops values for the zplprops we attach to the master node object * is_ci true if requested file system will be purely case-insensitive * * Determine the settings for utf8only, normalization and * casesensitivity. Specific values may have been requested by the * creator and/or we can inherit values from the parent dataset. If * the file system is of too early a vintage, a creator can not * request settings for these properties, even if the requested * setting is the default value. We don't actually want to create dsl * properties for these, so remove them from the source nvlist after * processing. */ static int zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; int error; ASSERT(zplprops != NULL); /* parent dataset must be a filesystem */ if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); /* * Pull out creator prop choices, if any. */ if (createprops) { (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); (void) nvlist_lookup_uint64(createprops, zfs_prop_to_name(ZFS_PROP_CASE), &sense); (void) nvlist_remove_all(createprops, zfs_prop_to_name(ZFS_PROP_CASE)); } /* * If the zpl version requested is whacky or the file system * or pool is version is too "young" to support normalization * and the creator tried to set a value for one of the props, * error out. */ if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || (zplver >= ZPL_VERSION_FUID && !fuids_ok) || (zplver >= ZPL_VERSION_SA && !sa_ok) || (zplver < ZPL_VERSION_NORMALIZATION && (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || sense != ZFS_PROP_UNDEFINED))) return (SET_ERROR(ENOTSUP)); /* * Put the version in the zplprops */ VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); if (norm == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); /* * If we're normalizing, names must always be valid UTF-8 strings. */ if (norm) u8 = 1; if (u8 == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); if (sense == ZFS_PROP_UNDEFINED && (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) return (error); VERIFY(nvlist_add_uint64(zplprops, zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); if (is_ci) *is_ci = (sense == ZFS_CASE_INSENSITIVE); return (0); } static int zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok, sa_ok; uint64_t zplver = ZPL_VERSION; objset_t *os = NULL; char parentname[ZFS_MAX_DATASET_NAME_LEN]; spa_t *spa; uint64_t spa_vers; int error; zfs_get_parent(dataset, parentname, sizeof (parentname)); if ((error = spa_open(dataset, &spa, FTAG)) != 0) return (error); spa_vers = spa_version(spa); spa_close(spa, FTAG); zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); /* * Open parent object set so we can inherit zplprop values. */ if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); dmu_objset_rele(os, FTAG); return (error); } static int zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { boolean_t fuids_ok; boolean_t sa_ok; uint64_t zplver = ZPL_VERSION; int error; zplver = zfs_zpl_version_map(spa_vers); fuids_ok = (zplver >= ZPL_VERSION_FUID); sa_ok = (zplver >= ZPL_VERSION_SA); error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, createprops, zplprops, is_ci); return (error); } /* * innvl: { * "type" -> dmu_objset_type_t (int32) * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_create[] = { {"type", DATA_TYPE_INT32, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; zfs_creat_t zct = { 0 }; nvlist_t *nvprops = NULL; nvlist_t *hidden_args = NULL; void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); dmu_objset_type_t type; boolean_t is_insensitive = B_FALSE; dsl_crypto_params_t *dcp = NULL; type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); switch (type) { case DMU_OST_ZFS: cbfunc = zfs_create_cb; break; case DMU_OST_ZVOL: cbfunc = zvol_create_cb; break; default: cbfunc = NULL; break; } if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); zct.zct_props = nvprops; if (cbfunc == NULL) return (SET_ERROR(EINVAL)); if (type == DMU_OST_ZVOL) { uint64_t volsize, volblocksize; if (nvprops == NULL) return (SET_ERROR(EINVAL)); if (nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) return (SET_ERROR(EINVAL)); if ((error = nvlist_lookup_uint64(nvprops, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize)) != 0 && error != ENOENT) return (SET_ERROR(EINVAL)); if (error != 0) volblocksize = zfs_prop_default_numeric( ZFS_PROP_VOLBLOCKSIZE); if ((error = zvol_check_volblocksize(fsname, volblocksize)) != 0 || (error = zvol_check_volsize(volsize, volblocksize)) != 0) return (error); } else if (type == DMU_OST_ZFS) { int error; /* * We have to have normalization and * case-folding flags correct when we do the * file system creation, so go figure them out * now. */ VERIFY(nvlist_alloc(&zct.zct_zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); error = zfs_fill_zplprops(fsname, nvprops, zct.zct_zplprops, &is_insensitive); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } } error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops, hidden_args, &dcp); if (error != 0) { nvlist_free(zct.zct_zplprops); return (error); } error = dmu_objset_create(fsname, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct); nvlist_free(zct.zct_zplprops); dsl_crypto_params_free(dcp, !!error); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) { spa_t *spa; int error2; /* * Volumes will return EBUSY and cannot be destroyed * until all asynchronous minor handling (e.g. from * setting the volmode property) has completed. Wait for * the spa_zvol_taskq to drain then retry. */ error2 = dsl_destroy_head(fsname); while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) { error2 = spa_open(fsname, &spa, FTAG); if (error2 == 0) { taskq_wait(spa->spa_zvol_taskq); spa_close(spa, FTAG); } error2 = dsl_destroy_head(fsname); } } } return (error); } /* * innvl: { * "origin" -> name of origin snapshot * (optional) "props" -> { prop -> value } * (optional) "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * } * * outputs: * outnvl: propname -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_clone[] = { {"origin", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { int error = 0; nvlist_t *nvprops = NULL; const char *origin_name; origin_name = fnvlist_lookup_string(innvl, "origin"); (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); if (strchr(fsname, '@') || strchr(fsname, '%')) return (SET_ERROR(EINVAL)); if (dataset_namecheck(origin_name, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); error = dmu_objset_clone(fsname, origin_name); /* * It would be nice to do this atomically. */ if (error == 0) { error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, nvprops, outnvl); if (error != 0) (void) dsl_destroy_head(fsname); } return (error); } static const zfs_ioc_key_t zfs_keys_remap[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { /* This IOCTL is no longer supported. */ return (0); } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional) "props" -> { prop -> value (string) } * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_snapshot[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *snaps; nvlist_t *props = NULL; int error, poollen; nvpair_t *pair; (void) nvlist_lookup_nvlist(innvl, "props", &props); if (!nvlist_empty(props) && zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) return (SET_ERROR(ENOTSUP)); if ((error = zfs_check_userprops(props)) != 0) return (error); snaps = fnvlist_lookup_nvlist(innvl, "snaps"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); char *cp = strchr(name, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The snap must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); /* * Check for permission to set the properties on the fs. */ if (!nvlist_empty(props)) { *cp = '\0'; error = zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_USERPROP, CRED()); *cp = '@'; if (error != 0) return (error); } /* This must be the only snap of this fs. */ for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { if (strncmp(name, nvpair_name(pair2), cp - name + 1) == 0) { return (SET_ERROR(EXDEV)); } } } error = dsl_dataset_snapshot(snaps, props, outnvl); return (error); } /* * innvl: "message" -> string */ static const zfs_ioc_key_t zfs_keys_log_history[] = { {"message", DATA_TYPE_STRING, 0}, }; /* ARGSUSED */ static int zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) { const char *message; char *poolname; spa_t *spa; int error; /* * The poolname in the ioctl is not set, we get it from the TSD, * which was set at the end of the last successful ioctl that allows * logging. The secpolicy func already checked that it is set. * Only one log ioctl is allowed after each successful ioctl, so * we clear the TSD here. */ poolname = tsd_get(zfs_allow_log_key); if (poolname == NULL) return (SET_ERROR(EINVAL)); (void) tsd_set(zfs_allow_log_key, NULL); error = spa_open(poolname, &spa, FTAG); kmem_strfree(poolname); if (error != 0) return (error); message = fnvlist_lookup_string(innvl, "message"); if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } error = spa_history_log(spa, message); spa_close(spa, FTAG); return (error); } /* * This ioctl is used to set the bootenv configuration on the current * pool. This configuration is stored in the second padding area of the label, * and it is used by the bootloader(s) to store the bootloader and/or system * specific data. * The data is stored as nvlist data stream, and is protected by * an embedded checksum. * The version can have two possible values: * VB_RAW: nvlist should have key GRUB_ENVMAP, value DATA_TYPE_STRING. * VB_NVLIST: nvlist with arbitrary pairs. */ static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { {"version", DATA_TYPE_UINT64, 0}, {"", DATA_TYPE_ANY, ZK_OPTIONAL | ZK_WILDCARDLIST}, }; static int zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int error; spa_t *spa; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_write_bootenv(spa->spa_root_vdev, innvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { /* no nvl keys */ }; static int zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; if ((error = spa_open(name, &spa, FTAG)) != 0) return (error); spa_vdev_state_enter(spa, SCL_ALL); error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (error); } /* * The dp_config_rwlock must not be held when calling this, because the * unmount may need to write out data. * * This function is best-effort. Callers must deal gracefully if it * remains mounted (or is remounted after this call). * * Returns 0 if the argument is not a snapshot, or it is not currently a * filesystem, or we were able to unmount it. Returns error code otherwise. */ void zfs_unmount_snap(const char *snapname) { if (strchr(snapname, '@') == NULL) return; (void) zfsctl_snapshot_unmount(snapname, MNT_FORCE); } /* ARGSUSED */ static int zfs_unmount_snap_cb(const char *snapname, void *arg) { zfs_unmount_snap(snapname); return (0); } /* * When a clone is destroyed, its origin may also need to be destroyed, * in which case it must be unmounted. This routine will do that unmount * if necessary. */ void zfs_destroy_unmount_origin(const char *fsname) { int error; objset_t *os; dsl_dataset_t *ds; error = dmu_objset_hold(fsname, FTAG, &os); if (error != 0) return; ds = dmu_objset_ds(os); if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { char originname[ZFS_MAX_DATASET_NAME_LEN]; dsl_dataset_name(ds->ds_prev, originname); dmu_objset_rele(os, FTAG); zfs_unmount_snap(originname); } else { dmu_objset_rele(os, FTAG); } } /* * innvl: { * "snaps" -> { snapshot1, snapshot2 } * (optional boolean) "defer" * } * * outnvl: snapshot -> error code (int32) */ static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { {"snaps", DATA_TYPE_NVLIST, 0}, {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int poollen; nvlist_t *snaps; nvpair_t *pair; boolean_t defer; spa_t *spa; snaps = fnvlist_lookup_nvlist(innvl, "snaps"); defer = nvlist_exists(innvl, "defer"); poollen = strlen(poolname); for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { const char *name = nvpair_name(pair); /* * The snap must be in the specified pool to prevent the * invalid removal of zvol minors below. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '@')) return (SET_ERROR(EXDEV)); zfs_unmount_snap(nvpair_name(pair)); if (spa_open(name, &spa, FTAG) == 0) { zvol_remove_minors(spa, name, B_TRUE); spa_close(spa, FTAG); } } return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); } /* * Create bookmarks. The bookmark names are of the form #. * All bookmarks and snapshots must be in the same pool. * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail. * * innvl: { * new_bookmark1 -> existing_snapshot, * new_bookmark2 -> existing_bookmark, * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_bookmark[] = { {"...", DATA_TYPE_STRING, ZK_WILDCARDLIST}, }; /* ARGSUSED */ static int zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_bookmark_create(innvl, outnvl)); } /* * innvl: { * property 1, property 2, ... * } * * outnvl: { * bookmark name 1 -> { property 1, property 2, ... }, * bookmark name 2 -> { property 1, property 2, ... } * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL}, }; static int zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { return (dsl_get_bookmarks(fsname, innvl, outnvl)); } /* * innvl is not used. * * outnvl: { * property 1, property 2, ... * } * */ static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl, nvlist_t *outnvl) { char fsname[ZFS_MAX_DATASET_NAME_LEN]; char *bmname; bmname = strchr(bookmark, '#'); if (bmname == NULL) return (SET_ERROR(EINVAL)); bmname++; (void) strlcpy(fsname, bookmark, sizeof (fsname)); *(strchr(fsname, '#')) = '\0'; return (dsl_get_bookmark_props(fsname, bmname, outnvl)); } /* * innvl: { * bookmark name 1, bookmark name 2 * } * * outnvl: bookmark -> error code (int32) * */ static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = { {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST}, }; static int zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { int error, poollen; poollen = strlen(poolname); for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { const char *name = nvpair_name(pair); const char *cp = strchr(name, '#'); /* * The bookmark name must contain an #, and the part after it * must contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); /* * The bookmark must be in the specified pool. */ if (strncmp(name, poolname, poollen) != 0 || (name[poollen] != '/' && name[poollen] != '#')) return (SET_ERROR(EXDEV)); } error = dsl_bookmark_destroy(innvl, outnvl); return (error); } static const zfs_ioc_key_t zfs_keys_channel_program[] = { {"program", DATA_TYPE_STRING, 0}, {"arg", DATA_TYPE_ANY, 0}, {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { char *program; uint64_t instrlimit, memlimit; boolean_t sync_flag; nvpair_t *nvarg = NULL; program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM); if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { sync_flag = B_TRUE; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { instrlimit = ZCP_DEFAULT_INSTRLIMIT; } if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { memlimit = ZCP_DEFAULT_MEMLIMIT; } nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST); if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) return (SET_ERROR(EINVAL)); if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) return (SET_ERROR(EINVAL)); return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, nvarg, outnvl)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (spa_checkpoint(poolname)); } /* * innvl: unused * outnvl: empty */ static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { return (spa_checkpoint_discard(poolname)); } /* * inputs: * zc_name name of dataset to destroy * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; int err; err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); if (ost == DMU_OST_ZFS) zfs_unmount_snap(zc->zc_name); if (strchr(zc->zc_name, '@')) { err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); } else { err = dsl_destroy_head(zc->zc_name); if (err == EEXIST) { /* * It is possible that the given DS may have * hidden child (%recv) datasets - "leftovers" * resulting from the previously interrupted * 'zfs receive'. * * 6 extra bytes for /%recv */ char namebuf[ZFS_MAX_DATASET_NAME_LEN + 6]; if (snprintf(namebuf, sizeof (namebuf), "%s/%s", zc->zc_name, recv_clone_name) >= sizeof (namebuf)) return (SET_ERROR(EINVAL)); /* * Try to remove the hidden child (%recv) and after * that try to remove the target dataset. * If the hidden child (%recv) does not exist * the original error (EEXIST) will be returned */ err = dsl_destroy_head(namebuf); if (err == 0) err = dsl_destroy_head(zc->zc_name); else if (err == ENOENT) err = SET_ERROR(EEXIST); } } return (err); } /* * innvl: { * "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64) * "initialize_vdevs": { -> guids to initialize (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * } * * outnvl: { * "initialize_vdevs": { -> initialization errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} }; static int zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, &cmd_type) != 0) { return (SET_ERROR(EINVAL)); } if (!(cmd_type == POOL_INITIALIZE_CANCEL || cmd_type == POOL_INITIALIZE_START || cmd_type == POOL_INITIALIZE_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, &vdev_guids) != 0) { return (SET_ERROR(EINVAL)); } for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) { fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, vdev_errlist); } fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? EINVAL : 0); } /* * innvl: { * "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64) * "trim_vdevs": { -> guids to TRIM (nvlist) * "vdev_path_1": vdev_guid_1, (uint64), * "vdev_path_2": vdev_guid_2, (uint64), * ... * }, * "trim_rate" -> Target TRIM rate in bytes/sec. * "trim_secure" -> Set to request a secure TRIM. * } * * outnvl: { * "trim_vdevs": { -> TRIM errors (nvlist) * "vdev_path_1": errno, see function body for possible errnos (uint64) * "vdev_path_2": errno, ... (uint64) * ... * } * } * * EINVAL is returned for an unknown commands or if any of the provided vdev * guids have be specified with a type other than uint64. */ static const zfs_ioc_key_t zfs_keys_pool_trim[] = { {ZPOOL_TRIM_COMMAND, DATA_TYPE_UINT64, 0}, {ZPOOL_TRIM_VDEVS, DATA_TYPE_NVLIST, 0}, {ZPOOL_TRIM_RATE, DATA_TYPE_UINT64, ZK_OPTIONAL}, {ZPOOL_TRIM_SECURE, DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; static int zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) { uint64_t cmd_type; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0) return (SET_ERROR(EINVAL)); if (!(cmd_type == POOL_TRIM_CANCEL || cmd_type == POOL_TRIM_START || cmd_type == POOL_TRIM_SUSPEND)) { return (SET_ERROR(EINVAL)); } nvlist_t *vdev_guids; if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0) return (SET_ERROR(EINVAL)); for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { uint64_t vdev_guid; if (nvpair_value_uint64(pair, &vdev_guid) != 0) { return (SET_ERROR(EINVAL)); } } /* Optional, defaults to maximum rate when not provided */ uint64_t rate; if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0) rate = 0; /* Optional, defaults to standard TRIM when not provided */ boolean_t secure; if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE, &secure) != 0) { secure = B_FALSE; } spa_t *spa; int error = spa_open(poolname, &spa, FTAG); if (error != 0) return (error); nvlist_t *vdev_errlist = fnvlist_alloc(); int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type, rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist); if (fnvlist_size(vdev_errlist) > 0) fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist); fnvlist_free(vdev_errlist); spa_close(spa, FTAG); return (total_errors > 0 ? EINVAL : 0); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a tag is provided, it identifies a particular instance of an activity to * wait for. Currently, this is only valid for use with 'initialize', because * that is the only activity for which there can be multiple instances running * concurrently. In the case of 'initialize', the tag corresponds to the guid of * the vdev on which to wait. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * (optional) "wait_tag" -> uint64_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_pool_wait[] = { {ZPOOL_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, {ZPOOL_WAIT_TAG, DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; uint64_t tag; boolean_t waited; int error; if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0) return (EINVAL); if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0) error = spa_wait_tag(name, activity, tag, &waited); else error = spa_wait(name, activity, &waited); if (error == 0) fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited); return (error); } /* * This ioctl waits for activity of a particular type to complete. If there is * no activity of that type in progress, it returns immediately, and the * returned value "waited" is false. If there is activity in progress, and no * tag is passed in, the ioctl blocks until all activity of that type is * complete, and then returns with "waited" set to true. * * If a thread waiting in the ioctl receives a signal, the call will return * immediately, and the return value will be EINTR. * * innvl: { * "wait_activity" -> int32_t * } * * outnvl: "waited" -> boolean_t */ static const zfs_ioc_key_t zfs_keys_fs_wait[] = { {ZFS_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, }; static int zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl) { int32_t activity; boolean_t waited = B_FALSE; int error; dsl_pool_t *dp; dsl_dir_t *dd; dsl_dataset_t *ds; if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0) return (SET_ERROR(EINVAL)); if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0) return (SET_ERROR(EINVAL)); if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0) return (error); if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) { dsl_pool_rele(dp, FTAG); return (error); } dd = ds->ds_dir; mutex_enter(&dd->dd_activity_lock); dd->dd_activity_waiters++; /* * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t * aren't evicted while we're waiting. Normally this is prevented by * holding the pool, but we can't do that while we're waiting since * that would prevent TXGs from syncing out. Some of the functionality * of long-holds (e.g. preventing deletion) is unnecessary for this * case, since we would cancel the waiters before proceeding with a * deletion. An alternative mechanism for keeping the dataset around * could be developed but this is simpler. */ dsl_dataset_long_hold(ds, FTAG); dsl_pool_rele(dp, FTAG); error = dsl_dir_wait(dd, ds, activity, &waited); dsl_dataset_long_rele(ds, FTAG); dd->dd_activity_waiters--; if (dd->dd_activity_waiters == 0) cv_signal(&dd->dd_activity_cv); mutex_exit(&dd->dd_activity_lock); dsl_dataset_rele(ds, FTAG); if (error == 0) fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited); return (error); } /* * fsname is name of dataset to rollback (to most recent snapshot) * * innvl may contain name of expected target snapshot * * outnvl: "target" -> name of most recent snapshot * } */ static const zfs_ioc_key_t zfs_keys_rollback[] = { {"target", DATA_TYPE_STRING, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { zfsvfs_t *zfsvfs; zvol_state_handle_t *zv; char *target = NULL; int error; (void) nvlist_lookup_string(innvl, "target", &target); if (target != NULL) { const char *cp = strchr(target, '@'); /* * The snap name must contain an @, and the part after it must * contain only valid characters. */ if (cp == NULL || zfs_component_namecheck(cp + 1, NULL, NULL) != 0) return (SET_ERROR(EINVAL)); } if (getzfsvfs(fsname, &zfsvfs) == 0) { dsl_dataset_t *ds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; error = dsl_dataset_rollback(fsname, target, zfsvfs, outnvl); resume_err = zfs_resume_fs(zfsvfs, ds); error = error ? error : resume_err; } zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(fsname)) != NULL) { error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), outnvl); zvol_resume(zv); } else { error = dsl_dataset_rollback(fsname, target, NULL, outnvl); } return (error); } static int recursive_unmount(const char *fsname, void *arg) { const char *snapname = arg; char *fullname; fullname = kmem_asprintf("%s@%s", fsname, snapname); zfs_unmount_snap(fullname); kmem_strfree(fullname); return (0); } /* * * snapname is the snapshot to redact. * innvl: { * "bookname" -> (string) * shortname of the redaction bookmark to generate * "snapnv" -> (nvlist, values ignored) * snapshots to redact snapname with respect to * } * * outnvl is unused */ /* ARGSUSED */ static const zfs_ioc_key_t zfs_keys_redact[] = { {"bookname", DATA_TYPE_STRING, 0}, {"snapnv", DATA_TYPE_NVLIST, 0}, }; static int zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { nvlist_t *redactnvl = NULL; char *redactbook = NULL; if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0) return (SET_ERROR(EINVAL)); if (fnvlist_num_pairs(redactnvl) == 0) return (SET_ERROR(ENXIO)); if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0) return (SET_ERROR(EINVAL)); return (dmu_redact_snap(snapname, redactnvl, redactbook)); } /* * inputs: * zc_name old name of dataset * zc_value new name of dataset * zc_cookie recursive flag (only valid for snapshots) * * outputs: none */ static int zfs_ioc_rename(zfs_cmd_t *zc) { objset_t *os; dmu_objset_type_t ost; boolean_t recursive = zc->zc_cookie & 1; boolean_t nounmount = !!(zc->zc_cookie & 2); char *at; int err; /* "zfs rename" from and to ...%recv datasets should both fail */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); err = dmu_objset_hold(zc->zc_name, FTAG, &os); if (err != 0) return (err); ost = dmu_objset_type(os); dmu_objset_rele(os, FTAG); at = strchr(zc->zc_name, '@'); if (at != NULL) { /* snaps must be in same fs */ int error; if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) return (SET_ERROR(EXDEV)); *at = '\0'; if (ost == DMU_OST_ZFS && !nounmount) { error = dmu_objset_find(zc->zc_name, recursive_unmount, at + 1, recursive ? DS_FIND_CHILDREN : 0); if (error != 0) { *at = '@'; return (error); } } error = dsl_dataset_rename_snapshot(zc->zc_name, at + 1, strchr(zc->zc_value, '@') + 1, recursive); *at = '@'; return (error); } else { return (dsl_dir_rename(zc->zc_name, zc->zc_value)); } } static int zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) { const char *propname = nvpair_name(pair); boolean_t issnap = (strchr(dsname, '@') != NULL); zfs_prop_t prop = zfs_name_to_prop(propname); uint64_t intval, compval; int err; if (prop == ZPROP_INVAL) { if (zfs_prop_user(propname)) { if ((err = zfs_secpolicy_write_perms(dsname, ZFS_DELEG_PERM_USERPROP, cr))) return (err); return (0); } if (!issnap && zfs_prop_userquota(propname)) { const char *perm = NULL; const char *uq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; const char *gq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; const char *uiq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA]; const char *giq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA]; const char *pq_prefix = zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA]; const char *piq_prefix = zfs_userquota_prop_prefixes[\ ZFS_PROP_PROJECTOBJQUOTA]; if (strncmp(propname, uq_prefix, strlen(uq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USERQUOTA; } else if (strncmp(propname, uiq_prefix, strlen(uiq_prefix)) == 0) { perm = ZFS_DELEG_PERM_USEROBJQUOTA; } else if (strncmp(propname, gq_prefix, strlen(gq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPQUOTA; } else if (strncmp(propname, giq_prefix, strlen(giq_prefix)) == 0) { perm = ZFS_DELEG_PERM_GROUPOBJQUOTA; } else if (strncmp(propname, pq_prefix, strlen(pq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTQUOTA; } else if (strncmp(propname, piq_prefix, strlen(piq_prefix)) == 0) { perm = ZFS_DELEG_PERM_PROJECTOBJQUOTA; } else { /* {USER|GROUP|PROJECT}USED are read-only */ return (SET_ERROR(EINVAL)); } if ((err = zfs_secpolicy_write_perms(dsname, perm, cr))) return (err); return (0); } return (SET_ERROR(EINVAL)); } if (issnap) return (SET_ERROR(EINVAL)); if (nvpair_type(pair) == DATA_TYPE_NVLIST) { /* * dsl_prop_get_all_impl() returns properties in this * format. */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &pair) == 0); } /* * Check that this value is valid for this pool version */ switch (prop) { case ZFS_PROP_COMPRESSION: /* * If the user specified gzip compression, make sure * the SPA supports it. We ignore any errors here since * we'll catch them later. */ if (nvpair_value_uint64(pair, &intval) == 0) { compval = ZIO_COMPRESS_ALGO(intval); if (compval >= ZIO_COMPRESS_GZIP_1 && compval <= ZIO_COMPRESS_GZIP_9 && zfs_earlier_version(dsname, SPA_VERSION_GZIP_COMPRESSION)) { return (SET_ERROR(ENOTSUP)); } if (compval == ZIO_COMPRESS_ZLE && zfs_earlier_version(dsname, SPA_VERSION_ZLE_COMPRESSION)) return (SET_ERROR(ENOTSUP)); if (compval == ZIO_COMPRESS_LZ4) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } if (compval == ZIO_COMPRESS_ZSTD) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } } break; case ZFS_PROP_COPIES: if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_VOLBLOCKSIZE: case ZFS_PROP_RECORDSIZE: /* Record sizes above 128k need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval > SPA_OLD_MAXBLOCKSIZE) { spa_t *spa; /* * We don't allow setting the property above 1MB, * unless the tunable has been changed. */ if (intval > zfs_max_recordsize || intval > SPA_MAXBLOCKSIZE) return (SET_ERROR(ERANGE)); if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_DNODESIZE: /* Dnode sizes above 512 need the feature to be enabled */ if (nvpair_value_uint64(pair, &intval) == 0 && intval != ZFS_DNSIZE_LEGACY) { spa_t *spa; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; case ZFS_PROP_SPECIAL_SMALL_BLOCKS: /* * This property could require the allocation classes * feature to be active for setting, however we allow * it so that tests of settable properties succeed. * The CLI will issue a warning in this case. */ break; case ZFS_PROP_SHARESMB: if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) return (SET_ERROR(ENOTSUP)); break; case ZFS_PROP_ACLINHERIT: if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { if (intval == ZFS_ACL_PASSTHROUGH_X && zfs_earlier_version(dsname, SPA_VERSION_PASSTHROUGH_X)) return (SET_ERROR(ENOTSUP)); } break; case ZFS_PROP_CHECKSUM: case ZFS_PROP_DEDUP: { spa_feature_t feature; spa_t *spa; int err; /* dedup feature version checks */ if (prop == ZFS_PROP_DEDUP && zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) return (SET_ERROR(ENOTSUP)); if (nvpair_type(pair) == DATA_TYPE_UINT64 && nvpair_value_uint64(pair, &intval) == 0) { /* check prop value is enabled in features */ feature = zio_checksum_to_feature( intval & ZIO_CHECKSUM_MASK); if (feature == SPA_FEATURE_NONE) break; if ((err = spa_open(dsname, &spa, FTAG)) != 0) return (err); if (!spa_feature_is_enabled(spa, feature)) { spa_close(spa, FTAG); return (SET_ERROR(ENOTSUP)); } spa_close(spa, FTAG); } break; } default: break; } return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); } /* * Removes properties from the given props list that fail permission checks * needed to clear them and to restore them in case of a receive error. For each * property, make sure we have both set and inherit permissions. * * Returns the first error encountered if any permission checks fail. If the * caller provides a non-NULL errlist, it also gives the complete list of names * of all the properties that failed a permission check along with the * corresponding error numbers. The caller is responsible for freeing the * returned errlist. * * If every property checks out successfully, zero is returned and the list * pointed at by errlist is NULL. */ static int zfs_check_clearable(const char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; nvpair_t *pair, *next_pair; nvlist_t *errors; int err, rv = 0; if (props == NULL) return (0); VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name)); pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { next_pair = nvlist_next_nvpair(props, pair); (void) strlcpy(zc->zc_value, nvpair_name(pair), sizeof (zc->zc_value)); if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { VERIFY(nvlist_remove_nvpair(props, pair) == 0); VERIFY(nvlist_add_int32(errors, zc->zc_value, err) == 0); } pair = next_pair; } kmem_free(zc, sizeof (zfs_cmd_t)); if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { nvlist_free(errors); errors = NULL; } else { VERIFY(nvpair_value_int32(pair, &rv) == 0); } if (errlist == NULL) nvlist_free(errors); else *errlist = errors; return (rv); } static boolean_t propval_equals(nvpair_t *p1, nvpair_t *p2) { if (nvpair_type(p1) == DATA_TYPE_NVLIST) { /* dsl_prop_get_all_impl() format */ nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p1) == 0); } if (nvpair_type(p2) == DATA_TYPE_NVLIST) { nvlist_t *attrs; VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, &p2) == 0); } if (nvpair_type(p1) != nvpair_type(p2)) return (B_FALSE); if (nvpair_type(p1) == DATA_TYPE_STRING) { char *valstr1, *valstr2; VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); return (strcmp(valstr1, valstr2) == 0); } else { uint64_t intval1, intval2; VERIFY(nvpair_value_uint64(p1, &intval1) == 0); VERIFY(nvpair_value_uint64(p2, &intval2) == 0); return (intval1 == intval2); } } /* * Remove properties from props if they are not going to change (as determined * by comparison with origprops). Remove them from origprops as well, since we * do not need to clear or restore properties that won't change. */ static void props_reduce(nvlist_t *props, nvlist_t *origprops) { nvpair_t *pair, *next_pair; if (origprops == NULL) return; /* all props need to be received */ pair = nvlist_next_nvpair(props, NULL); while (pair != NULL) { const char *propname = nvpair_name(pair); nvpair_t *match; next_pair = nvlist_next_nvpair(props, pair); if ((nvlist_lookup_nvpair(origprops, propname, &match) != 0) || !propval_equals(pair, match)) goto next; /* need to set received value */ /* don't clear the existing received value */ (void) nvlist_remove_nvpair(origprops, match); /* don't bother receiving the property */ (void) nvlist_remove_nvpair(props, pair); next: pair = next_pair; } } /* * Extract properties that cannot be set PRIOR to the receipt of a dataset. * For example, refquota cannot be set until after the receipt of a dataset, * because in replication streams, an older/earlier snapshot may exceed the * refquota. We want to receive the older/earlier snapshot, but setting * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent * the older/earlier snapshot from being received (with EDQUOT). * * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. * * libzfs will need to be judicious handling errors encountered by props * extracted by this function. */ static nvlist_t * extract_delay_props(nvlist_t *props) { nvlist_t *delayprops; nvpair_t *nvp, *tmp; static const zfs_prop_t delayable[] = { ZFS_PROP_REFQUOTA, ZFS_PROP_KEYLOCATION, 0 }; int i; VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; nvp = nvlist_next_nvpair(props, nvp)) { /* * strcmp() is safe because zfs_prop_to_name() always returns * a bounded string. */ for (i = 0; delayable[i] != 0; i++) { if (strcmp(zfs_prop_to_name(delayable[i]), nvpair_name(nvp)) == 0) { break; } } if (delayable[i] != 0) { tmp = nvlist_prev_nvpair(props, nvp); VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); VERIFY(nvlist_remove_nvpair(props, nvp) == 0); nvp = tmp; } } if (nvlist_empty(delayprops)) { nvlist_free(delayprops); delayprops = NULL; } return (delayprops); } static void zfs_allow_log_destroy(void *arg) { char *poolname = arg; if (poolname != NULL) kmem_strfree(poolname); } #ifdef ZFS_DEBUG static boolean_t zfs_ioc_recv_inject_err; #endif /* * nvlist 'errors' is always allocated. It will contain descriptions of * encountered errors, if any. It's the callers responsibility to free. */ static int zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, boolean_t resumable, int input_fd, dmu_replay_record_t *begin_record, uint64_t *read_bytes, uint64_t *errflags, nvlist_t **errors) { dmu_recv_cookie_t drc; int error = 0; int props_error = 0; offset_t off, noff; nvlist_t *local_delayprops = NULL; nvlist_t *recv_delayprops = NULL; nvlist_t *origprops = NULL; /* existing properties */ nvlist_t *origrecvd = NULL; /* existing received properties */ boolean_t first_recvd_props = B_FALSE; boolean_t tofs_was_redacted; zfs_file_t *input_fp; *read_bytes = 0; *errflags = 0; *errors = fnvlist_alloc(); off = 0; if ((error = zfs_file_get(input_fd, &input_fp))) return (error); noff = off = zfs_file_off(input_fp); error = dmu_recv_begin(tofs, tosnap, begin_record, force, resumable, localprops, hidden_args, origin, &drc, input_fp, &off); if (error != 0) goto out; tofs_was_redacted = dsl_get_redacted(drc.drc_ds); /* * Set properties before we receive the stream so that they are applied * to the new data. Note that we must call dmu_recv_stream() if * dmu_recv_begin() succeeds. */ if (recvprops != NULL && !drc.drc_newfs) { if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= SPA_VERSION_RECVD_PROPS && !dsl_prop_get_hasrecvd(tofs)) first_recvd_props = B_TRUE; /* * If new received properties are supplied, they are to * completely replace the existing received properties, * so stash away the existing ones. */ if (dsl_prop_get_received(tofs, &origrecvd) == 0) { nvlist_t *errlist = NULL; /* * Don't bother writing a property if its value won't * change (and avoid the unnecessary security checks). * * The first receive after SPA_VERSION_RECVD_PROPS is a * special case where we blow away all local properties * regardless. */ if (!first_recvd_props) props_reduce(recvprops, origrecvd); if (zfs_check_clearable(tofs, origrecvd, &errlist) != 0) (void) nvlist_merge(*errors, errlist, 0); nvlist_free(errlist); if (clear_received_props(tofs, origrecvd, first_recvd_props ? NULL : recvprops) != 0) *errflags |= ZPROP_ERR_NOCLEAR; } else { *errflags |= ZPROP_ERR_NOCLEAR; } } /* * Stash away existing properties so we can restore them on error unless * we're doing the first receive after SPA_VERSION_RECVD_PROPS, in which * case "origrecvd" will take care of that. */ if (localprops != NULL && !drc.drc_newfs && !first_recvd_props) { objset_t *os; if (dmu_objset_hold(tofs, FTAG, &os) == 0) { if (dsl_prop_get_all(os, &origprops) != 0) { *errflags |= ZPROP_ERR_NOCLEAR; } dmu_objset_rele(os, FTAG); } else { *errflags |= ZPROP_ERR_NOCLEAR; } } if (recvprops != NULL) { props_error = dsl_prop_set_hasrecvd(tofs); if (props_error == 0) { recv_delayprops = extract_delay_props(recvprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recvprops, *errors); } } if (localprops != NULL) { nvlist_t *oprops = fnvlist_alloc(); nvlist_t *xprops = fnvlist_alloc(); nvpair_t *nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { if (nvpair_type(nvp) == DATA_TYPE_BOOLEAN) { /* -x property */ const char *name = nvpair_name(nvp); zfs_prop_t prop = zfs_name_to_prop(name); if (prop != ZPROP_INVAL) { if (!zfs_prop_inheritable(prop)) continue; } else if (!zfs_prop_user(name)) continue; fnvlist_add_boolean(xprops, name); } else { /* -o property=value */ fnvlist_add_nvpair(oprops, nvp); } } local_delayprops = extract_delay_props(oprops); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, oprops, *errors); (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, xprops, *errors); nvlist_free(oprops); nvlist_free(xprops); } error = dmu_recv_stream(&drc, &off); if (error == 0) { zfsvfs_t *zfsvfs = NULL; zvol_state_handle_t *zv = NULL; if (getzfsvfs(tofs, &zfsvfs) == 0) { /* online recv */ dsl_dataset_t *ds; int end_err; boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS( begin_record->drr_u.drr_begin. drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); /* * If the suspend fails, then the recv_end will * likely also fail, and clean up after itself. */ end_err = dmu_recv_end(&drc, zfsvfs); /* * If the dataset was not redacted, but we received a * redacted stream onto it, we need to unmount the * dataset. Otherwise, resume the filesystem. */ if (error == 0 && !drc.drc_newfs && stream_is_redacted && !tofs_was_redacted) { error = zfs_end_fs(zfsvfs, ds); } else if (error == 0) { error = zfs_resume_fs(zfsvfs, ds); } error = error ? error : end_err; zfs_vfs_rele(zfsvfs); } else if ((zv = zvol_suspend(tofs)) != NULL) { error = dmu_recv_end(&drc, zvol_tag(zv)); zvol_resume(zv); } else { error = dmu_recv_end(&drc, NULL); } /* Set delayed properties now, after we're done receiving. */ if (recv_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, recv_delayprops, *errors); } if (local_delayprops != NULL && error == 0) { (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, local_delayprops, *errors); } } /* * Merge delayed props back in with initial props, in case * we're DEBUG and zfs_ioc_recv_inject_err is set (which means * we have to make sure clear_received_props() includes * the delayed properties). * * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, * using ASSERT() will be just like a VERIFY. */ if (recv_delayprops != NULL) { ASSERT(nvlist_merge(recvprops, recv_delayprops, 0) == 0); nvlist_free(recv_delayprops); } if (local_delayprops != NULL) { ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); nvlist_free(local_delayprops); } *read_bytes = off - noff; #ifdef ZFS_DEBUG if (zfs_ioc_recv_inject_err) { zfs_ioc_recv_inject_err = B_FALSE; error = 1; } #endif /* * On error, restore the original props. */ if (error != 0 && recvprops != NULL && !drc.drc_newfs) { if (clear_received_props(tofs, recvprops, NULL) != 0) { /* * We failed to clear the received properties. * Since we may have left a $recvd value on the * system, we can't clear the $hasrecvd flag. */ *errflags |= ZPROP_ERR_NORESTORE; } else if (first_recvd_props) { dsl_prop_unset_hasrecvd(tofs); } if (origrecvd == NULL && !drc.drc_newfs) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; } /* * dsl_props_set() will not convert RECEIVED to LOCAL on or * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL * explicitly if we're restoring local properties cleared in the * first new-style receive. */ if (origrecvd != NULL && zfs_set_prop_nvlist(tofs, (first_recvd_props ? ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), origrecvd, NULL) != 0) { /* * We stashed the original properties but failed to * restore them. */ *errflags |= ZPROP_ERR_NORESTORE; } } if (error != 0 && localprops != NULL && !drc.drc_newfs && !first_recvd_props) { nvlist_t *setprops; nvlist_t *inheritprops; nvpair_t *nvp; if (origprops == NULL) { /* We failed to stash the original properties. */ *errflags |= ZPROP_ERR_NORESTORE; goto out; } /* Restore original props */ setprops = fnvlist_alloc(); inheritprops = fnvlist_alloc(); nvp = NULL; while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { const char *name = nvpair_name(nvp); const char *source; nvlist_t *attrs; if (!nvlist_exists(origprops, name)) { /* * Property was not present or was explicitly * inherited before the receive, restore this. */ fnvlist_add_boolean(inheritprops, name); continue; } attrs = fnvlist_lookup_nvlist(origprops, name); source = fnvlist_lookup_string(attrs, ZPROP_SOURCE); /* Skip received properties */ if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) continue; if (strcmp(source, tofs) == 0) { /* Property was locally set */ fnvlist_add_nvlist(setprops, name, attrs); } else { /* Property was implicitly inherited */ fnvlist_add_boolean(inheritprops, name); } } if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, setprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inheritprops, NULL) != 0) *errflags |= ZPROP_ERR_NORESTORE; nvlist_free(setprops); nvlist_free(inheritprops); } out: zfs_file_put(input_fd); nvlist_free(origrecvd); nvlist_free(origprops); if (error == 0) error = props_error; return (error); } /* * inputs: * zc_name name of containing filesystem (unused) * zc_nvlist_src{_size} nvlist of properties to apply * zc_nvlist_conf{_size} nvlist of properties to exclude * (DATA_TYPE_BOOLEAN) and override (everything else) * zc_value name of snapshot to create * zc_string name of clone origin (if DRR_FLAG_CLONE) * zc_cookie file descriptor to recv from * zc_begin_record the BEGIN record of the stream (not byteswapped) * zc_guid force flag * * outputs: * zc_cookie number of bytes read * zc_obj zprop_errflags_t * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_recv(zfs_cmd_t *zc) { dmu_replay_record_t begin_record; nvlist_t *errors = NULL; nvlist_t *recvdprops = NULL; nvlist_t *localprops = NULL; char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; int error = 0; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || strchr(zc->zc_value, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, zc->zc_value, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; if (zc->zc_nvlist_src != 0 && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &recvdprops)) != 0) return (error); if (zc->zc_nvlist_conf != 0 && (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, zc->zc_iflags, &localprops)) != 0) return (error); if (zc->zc_string[0]) origin = zc->zc_string; begin_record.drr_type = DRR_BEGIN; begin_record.drr_payloadlen = 0; begin_record.drr_u.drr_begin = zc->zc_begin_record; error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record, &zc->zc_cookie, &zc->zc_obj, &errors); nvlist_free(recvdprops); nvlist_free(localprops); /* * Now that all props, initial and delayed, are set, report the prop * errors to the caller. */ if (zc->zc_nvlist_dst_size != 0 && errors != NULL && (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || put_nvlist(zc, errors) != 0)) { /* * Caller made zc->zc_nvlist_dst less than the minimum expected * size or supplied an invalid address. */ error = SET_ERROR(EINVAL); } nvlist_free(errors); return (error); } /* * innvl: { * "snapname" -> full name of the snapshot to create * (optional) "props" -> received properties to set (nvlist) * (optional) "localprops" -> override and exclude properties (nvlist) * (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE) * "begin_record" -> non-byteswapped dmu_replay_record_t * "input_fd" -> file descriptor to read stream from (int32) * (optional) "force" -> force flag (value ignored) * (optional) "resumable" -> resumable flag (value ignored) * (optional) "cleanup_fd" -> unused * (optional) "action_handle" -> unused * (optional) "hidden_args" -> { "wkeydata" -> value } * } * * outnvl: { * "read_bytes" -> number of bytes read * "error_flags" -> zprop_errflags_t * "errors" -> error for each unapplied received property (nvlist) * } */ static const zfs_ioc_key_t zfs_keys_recv_new[] = { {"snapname", DATA_TYPE_STRING, 0}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"localprops", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"origin", DATA_TYPE_STRING, ZK_OPTIONAL}, {"begin_record", DATA_TYPE_BYTE_ARRAY, 0}, {"input_fd", DATA_TYPE_INT32, 0}, {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; static int zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) { dmu_replay_record_t *begin_record; uint_t begin_record_size; nvlist_t *errors = NULL; nvlist_t *recvprops = NULL; nvlist_t *localprops = NULL; nvlist_t *hidden_args = NULL; char *snapname; char *origin = NULL; char *tosnap; char tofs[ZFS_MAX_DATASET_NAME_LEN]; boolean_t force; boolean_t resumable; uint64_t read_bytes = 0; uint64_t errflags = 0; int input_fd = -1; int error; snapname = fnvlist_lookup_string(innvl, "snapname"); if (dataset_namecheck(snapname, NULL, NULL) != 0 || strchr(snapname, '@') == NULL || strchr(snapname, '%')) return (SET_ERROR(EINVAL)); (void) strlcpy(tofs, snapname, sizeof (tofs)); tosnap = strchr(tofs, '@'); *tosnap++ = '\0'; error = nvlist_lookup_string(innvl, "origin", &origin); if (error && error != ENOENT) return (error); error = nvlist_lookup_byte_array(innvl, "begin_record", (uchar_t **)&begin_record, &begin_record_size); if (error != 0 || begin_record_size != sizeof (*begin_record)) return (SET_ERROR(EINVAL)); input_fd = fnvlist_lookup_int32(innvl, "input_fd"); force = nvlist_exists(innvl, "force"); resumable = nvlist_exists(innvl, "resumable"); /* we still use "props" here for backwards compatibility */ error = nvlist_lookup_nvlist(innvl, "props", &recvprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, "localprops", &localprops); if (error && error != ENOENT) return (error); error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); if (error && error != ENOENT) return (error); error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, hidden_args, force, resumable, input_fd, begin_record, &read_bytes, &errflags, &errors); fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); fnvlist_add_uint64(outnvl, "error_flags", errflags); fnvlist_add_nvlist(outnvl, "errors", errors); nvlist_free(errors); nvlist_free(recvprops); nvlist_free(localprops); return (error); } typedef struct dump_bytes_io { zfs_file_t *dbi_fp; caddr_t dbi_buf; int dbi_len; int dbi_err; } dump_bytes_io_t; static void dump_bytes_cb(void *arg) { dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; zfs_file_t *fp; caddr_t buf; fp = dbi->dbi_fp; buf = dbi->dbi_buf; dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); } static int dump_bytes(objset_t *os, void *buf, int len, void *arg) { dump_bytes_io_t dbi; dbi.dbi_fp = arg; dbi.dbi_buf = buf; dbi.dbi_len = len; #if defined(HAVE_LARGE_STACKS) dump_bytes_cb(&dbi); #else /* * The vn_rdwr() call is performed in a taskq to ensure that there is * always enough stack space to write safely to the target filesystem. * The ZIO_TYPE_FREE threads are used because there can be a lot of * them and they are used in vdev_file.c for a similar purpose. */ spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); #endif /* HAVE_LARGE_STACKS */ return (dbi.dbi_err); } /* * inputs: * zc_name name of snapshot to send * zc_cookie file descriptor to send stream to * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) * zc_sendobj objsetid of snapshot to send * zc_fromobj objsetid of incremental fromsnap (may be zero) * zc_guid if set, estimate size of stream only. zc_cookie is ignored. * output size in zc_objset_type. * zc_flags lzc_send_flags * * outputs: * zc_objset_type estimated size, if zc_guid is set * * NOTE: This is no longer the preferred interface, any new functionality * should be added to zfs_ioc_send_new() instead. */ static int zfs_ioc_send(zfs_cmd_t *zc) { int error; offset_t off; boolean_t estimate = (zc->zc_guid != 0); boolean_t embedok = (zc->zc_flags & 0x1); boolean_t large_block_ok = (zc->zc_flags & 0x2); boolean_t compressok = (zc->zc_flags & 0x4); boolean_t rawok = (zc->zc_flags & 0x8); boolean_t savedok = (zc->zc_flags & 0x10); if (zc->zc_obj != 0) { dsl_pool_t *dp; dsl_dataset_t *tosnap; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (dsl_dir_is_clone(tosnap->ds_dir)) zc->zc_fromobj = dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } if (estimate) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (zc->zc_fromobj != 0) { error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } } error = dmu_send_estimate_fast(tosnap, fromsnap, NULL, compressok || rawok, savedok, &zc->zc_objset_type); if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } else { zfs_file_t *fp; dmu_send_outparams_t out = {0}; if ((error = zfs_file_get(zc->zc_cookie, &fp))) return (error); off = zfs_file_off(fp); out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, zc->zc_fromobj, embedok, large_block_ok, compressok, rawok, savedok, zc->zc_cookie, &off, &out); zfs_file_put(zc->zc_cookie); } return (error); } /* * inputs: * zc_name name of snapshot on which to report progress * zc_cookie file descriptor of send stream * * outputs: * zc_cookie number of bytes written in send stream thus far * zc_objset_type logical size of data traversed by send thus far */ static int zfs_ioc_send_progress(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds; dmu_sendstatus_t *dsp = NULL; int error; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } mutex_enter(&ds->ds_sendstream_lock); /* * Iterate over all the send streams currently active on this dataset. * If there's one which matches the specified file descriptor _and_ the * stream was started by the current process, return the progress of * that stream. */ for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; dsp = list_next(&ds->ds_sendstreams, dsp)) { if (dsp->dss_outfd == zc->zc_cookie && zfs_proc_is_caller(dsp->dss_proc)) break; } if (dsp != NULL) { zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off, 0, 0); /* This is the closest thing we have to atomic_read_64. */ zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0); } else { error = SET_ERROR(ENOENT); } mutex_exit(&ds->ds_sendstream_lock); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } static int zfs_ioc_inject_fault(zfs_cmd_t *zc) { int id, error; error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, &zc->zc_inject_record); if (error == 0) zc->zc_guid = (uint64_t)id; return (error); } static int zfs_ioc_clear_fault(zfs_cmd_t *zc) { return (zio_clear_fault((int)zc->zc_guid)); } static int zfs_ioc_inject_list_next(zfs_cmd_t *zc) { int id = (int)zc->zc_guid; int error; error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), &zc->zc_inject_record); zc->zc_guid = id; return (error); } static int zfs_ioc_error_log(zfs_cmd_t *zc) { spa_t *spa; int error; size_t count = (size_t)zc->zc_nvlist_dst_size; if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, &count); if (error == 0) zc->zc_nvlist_dst_size = count; else zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); spa_close(spa, FTAG); return (error); } static int zfs_ioc_clear(zfs_cmd_t *zc) { spa_t *spa; vdev_t *vd; int error; /* * On zpool clear we also fix up missing slogs */ mutex_enter(&spa_namespace_lock); spa = spa_lookup(zc->zc_name); if (spa == NULL) { mutex_exit(&spa_namespace_lock); return (SET_ERROR(EIO)); } if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ spa_set_log_state(spa, SPA_LOG_CLEAR); } spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); if (zc->zc_cookie & ZPOOL_NO_REWIND) { error = spa_open(zc->zc_name, &spa, FTAG); } else { nvlist_t *policy; nvlist_t *config = NULL; if (zc->zc_nvlist_src == 0) return (SET_ERROR(EINVAL)); if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { error = spa_open_rewind(zc->zc_name, &spa, FTAG, policy, &config); if (config != NULL) { int err; if ((err = put_nvlist(zc, config)) != 0) error = err; nvlist_free(config); } nvlist_free(policy); } } if (error != 0) return (error); /* * If multihost is enabled, resuming I/O is unsafe as another * host may have imported the pool. */ if (spa_multihost(spa) && spa_suspended(spa)) return (SET_ERROR(EINVAL)); spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; } else { vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); if (vd == NULL) { error = SET_ERROR(ENODEV); (void) spa_vdev_state_exit(spa, NULL, error); spa_close(spa, FTAG); return (error); } } vdev_clear(spa, vd); (void) spa_vdev_state_exit(spa, spa_suspended(spa) ? NULL : spa->spa_root_vdev, 0); /* * Resume any suspended I/Os. */ if (zio_resume(spa) != 0) error = SET_ERROR(EIO); spa_close(spa, FTAG); return (error); } /* * Reopen all the vdevs associated with the pool. * * innvl: { * "scrub_restart" -> when true and scrub is running, allow to restart * scrub as the side effect of the reopen (boolean). * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_reopen[] = { {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) { spa_t *spa; int error; boolean_t rc, scrub_restart = B_TRUE; if (innvl) { error = nvlist_lookup_boolean_value(innvl, "scrub_restart", &rc); if (error == 0) scrub_restart = rc; } error = spa_open(pool, &spa, FTAG); if (error != 0) return (error); spa_vdev_state_enter(spa, SCL_NONE); /* * If the scrub_restart flag is B_FALSE and a scrub is already * in progress then set spa_scrub_reopen flag to B_TRUE so that * we don't restart the scrub as a side effect of the reopen. * Otherwise, let vdev_open() decided if a resilver is required. */ spa->spa_scrub_reopen = (!scrub_restart && dsl_scan_scrubbing(spa->spa_dsl_pool)); vdev_reopen(spa->spa_root_vdev); spa->spa_scrub_reopen = B_FALSE; (void) spa_vdev_state_exit(spa, NULL, 0); spa_close(spa, FTAG); return (0); } /* * inputs: * zc_name name of filesystem * * outputs: * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) { dsl_pool_t *dp; dsl_dataset_t *ds, *ods; char origin[ZFS_MAX_DATASET_NAME_LEN]; char *cp; int error; zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || strchr(zc->zc_name, '%')) return (SET_ERROR(EINVAL)); error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dir_is_clone(ds->ds_dir)) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } error = dsl_dataset_hold_obj(dp, dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); if (error != 0) { dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); return (error); } dsl_dataset_name(ods, origin); dsl_dataset_rele(ods, FTAG); dsl_dataset_rele(ds, FTAG); dsl_pool_rele(dp, FTAG); /* * We don't need to unmount *all* the origin fs's snapshots, but * it's easier. */ cp = strchr(origin, '@'); if (cp) *cp = '\0'; (void) dmu_objset_find(origin, zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); } /* * Retrieve a single {user|group|project}{used|quota}@... property. * * inputs: * zc_name name of filesystem * zc_objset_type zfs_userquota_prop_t * zc_value domain name (eg. "S-1-234-567-89") * zc_guid RID/UID/GID * * outputs: * zc_cookie property value */ static int zfs_ioc_userspace_one(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int error; if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) return (SET_ERROR(EINVAL)); error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); error = zfs_userspace_one(zfsvfs, zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_cookie zap cursor * zc_objset_type zfs_userquota_prop_t * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) * * outputs: * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) * zc_cookie zap cursor */ static int zfs_ioc_userspace_many(zfs_cmd_t *zc) { zfsvfs_t *zfsvfs; int bufsize = zc->zc_nvlist_dst_size; if (bufsize <= 0) return (SET_ERROR(ENOMEM)); int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); if (error != 0) return (error); void *buf = vmem_alloc(bufsize, KM_SLEEP); error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, buf, &zc->zc_nvlist_dst_size); if (error == 0) { error = xcopyout(buf, (void *)(uintptr_t)zc->zc_nvlist_dst, zc->zc_nvlist_dst_size); } vmem_free(buf, bufsize); zfsvfs_rele(zfsvfs, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) { objset_t *os; int error = 0; zfsvfs_t *zfsvfs; if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { /* * If userused is not enabled, it may be because the * objset needs to be closed & reopened (to grow the * objset_phys_t). Suspend/resume the fs will do that. */ dsl_dataset_t *ds, *newds; ds = dmu_objset_ds(zfsvfs->z_os); error = zfs_suspend_fs(zfsvfs); if (error == 0) { dmu_objset_refresh_ownership(ds, &newds, B_TRUE, zfsvfs); error = zfs_resume_fs(zfsvfs, newds); } } if (error == 0) error = dmu_objset_userspace_upgrade(zfsvfs->z_os); zfs_vfs_rele(zfsvfs); } else { /* XXX kind of reading contents without owning */ error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); error = dmu_objset_userspace_upgrade(os); dmu_objset_rele_flags(os, B_TRUE, FTAG); } return (error); } /* * inputs: * zc_name name of filesystem * * outputs: * none */ static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc) { objset_t *os; int error; error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); if (error != 0) return (error); if (dmu_objset_userobjspace_upgradable(os) || dmu_objset_projectquota_upgradable(os)) { mutex_enter(&os->os_upgrade_lock); if (os->os_upgrade_id == 0) { /* clear potential error code and retry */ os->os_upgrade_status = 0; mutex_exit(&os->os_upgrade_lock); dmu_objset_id_quota_upgrade(os); } else { mutex_exit(&os->os_upgrade_lock); } dsl_pool_rele(dmu_objset_pool(os), FTAG); taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); error = os->os_upgrade_status; } else { dsl_pool_rele(dmu_objset_pool(os), FTAG); } dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); return (error); } static int zfs_ioc_share(zfs_cmd_t *zc) { return (SET_ERROR(ENOSYS)); } ace_t full_access[] = { {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} }; /* * inputs: * zc_name name of containing filesystem * zc_obj object # beyond which we want next in-use object # * * outputs: * zc_obj next in-use object # */ static int zfs_ioc_next_obj(zfs_cmd_t *zc) { objset_t *os = NULL; int error; error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error != 0) return (error); error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0); dmu_objset_rele(os, FTAG); return (error); } /* * inputs: * zc_name name of filesystem * zc_value prefix name for snapshot * zc_cleanup_fd cleanup-on-exit file descriptor for calling process * * outputs: * zc_value short name of new snapshot */ static int zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) { char *snap_name; char *hold_name; int error; minor_t minor; error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); if (error != 0) return (error); snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, (u_longlong_t)ddi_get_lbolt64()); hold_name = kmem_asprintf("%%%s", zc->zc_value); error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, hold_name); if (error == 0) (void) strlcpy(zc->zc_value, snap_name, sizeof (zc->zc_value)); kmem_strfree(snap_name); kmem_strfree(hold_name); zfs_onexit_fd_rele(zc->zc_cleanup_fd); return (error); } /* * inputs: * zc_name name of "to" snapshot * zc_value name of "from" snapshot * zc_cookie file descriptor to write diff data on * * outputs: * dmu_diff_record_t's to the file descriptor */ static int zfs_ioc_diff(zfs_cmd_t *zc) { zfs_file_t *fp; offset_t off; int error; if ((error = zfs_file_get(zc->zc_cookie, &fp))) return (error); off = zfs_file_off(fp); error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); zfs_file_put(zc->zc_cookie); return (error); } static int zfs_ioc_smb_acl(zfs_cmd_t *zc) { return (SET_ERROR(ENOTSUP)); } /* * innvl: { * "holds" -> { snapname -> holdname (string), ... } * (optional) "cleanup_fd" -> fd (int32) * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_hold[] = { {"holds", DATA_TYPE_NVLIST, 0}, {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) { nvpair_t *pair; nvlist_t *holds; int cleanup_fd = -1; int error; minor_t minor = 0; holds = fnvlist_lookup_nvlist(args, "holds"); /* make sure the user didn't pass us any invalid (empty) tags */ for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; pair = nvlist_next_nvpair(holds, pair)) { char *htag; error = nvpair_value_string(pair, &htag); if (error != 0) return (SET_ERROR(error)); if (strlen(htag) == 0) return (SET_ERROR(EINVAL)); } if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { error = zfs_onexit_fd_hold(cleanup_fd, &minor); if (error != 0) return (SET_ERROR(error)); } error = dsl_dataset_user_hold(holds, minor, errlist); if (minor != 0) zfs_onexit_fd_rele(cleanup_fd); return (SET_ERROR(error)); } /* * innvl is not used. * * outnvl: { * holdname -> time added (uint64 seconds since epoch) * ... * } */ static const zfs_ioc_key_t zfs_keys_get_holds[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) { return (dsl_dataset_get_holds(snapname, outnvl)); } /* * innvl: { * snapname -> { holdname, ... } * ... * } * * outnvl: { * snapname -> error value (int32) * ... * } */ static const zfs_ioc_key_t zfs_keys_release[] = { {"...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST}, }; /* ARGSUSED */ static int zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) { return (dsl_dataset_user_release(holds, errlist)); } /* * inputs: * zc_guid flags (ZEVENT_NONBLOCK) * zc_cleanup_fd zevent file descriptor * * outputs: * zc_nvlist_dst next nvlist event * zc_cookie dropped events since last get */ static int zfs_ioc_events_next(zfs_cmd_t *zc) { zfs_zevent_t *ze; nvlist_t *event = NULL; minor_t minor; uint64_t dropped = 0; int error; error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (error != 0) return (error); do { error = zfs_zevent_next(ze, &event, &zc->zc_nvlist_dst_size, &dropped); if (event != NULL) { zc->zc_cookie = dropped; error = put_nvlist(zc, event); nvlist_free(event); } if (zc->zc_guid & ZEVENT_NONBLOCK) break; if ((error == 0) || (error != ENOENT)) break; error = zfs_zevent_wait(ze); if (error != 0) break; } while (1); zfs_zevent_fd_rele(zc->zc_cleanup_fd); return (error); } /* * outputs: * zc_cookie cleared events count */ static int zfs_ioc_events_clear(zfs_cmd_t *zc) { int count; zfs_zevent_drain_all(&count); zc->zc_cookie = count; return (0); } /* * inputs: * zc_guid eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END * zc_cleanup zevent file descriptor */ static int zfs_ioc_events_seek(zfs_cmd_t *zc) { zfs_zevent_t *ze; minor_t minor; int error; error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); if (error != 0) return (error); error = zfs_zevent_seek(ze, zc->zc_guid); zfs_zevent_fd_rele(zc->zc_cleanup_fd); return (error); } /* * inputs: * zc_name name of later filesystem or snapshot * zc_value full name of old snapshot or bookmark * * outputs: * zc_cookie space in bytes * zc_objset_type compressed space in bytes * zc_perm_action uncompressed space in bytes */ static int zfs_ioc_space_written(zfs_cmd_t *zc) { int error; dsl_pool_t *dp; dsl_dataset_t *new; error = dsl_pool_hold(zc->zc_name, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } if (strchr(zc->zc_value, '#') != NULL) { zfs_bookmark_phys_t bmp; error = dsl_bookmark_lookup(dp, zc->zc_value, new, &bmp); if (error == 0) { error = dsl_dataset_space_written_bookmark(&bmp, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); } } else { dsl_dataset_t *old; error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); if (error == 0) { error = dsl_dataset_space_written(old, new, &zc->zc_cookie, &zc->zc_objset_type, &zc->zc_perm_action); dsl_dataset_rele(old, FTAG); } } dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } /* * innvl: { * "firstsnap" -> snapshot name * } * * outnvl: { * "used" -> space in bytes * "compressed" -> compressed space in bytes * "uncompressed" -> uncompressed space in bytes * } */ static const zfs_ioc_key_t zfs_keys_space_snaps[] = { {"firstsnap", DATA_TYPE_STRING, 0}, }; static int zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) { int error; dsl_pool_t *dp; dsl_dataset_t *new, *old; char *firstsnap; uint64_t used, comp, uncomp; firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); error = dsl_pool_hold(lastsnap, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); if (error == 0 && !new->ds_is_snapshot) { dsl_dataset_rele(new, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); if (error == 0 && !old->ds_is_snapshot) { dsl_dataset_rele(old, FTAG); error = SET_ERROR(EINVAL); } if (error != 0) { dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); return (error); } error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); dsl_dataset_rele(old, FTAG); dsl_dataset_rele(new, FTAG); dsl_pool_rele(dp, FTAG); fnvlist_add_uint64(outnvl, "used", used); fnvlist_add_uint64(outnvl, "compressed", comp); fnvlist_add_uint64(outnvl, "uncompressed", uncomp); return (error); } /* * innvl: { * "fd" -> file descriptor to write stream to (int32) * (optional) "fromsnap" -> full snap name to send an incremental from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "savedok" -> (value ignored) * presence indicates we should send a partially received snapshot * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "redactbook" -> (string) * if present, use this bookmark's redaction list to generate a redacted * send stream * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_send_new[] = { {"fd", DATA_TYPE_INT32, 0}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"savedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { int error; offset_t off; char *fromname = NULL; int fd; zfs_file_t *fp; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t resumeobj = 0; uint64_t resumeoff = 0; char *redactbook = NULL; fd = fnvlist_lookup_int32(innvl, "fd"); (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); if ((error = zfs_file_get(fd, &fp))) return (error); off = zfs_file_off(fp); dmu_send_outparams_t out = {0}; out.dso_outfunc = dump_bytes; out.dso_arg = fp; out.dso_dryrun = B_FALSE; error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactbook, fd, &off, &out); zfs_file_put(fd); return (error); } /* ARGSUSED */ static int send_space_sum(objset_t *os, void *buf, int len, void *arg) { uint64_t *size = arg; *size += len; return (0); } /* * Determine approximately how large a zfs send stream will be -- the number * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). * * innvl: { * (optional) "from" -> full snap or bookmark name to send an incremental * from * (optional) "largeblockok" -> (value ignored) * indicates that blocks > 128KB are permitted * (optional) "embedok" -> (value ignored) * presence indicates DRR_WRITE_EMBEDDED records are permitted * (optional) "compressok" -> (value ignored) * presence indicates compressed DRR_WRITE records are permitted * (optional) "rawok" -> (value ignored) * presence indicates raw encrypted records should be used. * (optional) "resume_object" and "resume_offset" -> (uint64) * if present, resume send stream from specified object and offset. * (optional) "fd" -> file descriptor to use as a cookie for progress * tracking (int32) * } * * outnvl: { * "space" -> bytes of space (uint64) * } */ static const zfs_ioc_key_t zfs_keys_send_space[] = { {"from", DATA_TYPE_STRING, ZK_OPTIONAL}, {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, {"fd", DATA_TYPE_INT32, ZK_OPTIONAL}, {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"bytes", DATA_TYPE_UINT64, ZK_OPTIONAL}, }; static int zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) { dsl_pool_t *dp; dsl_dataset_t *tosnap; dsl_dataset_t *fromsnap = NULL; int error; char *fromname = NULL; char *redactlist_book = NULL; boolean_t largeblockok; boolean_t embedok; boolean_t compressok; boolean_t rawok; boolean_t savedok; uint64_t space = 0; boolean_t full_estimate = B_FALSE; uint64_t resumeobj = 0; uint64_t resumeoff = 0; uint64_t resume_bytes = 0; int32_t fd = -1; zfs_bookmark_phys_t zbm = {0}; error = dsl_pool_hold(snapname, FTAG, &dp); if (error != 0) return (error); error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); if (error != 0) { dsl_pool_rele(dp, FTAG); return (error); } (void) nvlist_lookup_int32(innvl, "fd", &fd); largeblockok = nvlist_exists(innvl, "largeblockok"); embedok = nvlist_exists(innvl, "embedok"); compressok = nvlist_exists(innvl, "compressok"); rawok = nvlist_exists(innvl, "rawok"); savedok = nvlist_exists(innvl, "savedok"); boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0); boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook", &redactlist_book) == 0); (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); (void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes); if (altbook) { full_estimate = B_TRUE; } else if (from) { if (strchr(fromname, '#')) { error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm); /* * dsl_bookmark_lookup() will fail with EXDEV if * the from-bookmark and tosnap are at the same txg. * However, it's valid to do a send (and therefore, * a send estimate) from and to the same time point, * if the bookmark is redacted (the incremental send * can change what's redacted on the target). In * this case, dsl_bookmark_lookup() fills in zbm * but returns EXDEV. Ignore this error. */ if (error == EXDEV && zbm.zbm_redaction_obj != 0 && zbm.zbm_guid == dsl_dataset_phys(tosnap)->ds_guid) error = 0; if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags & ZBM_FLAG_HAS_FBN)) { full_estimate = B_TRUE; } } else if (strchr(fromname, '@')) { error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); if (error != 0) { dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (error); } if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { full_estimate = B_TRUE; dsl_dataset_rele(fromsnap, FTAG); } } else { /* * from is not properly formatted as a snapshot or * bookmark */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); return (SET_ERROR(EINVAL)); } } if (full_estimate) { dmu_send_outparams_t out = {0}; offset_t off = 0; out.dso_outfunc = send_space_sum; out.dso_arg = &space; out.dso_dryrun = B_TRUE; /* * We have to release these holds so dmu_send can take them. It * will do all the error checking we need. */ dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); error = dmu_send(snapname, fromname, embedok, largeblockok, compressok, rawok, savedok, resumeobj, resumeoff, redactlist_book, fd, &off, &out); } else { error = dmu_send_estimate_fast(tosnap, fromsnap, (from && strchr(fromname, '#') != NULL ? &zbm : NULL), compressok || rawok, savedok, &space); space -= resume_bytes; if (fromsnap != NULL) dsl_dataset_rele(fromsnap, FTAG); dsl_dataset_rele(tosnap, FTAG); dsl_pool_rele(dp, FTAG); } fnvlist_add_uint64(outnvl, "space", space); return (error); } /* * Sync the currently open TXG to disk for the specified pool. * This is somewhat similar to 'zfs_sync()'. * For cases that do not result in error this ioctl will wait for * the currently open TXG to commit before returning back to the caller. * * innvl: { * "force" -> when true, force uberblock update even if there is no dirty data. * In addition this will cause the vdev configuration to be written * out including updating the zpool cache file. (boolean_t) * } * * onvl is unused */ static const zfs_ioc_key_t zfs_keys_pool_sync[] = { {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, }; /* ARGSUSED */ static int zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) { int err; boolean_t force = B_FALSE; spa_t *spa; if ((err = spa_open(pool, &spa, FTAG)) != 0) return (err); if (innvl) force = fnvlist_lookup_boolean_value(innvl, "force"); if (force) { spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); vdev_config_dirty(spa->spa_root_vdev); spa_config_exit(spa, SCL_CONFIG, FTAG); } txg_wait_synced(spa_get_dsl(spa), 0); spa_close(spa, FTAG); return (err); } /* * Load a user's wrapping key into the kernel. * innvl: { * "hidden_args" -> { "wkeydata" -> value } * raw uint8_t array of encryption wrapping key data (32 bytes) * (optional) "noop" -> (value ignored) * presence indicated key should only be verified, not loaded * } */ static const zfs_ioc_key_t zfs_keys_load_key[] = { {"hidden_args", DATA_TYPE_NVLIST, 0}, {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { int ret; dsl_crypto_params_t *dcp = NULL; nvlist_t *hidden_args; boolean_t noop = nvlist_exists(innvl, "noop"); if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = SET_ERROR(EINVAL); goto error; } hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS); ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_load_wkey(dsname, dcp, noop); if (ret != 0) goto error; dsl_crypto_params_free(dcp, noop); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } /* * Unload a user's wrapping key from the kernel. * Both innvl and outnvl are unused. */ static const zfs_ioc_key_t zfs_keys_unload_key[] = { /* no nvl keys */ }; /* ARGSUSED */ static int zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { int ret = 0; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto out; } ret = spa_keystore_unload_wkey(dsname); if (ret != 0) goto out; out: return (ret); } /* * Changes a user's wrapping key used to decrypt a dataset. The keyformat, * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified * here to change how the key is derived in userspace. * * innvl: { * "hidden_args" (optional) -> { "wkeydata" -> value } * raw uint8_t array of new encryption wrapping key data (32 bytes) * "props" (optional) -> { prop -> value } * } * * outnvl is unused */ static const zfs_ioc_key_t zfs_keys_change_key[] = { {"crypt_cmd", DATA_TYPE_UINT64, ZK_OPTIONAL}, {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, }; /* ARGSUSED */ static int zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) { int ret; uint64_t cmd = DCP_CMD_NONE; dsl_crypto_params_t *dcp = NULL; nvlist_t *args = NULL, *hidden_args = NULL; if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { ret = (SET_ERROR(EINVAL)); goto error; } (void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd); (void) nvlist_lookup_nvlist(innvl, "props", &args); (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp); if (ret != 0) goto error; ret = spa_keystore_change_key(dsname, dcp); if (ret != 0) goto error; dsl_crypto_params_free(dcp, B_FALSE); return (0); error: dsl_crypto_params_free(dcp, B_TRUE); return (ret); } static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; static void zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); vec->zvec_legacy_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_allow_log = log_history; vec->zvec_pool_check = pool_check; } /* * See the block comment at the beginning of this file for details on * each argument to this function. */ void zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys) { zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; ASSERT3U(ioc, >=, ZFS_IOC_FIRST); ASSERT3U(ioc, <, ZFS_IOC_LAST); ASSERT3P(vec->zvec_legacy_func, ==, NULL); ASSERT3P(vec->zvec_func, ==, NULL); /* if we are logging, the name must be valid */ ASSERT(!allow_log || namecheck != NO_NAME); vec->zvec_name = name; vec->zvec_func = func; vec->zvec_secpolicy = secpolicy; vec->zvec_namecheck = namecheck; vec->zvec_pool_check = pool_check; vec->zvec_smush_outnvlist = smush_outnvlist; vec->zvec_allow_log = allow_log; vec->zvec_nvl_keys = nvl_keys; vec->zvec_nvl_key_count = num_keys; } static void zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, boolean_t log_history, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, POOL_NAME, log_history, pool_check); } void zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, pool_check); } static void zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, NO_NAME, B_FALSE, POOL_CHECK_NONE); } static void zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); } static void zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) { zfs_ioctl_register_dataset_read_secpolicy(ioc, func, zfs_secpolicy_read); } static void zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) { zfs_ioctl_register_legacy(ioc, func, secpolicy, DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); } static void zfs_ioctl_init(void) { zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot)); zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history)); zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps)); zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new)); zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space)); zfs_ioctl_register("create", ZFS_IOC_CREATE, zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_create, ARRAY_SIZE(zfs_keys_create)); zfs_ioctl_register("clone", ZFS_IOC_CLONE, zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone)); zfs_ioctl_register("remap", ZFS_IOC_REMAP, zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap)); zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps)); zfs_ioctl_register("hold", ZFS_IOC_HOLD, zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold)); zfs_ioctl_register("release", ZFS_IOC_RELEASE, zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_release, ARRAY_SIZE(zfs_keys_release)); zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds)); zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback)); zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark)); zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS, zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props, ARRAY_SIZE(zfs_keys_get_bookmark_props)); zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_destroy_bookmarks, ARRAY_SIZE(zfs_keys_destroy_bookmarks)); zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW, zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new)); zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, zfs_ioc_load_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key)); zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY, zfs_ioc_unload_key, zfs_secpolicy_load_key, DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key)); zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY, zfs_ioc_change_key, zfs_secpolicy_change_key, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_change_key, ARRAY_SIZE(zfs_keys_change_key)); zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC, zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen)); zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, zfs_ioc_channel_program, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_channel_program, ARRAY_SIZE(zfs_keys_channel_program)); zfs_ioctl_register("redact", ZFS_IOC_REDACT, zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact)); zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint)); zfs_ioctl_register("zpool_discard_checkpoint", ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_discard_checkpoint, ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize)); zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM, zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim)); zfs_ioctl_register("wait", ZFS_IOC_WAIT, zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait)); zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS, zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); /* IOCTLS that use the legacy function signature */ zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, zfs_ioc_pool_scan); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, zfs_ioc_pool_upgrade); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, zfs_ioc_vdev_add); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, zfs_ioc_vdev_remove); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, zfs_ioc_vdev_set_state); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, zfs_ioc_vdev_attach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, zfs_ioc_vdev_detach); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, zfs_ioc_vdev_setpath); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, zfs_ioc_vdev_setfru); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, zfs_ioc_pool_set_props); zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, zfs_ioc_vdev_split); zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, zfs_ioc_pool_reguid); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, zfs_ioc_pool_configs, zfs_secpolicy_none); zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, zfs_ioc_pool_tryimport, zfs_secpolicy_config); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, zfs_ioc_inject_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, zfs_ioc_clear_fault, zfs_secpolicy_inject); zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, zfs_ioc_inject_list_next, zfs_secpolicy_inject); /* * pool destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_export * does the logging of those commands. */ zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, zfs_ioc_dsobj_to_dsname, zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, zfs_ioc_pool_get_history, zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, zfs_ioc_space_written); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, zfs_ioc_objset_recvd_props); zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, zfs_ioc_next_obj); zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, zfs_ioc_get_fsacl); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, zfs_ioc_objset_stats); zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, zfs_ioc_objset_zplprops); zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, zfs_ioc_dataset_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, zfs_ioc_snapshot_list_next); zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, zfs_ioc_send_progress); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, zfs_ioc_diff, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, zfs_ioc_obj_to_stats, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, zfs_ioc_obj_to_path, zfs_secpolicy_diff); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, zfs_ioc_send, zfs_secpolicy_send); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, zfs_secpolicy_none); zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, zfs_secpolicy_destroy); zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, zfs_secpolicy_rename); zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, zfs_secpolicy_recv); zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, zfs_secpolicy_promote); zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, zfs_secpolicy_set_fsacl); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, zfs_secpolicy_share, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, POOL_CHECK_NONE); zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek, zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); zfs_ioctl_init_os(); } /* * Verify that for non-legacy ioctls the input nvlist * pairs match against the expected input. * * Possible errors are: * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair */ static int zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) { const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys; boolean_t required_keys_found = B_FALSE; /* * examine each input pair */ for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { char *name = nvpair_name(pair); data_type_t type = nvpair_type(pair); boolean_t identified = B_FALSE; /* * check pair against the documented names and type */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { /* if not a wild card name, check for an exact match */ if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 && strcmp(nvl_keys[k].zkey_name, name) != 0) continue; identified = B_TRUE; if (nvl_keys[k].zkey_type != DATA_TYPE_ANY && nvl_keys[k].zkey_type != type) { return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE)); } if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; required_keys_found = B_TRUE; break; } /* allow an 'optional' key, everything else is invalid */ if (!identified && (strcmp(name, "optional") != 0 || type != DATA_TYPE_NVLIST)) { return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL)); } } /* verify that all required keys were found */ for (int k = 0; k < vec->zvec_nvl_key_count; k++) { if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) continue; if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) { /* at least one non-optional key is expected here */ if (!required_keys_found) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); continue; } if (!nvlist_exists(innvl, nvl_keys[k].zkey_name)) return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); } return (0); } static int pool_status_check(const char *name, zfs_ioc_namecheck_t type, zfs_ioc_poolcheck_t check) { spa_t *spa; int error; ASSERT(type == POOL_NAME || type == DATASET_NAME || type == ENTITY_NAME); if (check & POOL_CHECK_NONE) return (0); error = spa_open(name, &spa, FTAG); if (error == 0) { if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) error = SET_ERROR(EAGAIN); else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) error = SET_ERROR(EROFS); spa_close(spa, FTAG); } return (error); } int zfsdev_getminor(int fd, minor_t *minorp) { zfsdev_state_t *zs, *fpd; zfs_file_t *fp; int rc; ASSERT(!MUTEX_HELD(&zfsdev_state_lock)); if ((rc = zfs_file_get(fd, &fp))) return (rc); fpd = zfs_file_private(fp); if (fpd == NULL) return (SET_ERROR(EBADF)); mutex_enter(&zfsdev_state_lock); for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == -1) continue; if (fpd == zs) { *minorp = fpd->zs_minor; mutex_exit(&zfsdev_state_lock); return (0); } } mutex_exit(&zfsdev_state_lock); return (SET_ERROR(EBADF)); } static void * zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which) { zfsdev_state_t *zs; for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { if (zs->zs_minor == minor) { smp_rmb(); switch (which) { case ZST_ONEXIT: return (zs->zs_onexit); case ZST_ZEVENT: return (zs->zs_zevent); case ZST_ALL: return (zs); } } } return (NULL); } void * zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) { void *ptr; ptr = zfsdev_get_state_impl(minor, which); return (ptr); } /* * Find a free minor number. The zfsdev_state_list is expected to * be short since it is only a list of currently open file handles. */ minor_t zfsdev_minor_alloc(void) { static minor_t last_minor = 0; minor_t m; ASSERT(MUTEX_HELD(&zfsdev_state_lock)); for (m = last_minor + 1; m != last_minor; m++) { if (m > ZFSDEV_MAX_MINOR) m = 1; if (zfsdev_get_state_impl(m, ZST_ALL) == NULL) { last_minor = m; return (m); } } return (0); } long zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) { int error, cmd; const zfs_ioc_vec_t *vec; char *saved_poolname = NULL; uint64_t max_nvlist_src_size; size_t saved_poolname_len = 0; nvlist_t *innvl = NULL; fstrans_cookie_t cookie; cmd = vecnum; error = 0; if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); vec = &zfs_ioc_vec[vecnum]; /* * The registered ioctl list may be sparse, verify that either * a normal or legacy handler are registered. */ if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); zc->zc_iflags = flag & FKIOCTL; max_nvlist_src_size = zfs_max_nvlist_src_size_os(); if (zc->zc_nvlist_src_size > max_nvlist_src_size) { /* * Make sure the user doesn't pass in an insane value for * zc_nvlist_src_size. We have to check, since we will end * up allocating that much memory inside of get_nvlist(). This * prevents a nefarious user from allocating tons of kernel * memory. * * Also, we return EINVAL instead of ENOMEM here. The reason * being that returning ENOMEM from an ioctl() has a special * connotation; that the user's size value is too small and * needs to be expanded to hold the nvlist. See * zcmd_expand_dst_nvlist() for details. */ error = SET_ERROR(EINVAL); /* User's size too big */ } else if (zc->zc_nvlist_src_size != 0) { error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, zc->zc_iflags, &innvl); if (error != 0) goto out; } /* * Ensure that all pool/dataset names are valid before we pass down to * the lower layers. */ zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; switch (vec->zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = SET_ERROR(EINVAL); else error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); break; case ENTITY_NAME: if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) { error = SET_ERROR(EINVAL); } else { error = pool_status_check(zc->zc_name, vec->zvec_namecheck, vec->zvec_pool_check); } break; case NO_NAME: break; } /* * Ensure that all input pairs are valid before we pass them down * to the lower layers. * * The vectored functions can use fnvlist_lookup_{type} for any * required pairs since zfs_check_input_nvpairs() confirmed that * they exist and are of the correct type. */ if (error == 0 && vec->zvec_func != NULL) { error = zfs_check_input_nvpairs(innvl, vec); if (error != 0) goto out; } if (error == 0) { cookie = spl_fstrans_mark(); error = vec->zvec_secpolicy(zc, innvl, CRED()); spl_fstrans_unmark(cookie); } if (error != 0) goto out; /* legacy ioctls can modify zc_name */ /* * Can't use kmem_strdup() as we might truncate the string and * kmem_strfree() would then free with incorrect size. */ saved_poolname_len = strlen(zc->zc_name) + 1; saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP); strlcpy(saved_poolname, zc->zc_name, saved_poolname_len); saved_poolname[strcspn(saved_poolname, "/@#")] = '\0'; if (vec->zvec_func != NULL) { nvlist_t *outnvl; int puterror = 0; spa_t *spa; nvlist_t *lognv = NULL; ASSERT(vec->zvec_legacy_func == NULL); /* * Add the innvl to the lognv before calling the func, * in case the func changes the innvl. */ if (vec->zvec_allow_log) { lognv = fnvlist_alloc(); fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, vec->zvec_name); if (!nvlist_empty(innvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, innvl); } } outnvl = fnvlist_alloc(); cookie = spl_fstrans_mark(); error = vec->zvec_func(zc->zc_name, innvl, outnvl); spl_fstrans_unmark(cookie); /* * Some commands can partially execute, modify state, and still * return an error. In these cases, attempt to record what * was modified. */ if ((error == 0 || (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && vec->zvec_allow_log && spa_open(zc->zc_name, &spa, FTAG) == 0) { if (!nvlist_empty(outnvl)) { fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, outnvl); } if (error != 0) { fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, error); } (void) spa_history_log_nvl(spa, lognv); spa_close(spa, FTAG); } fnvlist_free(lognv); if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { int smusherror = 0; if (vec->zvec_smush_outnvlist) { smusherror = nvlist_smush(outnvl, zc->zc_nvlist_dst_size); } if (smusherror == 0) puterror = put_nvlist(zc, outnvl); } if (puterror != 0) error = puterror; nvlist_free(outnvl); } else { cookie = spl_fstrans_mark(); error = vec->zvec_legacy_func(zc); spl_fstrans_unmark(cookie); } out: nvlist_free(innvl); if (error == 0 && vec->zvec_allow_log) { char *s = tsd_get(zfs_allow_log_key); if (s != NULL) kmem_strfree(s); (void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname)); } if (saved_poolname != NULL) kmem_free(saved_poolname, saved_poolname_len); return (error); } int zfs_kmod_init(void) { int error; if ((error = zvol_init()) != 0) return (error); spa_init(SPA_MODE_READ | SPA_MODE_WRITE); zfs_init(); zfs_ioctl_init(); mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); zfsdev_state_list->zs_minor = -1; if ((error = zfsdev_attach()) != 0) goto out; tsd_create(&zfs_fsyncer_key, NULL); tsd_create(&rrw_tsd_key, rrw_tsd_destroy); tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); return (0); out: zfs_fini(); spa_fini(); zvol_fini(); return (error); } void zfs_kmod_fini(void) { zfsdev_state_t *zs, *zsnext = NULL; zfsdev_detach(); mutex_destroy(&zfsdev_state_lock); for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) { zsnext = zs->zs_next; if (zs->zs_onexit) zfs_onexit_destroy(zs->zs_onexit); if (zs->zs_zevent) zfs_zevent_destroy(zs->zs_zevent); kmem_free(zs, sizeof (zfsdev_state_t)); } zfs_ereport_taskq_fini(); /* run before zfs_fini() on Linux */ zfs_fini(); spa_fini(); zvol_fini(); tsd_destroy(&zfs_fsyncer_key); tsd_destroy(&rrw_tsd_key); tsd_destroy(&zfs_allow_log_key); } /* BEGIN CSTYLED */ ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW, "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); /* END CSTYLED */